1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/Loads.h"
42#include "llvm/Analysis/LoopAccessAnalysis.h"
43#include "llvm/Analysis/LoopInfo.h"
44#include "llvm/Analysis/MemoryLocation.h"
45#include "llvm/Analysis/OptimizationRemarkEmitter.h"
46#include "llvm/Analysis/ScalarEvolution.h"
47#include "llvm/Analysis/ScalarEvolutionExpressions.h"
48#include "llvm/Analysis/TargetLibraryInfo.h"
49#include "llvm/Analysis/TargetTransformInfo.h"
50#include "llvm/Analysis/ValueTracking.h"
51#include "llvm/Analysis/VectorUtils.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/DerivedTypes.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
68#include "llvm/IR/PatternMatch.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/DOTGraphTraits.h"
82#include "llvm/Support/Debug.h"
83#include "llvm/Support/DebugCounter.h"
84#include "llvm/Support/ErrorHandling.h"
85#include "llvm/Support/GraphWriter.h"
86#include "llvm/Support/InstructionCost.h"
87#include "llvm/Support/KnownBits.h"
88#include "llvm/Support/MathExtras.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Transforms/Utils/InjectTLIMappings.h"
91#include "llvm/Transforms/Utils/Local.h"
92#include "llvm/Transforms/Utils/LoopUtils.h"
93#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
128 SLPCostThreshold("slp-threshold", cl::init(Val: 0), cl::Hidden,
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
132static cl::opt<bool>
133ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
134 cl::desc("Attempt to vectorize horizontal reductions"));
135
136static cl::opt<bool> ShouldStartVectorizeHorAtStore(
137 "slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
138 cl::desc(
139 "Attempt to vectorize horizontal reductions feeding into a store"));
140
141static cl::opt<bool> SplitAlternateInstructions(
142 "slp-split-alternate-instructions", cl::init(Val: true), cl::Hidden,
143 cl::desc("Improve the code quality by splitting alternate instructions"));
144
145static cl::opt<int>
146MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: 128), cl::Hidden,
147 cl::desc("Attempt to vectorize for this register size in bits"));
148
149static cl::opt<unsigned>
150MaxVFOption("slp-max-vf", cl::init(Val: 0), cl::Hidden,
151 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
152
153/// Limits the size of scheduling regions in a block.
154/// It avoid long compile times for _very_ large blocks where vector
155/// instructions are spread over a wide range.
156/// This limit is way higher than needed by real-world functions.
157static cl::opt<int>
158ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: 100000), cl::Hidden,
159 cl::desc("Limit the size of the SLP scheduling region per block"));
160
161static cl::opt<int> MinVectorRegSizeOption(
162 "slp-min-reg-size", cl::init(Val: 128), cl::Hidden,
163 cl::desc("Attempt to vectorize for this register size in bits"));
164
165static cl::opt<unsigned> RecursionMaxDepth(
166 "slp-recursion-max-depth", cl::init(Val: 12), cl::Hidden,
167 cl::desc("Limit the recursion depth when building a vectorizable tree"));
168
169static cl::opt<unsigned> MinTreeSize(
170 "slp-min-tree-size", cl::init(Val: 3), cl::Hidden,
171 cl::desc("Only vectorize small trees if they are fully vectorizable"));
172
173// The maximum depth that the look-ahead score heuristic will explore.
174// The higher this value, the higher the compilation time overhead.
175static cl::opt<int> LookAheadMaxDepth(
176 "slp-max-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
177 cl::desc("The maximum look-ahead depth for operand reordering scores"));
178
179// The maximum depth that the look-ahead score heuristic will explore
180// when it probing among candidates for vectorization tree roots.
181// The higher this value, the higher the compilation time overhead but unlike
182// similar limit for operands ordering this is less frequently used, hence
183// impact of higher value is less noticeable.
184static cl::opt<int> RootLookAheadMaxDepth(
185 "slp-max-root-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
186 cl::desc("The maximum look-ahead depth for searching best rooting option"));
187
188static cl::opt<unsigned> MinProfitableStridedLoads(
189 "slp-min-strided-loads", cl::init(Val: 2), cl::Hidden,
190 cl::desc("The minimum number of loads, which should be considered strided, "
191 "if the stride is > 1 or is runtime value"));
192
193static cl::opt<unsigned> MaxProfitableLoadStride(
194 "slp-max-stride", cl::init(Val: 8), cl::Hidden,
195 cl::desc("The maximum stride, considered to be profitable."));
196
197static cl::opt<bool>
198 DisableTreeReorder("slp-disable-tree-reorder", cl::init(Val: false), cl::Hidden,
199 cl::desc("Disable tree reordering even if it is "
200 "profitable. Used for testing only."));
201
202static cl::opt<bool>
203 ForceStridedLoads("slp-force-strided-loads", cl::init(Val: false), cl::Hidden,
204 cl::desc("Generate strided loads even if they are not "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ViewSLPTree("view-slp-tree", cl::Hidden,
209 cl::desc("Display the SLP trees with Graphviz"));
210
211static cl::opt<bool> VectorizeNonPowerOf2(
212 "slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
213 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
214
215/// Enables vectorization of copyable elements.
216static cl::opt<bool> VectorizeCopyableElements(
217 "slp-copyable-elements", cl::init(Val: true), cl::Hidden,
218 cl::desc("Try to replace values with the idempotent instructions for "
219 "better vectorization."));
220
221static cl::opt<unsigned> LoopAwareTripCount(
222 "slp-cost-loop-trip-count", cl::init(Val: 2), cl::Hidden,
223 cl::desc("Loop trip count, considered by the cost model during "
224 "modeling (0=loops are ignored and considered flat code)"));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
255 if (SLPReVec && isa<FixedVectorType>(Val: Ty))
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
265static Type *getValueType(Value *V) {
266 if (auto *SI = dyn_cast<StoreInst>(Val: V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(Val: V))
269 return CI->getOperand(i_nocapture: 0)->getType();
270 if (!SLPReVec)
271 if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
272 return IE->getOperand(i_nocapture: 1)->getType();
273 return V->getType();
274}
275
276/// \returns the number of elements for Ty.
277static unsigned getNumElements(Type *Ty) {
278 assert(!isa<ScalableVectorType>(Ty) &&
279 "ScalableVectorType is not supported.");
280 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
281 return VecTy->getNumElements();
282 return 1;
283}
284
285/// \returns the vector type of ScalarTy based on vectorization factor.
286static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
287 return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
288 NumElts: VF * getNumElements(Ty: ScalarTy));
289}
290
291/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
292/// which forms type, which splits by \p TTI into whole vector types during
293/// legalization.
294static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
295 Type *Ty, unsigned Sz) {
296 if (!isValidElementType(Ty))
297 return bit_ceil(Value: Sz);
298 // Find the number of elements, which forms full vectors.
299 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
300 if (NumParts == 0 || NumParts >= Sz)
301 return bit_ceil(Value: Sz);
302 return bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)) * NumParts;
303}
304
305/// Returns the number of elements of the given type \p Ty, not greater than \p
306/// Sz, which forms type, which splits by \p TTI into whole vector types during
307/// legalization.
308static unsigned
309getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
310 unsigned Sz) {
311 if (!isValidElementType(Ty))
312 return bit_floor(Value: Sz);
313 // Find the number of elements, which forms full vectors.
314 unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
315 if (NumParts == 0 || NumParts >= Sz)
316 return bit_floor(Value: Sz);
317 unsigned RegVF = bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts));
318 if (RegVF > Sz)
319 return bit_floor(Value: Sz);
320 return (Sz / RegVF) * RegVF;
321}
322
323static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
324 SmallVectorImpl<int> &Mask) {
325 // The ShuffleBuilder implementation use shufflevector to splat an "element".
326 // But the element have different meaning for SLP (scalar) and REVEC
327 // (vector). We need to expand Mask into masks which shufflevector can use
328 // directly.
329 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
330 for (unsigned I : seq<unsigned>(Size: Mask.size()))
331 for (auto [J, MaskV] : enumerate(First: MutableArrayRef(NewMask).slice(
332 N: I * VecTyNumElements, M: VecTyNumElements)))
333 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
334 : Mask[I] * VecTyNumElements + J;
335 Mask.swap(RHS&: NewMask);
336}
337
338/// \returns the number of groups of shufflevector
339/// A group has the following features
340/// 1. All of value in a group are shufflevector.
341/// 2. The mask of all shufflevector is isExtractSubvectorMask.
342/// 3. The mask of all shufflevector uses all of the elements of the source.
343/// e.g., it is 1 group (%0)
344/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
345/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
346/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
347/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
348/// it is 2 groups (%3 and %4)
349/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
350/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
351/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
352/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
353/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
354/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
355/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
356/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
357/// it is 0 group
358/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
359/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
360/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
361/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
362static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
363 if (VL.empty())
364 return 0;
365 if (!all_of(Range&: VL, P: IsaPred<ShuffleVectorInst>))
366 return 0;
367 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
368 unsigned SVNumElements =
369 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
370 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
371 if (SVNumElements % ShuffleMaskSize != 0)
372 return 0;
373 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
374 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
375 return 0;
376 unsigned NumGroup = 0;
377 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
378 auto *SV = cast<ShuffleVectorInst>(Val: VL[I]);
379 Value *Src = SV->getOperand(i_nocapture: 0);
380 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
381 SmallBitVector ExpectedIndex(GroupSize);
382 if (!all_of(Range&: Group, P: [&](Value *V) {
383 auto *SV = cast<ShuffleVectorInst>(Val: V);
384 // From the same source.
385 if (SV->getOperand(i_nocapture: 0) != Src)
386 return false;
387 int Index;
388 if (!SV->isExtractSubvectorMask(Index))
389 return false;
390 ExpectedIndex.set(Index / ShuffleMaskSize);
391 return true;
392 }))
393 return 0;
394 if (!ExpectedIndex.all())
395 return 0;
396 ++NumGroup;
397 }
398 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
399 return NumGroup;
400}
401
402/// \returns a shufflevector mask which is used to vectorize shufflevectors
403/// e.g.,
404/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
405/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
406/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
407/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
408/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
409/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
410/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
411/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
412/// the result is
413/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
414static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
415 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
416 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
417 unsigned SVNumElements =
418 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
419 SmallVector<int> Mask;
420 unsigned AccumulateLength = 0;
421 for (Value *V : VL) {
422 auto *SV = cast<ShuffleVectorInst>(Val: V);
423 for (int M : SV->getShuffleMask())
424 Mask.push_back(Elt: M == PoisonMaskElem ? PoisonMaskElem
425 : AccumulateLength + M);
426 AccumulateLength += SVNumElements;
427 }
428 return Mask;
429}
430
431/// \returns True if the value is a constant (but not globals/constant
432/// expressions).
433static bool isConstant(Value *V) {
434 return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
435}
436
437/// Checks if \p V is one of vector-like instructions, i.e. undef,
438/// insertelement/extractelement with constant indices for fixed vector type or
439/// extractvalue instruction.
440static bool isVectorLikeInstWithConstOps(Value *V) {
441 if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
442 !isa<ExtractValueInst, UndefValue>(Val: V))
443 return false;
444 auto *I = dyn_cast<Instruction>(Val: V);
445 if (!I || isa<ExtractValueInst>(Val: I))
446 return true;
447 if (!isa<FixedVectorType>(Val: I->getOperand(i: 0)->getType()))
448 return false;
449 if (isa<ExtractElementInst>(Val: I))
450 return isConstant(V: I->getOperand(i: 1));
451 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
452 return isConstant(V: I->getOperand(i: 2));
453}
454
455/// Returns power-of-2 number of elements in a single register (part), given the
456/// total number of elements \p Size and number of registers (parts) \p
457/// NumParts.
458static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
459 return std::min<unsigned>(a: Size, b: bit_ceil(Value: divideCeil(Numerator: Size, Denominator: NumParts)));
460}
461
462/// Returns correct remaining number of elements, considering total amount \p
463/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
464/// and current register (part) \p Part.
465static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
466 unsigned Part) {
467 return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
468}
469
470#if !defined(NDEBUG)
471/// Print a short descriptor of the instruction bundle suitable for debug output.
472static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
473 std::string Result;
474 raw_string_ostream OS(Result);
475 if (Idx >= 0)
476 OS << "Idx: " << Idx << ", ";
477 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
478 return Result;
479}
480#endif
481
482/// \returns true if all of the instructions in \p VL are in the same block or
483/// false otherwise.
484static bool allSameBlock(ArrayRef<Value *> VL) {
485 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
486 if (It == VL.end())
487 return false;
488 Instruction *I0 = cast<Instruction>(Val: *It);
489 if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
490 return true;
491
492 BasicBlock *BB = I0->getParent();
493 for (Value *V : iterator_range(It, VL.end())) {
494 if (isa<PoisonValue>(Val: V))
495 continue;
496 auto *II = dyn_cast<Instruction>(Val: V);
497 if (!II)
498 return false;
499
500 if (BB != II->getParent())
501 return false;
502 }
503 return true;
504}
505
506/// \returns True if all of the values in \p VL are constants (but not
507/// globals/constant expressions).
508static bool allConstant(ArrayRef<Value *> VL) {
509 // Constant expressions and globals can't be vectorized like normal integer/FP
510 // constants.
511 return all_of(Range&: VL, P: isConstant);
512}
513
514/// \returns True if all of the values in \p VL are identical or some of them
515/// are UndefValue.
516static bool isSplat(ArrayRef<Value *> VL) {
517 Value *FirstNonUndef = nullptr;
518 for (Value *V : VL) {
519 if (isa<UndefValue>(Val: V))
520 continue;
521 if (!FirstNonUndef) {
522 FirstNonUndef = V;
523 continue;
524 }
525 if (V != FirstNonUndef)
526 return false;
527 }
528 return FirstNonUndef != nullptr;
529}
530
531/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
532/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
533/// patterns that make it effectively commutative (like equality comparisons
534/// with zero).
535/// In most cases, users should not call this function directly (since \p I and
536/// \p InstWithUses are the same). However, when analyzing interchangeable
537/// instructions, we need to use the converted opcode along with the original
538/// uses.
539/// \param I The instruction to check for commutativity
540/// \param ValWithUses The value whose uses are analyzed for special
541/// patterns
542static bool isCommutative(Instruction *I, Value *ValWithUses,
543 bool IsCopyable = false) {
544 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
545 return Cmp->isCommutative();
546 if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
547 return BO->isCommutative() ||
548 (BO->getOpcode() == Instruction::Sub &&
549 ValWithUses->hasUseList() &&
550 !ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
551 all_of(
552 Range: ValWithUses->uses(),
553 P: [&](const Use &U) {
554 // Commutative, if icmp eq/ne sub, 0
555 CmpPredicate Pred;
556 if (match(V: U.getUser(),
557 P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
558 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
559 return true;
560 // Commutative, if abs(sub nsw, true) or abs(sub, false).
561 ConstantInt *Flag;
562 auto *I = dyn_cast<BinaryOperator>(Val: U.get());
563 return match(V: U.getUser(),
564 P: m_Intrinsic<Intrinsic::abs>(
565 Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
566 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
567 Flag->isOne());
568 })) ||
569 (BO->getOpcode() == Instruction::FSub &&
570 ValWithUses->hasUseList() &&
571 !ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
572 all_of(Range: ValWithUses->uses(), P: [](const Use &U) {
573 return match(V: U.getUser(),
574 P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
575 }));
576 return I->isCommutative();
577}
578
579/// Checks if the operand is commutative. In commutative operations, not all
580/// operands might commutable, e.g. for fmuladd only 2 first operands are
581/// commutable.
582static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
583 bool IsCopyable = false) {
584 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
585 "The instruction is not commutative.");
586 if (isa<CmpInst>(Val: I))
587 return true;
588 if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
589 switch (BO->getOpcode()) {
590 case Instruction::Sub:
591 case Instruction::FSub:
592 return true;
593 default:
594 break;
595 }
596 }
597 return I->isCommutableOperand(Op);
598}
599
600/// This is a helper function to check whether \p I is commutative.
601/// This is a convenience wrapper that calls the two-parameter version of
602/// isCommutative with the same instruction for both parameters. This is
603/// the common case where the instruction being checked for commutativity
604/// is the same as the instruction whose uses are analyzed for special
605/// patterns (see the two-parameter version above for details).
606/// \param I The instruction to check for commutativity
607/// \returns true if the instruction is commutative, false otherwise
608static bool isCommutative(Instruction *I) { return isCommutative(I, ValWithUses: I); }
609
610/// \returns number of operands of \p I, considering commutativity. Returns 2
611/// for commutative intrinsics.
612/// \param I The instruction to check for commutativity
613static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {
614 if (isa<IntrinsicInst>(Val: I) && isCommutative(I)) {
615 // IntrinsicInst::isCommutative returns true if swapping the first "two"
616 // arguments to the intrinsic produces the same result.
617 constexpr unsigned IntrinsicNumOperands = 2;
618 return IntrinsicNumOperands;
619 }
620 return I->getNumOperands();
621}
622
623template <typename T>
624static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
625 unsigned Offset) {
626 static_assert(std::is_same_v<T, InsertElementInst> ||
627 std::is_same_v<T, ExtractElementInst>,
628 "unsupported T");
629 int Index = Offset;
630 if (const auto *IE = dyn_cast<T>(Inst)) {
631 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
632 if (!VT)
633 return std::nullopt;
634 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
635 if (!CI)
636 return std::nullopt;
637 if (CI->getValue().uge(VT->getNumElements()))
638 return std::nullopt;
639 Index *= VT->getNumElements();
640 Index += CI->getZExtValue();
641 return Index;
642 }
643 return std::nullopt;
644}
645
646/// \returns inserting or extracting index of InsertElement, ExtractElement or
647/// InsertValue instruction, using Offset as base offset for index.
648/// \returns std::nullopt if the index is not an immediate.
649static std::optional<unsigned> getElementIndex(const Value *Inst,
650 unsigned Offset = 0) {
651 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
652 return Index;
653 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
654 return Index;
655
656 int Index = Offset;
657
658 const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
659 if (!IV)
660 return std::nullopt;
661
662 Type *CurrentType = IV->getType();
663 for (unsigned I : IV->indices()) {
664 if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
665 Index *= ST->getNumElements();
666 CurrentType = ST->getElementType(N: I);
667 } else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
668 Index *= AT->getNumElements();
669 CurrentType = AT->getElementType();
670 } else {
671 return std::nullopt;
672 }
673 Index += I;
674 }
675 return Index;
676}
677
678/// \returns true if all of the values in \p VL use the same opcode.
679/// For comparison instructions, also checks if predicates match.
680/// PoisonValues are considered matching.
681/// Interchangeable instructions are not considered.
682static bool allSameOpcode(ArrayRef<Value *> VL) {
683 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
684 if (It == VL.end())
685 return true;
686 Instruction *MainOp = cast<Instruction>(Val: *It);
687 unsigned Opcode = MainOp->getOpcode();
688 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
689 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
690 : CmpInst::BAD_ICMP_PREDICATE;
691 return std::all_of(first: It, last: VL.end(), pred: [&](Value *V) {
692 if (auto *CI = dyn_cast<CmpInst>(Val: V))
693 return BasePred == CI->getPredicate();
694 if (auto *I = dyn_cast<Instruction>(Val: V))
695 return I->getOpcode() == Opcode;
696 return isa<PoisonValue>(Val: V);
697 });
698}
699
700namespace {
701/// Specifies the way the mask should be analyzed for undefs/poisonous elements
702/// in the shuffle mask.
703enum class UseMask {
704 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
705 ///< check for the mask elements for the first argument (mask
706 ///< indices are in range [0:VF)).
707 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
708 ///< for the mask elements for the second argument (mask indices
709 ///< are in range [VF:2*VF))
710 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
711 ///< future shuffle elements and mark them as ones as being used
712 ///< in future. Non-undef elements are considered as unused since
713 ///< they're already marked as used in the mask.
714};
715} // namespace
716
717/// Prepares a use bitset for the given mask either for the first argument or
718/// for the second.
719static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
720 UseMask MaskArg) {
721 SmallBitVector UseMask(VF, true);
722 for (auto [Idx, Value] : enumerate(First&: Mask)) {
723 if (Value == PoisonMaskElem) {
724 if (MaskArg == UseMask::UndefsAsMask)
725 UseMask.reset(Idx);
726 continue;
727 }
728 if (MaskArg == UseMask::FirstArg && Value < VF)
729 UseMask.reset(Idx: Value);
730 else if (MaskArg == UseMask::SecondArg && Value >= VF)
731 UseMask.reset(Idx: Value - VF);
732 }
733 return UseMask;
734}
735
736/// Checks if the given value is actually an undefined constant vector.
737/// Also, if the \p UseMask is not empty, tries to check if the non-masked
738/// elements actually mask the insertelement buildvector, if any.
739template <bool IsPoisonOnly = false>
740static SmallBitVector isUndefVector(const Value *V,
741 const SmallBitVector &UseMask = {}) {
742 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
743 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
744 if (isa<T>(V))
745 return Res;
746 auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
747 if (!VecTy)
748 return Res.reset();
749 auto *C = dyn_cast<Constant>(Val: V);
750 if (!C) {
751 if (!UseMask.empty()) {
752 const Value *Base = V;
753 while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
754 Base = II->getOperand(i_nocapture: 0);
755 if (isa<T>(II->getOperand(i_nocapture: 1)))
756 continue;
757 std::optional<unsigned> Idx = getElementIndex(Inst: II);
758 if (!Idx) {
759 Res.reset();
760 return Res;
761 }
762 if (*Idx < UseMask.size() && !UseMask.test(Idx: *Idx))
763 Res.reset(Idx: *Idx);
764 }
765 // TODO: Add analysis for shuffles here too.
766 if (V == Base) {
767 Res.reset();
768 } else {
769 SmallBitVector SubMask(UseMask.size(), false);
770 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
771 }
772 } else {
773 Res.reset();
774 }
775 return Res;
776 }
777 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
778 if (Constant *Elem = C->getAggregateElement(Elt: I))
779 if (!isa<T>(Elem) &&
780 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(Idx: I))))
781 Res.reset(Idx: I);
782 }
783 return Res;
784}
785
786/// Checks if the vector of instructions can be represented as a shuffle, like:
787/// %x0 = extractelement <4 x i8> %x, i32 0
788/// %x3 = extractelement <4 x i8> %x, i32 3
789/// %y1 = extractelement <4 x i8> %y, i32 1
790/// %y2 = extractelement <4 x i8> %y, i32 2
791/// %x0x0 = mul i8 %x0, %x0
792/// %x3x3 = mul i8 %x3, %x3
793/// %y1y1 = mul i8 %y1, %y1
794/// %y2y2 = mul i8 %y2, %y2
795/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
796/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
797/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
798/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
799/// ret <4 x i8> %ins4
800/// can be transformed into:
801/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
802/// i32 6>
803/// %2 = mul <4 x i8> %1, %1
804/// ret <4 x i8> %2
805/// Mask will return the Shuffle Mask equivalent to the extracted elements.
806/// TODO: Can we split off and reuse the shuffle mask detection from
807/// ShuffleVectorInst/getShuffleCost?
808static std::optional<TargetTransformInfo::ShuffleKind>
809isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
810 AssumptionCache *AC) {
811 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
812 if (It == VL.end())
813 return std::nullopt;
814 unsigned Size =
815 std::accumulate(first: VL.begin(), last: VL.end(), init: 0u, binary_op: [](unsigned S, Value *V) {
816 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
817 if (!EI)
818 return S;
819 auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
820 if (!VTy)
821 return S;
822 return std::max(a: S, b: VTy->getNumElements());
823 });
824
825 Value *Vec1 = nullptr;
826 Value *Vec2 = nullptr;
827 bool HasNonUndefVec = any_of(Range&: VL, P: [&](Value *V) {
828 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
829 if (!EE)
830 return false;
831 Value *Vec = EE->getVectorOperand();
832 if (isa<UndefValue>(Val: Vec))
833 return false;
834 return isGuaranteedNotToBePoison(V: Vec, AC);
835 });
836 enum ShuffleMode { Unknown, Select, Permute };
837 ShuffleMode CommonShuffleMode = Unknown;
838 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
839 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
840 // Undef can be represented as an undef element in a vector.
841 if (isa<UndefValue>(Val: VL[I]))
842 continue;
843 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
844 if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
845 return std::nullopt;
846 auto *Vec = EI->getVectorOperand();
847 // We can extractelement from undef or poison vector.
848 if (isUndefVector</*isPoisonOnly=*/true>(V: Vec).all())
849 continue;
850 // All vector operands must have the same number of vector elements.
851 if (isa<UndefValue>(Val: Vec)) {
852 Mask[I] = I;
853 } else {
854 if (isa<UndefValue>(Val: EI->getIndexOperand()))
855 continue;
856 auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
857 if (!Idx)
858 return std::nullopt;
859 // Undefined behavior if Idx is negative or >= Size.
860 if (Idx->getValue().uge(RHS: Size))
861 continue;
862 unsigned IntIdx = Idx->getValue().getZExtValue();
863 Mask[I] = IntIdx;
864 }
865 if (isUndefVector(V: Vec).all() && HasNonUndefVec)
866 continue;
867 // For correct shuffling we have to have at most 2 different vector operands
868 // in all extractelement instructions.
869 if (!Vec1 || Vec1 == Vec) {
870 Vec1 = Vec;
871 } else if (!Vec2 || Vec2 == Vec) {
872 Vec2 = Vec;
873 Mask[I] += Size;
874 } else {
875 return std::nullopt;
876 }
877 if (CommonShuffleMode == Permute)
878 continue;
879 // If the extract index is not the same as the operation number, it is a
880 // permutation.
881 if (Mask[I] % Size != I) {
882 CommonShuffleMode = Permute;
883 continue;
884 }
885 CommonShuffleMode = Select;
886 }
887 // If we're not crossing lanes in different vectors, consider it as blending.
888 if (CommonShuffleMode == Select && Vec2)
889 return TargetTransformInfo::SK_Select;
890 // If Vec2 was never used, we have a permutation of a single vector, otherwise
891 // we have permutation of 2 vectors.
892 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
893 : TargetTransformInfo::SK_PermuteSingleSrc;
894}
895
896/// \returns True if Extract{Value,Element} instruction extracts element Idx.
897static std::optional<unsigned> getExtractIndex(const Instruction *E) {
898 unsigned Opcode = E->getOpcode();
899 assert((Opcode == Instruction::ExtractElement ||
900 Opcode == Instruction::ExtractValue) &&
901 "Expected extractelement or extractvalue instruction.");
902 if (Opcode == Instruction::ExtractElement) {
903 auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: 1));
904 if (!CI)
905 return std::nullopt;
906 // Check if the index is out of bound - we can get the source vector from
907 // operand 0
908 unsigned Idx = CI->getZExtValue();
909 auto *EE = cast<ExtractElementInst>(Val: E);
910 const unsigned VF = ::getNumElements(Ty: EE->getVectorOperandType());
911 if (Idx >= VF)
912 return std::nullopt;
913 return Idx;
914 }
915 auto *EI = cast<ExtractValueInst>(Val: E);
916 if (EI->getNumIndices() != 1)
917 return std::nullopt;
918 return *EI->idx_begin();
919}
920
921/// Checks if the provided value does not require scheduling. It does not
922/// require scheduling if this is not an instruction or it is an instruction
923/// that does not read/write memory and all operands are either not instructions
924/// or phi nodes or instructions from different blocks.
925static bool areAllOperandsNonInsts(Value *V);
926/// Checks if the provided value does not require scheduling. It does not
927/// require scheduling if this is not an instruction or it is an instruction
928/// that does not read/write memory and all users are phi nodes or instructions
929/// from the different blocks.
930static bool isUsedOutsideBlock(Value *V);
931/// Checks if the specified value does not require scheduling. It does not
932/// require scheduling if all operands and all users do not need to be scheduled
933/// in the current basic block.
934static bool doesNotNeedToBeScheduled(Value *V);
935
936/// \returns true if \p Opcode is allowed as part of the main/alternate
937/// instruction for SLP vectorization.
938///
939/// Example of unsupported opcode is SDIV that can potentially cause UB if the
940/// "shuffled out" lane would result in division by zero.
941static bool isValidForAlternation(unsigned Opcode) {
942 return !Instruction::isIntDivRem(Opcode);
943}
944
945namespace {
946
947/// Helper class that determines VL can use the same opcode.
948/// Alternate instruction is supported. In addition, it supports interchangeable
949/// instruction. An interchangeable instruction is an instruction that can be
950/// converted to another instruction with same semantics. For example, x << 1 is
951/// equal to x * 2. x * 1 is equal to x | 0.
952class BinOpSameOpcodeHelper {
953 using MaskType = std::uint_fast32_t;
954 /// Sort SupportedOp because it is used by binary_search.
955 constexpr static std::initializer_list<unsigned> SupportedOp = {
956 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
957 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
958 static_assert(llvm::is_sorted_constexpr(Range: SupportedOp) &&
959 "SupportedOp is not sorted.");
960 enum : MaskType {
961 ShlBIT = 1,
962 AShrBIT = 1 << 1,
963 MulBIT = 1 << 2,
964 AddBIT = 1 << 3,
965 SubBIT = 1 << 4,
966 AndBIT = 1 << 5,
967 OrBIT = 1 << 6,
968 XorBIT = 1 << 7,
969 MainOpBIT = 1 << 8,
970 LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
971 };
972 /// Return a non-nullptr if either operand of I is a ConstantInt.
973 /// The second return value represents the operand position. We check the
974 /// right-hand side first (1). If the right hand side is not a ConstantInt and
975 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
976 /// side (0).
977 static std::pair<ConstantInt *, unsigned>
978 isBinOpWithConstantInt(const Instruction *I) {
979 unsigned Opcode = I->getOpcode();
980 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
981 (void)SupportedOp;
982 auto *BinOp = cast<BinaryOperator>(Val: I);
983 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 1)))
984 return {CI, 1};
985 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
986 Opcode == Instruction::AShr)
987 return {nullptr, 0};
988 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 0)))
989 return {CI, 0};
990 return {nullptr, 0};
991 }
992 struct InterchangeableInfo {
993 const Instruction *I = nullptr;
994 /// The bit it sets represents whether MainOp can be converted to.
995 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
996 MulBIT | AShrBIT | ShlBIT;
997 /// We cannot create an interchangeable instruction that does not exist in
998 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
999 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
1000 /// 1]. SeenBefore is used to know what operations have been seen before.
1001 MaskType SeenBefore = 0;
1002 InterchangeableInfo(const Instruction *I) : I(I) {}
1003 /// Return false allows BinOpSameOpcodeHelper to find an alternate
1004 /// instruction. Directly setting the mask will destroy the mask state,
1005 /// preventing us from determining which instruction it should convert to.
1006 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1007 if (Mask & InterchangeableMask) {
1008 SeenBefore |= OpcodeInMaskForm;
1009 Mask &= InterchangeableMask;
1010 return true;
1011 }
1012 return false;
1013 }
1014 bool equal(unsigned Opcode) {
1015 return Opcode == I->getOpcode() && trySet(OpcodeInMaskForm: MainOpBIT, InterchangeableMask: MainOpBIT);
1016 }
1017 unsigned getOpcode() const {
1018 MaskType Candidate = Mask & SeenBefore;
1019 if (Candidate & MainOpBIT)
1020 return I->getOpcode();
1021 if (Candidate & ShlBIT)
1022 return Instruction::Shl;
1023 if (Candidate & AShrBIT)
1024 return Instruction::AShr;
1025 if (Candidate & MulBIT)
1026 return Instruction::Mul;
1027 if (Candidate & AddBIT)
1028 return Instruction::Add;
1029 if (Candidate & SubBIT)
1030 return Instruction::Sub;
1031 if (Candidate & AndBIT)
1032 return Instruction::And;
1033 if (Candidate & OrBIT)
1034 return Instruction::Or;
1035 if (Candidate & XorBIT)
1036 return Instruction::Xor;
1037 llvm_unreachable("Cannot find interchangeable instruction.");
1038 }
1039
1040 /// Return true if the instruction can be converted to \p Opcode.
1041 bool hasCandidateOpcode(unsigned Opcode) const {
1042 MaskType Candidate = Mask & SeenBefore;
1043 switch (Opcode) {
1044 case Instruction::Shl:
1045 return Candidate & ShlBIT;
1046 case Instruction::AShr:
1047 return Candidate & AShrBIT;
1048 case Instruction::Mul:
1049 return Candidate & MulBIT;
1050 case Instruction::Add:
1051 return Candidate & AddBIT;
1052 case Instruction::Sub:
1053 return Candidate & SubBIT;
1054 case Instruction::And:
1055 return Candidate & AndBIT;
1056 case Instruction::Or:
1057 return Candidate & OrBIT;
1058 case Instruction::Xor:
1059 return Candidate & XorBIT;
1060 case Instruction::LShr:
1061 case Instruction::FAdd:
1062 case Instruction::FSub:
1063 case Instruction::FMul:
1064 case Instruction::SDiv:
1065 case Instruction::UDiv:
1066 case Instruction::FDiv:
1067 case Instruction::SRem:
1068 case Instruction::URem:
1069 case Instruction::FRem:
1070 return false;
1071 default:
1072 break;
1073 }
1074 llvm_unreachable("Cannot find interchangeable instruction.");
1075 }
1076
1077 SmallVector<Value *> getOperand(const Instruction *To) const {
1078 unsigned ToOpcode = To->getOpcode();
1079 unsigned FromOpcode = I->getOpcode();
1080 if (FromOpcode == ToOpcode)
1081 return SmallVector<Value *>(I->operands());
1082 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1083 auto [CI, Pos] = isBinOpWithConstantInt(I);
1084 const APInt &FromCIValue = CI->getValue();
1085 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1086 Type *RHSType = I->getOperand(i: Pos)->getType();
1087 Constant *RHS;
1088 switch (FromOpcode) {
1089 case Instruction::Shl:
1090 if (ToOpcode == Instruction::Mul) {
1091 RHS = ConstantInt::get(
1092 Ty: RHSType, V: APInt::getOneBitSet(numBits: FromCIValueBitWidth,
1093 BitNo: FromCIValue.getZExtValue()));
1094 } else {
1095 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1096 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1097 /*AllowRHSConstant=*/true);
1098 }
1099 break;
1100 case Instruction::Mul:
1101 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1102 if (ToOpcode == Instruction::Shl) {
1103 RHS = ConstantInt::get(
1104 Ty: RHSType, V: APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
1105 } else {
1106 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1107 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1108 /*AllowRHSConstant=*/true);
1109 }
1110 break;
1111 case Instruction::Add:
1112 case Instruction::Sub:
1113 if (FromCIValue.isZero()) {
1114 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1115 /*AllowRHSConstant=*/true);
1116 } else {
1117 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1118 "Cannot convert the instruction.");
1119 APInt NegatedVal = APInt(FromCIValue);
1120 NegatedVal.negate();
1121 RHS = ConstantInt::get(Ty: RHSType, V: NegatedVal);
1122 }
1123 break;
1124 case Instruction::And:
1125 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1126 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1127 /*AllowRHSConstant=*/true);
1128 break;
1129 default:
1130 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1131 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1132 /*AllowRHSConstant=*/true);
1133 break;
1134 }
1135 Value *LHS = I->getOperand(i: 1 - Pos);
1136 // If the target opcode is non-commutative (e.g., shl, sub),
1137 // force the variable to the left and the constant to the right.
1138 if (Pos == 1 || !Instruction::isCommutative(Opcode: ToOpcode))
1139 return SmallVector<Value *>({LHS, RHS});
1140
1141 return SmallVector<Value *>({RHS, LHS});
1142 }
1143 };
1144 InterchangeableInfo MainOp;
1145 InterchangeableInfo AltOp;
1146 bool isValidForAlternation(const Instruction *I) const {
1147 return ::isValidForAlternation(Opcode: MainOp.I->getOpcode()) &&
1148 ::isValidForAlternation(Opcode: I->getOpcode());
1149 }
1150 bool initializeAltOp(const Instruction *I) {
1151 if (AltOp.I)
1152 return true;
1153 if (!isValidForAlternation(I))
1154 return false;
1155 AltOp.I = I;
1156 return true;
1157 }
1158
1159public:
1160 BinOpSameOpcodeHelper(const Instruction *MainOp,
1161 const Instruction *AltOp = nullptr)
1162 : MainOp(MainOp), AltOp(AltOp) {}
1163 bool add(const Instruction *I) {
1164 assert(isa<BinaryOperator>(I) &&
1165 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1166 unsigned Opcode = I->getOpcode();
1167 MaskType OpcodeInMaskForm;
1168 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1169 switch (Opcode) {
1170 case Instruction::Shl:
1171 OpcodeInMaskForm = ShlBIT;
1172 break;
1173 case Instruction::AShr:
1174 OpcodeInMaskForm = AShrBIT;
1175 break;
1176 case Instruction::Mul:
1177 OpcodeInMaskForm = MulBIT;
1178 break;
1179 case Instruction::Add:
1180 OpcodeInMaskForm = AddBIT;
1181 break;
1182 case Instruction::Sub:
1183 OpcodeInMaskForm = SubBIT;
1184 break;
1185 case Instruction::And:
1186 OpcodeInMaskForm = AndBIT;
1187 break;
1188 case Instruction::Or:
1189 OpcodeInMaskForm = OrBIT;
1190 break;
1191 case Instruction::Xor:
1192 OpcodeInMaskForm = XorBIT;
1193 break;
1194 default:
1195 return MainOp.equal(Opcode) ||
1196 (initializeAltOp(I) && AltOp.equal(Opcode));
1197 }
1198 MaskType InterchangeableMask = OpcodeInMaskForm;
1199 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1200 if (CI) {
1201 constexpr MaskType CanBeAll =
1202 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1203 const APInt &CIValue = CI->getValue();
1204 switch (Opcode) {
1205 case Instruction::Shl:
1206 if (CIValue.ult(RHS: CIValue.getBitWidth()))
1207 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1208 break;
1209 case Instruction::Mul:
1210 if (CIValue.isOne()) {
1211 InterchangeableMask = CanBeAll;
1212 break;
1213 }
1214 if (CIValue.isPowerOf2())
1215 InterchangeableMask = MulBIT | ShlBIT;
1216 break;
1217 case Instruction::Add:
1218 case Instruction::Sub:
1219 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1220 break;
1221 case Instruction::And:
1222 if (CIValue.isAllOnes())
1223 InterchangeableMask = CanBeAll;
1224 break;
1225 case Instruction::Xor:
1226 if (CIValue.isZero())
1227 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1228 break;
1229 default:
1230 if (CIValue.isZero())
1231 InterchangeableMask = CanBeAll;
1232 break;
1233 }
1234 }
1235 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1236 (initializeAltOp(I) &&
1237 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1238 }
1239 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1240 /// Checks if the list of potential opcodes includes \p Opcode.
1241 bool hasCandidateOpcode(unsigned Opcode) const {
1242 return MainOp.hasCandidateOpcode(Opcode);
1243 }
1244 bool hasAltOp() const { return AltOp.I; }
1245 unsigned getAltOpcode() const {
1246 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1247 }
1248 SmallVector<Value *> getOperand(const Instruction *I) const {
1249 return MainOp.getOperand(To: I);
1250 }
1251};
1252
1253/// Main data required for vectorization of instructions.
1254class InstructionsState {
1255 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1256 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1257 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1258 /// isAltShuffle).
1259 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1260 /// from getMainAltOpsNoStateVL.
1261 /// For those InstructionsState that use alternate instructions, the resulting
1262 /// vectorized output ultimately comes from a shufflevector. For example,
1263 /// given a vector list (VL):
1264 /// VL[0] = add i32 a, e
1265 /// VL[1] = sub i32 b, f
1266 /// VL[2] = add i32 c, g
1267 /// VL[3] = sub i32 d, h
1268 /// The vectorized result would be:
1269 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1270 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1271 /// result = shufflevector <4 x i32> intermediated_0,
1272 /// <4 x i32> intermediated_1,
1273 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1274 /// Since shufflevector is used in the final result, when calculating the cost
1275 /// (getEntryCost), we must account for the usage of shufflevector in
1276 /// GetVectorCost.
1277 Instruction *MainOp = nullptr;
1278 Instruction *AltOp = nullptr;
1279 /// Wether the instruction state represents copyable instructions.
1280 bool HasCopyables = false;
1281
1282public:
1283 Instruction *getMainOp() const {
1284 assert(valid() && "InstructionsState is invalid.");
1285 return MainOp;
1286 }
1287
1288 Instruction *getAltOp() const {
1289 assert(valid() && "InstructionsState is invalid.");
1290 return AltOp;
1291 }
1292
1293 /// The main/alternate opcodes for the list of instructions.
1294 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1295
1296 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1297
1298 /// Some of the instructions in the list have alternate opcodes.
1299 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1300
1301 /// Checks if the instruction matches either the main or alternate opcode.
1302 /// \returns
1303 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1304 /// to it
1305 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1306 /// it
1307 /// - nullptr if \param I cannot be matched or converted to either opcode
1308 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1309 assert(MainOp && "MainOp cannot be nullptr.");
1310 if (I->getOpcode() == MainOp->getOpcode())
1311 return MainOp;
1312 if (MainOp->getOpcode() == Instruction::Select &&
1313 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1314 return MainOp;
1315 // Prefer AltOp instead of interchangeable instruction of MainOp.
1316 assert(AltOp && "AltOp cannot be nullptr.");
1317 if (I->getOpcode() == AltOp->getOpcode())
1318 return AltOp;
1319 if (!I->isBinaryOp())
1320 return nullptr;
1321 BinOpSameOpcodeHelper Converter(MainOp);
1322 if (!Converter.add(I) || !Converter.add(I: MainOp))
1323 return nullptr;
1324 if (isAltShuffle() && !Converter.hasCandidateOpcode(Opcode: MainOp->getOpcode())) {
1325 BinOpSameOpcodeHelper AltConverter(AltOp);
1326 if (AltConverter.add(I) && AltConverter.add(I: AltOp) &&
1327 AltConverter.hasCandidateOpcode(Opcode: AltOp->getOpcode()))
1328 return AltOp;
1329 }
1330 if (Converter.hasAltOp() && !isAltShuffle())
1331 return nullptr;
1332 return Converter.hasAltOp() ? AltOp : MainOp;
1333 }
1334
1335 /// Checks if main/alt instructions are shift operations.
1336 bool isShiftOp() const {
1337 return getMainOp()->isShift() && getAltOp()->isShift();
1338 }
1339
1340 /// Checks if main/alt instructions are bitwise logic operations.
1341 bool isBitwiseLogicOp() const {
1342 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1343 }
1344
1345 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1346 bool isMulDivLikeOp() const {
1347 constexpr std::array<unsigned, 8> MulDiv = {
1348 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1349 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1350 Instruction::URem, Instruction::FRem};
1351 return is_contained(Range: MulDiv, Element: getOpcode()) &&
1352 is_contained(Range: MulDiv, Element: getAltOpcode());
1353 }
1354
1355 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1356 bool isAddSubLikeOp() const {
1357 constexpr std::array<unsigned, 4> AddSub = {
1358 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1359 Instruction::FSub};
1360 return is_contained(Range: AddSub, Element: getOpcode()) &&
1361 is_contained(Range: AddSub, Element: getAltOpcode());
1362 }
1363
1364 /// Checks if main/alt instructions are cmp operations.
1365 bool isCmpOp() const {
1366 return (getOpcode() == Instruction::ICmp ||
1367 getOpcode() == Instruction::FCmp) &&
1368 getAltOpcode() == getOpcode();
1369 }
1370
1371 /// Checks if the current state is valid, i.e. has non-null MainOp
1372 bool valid() const { return MainOp && AltOp; }
1373
1374 explicit operator bool() const { return valid(); }
1375
1376 InstructionsState() = delete;
1377 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1378 bool HasCopyables = false)
1379 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1380 static InstructionsState invalid() { return {nullptr, nullptr}; }
1381
1382 /// Checks if the value is a copyable element.
1383 bool isCopyableElement(Value *V) const {
1384 assert(valid() && "InstructionsState is invalid.");
1385 if (!HasCopyables)
1386 return false;
1387 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1388 return false;
1389 auto *I = dyn_cast<Instruction>(Val: V);
1390 if (!I)
1391 return !isa<PoisonValue>(Val: V);
1392 if (I->getParent() != MainOp->getParent() &&
1393 (!isVectorLikeInstWithConstOps(V: I) ||
1394 !isVectorLikeInstWithConstOps(V: MainOp)))
1395 return true;
1396 if (I->getOpcode() == MainOp->getOpcode())
1397 return false;
1398 if (!I->isBinaryOp())
1399 return true;
1400 BinOpSameOpcodeHelper Converter(MainOp);
1401 return !Converter.add(I) || !Converter.add(I: MainOp) ||
1402 Converter.hasAltOp() || !Converter.hasCandidateOpcode(Opcode: getOpcode());
1403 }
1404
1405 /// Checks if the value is non-schedulable.
1406 bool isNonSchedulable(Value *V) const {
1407 assert(valid() && "InstructionsState is invalid.");
1408 auto *I = dyn_cast<Instruction>(Val: V);
1409 if (!HasCopyables)
1410 return !I || isa<PHINode>(Val: I) || isVectorLikeInstWithConstOps(V: I) ||
1411 doesNotNeedToBeScheduled(V);
1412 // MainOp for copyables always schedulable to correctly identify
1413 // non-schedulable copyables.
1414 if (getMainOp() == V)
1415 return false;
1416 if (isCopyableElement(V)) {
1417 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1418 auto *I = dyn_cast<Instruction>(Val: V);
1419 return !I || isa<PHINode>(Val: I) || I->getParent() != MainOp->getParent() ||
1420 (doesNotNeedToBeScheduled(V: I) &&
1421 // If the copyable instructions comes after MainOp
1422 // (non-schedulable, but used in the block) - cannot vectorize
1423 // it, will possibly generate use before def.
1424 !MainOp->comesBefore(Other: I));
1425 };
1426
1427 return IsNonSchedulableCopyableElement(V);
1428 }
1429 return !I || isa<PHINode>(Val: I) || isVectorLikeInstWithConstOps(V: I) ||
1430 doesNotNeedToBeScheduled(V);
1431 }
1432
1433 /// Checks if the state represents copyable instructions.
1434 bool areInstructionsWithCopyableElements() const {
1435 assert(valid() && "InstructionsState is invalid.");
1436 return HasCopyables;
1437 }
1438};
1439
1440std::pair<Instruction *, SmallVector<Value *>>
1441convertTo(Instruction *I, const InstructionsState &S) {
1442 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1443 assert(SelectedOp && "Cannot convert the instruction.");
1444 if (I->isBinaryOp()) {
1445 BinOpSameOpcodeHelper Converter(I);
1446 return std::make_pair(x&: SelectedOp, y: Converter.getOperand(I: SelectedOp));
1447 }
1448 return std::make_pair(x&: SelectedOp, y: SmallVector<Value *>(I->operands()));
1449}
1450
1451} // end anonymous namespace
1452
1453static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1454 const TargetLibraryInfo &TLI);
1455
1456/// Find an instruction with a specific opcode in VL.
1457/// \param VL Array of values to search through. Must contain only Instructions
1458/// and PoisonValues.
1459/// \param Opcode The instruction opcode to search for
1460/// \returns
1461/// - The first instruction found with matching opcode
1462/// - nullptr if no matching instruction is found
1463static Instruction *findInstructionWithOpcode(ArrayRef<Value *> VL,
1464 unsigned Opcode) {
1465 for (Value *V : VL) {
1466 if (isa<PoisonValue>(Val: V))
1467 continue;
1468 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1469 auto *Inst = cast<Instruction>(Val: V);
1470 if (Inst->getOpcode() == Opcode)
1471 return Inst;
1472 }
1473 return nullptr;
1474}
1475
1476/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1477/// compatible instructions or constants, or just some other regular values.
1478static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1479 Value *Op1, const TargetLibraryInfo &TLI) {
1480 return (isConstant(V: BaseOp0) && isConstant(V: Op0)) ||
1481 (isConstant(V: BaseOp1) && isConstant(V: Op1)) ||
1482 (!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
1483 !isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) ||
1484 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1485 getSameOpcode(VL: {BaseOp0, Op0}, TLI) ||
1486 getSameOpcode(VL: {BaseOp1, Op1}, TLI);
1487}
1488
1489/// \returns true if a compare instruction \p CI has similar "look" and
1490/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1491/// swapped, false otherwise.
1492static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1493 const TargetLibraryInfo &TLI) {
1494 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1495 "Assessing comparisons of different types?");
1496 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1497 CmpInst::Predicate Pred = CI->getPredicate();
1498 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
1499
1500 Value *BaseOp0 = BaseCI->getOperand(i_nocapture: 0);
1501 Value *BaseOp1 = BaseCI->getOperand(i_nocapture: 1);
1502 Value *Op0 = CI->getOperand(i_nocapture: 0);
1503 Value *Op1 = CI->getOperand(i_nocapture: 1);
1504
1505 return (BasePred == Pred &&
1506 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1507 (BasePred == SwappedPred &&
1508 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
1509}
1510
1511/// \returns analysis of the Instructions in \p VL described in
1512/// InstructionsState, the Opcode that we suppose the whole list
1513/// could be vectorized even if its structure is diverse.
1514static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1515 const TargetLibraryInfo &TLI) {
1516 // Make sure these are all Instructions.
1517 if (!all_of(Range&: VL, P: IsaPred<Instruction, PoisonValue>))
1518 return InstructionsState::invalid();
1519
1520 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
1521 if (It == VL.end())
1522 return InstructionsState::invalid();
1523
1524 Instruction *MainOp = cast<Instruction>(Val: *It);
1525 unsigned InstCnt = std::count_if(first: It, last: VL.end(), pred: IsaPred<Instruction>);
1526 if ((VL.size() > 2 && !isa<PHINode>(Val: MainOp) && InstCnt < VL.size() / 2) ||
1527 (VL.size() == 2 && InstCnt < 2))
1528 return InstructionsState::invalid();
1529
1530 bool IsCastOp = isa<CastInst>(Val: MainOp);
1531 bool IsBinOp = isa<BinaryOperator>(Val: MainOp);
1532 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
1533 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
1534 : CmpInst::BAD_ICMP_PREDICATE;
1535 Instruction *AltOp = MainOp;
1536 unsigned Opcode = MainOp->getOpcode();
1537 unsigned AltOpcode = Opcode;
1538
1539 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1540 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1541 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1542 UniquePreds.insert(X: BasePred);
1543 UniqueNonSwappedPreds.insert(X: BasePred);
1544 for (Value *V : VL) {
1545 auto *I = dyn_cast<CmpInst>(Val: V);
1546 if (!I)
1547 return false;
1548 CmpInst::Predicate CurrentPred = I->getPredicate();
1549 CmpInst::Predicate SwappedCurrentPred =
1550 CmpInst::getSwappedPredicate(pred: CurrentPred);
1551 UniqueNonSwappedPreds.insert(X: CurrentPred);
1552 if (!UniquePreds.contains(key: CurrentPred) &&
1553 !UniquePreds.contains(key: SwappedCurrentPred))
1554 UniquePreds.insert(X: CurrentPred);
1555 }
1556 // Total number of predicates > 2, but if consider swapped predicates
1557 // compatible only 2, consider swappable predicates as compatible opcodes,
1558 // not alternate.
1559 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1560 }();
1561 // Check for one alternate opcode from another BinaryOperator.
1562 // TODO - generalize to support all operators (types, calls etc.).
1563 Intrinsic::ID BaseID = 0;
1564 SmallVector<VFInfo> BaseMappings;
1565 if (auto *CallBase = dyn_cast<CallInst>(Val: MainOp)) {
1566 BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
1567 BaseMappings = VFDatabase(*CallBase).getMappings(CI: *CallBase);
1568 if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
1569 return InstructionsState::invalid();
1570 }
1571 bool AnyPoison = InstCnt != VL.size();
1572 // Check MainOp too to be sure that it matches the requirements for the
1573 // instructions.
1574 for (Value *V : iterator_range(It, VL.end())) {
1575 auto *I = dyn_cast<Instruction>(Val: V);
1576 if (!I)
1577 continue;
1578
1579 // Cannot combine poison and divisions.
1580 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1581 // intrinsics/functions only.
1582 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I)))
1583 return InstructionsState::invalid();
1584 unsigned InstOpcode = I->getOpcode();
1585 if (IsBinOp && isa<BinaryOperator>(Val: I)) {
1586 if (BinOpHelper.add(I))
1587 continue;
1588 } else if (IsCastOp && isa<CastInst>(Val: I)) {
1589 Value *Op0 = MainOp->getOperand(i: 0);
1590 Type *Ty0 = Op0->getType();
1591 Value *Op1 = I->getOperand(i: 0);
1592 Type *Ty1 = Op1->getType();
1593 if (Ty0 == Ty1) {
1594 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1595 continue;
1596 if (Opcode == AltOpcode) {
1597 assert(isValidForAlternation(Opcode) &&
1598 isValidForAlternation(InstOpcode) &&
1599 "Cast isn't safe for alternation, logic needs to be updated!");
1600 AltOpcode = InstOpcode;
1601 AltOp = I;
1602 continue;
1603 }
1604 }
1605 } else if (auto *Inst = dyn_cast<CmpInst>(Val: I); Inst && IsCmpOp) {
1606 auto *BaseInst = cast<CmpInst>(Val: MainOp);
1607 Type *Ty0 = BaseInst->getOperand(i_nocapture: 0)->getType();
1608 Type *Ty1 = Inst->getOperand(i_nocapture: 0)->getType();
1609 if (Ty0 == Ty1) {
1610 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1611 assert(InstOpcode == AltOpcode &&
1612 "Alternate instructions are only supported by BinaryOperator "
1613 "and CastInst.");
1614 // Check for compatible operands. If the corresponding operands are not
1615 // compatible - need to perform alternate vectorization.
1616 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1617 CmpInst::Predicate SwappedCurrentPred =
1618 CmpInst::getSwappedPredicate(pred: CurrentPred);
1619
1620 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1621 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1622 continue;
1623
1624 if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
1625 continue;
1626 auto *AltInst = cast<CmpInst>(Val: AltOp);
1627 if (MainOp != AltOp) {
1628 if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
1629 continue;
1630 } else if (BasePred != CurrentPred) {
1631 assert(
1632 isValidForAlternation(InstOpcode) &&
1633 "CmpInst isn't safe for alternation, logic needs to be updated!");
1634 AltOp = I;
1635 continue;
1636 }
1637 CmpInst::Predicate AltPred = AltInst->getPredicate();
1638 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1639 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1640 continue;
1641 }
1642 } else if (InstOpcode == Opcode) {
1643 assert(InstOpcode == AltOpcode &&
1644 "Alternate instructions are only supported by BinaryOperator and "
1645 "CastInst.");
1646 if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
1647 if (Gep->getNumOperands() != 2 ||
1648 Gep->getOperand(i_nocapture: 0)->getType() != MainOp->getOperand(i: 0)->getType())
1649 return InstructionsState::invalid();
1650 } else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
1651 if (!isVectorLikeInstWithConstOps(V: EI))
1652 return InstructionsState::invalid();
1653 } else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
1654 auto *BaseLI = cast<LoadInst>(Val: MainOp);
1655 if (!LI->isSimple() || !BaseLI->isSimple())
1656 return InstructionsState::invalid();
1657 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
1658 auto *CallBase = cast<CallInst>(Val: MainOp);
1659 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1660 return InstructionsState::invalid();
1661 if (Call->hasOperandBundles() &&
1662 (!CallBase->hasOperandBundles() ||
1663 !std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
1664 last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
1665 first2: CallBase->op_begin() +
1666 CallBase->getBundleOperandsStartIndex())))
1667 return InstructionsState::invalid();
1668 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
1669 if (ID != BaseID)
1670 return InstructionsState::invalid();
1671 if (!ID) {
1672 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(CI: *Call);
1673 if (Mappings.size() != BaseMappings.size() ||
1674 Mappings.front().ISA != BaseMappings.front().ISA ||
1675 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1676 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1677 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1678 Mappings.front().Shape.Parameters !=
1679 BaseMappings.front().Shape.Parameters)
1680 return InstructionsState::invalid();
1681 }
1682 }
1683 continue;
1684 }
1685 return InstructionsState::invalid();
1686 }
1687
1688 if (IsBinOp) {
1689 MainOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getMainOpcode());
1690 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1691 AltOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getAltOpcode());
1692 assert(AltOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1693 }
1694 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1695 "Incorrect implementation of allSameOpcode.");
1696 InstructionsState S(MainOp, AltOp);
1697 assert(all_of(VL,
1698 [&](Value *V) {
1699 return isa<PoisonValue>(V) ||
1700 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1701 }) &&
1702 "Invalid InstructionsState.");
1703 return S;
1704}
1705
1706/// \returns true if all of the values in \p VL have the same type or false
1707/// otherwise.
1708static bool allSameType(ArrayRef<Value *> VL) {
1709 Type *Ty = VL.consume_front()->getType();
1710 return all_of(Range&: VL, P: [&](Value *V) { return V->getType() == Ty; });
1711}
1712
1713/// \returns True if in-tree use also needs extract. This refers to
1714/// possible scalar operand in vectorized instruction.
1715static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1716 TargetLibraryInfo *TLI,
1717 const TargetTransformInfo *TTI) {
1718 if (!UserInst)
1719 return false;
1720 unsigned Opcode = UserInst->getOpcode();
1721 switch (Opcode) {
1722 case Instruction::Load: {
1723 LoadInst *LI = cast<LoadInst>(Val: UserInst);
1724 return (LI->getPointerOperand() == Scalar);
1725 }
1726 case Instruction::Store: {
1727 StoreInst *SI = cast<StoreInst>(Val: UserInst);
1728 return (SI->getPointerOperand() == Scalar);
1729 }
1730 case Instruction::Call: {
1731 CallInst *CI = cast<CallInst>(Val: UserInst);
1732 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1733 return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
1734 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1735 Arg.value().get() == Scalar;
1736 });
1737 }
1738 default:
1739 return false;
1740 }
1741}
1742
1743/// \returns the AA location that is being access by the instruction.
1744static MemoryLocation getLocation(Instruction *I) {
1745 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1746 return MemoryLocation::get(SI);
1747 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1748 return MemoryLocation::get(LI);
1749 return MemoryLocation();
1750}
1751
1752/// \returns True if the instruction is not a volatile or atomic load/store.
1753static bool isSimple(Instruction *I) {
1754 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1755 return LI->isSimple();
1756 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1757 return SI->isSimple();
1758 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
1759 return !MI->isVolatile();
1760 return true;
1761}
1762
1763/// Shuffles \p Mask in accordance with the given \p SubMask.
1764/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1765/// one but two input vectors.
1766static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1767 bool ExtendingManyInputs = false) {
1768 if (SubMask.empty())
1769 return;
1770 assert(
1771 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1772 // Check if input scalars were extended to match the size of other node.
1773 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1774 "SubMask with many inputs support must be larger than the mask.");
1775 if (Mask.empty()) {
1776 Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
1777 return;
1778 }
1779 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1780 int TermValue = std::min(a: Mask.size(), b: SubMask.size());
1781 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1782 if (SubMask[I] == PoisonMaskElem ||
1783 (!ExtendingManyInputs &&
1784 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1785 continue;
1786 NewMask[I] = Mask[SubMask[I]];
1787 }
1788 Mask.swap(RHS&: NewMask);
1789}
1790
1791/// Order may have elements assigned special value (size) which is out of
1792/// bounds. Such indices only appear on places which correspond to undef values
1793/// (see canReuseExtract for details) and used in order to avoid undef values
1794/// have effect on operands ordering.
1795/// The first loop below simply finds all unused indices and then the next loop
1796/// nest assigns these indices for undef values positions.
1797/// As an example below Order has two undef positions and they have assigned
1798/// values 3 and 7 respectively:
1799/// before: 6 9 5 4 9 2 1 0
1800/// after: 6 3 5 4 7 2 1 0
1801static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1802 const size_t Sz = Order.size();
1803 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1804 SmallBitVector MaskedIndices(Sz);
1805 for (unsigned I = 0; I < Sz; ++I) {
1806 if (Order[I] < Sz)
1807 UnusedIndices.reset(Idx: Order[I]);
1808 else
1809 MaskedIndices.set(I);
1810 }
1811 if (MaskedIndices.none())
1812 return;
1813 assert(UnusedIndices.count() == MaskedIndices.count() &&
1814 "Non-synced masked/available indices.");
1815 int Idx = UnusedIndices.find_first();
1816 int MIdx = MaskedIndices.find_first();
1817 while (MIdx >= 0) {
1818 assert(Idx >= 0 && "Indices must be synced.");
1819 Order[MIdx] = Idx;
1820 Idx = UnusedIndices.find_next(Prev: Idx);
1821 MIdx = MaskedIndices.find_next(Prev: MIdx);
1822 }
1823}
1824
1825/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1826/// Opcode1.
1827static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, Type *ScalarTy,
1828 unsigned Opcode0, unsigned Opcode1) {
1829 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
1830 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1831 for (unsigned Lane : seq<unsigned>(Size: VL.size())) {
1832 if (isa<PoisonValue>(Val: VL[Lane]))
1833 continue;
1834 if (cast<Instruction>(Val: VL[Lane])->getOpcode() == Opcode1)
1835 OpcodeMask.set(I: Lane * ScalarTyNumElements,
1836 E: Lane * ScalarTyNumElements + ScalarTyNumElements);
1837 }
1838 return OpcodeMask;
1839}
1840
1841/// Replicates the given \p Val \p VF times.
1842static SmallVector<Constant *> replicateMask(ArrayRef<Constant *> Val,
1843 unsigned VF) {
1844 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1845 "Expected scalar constants.");
1846 SmallVector<Constant *> NewVal(Val.size() * VF);
1847 for (auto [I, V] : enumerate(First&: Val))
1848 std::fill_n(first: NewVal.begin() + I * VF, n: VF, value: V);
1849 return NewVal;
1850}
1851
1852static void inversePermutation(ArrayRef<unsigned> Indices,
1853 SmallVectorImpl<int> &Mask) {
1854 Mask.clear();
1855 const unsigned E = Indices.size();
1856 Mask.resize(N: E, NV: PoisonMaskElem);
1857 for (unsigned I = 0; I < E; ++I)
1858 Mask[Indices[I]] = I;
1859}
1860
1861/// Reorders the list of scalars in accordance with the given \p Mask.
1862static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1863 ArrayRef<int> Mask) {
1864 assert(!Mask.empty() && "Expected non-empty mask.");
1865 SmallVector<Value *> Prev(Scalars.size(),
1866 PoisonValue::get(T: Scalars.front()->getType()));
1867 Prev.swap(RHS&: Scalars);
1868 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1869 if (Mask[I] != PoisonMaskElem)
1870 Scalars[Mask[I]] = Prev[I];
1871}
1872
1873/// Checks if the provided value does not require scheduling. It does not
1874/// require scheduling if this is not an instruction or it is an instruction
1875/// that does not read/write memory and all operands are either not instructions
1876/// or phi nodes or instructions from different blocks.
1877static bool areAllOperandsNonInsts(Value *V) {
1878 auto *I = dyn_cast<Instruction>(Val: V);
1879 if (!I)
1880 return true;
1881 return !mayHaveNonDefUseDependency(I: *I) &&
1882 all_of(Range: I->operands(), P: [I](Value *V) {
1883 auto *IO = dyn_cast<Instruction>(Val: V);
1884 if (!IO)
1885 return true;
1886 return isa<PHINode>(Val: IO) || IO->getParent() != I->getParent();
1887 });
1888}
1889
1890/// Checks if the provided value does not require scheduling. It does not
1891/// require scheduling if this is not an instruction or it is an instruction
1892/// that does not read/write memory and all users are phi nodes or instructions
1893/// from the different blocks.
1894static bool isUsedOutsideBlock(Value *V) {
1895 auto *I = dyn_cast<Instruction>(Val: V);
1896 if (!I)
1897 return true;
1898 // Limits the number of uses to save compile time.
1899 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1900 all_of(Range: I->users(), P: [I](User *U) {
1901 auto *IU = dyn_cast<Instruction>(Val: U);
1902 if (!IU)
1903 return true;
1904 return IU->getParent() != I->getParent() || isa<PHINode>(Val: IU);
1905 });
1906}
1907
1908/// Checks if the specified value does not require scheduling. It does not
1909/// require scheduling if all operands and all users do not need to be scheduled
1910/// in the current basic block.
1911static bool doesNotNeedToBeScheduled(Value *V) {
1912 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1913}
1914
1915/// Checks if the specified array of instructions does not require scheduling.
1916/// It is so if all either instructions have operands that do not require
1917/// scheduling or their users do not require scheduling since they are phis or
1918/// in other basic blocks.
1919static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1920 return !VL.empty() &&
1921 (all_of(Range&: VL, P: isUsedOutsideBlock) || all_of(Range&: VL, P: areAllOperandsNonInsts));
1922}
1923
1924/// Returns true if widened type of \p Ty elements with size \p Sz represents
1925/// full vector type, i.e. adding extra element results in extra parts upon type
1926/// legalization.
1927static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1928 unsigned Sz) {
1929 if (Sz <= 1)
1930 return false;
1931 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Val: Ty))
1932 return false;
1933 if (has_single_bit(Value: Sz))
1934 return true;
1935 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
1936 return NumParts > 0 && NumParts < Sz && has_single_bit(Value: Sz / NumParts) &&
1937 Sz % NumParts == 0;
1938}
1939
1940/// Returns number of parts, the type \p VecTy will be split at the codegen
1941/// phase. If the type is going to be scalarized or does not uses whole
1942/// registers, returns 1.
1943static unsigned
1944getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1945 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1946 unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
1947 if (NumParts == 0 || NumParts >= Limit)
1948 return 1;
1949 unsigned Sz = getNumElements(Ty: VecTy);
1950 if (NumParts >= Sz || Sz % NumParts != 0 ||
1951 !hasFullVectorsOrPowerOf2(TTI, Ty: VecTy->getElementType(), Sz: Sz / NumParts))
1952 return 1;
1953 return NumParts;
1954}
1955
1956/// Bottom Up SLP Vectorizer.
1957class slpvectorizer::BoUpSLP {
1958 class TreeEntry;
1959 class ScheduleEntity;
1960 class ScheduleData;
1961 class ScheduleCopyableData;
1962 class ScheduleBundle;
1963 class ShuffleCostEstimator;
1964 class ShuffleInstructionBuilder;
1965
1966 /// If we decide to generate strided load / store, this struct contains all
1967 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1968 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1969 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1970 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1971 /// size of element of FixedVectorType.
1972 struct StridedPtrInfo {
1973 Value *StrideVal = nullptr;
1974 const SCEV *StrideSCEV = nullptr;
1975 FixedVectorType *Ty = nullptr;
1976 };
1977 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1978
1979public:
1980 /// Tracks the state we can represent the loads in the given sequence.
1981 enum class LoadsState {
1982 Gather,
1983 Vectorize,
1984 ScatterVectorize,
1985 StridedVectorize,
1986 CompressVectorize
1987 };
1988
1989 using ValueList = SmallVector<Value *, 8>;
1990 using InstrList = SmallVector<Instruction *, 16>;
1991 using ValueSet = SmallPtrSet<Value *, 16>;
1992 using StoreList = SmallVector<StoreInst *, 8>;
1993 using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
1994 using OrdersType = SmallVector<unsigned, 4>;
1995
1996 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1997 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1998 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1999 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
2000 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
2001 AC(AC), DB(DB), DL(DL), ORE(ORE),
2002 Builder(Se->getContext(), TargetFolder(*DL)) {
2003 CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
2004 // Use the vector register size specified by the target unless overridden
2005 // by a command-line option.
2006 // TODO: It would be better to limit the vectorization factor based on
2007 // data type rather than just register size. For example, x86 AVX has
2008 // 256-bit registers, but it does not support integer operations
2009 // at that width (that requires AVX2).
2010 if (MaxVectorRegSizeOption.getNumOccurrences())
2011 MaxVecRegSize = MaxVectorRegSizeOption;
2012 else
2013 MaxVecRegSize =
2014 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
2015 .getFixedValue();
2016
2017 if (MinVectorRegSizeOption.getNumOccurrences())
2018 MinVecRegSize = MinVectorRegSizeOption;
2019 else
2020 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2021 }
2022
2023 /// Vectorize the tree that starts with the elements in \p VL.
2024 /// Returns the vectorized root.
2025 Value *vectorizeTree();
2026
2027 /// Vectorize the tree but with the list of externally used values \p
2028 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2029 /// generated extractvalue instructions.
2030 Value *
2031 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2032 Instruction *ReductionRoot = nullptr,
2033 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2034 VectorValuesAndScales = {});
2035
2036 /// \returns the cost incurred by unwanted spills and fills, caused by
2037 /// holding live values over call sites.
2038 InstructionCost getSpillCost();
2039
2040 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2041 /// final cost.
2042 InstructionCost
2043 calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {});
2044
2045 /// \returns the vectorization cost of the subtree that starts at \p VL.
2046 /// A negative number means that this is profitable.
2047 InstructionCost getTreeCost(InstructionCost TreeCost,
2048 ArrayRef<Value *> VectorizedVals = {},
2049 InstructionCost ReductionCost = TTI::TCC_Free,
2050 Instruction *RdxRoot = nullptr);
2051
2052 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2053 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2054 void buildTree(ArrayRef<Value *> Roots,
2055 const SmallDenseSet<Value *> &UserIgnoreLst);
2056
2057 /// Construct a vectorizable tree that starts at \p Roots.
2058 void buildTree(ArrayRef<Value *> Roots);
2059
2060 /// Return the scalars of the root node.
2061 ArrayRef<Value *> getRootNodeScalars() const {
2062 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2063 return VectorizableTree.front()->Scalars;
2064 }
2065
2066 /// Returns the type/is-signed info for the root node in the graph without
2067 /// casting.
2068 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2069 const TreeEntry &Root = *VectorizableTree.front();
2070 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2071 !Root.Scalars.front()->getType()->isIntegerTy())
2072 return std::nullopt;
2073 auto It = MinBWs.find(Val: &Root);
2074 if (It != MinBWs.end())
2075 return std::make_pair(x: IntegerType::get(C&: Root.Scalars.front()->getContext(),
2076 NumBits: It->second.first),
2077 y: It->second.second);
2078 if (Root.getOpcode() == Instruction::ZExt ||
2079 Root.getOpcode() == Instruction::SExt)
2080 return std::make_pair(x: cast<CastInst>(Val: Root.getMainOp())->getSrcTy(),
2081 y: Root.getOpcode() == Instruction::SExt);
2082 return std::nullopt;
2083 }
2084
2085 /// Checks if the root graph node can be emitted with narrower bitwidth at
2086 /// codegen and returns it signedness, if so.
2087 bool isSignedMinBitwidthRootNode() const {
2088 return MinBWs.at(Val: VectorizableTree.front().get()).second;
2089 }
2090
2091 /// Returns reduction type after minbitdth analysis.
2092 FixedVectorType *getReductionType() const {
2093 if (ReductionBitWidth == 0 ||
2094 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2095 ReductionBitWidth >=
2096 DL->getTypeSizeInBits(
2097 Ty: VectorizableTree.front()->Scalars.front()->getType()))
2098 return getWidenedType(
2099 ScalarTy: VectorizableTree.front()->Scalars.front()->getType(),
2100 VF: VectorizableTree.front()->getVectorFactor());
2101 return getWidenedType(
2102 ScalarTy: IntegerType::get(
2103 C&: VectorizableTree.front()->Scalars.front()->getContext(),
2104 NumBits: ReductionBitWidth),
2105 VF: VectorizableTree.front()->getVectorFactor());
2106 }
2107
2108 /// Returns true if the tree results in one of the reduced bitcasts variants.
2109 bool isReducedBitcastRoot() const {
2110 return VectorizableTree.front()->hasState() &&
2111 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2112 VectorizableTree.front()->CombinedOp ==
2113 TreeEntry::ReducedBitcastBSwap ||
2114 VectorizableTree.front()->CombinedOp ==
2115 TreeEntry::ReducedBitcastLoads ||
2116 VectorizableTree.front()->CombinedOp ==
2117 TreeEntry::ReducedBitcastBSwapLoads) &&
2118 VectorizableTree.front()->State == TreeEntry::Vectorize;
2119 }
2120
2121 /// Returns true if the tree results in the reduced cmp bitcast root.
2122 bool isReducedCmpBitcastRoot() const {
2123 return VectorizableTree.front()->hasState() &&
2124 VectorizableTree.front()->CombinedOp ==
2125 TreeEntry::ReducedCmpBitcast &&
2126 VectorizableTree.front()->State == TreeEntry::Vectorize;
2127 }
2128
2129 /// Builds external uses of the vectorized scalars, i.e. the list of
2130 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2131 /// ExternallyUsedValues contains additional list of external uses to handle
2132 /// vectorization of reductions.
2133 void
2134 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2135
2136 /// Transforms graph nodes to target specific representations, if profitable.
2137 void transformNodes();
2138
2139 /// Clear the internal data structures that are created by 'buildTree'.
2140 void deleteTree() {
2141 VectorizableTree.clear();
2142 ScalarToTreeEntries.clear();
2143 DeletedNodes.clear();
2144 TransformedToGatherNodes.clear();
2145 OperandsToTreeEntry.clear();
2146 ScalarsInSplitNodes.clear();
2147 MustGather.clear();
2148 NonScheduledFirst.clear();
2149 EntryToLastInstruction.clear();
2150 LastInstructionToPos.clear();
2151 LoadEntriesToVectorize.clear();
2152 IsGraphTransformMode = false;
2153 GatheredLoadsEntriesFirst.reset();
2154 CompressEntryToData.clear();
2155 ExternalUses.clear();
2156 ExternalUsesAsOriginalScalar.clear();
2157 ExternalUsesWithNonUsers.clear();
2158 for (auto &Iter : BlocksSchedules) {
2159 BlockScheduling *BS = Iter.second.get();
2160 BS->clear();
2161 }
2162 MinBWs.clear();
2163 ReductionBitWidth = 0;
2164 BaseGraphSize = 1;
2165 CastMaxMinBWSizes.reset();
2166 ExtraBitWidthNodes.clear();
2167 InstrElementSize.clear();
2168 UserIgnoreList = nullptr;
2169 PostponedGathers.clear();
2170 ValueToGatherNodes.clear();
2171 TreeEntryToStridedPtrInfoMap.clear();
2172 CurrentLoopNest.clear();
2173 }
2174
2175 unsigned getTreeSize() const { return VectorizableTree.size(); }
2176
2177 /// Returns the base graph size, before any transformations.
2178 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2179
2180 /// Perform LICM and CSE on the newly generated gather sequences.
2181 void optimizeGatherSequence();
2182
2183 /// Does this non-empty order represent an identity order? Identity
2184 /// should be represented as an empty order, so this is used to
2185 /// decide if we can canonicalize a computed order. Undef elements
2186 /// (represented as size) are ignored.
2187 static bool isIdentityOrder(ArrayRef<unsigned> Order) {
2188 assert(!Order.empty() && "expected non-empty order");
2189 const unsigned Sz = Order.size();
2190 return all_of(Range: enumerate(First&: Order), P: [&](const auto &P) {
2191 return P.value() == P.index() || P.value() == Sz;
2192 });
2193 }
2194
2195 /// Checks if the specified gather tree entry \p TE can be represented as a
2196 /// shuffled vector entry + (possibly) permutation with other gathers. It
2197 /// implements the checks only for possibly ordered scalars (Loads,
2198 /// ExtractElement, ExtractValue), which can be part of the graph.
2199 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2200 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2201 /// node might be ignored.
2202 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2203 bool TopToBottom,
2204 bool IgnoreReorder);
2205
2206 /// Sort loads into increasing pointers offsets to allow greater clustering.
2207 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2208
2209 /// Gets reordering data for the given tree entry. If the entry is vectorized
2210 /// - just return ReorderIndices, otherwise check if the scalars can be
2211 /// reordered and return the most optimal order.
2212 /// \return std::nullopt if ordering is not important, empty order, if
2213 /// identity order is important, or the actual order.
2214 /// \param TopToBottom If true, include the order of vectorized stores and
2215 /// insertelement nodes, otherwise skip them.
2216 /// \param IgnoreReorder true, if the root node order can be ignored.
2217 std::optional<OrdersType>
2218 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2219
2220 /// Checks if it is profitable to reorder the current tree.
2221 /// If the tree does not contain many profitable reordable nodes, better to
2222 /// skip it to save compile time.
2223 bool isProfitableToReorder() const;
2224
2225 /// Reorders the current graph to the most profitable order starting from the
2226 /// root node to the leaf nodes. The best order is chosen only from the nodes
2227 /// of the same size (vectorization factor). Smaller nodes are considered
2228 /// parts of subgraph with smaller VF and they are reordered independently. We
2229 /// can make it because we still need to extend smaller nodes to the wider VF
2230 /// and we can merge reordering shuffles with the widening shuffles.
2231 void reorderTopToBottom();
2232
2233 /// Reorders the current graph to the most profitable order starting from
2234 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2235 /// number of reshuffles if the leaf nodes use the same order. In this case we
2236 /// can merge the orders and just shuffle user node instead of shuffling its
2237 /// operands. Plus, even the leaf nodes have different orders, it allows to
2238 /// sink reordering in the graph closer to the root node and merge it later
2239 /// during analysis.
2240 void reorderBottomToTop(bool IgnoreReorder = false);
2241
2242 /// \return The vector element size in bits to use when vectorizing the
2243 /// expression tree ending at \p V. If V is a store, the size is the width of
2244 /// the stored value. Otherwise, the size is the width of the largest loaded
2245 /// value reaching V. This method is used by the vectorizer to calculate
2246 /// vectorization factors.
2247 unsigned getVectorElementSize(Value *V);
2248
2249 /// Compute the minimum type sizes required to represent the entries in a
2250 /// vectorizable tree.
2251 void computeMinimumValueSizes();
2252
2253 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2254 unsigned getMaxVecRegSize() const {
2255 return MaxVecRegSize;
2256 }
2257
2258 // \returns minimum vector register size as set by cl::opt.
2259 unsigned getMinVecRegSize() const {
2260 return MinVecRegSize;
2261 }
2262
2263 unsigned getMinVF(unsigned Sz) const {
2264 return std::max(a: 2U, b: getMinVecRegSize() / Sz);
2265 }
2266
2267 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2268 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2269 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2270 return MaxVF ? MaxVF : UINT_MAX;
2271 }
2272
2273 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2274 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2275 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2276 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2277 ///
2278 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2279 unsigned canMapToVector(Type *T) const;
2280
2281 /// \returns True if the VectorizableTree is both tiny and not fully
2282 /// vectorizable. We do not vectorize such trees.
2283 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2284
2285 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2286 /// It may happen, if all gather nodes are loads and they cannot be
2287 /// "clusterized". In this case even subgraphs cannot be vectorized more
2288 /// effectively than the base graph.
2289 bool isTreeNotExtendable() const;
2290
2291 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2292 Align Alignment, const int64_t Diff,
2293 const size_t Sz) const;
2294
2295 /// Return true if an array of scalar loads can be replaced with a strided
2296 /// load (with constant stride).
2297 ///
2298 /// It is possible that the load gets "widened". Suppose that originally each
2299 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2300 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2301 /// ...
2302 /// %b + 0 * %s + (w - 1)
2303 ///
2304 /// %b + 1 * %s + 0
2305 /// %b + 1 * %s + 1
2306 /// %b + 1 * %s + 2
2307 /// ...
2308 /// %b + 1 * %s + (w - 1)
2309 /// ...
2310 ///
2311 /// %b + (n - 1) * %s + 0
2312 /// %b + (n - 1) * %s + 1
2313 /// %b + (n - 1) * %s + 2
2314 /// ...
2315 /// %b + (n - 1) * %s + (w - 1)
2316 ///
2317 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2318 ///
2319 /// \param PointerOps list of pointer arguments of loads.
2320 /// \param ElemTy original scalar type of loads.
2321 /// \param Alignment alignment of the first load.
2322 /// \param SortedIndices is the order of PointerOps as returned by
2323 /// `sortPtrAccesses`
2324 /// \param Diff Pointer difference between the lowest and the highes pointer
2325 /// in `PointerOps` as returned by `getPointersDiff`.
2326 /// \param Ptr0 first pointer in `PointersOps`.
2327 /// \param PtrN last pointer in `PointersOps`.
2328 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2329 /// of `SPtrInfo` necessary to generate the strided load later.
2330 bool analyzeConstantStrideCandidate(
2331 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2332 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2333 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2334
2335 /// Return true if an array of scalar loads can be replaced with a strided
2336 /// load (with run-time stride).
2337 /// \param PointerOps list of pointer arguments of loads.
2338 /// \param ScalarTy type of loads.
2339 /// \param CommonAlignment common alignement of loads as computed by
2340 /// `computeCommonAlignment<LoadInst>`.
2341 /// \param SortedIndicies is a list of indicies computed by this function such
2342 /// that the sequence `PointerOps[SortedIndices[0]],
2343 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2344 /// ordered by the coefficient of the stride. For example, if PointerOps is
2345 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2346 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2347 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2348 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2349 /// of `SPtrInfo` necessary to generate the strided load later.
2350 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2351 Align CommonAlignment,
2352 SmallVectorImpl<unsigned> &SortedIndices,
2353 StridedPtrInfo &SPtrInfo) const;
2354
2355 /// Checks if the given array of loads can be represented as a vectorized,
2356 /// scatter or just simple gather.
2357 /// \param VL list of loads.
2358 /// \param VL0 main load value.
2359 /// \param Order returned order of load instructions.
2360 /// \param PointerOps returned list of pointer operands.
2361 /// \param BestVF return best vector factor, if recursive check found better
2362 /// vectorization sequences rather than masked gather.
2363 /// \param TryRecursiveCheck used to check if long masked gather can be
2364 /// represented as a serie of loads/insert subvector, if profitable.
2365 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
2366 SmallVectorImpl<unsigned> &Order,
2367 SmallVectorImpl<Value *> &PointerOps,
2368 StridedPtrInfo &SPtrInfo,
2369 unsigned *BestVF = nullptr,
2370 bool TryRecursiveCheck = true) const;
2371
2372 /// Registers non-vectorizable sequence of loads
2373 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2374 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2375 }
2376
2377 /// Checks if the given loads sequence is known as not vectorizable
2378 template <typename T>
2379 bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const {
2380 return ListOfKnonwnNonVectorizableLoads.contains(V: hash_value(VL));
2381 }
2382
2383 OptimizationRemarkEmitter *getORE() { return ORE; }
2384
2385 /// This structure holds any data we need about the edges being traversed
2386 /// during buildTreeRec(). We keep track of:
2387 /// (i) the user TreeEntry index, and
2388 /// (ii) the index of the edge.
2389 struct EdgeInfo {
2390 EdgeInfo() = default;
2391 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2392 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
2393 /// The user TreeEntry.
2394 TreeEntry *UserTE = nullptr;
2395 /// The operand index of the use.
2396 unsigned EdgeIdx = UINT_MAX;
2397#ifndef NDEBUG
2398 friend inline raw_ostream &operator<<(raw_ostream &OS,
2399 const BoUpSLP::EdgeInfo &EI) {
2400 EI.dump(OS);
2401 return OS;
2402 }
2403 /// Debug print.
2404 void dump(raw_ostream &OS) const {
2405 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2406 << " EdgeIdx:" << EdgeIdx << "}";
2407 }
2408 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2409#endif
2410 bool operator == (const EdgeInfo &Other) const {
2411 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2412 }
2413
2414 operator bool() const { return UserTE != nullptr; }
2415 };
2416 friend struct DenseMapInfo<EdgeInfo>;
2417
2418 /// A helper class used for scoring candidates for two consecutive lanes.
2419 class LookAheadHeuristics {
2420 const TargetLibraryInfo &TLI;
2421 const DataLayout &DL;
2422 ScalarEvolution &SE;
2423 const BoUpSLP &R;
2424 int NumLanes; // Total number of lanes (aka vectorization factor).
2425 int MaxLevel; // The maximum recursion depth for accumulating score.
2426
2427 public:
2428 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
2429 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2430 int MaxLevel)
2431 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2432 MaxLevel(MaxLevel) {}
2433
2434 // The hard-coded scores listed here are not very important, though it shall
2435 // be higher for better matches to improve the resulting cost. When
2436 // computing the scores of matching one sub-tree with another, we are
2437 // basically counting the number of values that are matching. So even if all
2438 // scores are set to 1, we would still get a decent matching result.
2439 // However, sometimes we have to break ties. For example we may have to
2440 // choose between matching loads vs matching opcodes. This is what these
2441 // scores are helping us with: they provide the order of preference. Also,
2442 // this is important if the scalar is externally used or used in another
2443 // tree entry node in the different lane.
2444
2445 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2446 static const int ScoreConsecutiveLoads = 4;
2447 /// The same load multiple times. This should have a better score than
2448 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2449 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2450 /// a vector load and 1.0 for a broadcast.
2451 static const int ScoreSplatLoads = 3;
2452 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2453 static const int ScoreReversedLoads = 3;
2454 /// A load candidate for masked gather.
2455 static const int ScoreMaskedGatherCandidate = 1;
2456 /// ExtractElementInst from same vector and consecutive indexes.
2457 static const int ScoreConsecutiveExtracts = 4;
2458 /// ExtractElementInst from same vector and reversed indices.
2459 static const int ScoreReversedExtracts = 3;
2460 /// Constants.
2461 static const int ScoreConstants = 2;
2462 /// Instructions with the same opcode.
2463 static const int ScoreSameOpcode = 2;
2464 /// Instructions with alt opcodes (e.g, add + sub).
2465 static const int ScoreAltOpcodes = 1;
2466 /// Identical instructions (a.k.a. splat or broadcast).
2467 static const int ScoreSplat = 1;
2468 /// Matching with an undef is preferable to failing.
2469 static const int ScoreUndef = 1;
2470 /// Score for failing to find a decent match.
2471 static const int ScoreFail = 0;
2472 /// Score if all users are vectorized.
2473 static const int ScoreAllUserVectorized = 1;
2474
2475 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2476 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2477 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2478 /// MainAltOps.
2479 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
2480 ArrayRef<Value *> MainAltOps) const {
2481 if (!isValidElementType(Ty: V1->getType()) ||
2482 !isValidElementType(Ty: V2->getType()))
2483 return LookAheadHeuristics::ScoreFail;
2484
2485 if (V1 == V2) {
2486 if (isa<LoadInst>(Val: V1)) {
2487 // Retruns true if the users of V1 and V2 won't need to be extracted.
2488 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2489 // Bail out if we have too many uses to save compilation time.
2490 if (V1->hasNUsesOrMore(N: UsesLimit) || V2->hasNUsesOrMore(N: UsesLimit))
2491 return false;
2492
2493 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2494 return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
2495 return U == U1 || U == U2 || R.isVectorized(V: U);
2496 });
2497 };
2498 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2499 };
2500 // A broadcast of a load can be cheaper on some targets.
2501 if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
2502 NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
2503 ((int)V1->getNumUses() == NumLanes ||
2504 AllUsersAreInternal(V1, V2)))
2505 return LookAheadHeuristics::ScoreSplatLoads;
2506 }
2507 return LookAheadHeuristics::ScoreSplat;
2508 }
2509
2510 auto CheckSameEntryOrFail = [&]() {
2511 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V: V1); !TEs1.empty()) {
2512 SmallPtrSet<TreeEntry *, 4> Set(llvm::from_range, TEs1);
2513 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V: V2);
2514 !TEs2.empty() &&
2515 any_of(Range&: TEs2, P: [&](TreeEntry *E) { return Set.contains(Ptr: E); }))
2516 return LookAheadHeuristics::ScoreSplatLoads;
2517 }
2518 return LookAheadHeuristics::ScoreFail;
2519 };
2520
2521 auto *LI1 = dyn_cast<LoadInst>(Val: V1);
2522 auto *LI2 = dyn_cast<LoadInst>(Val: V2);
2523 if (LI1 && LI2) {
2524 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2525 !LI2->isSimple())
2526 return CheckSameEntryOrFail();
2527
2528 std::optional<int64_t> Dist = getPointersDiff(
2529 ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
2530 PtrB: LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2531 if (!Dist || *Dist == 0) {
2532 if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
2533 getUnderlyingObject(V: LI2->getPointerOperand()) &&
2534 R.TTI->isLegalMaskedGather(
2535 DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
2536 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2537 return CheckSameEntryOrFail();
2538 }
2539 // The distance is too large - still may be profitable to use masked
2540 // loads/gathers.
2541 if (std::abs(i: *Dist) > NumLanes / 2)
2542 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2543 // This still will detect consecutive loads, but we might have "holes"
2544 // in some cases. It is ok for non-power-2 vectorization and may produce
2545 // better results. It should not affect current vectorization.
2546 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
2547 : LookAheadHeuristics::ScoreReversedLoads;
2548 }
2549
2550 auto *C1 = dyn_cast<Constant>(Val: V1);
2551 auto *C2 = dyn_cast<Constant>(Val: V2);
2552 if (C1 && C2)
2553 return LookAheadHeuristics::ScoreConstants;
2554
2555 // Consider constants and buildvector compatible.
2556 if ((C1 && isa<InsertElementInst>(Val: V2)) ||
2557 (C2 && isa<InsertElementInst>(Val: V1)))
2558 return LookAheadHeuristics::ScoreConstants;
2559
2560 // Extracts from consecutive indexes of the same vector better score as
2561 // the extracts could be optimized away.
2562 Value *EV1;
2563 ConstantInt *Ex1Idx;
2564 if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
2565 // Undefs are always profitable for extractelements.
2566 // Compiler can easily combine poison and extractelement <non-poison> or
2567 // undef and extractelement <poison>. But combining undef +
2568 // extractelement <non-poison-but-may-produce-poison> requires some
2569 // extra operations.
2570 if (isa<UndefValue>(Val: V2))
2571 return (isa<PoisonValue>(Val: V2) || isUndefVector(V: EV1).all())
2572 ? LookAheadHeuristics::ScoreConsecutiveExtracts
2573 : LookAheadHeuristics::ScoreSameOpcode;
2574 Value *EV2 = nullptr;
2575 ConstantInt *Ex2Idx = nullptr;
2576 if (match(V: V2,
2577 P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
2578 R: m_Undef())))) {
2579 // Undefs are always profitable for extractelements.
2580 if (!Ex2Idx)
2581 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2582 if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
2583 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2584 if (EV2 == EV1) {
2585 int Idx1 = Ex1Idx->getZExtValue();
2586 int Idx2 = Ex2Idx->getZExtValue();
2587 int Dist = Idx2 - Idx1;
2588 // The distance is too large - still may be profitable to use
2589 // shuffles.
2590 if (std::abs(x: Dist) == 0)
2591 return LookAheadHeuristics::ScoreSplat;
2592 if (std::abs(x: Dist) > NumLanes / 2)
2593 return LookAheadHeuristics::ScoreSameOpcode;
2594 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
2595 : LookAheadHeuristics::ScoreReversedExtracts;
2596 }
2597 return LookAheadHeuristics::ScoreAltOpcodes;
2598 }
2599 return CheckSameEntryOrFail();
2600 }
2601
2602 auto *I1 = dyn_cast<Instruction>(Val: V1);
2603 auto *I2 = dyn_cast<Instruction>(Val: V2);
2604 if (I1 && I2) {
2605 if (I1->getParent() != I2->getParent())
2606 return CheckSameEntryOrFail();
2607 Value *V;
2608 Value *Cond;
2609 // ZExt i1 to something must be considered same opcode for select i1
2610 // cmp, x, y
2611 // Required to better match the transformation after
2612 // BoUpSLP::matchesInversedZExtSelect analysis.
2613 if ((match(V: I1, P: m_ZExt(Op: m_Value(V))) &&
2614 match(V: I2, P: m_Select(C: m_Value(V&: Cond), L: m_Value(), R: m_Value())) &&
2615 V->getType() == Cond->getType()) ||
2616 (match(V: I2, P: m_ZExt(Op: m_Value(V))) &&
2617 match(V: I1, P: m_Select(C: m_Value(V&: Cond), L: m_Value(), R: m_Value())) &&
2618 V->getType() == Cond->getType()))
2619 return LookAheadHeuristics::ScoreSameOpcode;
2620 SmallVector<Value *, 4> Ops(MainAltOps);
2621 Ops.push_back(Elt: I1);
2622 Ops.push_back(Elt: I2);
2623 InstructionsState S = getSameOpcode(VL: Ops, TLI);
2624 // Note: Only consider instructions with <= 2 operands to avoid
2625 // complexity explosion.
2626 if (S &&
2627 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2628 !S.isAltShuffle()) &&
2629 all_of(Range&: Ops, P: [&S](Value *V) {
2630 return isa<PoisonValue>(Val: V) ||
2631 cast<Instruction>(Val: V)->getNumOperands() ==
2632 S.getMainOp()->getNumOperands();
2633 }))
2634 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2635 : LookAheadHeuristics::ScoreSameOpcode;
2636 }
2637
2638 if (I1 && isa<PoisonValue>(Val: V2))
2639 return LookAheadHeuristics::ScoreSameOpcode;
2640
2641 if (isa<UndefValue>(Val: V2))
2642 return LookAheadHeuristics::ScoreUndef;
2643
2644 return CheckSameEntryOrFail();
2645 }
2646
2647 /// Go through the operands of \p LHS and \p RHS recursively until
2648 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2649 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2650 /// of \p U1 and \p U2), except at the beginning of the recursion where
2651 /// these are set to nullptr.
2652 ///
2653 /// For example:
2654 /// \verbatim
2655 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2656 /// \ / \ / \ / \ /
2657 /// + + + +
2658 /// G1 G2 G3 G4
2659 /// \endverbatim
2660 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2661 /// each level recursively, accumulating the score. It starts from matching
2662 /// the additions at level 0, then moves on to the loads (level 1). The
2663 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2664 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2665 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2666 /// Please note that the order of the operands does not matter, as we
2667 /// evaluate the score of all profitable combinations of operands. In
2668 /// other words the score of G1 and G4 is the same as G1 and G2. This
2669 /// heuristic is based on ideas described in:
2670 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2671 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2672 /// Luís F. W. Góes
2673 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
2674 Instruction *U2, int CurrLevel,
2675 ArrayRef<Value *> MainAltOps) const {
2676
2677 // Get the shallow score of V1 and V2.
2678 int ShallowScoreAtThisLevel =
2679 getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
2680
2681 // If reached MaxLevel,
2682 // or if V1 and V2 are not instructions,
2683 // or if they are SPLAT,
2684 // or if they are not consecutive,
2685 // or if profitable to vectorize loads or extractelements, early return
2686 // the current cost.
2687 auto *I1 = dyn_cast<Instruction>(Val: LHS);
2688 auto *I2 = dyn_cast<Instruction>(Val: RHS);
2689 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2690 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2691 (((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) ||
2692 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2693 (isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
2694 ShallowScoreAtThisLevel))
2695 return ShallowScoreAtThisLevel;
2696 assert(I1 && I2 && "Should have early exited.");
2697
2698 // Contains the I2 operand indexes that got matched with I1 operands.
2699 SmallSet<unsigned, 4> Op2Used;
2700
2701 // Recursion towards the operands of I1 and I2. We are trying all possible
2702 // operand pairs, and keeping track of the best score.
2703 if (I1->getNumOperands() != I2->getNumOperands())
2704 return LookAheadHeuristics::ScoreSameOpcode;
2705 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2706 OpIdx1 != NumOperands1; ++OpIdx1) {
2707 // Try to pair op1I with the best operand of I2.
2708 int MaxTmpScore = 0;
2709 unsigned MaxOpIdx2 = 0;
2710 bool FoundBest = false;
2711 // If I2 is commutative try all combinations.
2712 unsigned FromIdx = isCommutative(I: I2) ? 0 : OpIdx1;
2713 unsigned ToIdx = isCommutative(I: I2)
2714 ? I2->getNumOperands()
2715 : std::min(a: I2->getNumOperands(), b: OpIdx1 + 1);
2716 assert(FromIdx <= ToIdx && "Bad index");
2717 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2718 // Skip operands already paired with OpIdx1.
2719 if (Op2Used.count(V: OpIdx2))
2720 continue;
2721 // Recursively calculate the cost at each level
2722 int TmpScore =
2723 getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
2724 U1: I1, U2: I2, CurrLevel: CurrLevel + 1, MainAltOps: {});
2725 // Look for the best score.
2726 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2727 TmpScore > MaxTmpScore) {
2728 MaxTmpScore = TmpScore;
2729 MaxOpIdx2 = OpIdx2;
2730 FoundBest = true;
2731 }
2732 }
2733 if (FoundBest) {
2734 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2735 Op2Used.insert(V: MaxOpIdx2);
2736 ShallowScoreAtThisLevel += MaxTmpScore;
2737 }
2738 }
2739 return ShallowScoreAtThisLevel;
2740 }
2741 };
2742 /// A helper data structure to hold the operands of a vector of instructions.
2743 /// This supports a fixed vector length for all operand vectors.
2744 class VLOperands {
2745 /// For each operand we need (i) the value, and (ii) the opcode that it
2746 /// would be attached to if the expression was in a left-linearized form.
2747 /// This is required to avoid illegal operand reordering.
2748 /// For example:
2749 /// \verbatim
2750 /// 0 Op1
2751 /// |/
2752 /// Op1 Op2 Linearized + Op2
2753 /// \ / ----------> |/
2754 /// - -
2755 ///
2756 /// Op1 - Op2 (0 + Op1) - Op2
2757 /// \endverbatim
2758 ///
2759 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2760 ///
2761 /// Another way to think of this is to track all the operations across the
2762 /// path from the operand all the way to the root of the tree and to
2763 /// calculate the operation that corresponds to this path. For example, the
2764 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2765 /// corresponding operation is a '-' (which matches the one in the
2766 /// linearized tree, as shown above).
2767 ///
2768 /// For lack of a better term, we refer to this operation as Accumulated
2769 /// Path Operation (APO).
2770 struct OperandData {
2771 OperandData() = default;
2772 OperandData(Value *V, bool APO, bool IsUsed)
2773 : V(V), APO(APO), IsUsed(IsUsed) {}
2774 /// The operand value.
2775 Value *V = nullptr;
2776 /// TreeEntries only allow a single opcode, or an alternate sequence of
2777 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2778 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2779 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2780 /// (e.g., Add/Mul)
2781 bool APO = false;
2782 /// Helper data for the reordering function.
2783 bool IsUsed = false;
2784 };
2785
2786 /// During operand reordering, we are trying to select the operand at lane
2787 /// that matches best with the operand at the neighboring lane. Our
2788 /// selection is based on the type of value we are looking for. For example,
2789 /// if the neighboring lane has a load, we need to look for a load that is
2790 /// accessing a consecutive address. These strategies are summarized in the
2791 /// 'ReorderingMode' enumerator.
2792 enum class ReorderingMode {
2793 Load, ///< Matching loads to consecutive memory addresses
2794 Opcode, ///< Matching instructions based on opcode (same or alternate)
2795 Constant, ///< Matching constants
2796 Splat, ///< Matching the same instruction multiple times (broadcast)
2797 Failed, ///< We failed to create a vectorizable group
2798 };
2799
2800 using OperandDataVec = SmallVector<OperandData, 2>;
2801
2802 /// A vector of operand vectors.
2803 SmallVector<OperandDataVec, 4> OpsVec;
2804 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2805 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2806 unsigned ArgSize = 0;
2807
2808 const TargetLibraryInfo &TLI;
2809 const DataLayout &DL;
2810 ScalarEvolution &SE;
2811 const BoUpSLP &R;
2812 const Loop *L = nullptr;
2813
2814 /// \returns the operand data at \p OpIdx and \p Lane.
2815 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2816 return OpsVec[OpIdx][Lane];
2817 }
2818
2819 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2820 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2821 return OpsVec[OpIdx][Lane];
2822 }
2823
2824 /// Clears the used flag for all entries.
2825 void clearUsed() {
2826 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2827 OpIdx != NumOperands; ++OpIdx)
2828 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2829 ++Lane)
2830 OpsVec[OpIdx][Lane].IsUsed = false;
2831 }
2832
2833 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2834 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2835 std::swap(a&: OpsVec[OpIdx1][Lane], b&: OpsVec[OpIdx2][Lane]);
2836 }
2837
2838 /// \param Lane lane of the operands under analysis.
2839 /// \param OpIdx operand index in \p Lane lane we're looking the best
2840 /// candidate for.
2841 /// \param Idx operand index of the current candidate value.
2842 /// \returns The additional score due to possible broadcasting of the
2843 /// elements in the lane. It is more profitable to have power-of-2 unique
2844 /// elements in the lane, it will be vectorized with higher probability
2845 /// after removing duplicates. Currently the SLP vectorizer supports only
2846 /// vectorization of the power-of-2 number of unique scalars.
2847 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2848 const SmallBitVector &UsedLanes) const {
2849 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2850 if (!isa<Instruction>(Val: IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2851 isa<ExtractElementInst>(Val: IdxLaneV))
2852 return 0;
2853 SmallDenseMap<Value *, unsigned, 4> Uniques;
2854 for (unsigned Ln : seq<unsigned>(Size: getNumLanes())) {
2855 if (Ln == Lane)
2856 continue;
2857 Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
2858 if (!isa<Instruction>(Val: OpIdxLnV))
2859 return 0;
2860 Uniques.try_emplace(Key: OpIdxLnV, Args&: Ln);
2861 }
2862 unsigned UniquesCount = Uniques.size();
2863 auto IdxIt = Uniques.find(Val: IdxLaneV);
2864 unsigned UniquesCntWithIdxLaneV =
2865 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2866 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2867 auto OpIdxIt = Uniques.find(Val: OpIdxLaneV);
2868 unsigned UniquesCntWithOpIdxLaneV =
2869 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2870 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2871 return 0;
2872 return std::min(a: bit_ceil(Value: UniquesCntWithOpIdxLaneV) -
2873 UniquesCntWithOpIdxLaneV,
2874 b: UniquesCntWithOpIdxLaneV -
2875 bit_floor(Value: UniquesCntWithOpIdxLaneV)) -
2876 ((IdxIt != Uniques.end() && UsedLanes.test(Idx: IdxIt->second))
2877 ? UniquesCntWithIdxLaneV - bit_floor(Value: UniquesCntWithIdxLaneV)
2878 : bit_ceil(Value: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2879 }
2880
2881 /// \param Lane lane of the operands under analysis.
2882 /// \param OpIdx operand index in \p Lane lane we're looking the best
2883 /// candidate for.
2884 /// \param Idx operand index of the current candidate value.
2885 /// \returns The additional score for the scalar which users are all
2886 /// vectorized.
2887 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2888 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2889 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2890 // Do not care about number of uses for vector-like instructions
2891 // (extractelement/extractvalue with constant indices), they are extracts
2892 // themselves and already externally used. Vectorization of such
2893 // instructions does not add extra extractelement instruction, just may
2894 // remove it.
2895 if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
2896 isVectorLikeInstWithConstOps(V: OpIdxLaneV))
2897 return LookAheadHeuristics::ScoreAllUserVectorized;
2898 auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
2899 if (!IdxLaneI || !isa<Instruction>(Val: OpIdxLaneV))
2900 return 0;
2901 return R.areAllUsersVectorized(I: IdxLaneI)
2902 ? LookAheadHeuristics::ScoreAllUserVectorized
2903 : 0;
2904 }
2905
2906 /// Score scaling factor for fully compatible instructions but with
2907 /// different number of external uses. Allows better selection of the
2908 /// instructions with less external uses.
2909 static const int ScoreScaleFactor = 10;
2910
2911 /// \Returns the look-ahead score, which tells us how much the sub-trees
2912 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2913 /// score. This helps break ties in an informed way when we cannot decide on
2914 /// the order of the operands by just considering the immediate
2915 /// predecessors.
2916 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2917 int Lane, unsigned OpIdx, unsigned Idx,
2918 bool &IsUsed, const SmallBitVector &UsedLanes) {
2919 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2920 LookAheadMaxDepth);
2921 // Keep track of the instruction stack as we recurse into the operands
2922 // during the look-ahead score exploration.
2923 int Score =
2924 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2925 /*CurrLevel=*/1, MainAltOps);
2926 if (Score) {
2927 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2928 if (Score <= -SplatScore) {
2929 // Failed score.
2930 Score = 0;
2931 } else {
2932 Score += SplatScore;
2933 // Scale score to see the difference between different operands
2934 // and similar operands but all vectorized/not all vectorized
2935 // uses. It does not affect actual selection of the best
2936 // compatible operand in general, just allows to select the
2937 // operand with all vectorized uses.
2938 Score *= ScoreScaleFactor;
2939 Score += getExternalUseScore(Lane, OpIdx, Idx);
2940 IsUsed = true;
2941 }
2942 }
2943 return Score;
2944 }
2945
2946 /// Best defined scores per lanes between the passes. Used to choose the
2947 /// best operand (with the highest score) between the passes.
2948 /// The key - {Operand Index, Lane}.
2949 /// The value - the best score between the passes for the lane and the
2950 /// operand.
2951 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
2952 BestScoresPerLanes;
2953
2954 // Search all operands in Ops[*][Lane] for the one that matches best
2955 // Ops[OpIdx][LastLane] and return its opreand index.
2956 // If no good match can be found, return std::nullopt.
2957 std::optional<unsigned>
2958 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2959 ArrayRef<ReorderingMode> ReorderingModes,
2960 ArrayRef<Value *> MainAltOps,
2961 const SmallBitVector &UsedLanes) {
2962 unsigned NumOperands = getNumOperands();
2963
2964 // The operand of the previous lane at OpIdx.
2965 Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
2966
2967 // Our strategy mode for OpIdx.
2968 ReorderingMode RMode = ReorderingModes[OpIdx];
2969 if (RMode == ReorderingMode::Failed)
2970 return std::nullopt;
2971
2972 // The linearized opcode of the operand at OpIdx, Lane.
2973 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2974
2975 // The best operand index and its score.
2976 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2977 // are using the score to differentiate between the two.
2978 struct BestOpData {
2979 std::optional<unsigned> Idx;
2980 unsigned Score = 0;
2981 } BestOp;
2982 BestOp.Score =
2983 BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: 0)
2984 .first->second;
2985
2986 // Track if the operand must be marked as used. If the operand is set to
2987 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2988 // want to reestimate the operands again on the following iterations).
2989 bool IsUsed = RMode == ReorderingMode::Splat ||
2990 RMode == ReorderingMode::Constant ||
2991 RMode == ReorderingMode::Load;
2992 // Iterate through all unused operands and look for the best.
2993 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2994 // Get the operand at Idx and Lane.
2995 OperandData &OpData = getData(OpIdx: Idx, Lane);
2996 Value *Op = OpData.V;
2997 bool OpAPO = OpData.APO;
2998
2999 // Skip already selected operands.
3000 if (OpData.IsUsed)
3001 continue;
3002
3003 // Skip if we are trying to move the operand to a position with a
3004 // different opcode in the linearized tree form. This would break the
3005 // semantics.
3006 if (OpAPO != OpIdxAPO)
3007 continue;
3008
3009 // Look for an operand that matches the current mode.
3010 switch (RMode) {
3011 case ReorderingMode::Load:
3012 case ReorderingMode::Opcode: {
3013 bool LeftToRight = Lane > LastLane;
3014 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
3015 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
3016 int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
3017 OpIdx, Idx, IsUsed, UsedLanes);
3018 if (Score > static_cast<int>(BestOp.Score) ||
3019 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
3020 Idx == OpIdx)) {
3021 BestOp.Idx = Idx;
3022 BestOp.Score = Score;
3023 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = Score;
3024 }
3025 break;
3026 }
3027 case ReorderingMode::Constant:
3028 if (isa<Constant>(Val: Op) ||
3029 (!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
3030 BestOp.Idx = Idx;
3031 if (isa<Constant>(Val: Op)) {
3032 BestOp.Score = LookAheadHeuristics::ScoreConstants;
3033 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
3034 LookAheadHeuristics::ScoreConstants;
3035 }
3036 if (isa<UndefValue>(Val: Op) || !isa<Constant>(Val: Op))
3037 IsUsed = false;
3038 }
3039 break;
3040 case ReorderingMode::Splat:
3041 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Val: Op))) {
3042 IsUsed = Op == OpLastLane;
3043 if (Op == OpLastLane) {
3044 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3045 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
3046 LookAheadHeuristics::ScoreSplat;
3047 }
3048 BestOp.Idx = Idx;
3049 }
3050 break;
3051 case ReorderingMode::Failed:
3052 llvm_unreachable("Not expected Failed reordering mode.");
3053 }
3054 }
3055
3056 if (BestOp.Idx) {
3057 getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
3058 return BestOp.Idx;
3059 }
3060 // If we could not find a good match return std::nullopt.
3061 return std::nullopt;
3062 }
3063
3064 /// Helper for reorderOperandVecs.
3065 /// \returns the lane that we should start reordering from. This is the one
3066 /// which has the least number of operands that can freely move about or
3067 /// less profitable because it already has the most optimal set of operands.
3068 unsigned getBestLaneToStartReordering() const {
3069 unsigned Min = UINT_MAX;
3070 unsigned SameOpNumber = 0;
3071 // std::pair<unsigned, unsigned> is used to implement a simple voting
3072 // algorithm and choose the lane with the least number of operands that
3073 // can freely move about or less profitable because it already has the
3074 // most optimal set of operands. The first unsigned is a counter for
3075 // voting, the second unsigned is the counter of lanes with instructions
3076 // with same/alternate opcodes and same parent basic block.
3077 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
3078 // Try to be closer to the original results, if we have multiple lanes
3079 // with same cost. If 2 lanes have the same cost, use the one with the
3080 // highest index.
3081 for (int I = getNumLanes(); I > 0; --I) {
3082 unsigned Lane = I - 1;
3083 OperandsOrderData NumFreeOpsHash =
3084 getMaxNumOperandsThatCanBeReordered(Lane);
3085 // Compare the number of operands that can move and choose the one with
3086 // the least number.
3087 if (NumFreeOpsHash.NumOfAPOs < Min) {
3088 Min = NumFreeOpsHash.NumOfAPOs;
3089 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3090 HashMap.clear();
3091 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
3092 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3093 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3094 // Select the most optimal lane in terms of number of operands that
3095 // should be moved around.
3096 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3097 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
3098 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3099 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3100 auto [It, Inserted] =
3101 HashMap.try_emplace(Key: NumFreeOpsHash.Hash, Args: 1, Args&: Lane);
3102 if (!Inserted)
3103 ++It->second.first;
3104 }
3105 }
3106 // Select the lane with the minimum counter.
3107 unsigned BestLane = 0;
3108 unsigned CntMin = UINT_MAX;
3109 for (const auto &Data : reverse(C&: HashMap)) {
3110 if (Data.second.first < CntMin) {
3111 CntMin = Data.second.first;
3112 BestLane = Data.second.second;
3113 }
3114 }
3115 return BestLane;
3116 }
3117
3118 /// Data structure that helps to reorder operands.
3119 struct OperandsOrderData {
3120 /// The best number of operands with the same APOs, which can be
3121 /// reordered.
3122 unsigned NumOfAPOs = UINT_MAX;
3123 /// Number of operands with the same/alternate instruction opcode and
3124 /// parent.
3125 unsigned NumOpsWithSameOpcodeParent = 0;
3126 /// Hash for the actual operands ordering.
3127 /// Used to count operands, actually their position id and opcode
3128 /// value. It is used in the voting mechanism to find the lane with the
3129 /// least number of operands that can freely move about or less profitable
3130 /// because it already has the most optimal set of operands. Can be
3131 /// replaced with SmallVector<unsigned> instead but hash code is faster
3132 /// and requires less memory.
3133 unsigned Hash = 0;
3134 };
3135 /// \returns the maximum number of operands that are allowed to be reordered
3136 /// for \p Lane and the number of compatible instructions(with the same
3137 /// parent/opcode). This is used as a heuristic for selecting the first lane
3138 /// to start operand reordering.
3139 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3140 unsigned CntTrue = 0;
3141 unsigned NumOperands = getNumOperands();
3142 // Operands with the same APO can be reordered. We therefore need to count
3143 // how many of them we have for each APO, like this: Cnt[APO] = x.
3144 // Since we only have two APOs, namely true and false, we can avoid using
3145 // a map. Instead we can simply count the number of operands that
3146 // correspond to one of them (in this case the 'true' APO), and calculate
3147 // the other by subtracting it from the total number of operands.
3148 // Operands with the same instruction opcode and parent are more
3149 // profitable since we don't need to move them in many cases, with a high
3150 // probability such lane already can be vectorized effectively.
3151 bool AllUndefs = true;
3152 unsigned NumOpsWithSameOpcodeParent = 0;
3153 Instruction *OpcodeI = nullptr;
3154 BasicBlock *Parent = nullptr;
3155 unsigned Hash = 0;
3156 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3157 const OperandData &OpData = getData(OpIdx, Lane);
3158 if (OpData.APO)
3159 ++CntTrue;
3160 // Use Boyer-Moore majority voting for finding the majority opcode and
3161 // the number of times it occurs.
3162 if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
3163 if (!OpcodeI || !getSameOpcode(VL: {OpcodeI, I}, TLI) ||
3164 I->getParent() != Parent) {
3165 if (NumOpsWithSameOpcodeParent == 0) {
3166 NumOpsWithSameOpcodeParent = 1;
3167 OpcodeI = I;
3168 Parent = I->getParent();
3169 } else {
3170 --NumOpsWithSameOpcodeParent;
3171 }
3172 } else {
3173 ++NumOpsWithSameOpcodeParent;
3174 }
3175 }
3176 Hash = hash_combine(
3177 args: Hash, args: hash_value(value: (OpIdx + 1) * (OpData.V->getValueID() + 1)));
3178 AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
3179 }
3180 if (AllUndefs)
3181 return {};
3182 OperandsOrderData Data;
3183 Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
3184 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3185 Data.Hash = Hash;
3186 return Data;
3187 }
3188
3189 /// Go through the instructions in VL and append their operands.
3190 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3191 const InstructionsState &S) {
3192 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3193 assert((empty() || all_of(Operands,
3194 [this](const ValueList &VL) {
3195 return VL.size() == getNumLanes();
3196 })) &&
3197 "Expected same number of lanes");
3198 assert(S.valid() && "InstructionsState is invalid.");
3199 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3200 // arguments to the intrinsic produces the same result.
3201 Instruction *MainOp = S.getMainOp();
3202 unsigned NumOperands = MainOp->getNumOperands();
3203 ArgSize = ::getNumberOfPotentiallyCommutativeOps(I: MainOp);
3204 OpsVec.resize(N: ArgSize);
3205 unsigned NumLanes = VL.size();
3206 for (OperandDataVec &Ops : OpsVec)
3207 Ops.resize(N: NumLanes);
3208 for (unsigned Lane : seq<unsigned>(Size: NumLanes)) {
3209 // Our tree has just 3 nodes: the root and two operands.
3210 // It is therefore trivial to get the APO. We only need to check the
3211 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3212 // operand. The LHS operand of both add and sub is never attached to an
3213 // inversese operation in the linearized form, therefore its APO is
3214 // false. The RHS is true only if V is an inverse operation.
3215
3216 // Since operand reordering is performed on groups of commutative
3217 // operations or alternating sequences (e.g., +, -), we can safely tell
3218 // the inverse operations by checking commutativity.
3219 auto *I = dyn_cast<Instruction>(Val: VL[Lane]);
3220 if (!I && isa<PoisonValue>(Val: VL[Lane])) {
3221 for (unsigned OpIdx : seq<unsigned>(Size: NumOperands))
3222 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3223 continue;
3224 }
3225 bool IsInverseOperation = false;
3226 if (S.isCopyableElement(V: VL[Lane])) {
3227 // The value is a copyable element.
3228 IsInverseOperation =
3229 !isCommutative(I: MainOp, ValWithUses: VL[Lane], /*IsCopyable=*/true);
3230 } else {
3231 assert(I && "Expected instruction");
3232 auto [SelectedOp, Ops] = convertTo(I, S);
3233 // We cannot check commutativity by the converted instruction
3234 // (SelectedOp) because isCommutative also examines def-use
3235 // relationships.
3236 IsInverseOperation = !isCommutative(I: SelectedOp, ValWithUses: I);
3237 }
3238 for (unsigned OpIdx : seq<unsigned>(Size: ArgSize)) {
3239 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3240 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3241 }
3242 }
3243 }
3244
3245 /// \returns the number of operands.
3246 unsigned getNumOperands() const { return ArgSize; }
3247
3248 /// \returns the number of lanes.
3249 unsigned getNumLanes() const { return OpsVec[0].size(); }
3250
3251 /// \returns the operand value at \p OpIdx and \p Lane.
3252 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3253 return getData(OpIdx, Lane).V;
3254 }
3255
3256 /// \returns true if the data structure is empty.
3257 bool empty() const { return OpsVec.empty(); }
3258
3259 /// Clears the data.
3260 void clear() { OpsVec.clear(); }
3261
3262 /// \Returns true if there are enough operands identical to \p Op to fill
3263 /// the whole vector (it is mixed with constants or loop invariant values).
3264 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3265 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3266 assert(Op == getValue(OpIdx, Lane) &&
3267 "Op is expected to be getValue(OpIdx, Lane).");
3268 // Small number of loads - try load matching.
3269 if (isa<LoadInst>(Val: Op) && getNumLanes() == 2 && getNumOperands() == 2)
3270 return false;
3271 bool OpAPO = getData(OpIdx, Lane).APO;
3272 bool IsInvariant = L && L->isLoopInvariant(V: Op);
3273 unsigned Cnt = 0;
3274 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3275 if (Ln == Lane)
3276 continue;
3277 // This is set to true if we found a candidate for broadcast at Lane.
3278 bool FoundCandidate = false;
3279 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3280 OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3281 if (Data.APO != OpAPO || Data.IsUsed)
3282 continue;
3283 Value *OpILane = getValue(OpIdx: OpI, Lane);
3284 bool IsConstantOp = isa<Constant>(Val: OpILane);
3285 // Consider the broadcast candidate if:
3286 // 1. Same value is found in one of the operands.
3287 if (Data.V == Op ||
3288 // 2. The operand in the given lane is not constant but there is a
3289 // constant operand in another lane (which can be moved to the
3290 // given lane). In this case we can represent it as a simple
3291 // permutation of constant and broadcast.
3292 (!IsConstantOp &&
3293 ((Lns > 2 && isa<Constant>(Val: Data.V)) ||
3294 // 2.1. If we have only 2 lanes, need to check that value in the
3295 // next lane does not build same opcode sequence.
3296 (Lns == 2 &&
3297 !getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + 1) % OpE, Lane: Ln)}, TLI) &&
3298 isa<Constant>(Val: Data.V)))) ||
3299 // 3. The operand in the current lane is loop invariant (can be
3300 // hoisted out) and another operand is also a loop invariant
3301 // (though not a constant). In this case the whole vector can be
3302 // hoisted out.
3303 // FIXME: need to teach the cost model about this case for better
3304 // estimation.
3305 (IsInvariant && !isa<Constant>(Val: Data.V) &&
3306 !getSameOpcode(VL: {Op, Data.V}, TLI) &&
3307 L->isLoopInvariant(V: Data.V))) {
3308 FoundCandidate = true;
3309 Data.IsUsed = Data.V == Op;
3310 if (Data.V == Op)
3311 ++Cnt;
3312 break;
3313 }
3314 }
3315 if (!FoundCandidate)
3316 return false;
3317 }
3318 return getNumLanes() == 2 || Cnt > 1;
3319 }
3320
3321 /// Checks if there is at least single compatible operand in lanes other
3322 /// than \p Lane, compatible with the operand \p Op.
3323 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3324 assert(Op == getValue(OpIdx, Lane) &&
3325 "Op is expected to be getValue(OpIdx, Lane).");
3326 bool OpAPO = getData(OpIdx, Lane).APO;
3327 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3328 if (Ln == Lane)
3329 continue;
3330 if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
3331 const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3332 if (Data.APO != OpAPO || Data.IsUsed)
3333 return true;
3334 Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
3335 return (L && L->isLoopInvariant(V: OpILn)) ||
3336 (getSameOpcode(VL: {Op, OpILn}, TLI) &&
3337 allSameBlock(VL: {Op, OpILn}));
3338 }))
3339 return true;
3340 }
3341 return false;
3342 }
3343
3344 public:
3345 /// Initialize with all the operands of the instruction vector \p RootVL.
3346 VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands,
3347 const InstructionsState &S, const BoUpSLP &R)
3348 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3349 L(R.LI->getLoopFor(BB: S.getMainOp()->getParent())) {
3350 // Append all the operands of RootVL.
3351 appendOperands(VL: RootVL, Operands, S);
3352 }
3353
3354 /// \Returns a value vector with the operands across all lanes for the
3355 /// opearnd at \p OpIdx.
3356 ValueList getVL(unsigned OpIdx) const {
3357 ValueList OpVL(OpsVec[OpIdx].size());
3358 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3359 "Expected same num of lanes across all operands");
3360 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3361 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3362 return OpVL;
3363 }
3364
3365 // Performs operand reordering for 2 or more operands.
3366 // The original operands are in OrigOps[OpIdx][Lane].
3367 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3368 void reorder() {
3369 unsigned NumOperands = getNumOperands();
3370 unsigned NumLanes = getNumLanes();
3371 // Each operand has its own mode. We are using this mode to help us select
3372 // the instructions for each lane, so that they match best with the ones
3373 // we have selected so far.
3374 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3375
3376 // This is a greedy single-pass algorithm. We are going over each lane
3377 // once and deciding on the best order right away with no back-tracking.
3378 // However, in order to increase its effectiveness, we start with the lane
3379 // that has operands that can move the least. For example, given the
3380 // following lanes:
3381 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3382 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3383 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3384 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3385 // we will start at Lane 1, since the operands of the subtraction cannot
3386 // be reordered. Then we will visit the rest of the lanes in a circular
3387 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3388
3389 // Find the first lane that we will start our search from.
3390 unsigned FirstLane = getBestLaneToStartReordering();
3391
3392 // Initialize the modes.
3393 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3394 Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
3395 // Keep track if we have instructions with all the same opcode on one
3396 // side.
3397 if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
3398 // Check if OpLane0 should be broadcast.
3399 if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) ||
3400 !canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
3401 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3402 else if (isa<LoadInst>(Val: OpILane0))
3403 ReorderingModes[OpIdx] = ReorderingMode::Load;
3404 else
3405 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3406 } else if (isa<Constant>(Val: OpLane0)) {
3407 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3408 } else if (isa<Argument>(Val: OpLane0)) {
3409 // Our best hope is a Splat. It may save some cost in some cases.
3410 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3411 } else {
3412 llvm_unreachable("Unexpected value kind.");
3413 }
3414 }
3415
3416 // Check that we don't have same operands. No need to reorder if operands
3417 // are just perfect diamond or shuffled diamond match. Do not do it only
3418 // for possible broadcasts or non-power of 2 number of scalars (just for
3419 // now).
3420 auto &&SkipReordering = [this]() {
3421 SmallPtrSet<Value *, 4> UniqueValues;
3422 ArrayRef<OperandData> Op0 = OpsVec.front();
3423 for (const OperandData &Data : Op0)
3424 UniqueValues.insert(Ptr: Data.V);
3425 for (ArrayRef<OperandData> Op :
3426 ArrayRef(OpsVec).slice(N: 1, M: getNumOperands() - 1)) {
3427 if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
3428 return !UniqueValues.contains(Ptr: Data.V);
3429 }))
3430 return false;
3431 }
3432 // TODO: Check if we can remove a check for non-power-2 number of
3433 // scalars after full support of non-power-2 vectorization.
3434 return UniqueValues.size() != 2 &&
3435 hasFullVectorsOrPowerOf2(TTI: *R.TTI, Ty: Op0.front().V->getType(),
3436 Sz: UniqueValues.size());
3437 };
3438
3439 // If the initial strategy fails for any of the operand indexes, then we
3440 // perform reordering again in a second pass. This helps avoid assigning
3441 // high priority to the failed strategy, and should improve reordering for
3442 // the non-failed operand indexes.
3443 for (int Pass = 0; Pass != 2; ++Pass) {
3444 // Check if no need to reorder operands since they're are perfect or
3445 // shuffled diamond match.
3446 // Need to do it to avoid extra external use cost counting for
3447 // shuffled matches, which may cause regressions.
3448 if (SkipReordering())
3449 break;
3450 // Skip the second pass if the first pass did not fail.
3451 bool StrategyFailed = false;
3452 // Mark all operand data as free to use.
3453 clearUsed();
3454 // We keep the original operand order for the FirstLane, so reorder the
3455 // rest of the lanes. We are visiting the nodes in a circular fashion,
3456 // using FirstLane as the center point and increasing the radius
3457 // distance.
3458 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3459 for (unsigned I = 0; I < NumOperands; ++I)
3460 MainAltOps[I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
3461
3462 SmallBitVector UsedLanes(NumLanes);
3463 UsedLanes.set(FirstLane);
3464 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3465 // Visit the lane on the right and then the lane on the left.
3466 for (int Direction : {+1, -1}) {
3467 int Lane = FirstLane + Direction * Distance;
3468 if (Lane < 0 || Lane >= (int)NumLanes)
3469 continue;
3470 UsedLanes.set(Lane);
3471 int LastLane = Lane - Direction;
3472 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3473 "Out of bounds");
3474 // Look for a good match for each operand.
3475 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3476 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3477 std::optional<unsigned> BestIdx =
3478 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3479 MainAltOps: MainAltOps[OpIdx], UsedLanes);
3480 // By not selecting a value, we allow the operands that follow to
3481 // select a better matching value. We will get a non-null value in
3482 // the next run of getBestOperand().
3483 if (BestIdx) {
3484 // Swap the current operand with the one returned by
3485 // getBestOperand().
3486 swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
3487 } else {
3488 // Enable the second pass.
3489 StrategyFailed = true;
3490 }
3491 // Try to get the alternate opcode and follow it during analysis.
3492 if (MainAltOps[OpIdx].size() != 2) {
3493 OperandData &AltOp = getData(OpIdx, Lane);
3494 InstructionsState OpS =
3495 getSameOpcode(VL: {MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3496 if (OpS && OpS.isAltShuffle())
3497 MainAltOps[OpIdx].push_back(Elt: AltOp.V);
3498 }
3499 }
3500 }
3501 }
3502 // Skip second pass if the strategy did not fail.
3503 if (!StrategyFailed)
3504 break;
3505 }
3506 }
3507
3508#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3509 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3510 switch (RMode) {
3511 case ReorderingMode::Load:
3512 return "Load";
3513 case ReorderingMode::Opcode:
3514 return "Opcode";
3515 case ReorderingMode::Constant:
3516 return "Constant";
3517 case ReorderingMode::Splat:
3518 return "Splat";
3519 case ReorderingMode::Failed:
3520 return "Failed";
3521 }
3522 llvm_unreachable("Unimplemented Reordering Type");
3523 }
3524
3525 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3526 raw_ostream &OS) {
3527 return OS << getModeStr(RMode);
3528 }
3529
3530 /// Debug print.
3531 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3532 printMode(RMode, dbgs());
3533 }
3534
3535 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3536 return printMode(RMode, OS);
3537 }
3538
3539 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
3540 const unsigned Indent = 2;
3541 unsigned Cnt = 0;
3542 for (const OperandDataVec &OpDataVec : OpsVec) {
3543 OS << "Operand " << Cnt++ << "\n";
3544 for (const OperandData &OpData : OpDataVec) {
3545 OS.indent(Indent) << "{";
3546 if (Value *V = OpData.V)
3547 OS << *V;
3548 else
3549 OS << "null";
3550 OS << ", APO:" << OpData.APO << "}\n";
3551 }
3552 OS << "\n";
3553 }
3554 return OS;
3555 }
3556
3557 /// Debug print.
3558 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3559#endif
3560 };
3561
3562 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3563 /// for a pair which have highest score deemed to have best chance to form
3564 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3565 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3566 /// of the cost, considered to be good enough score.
3567 std::pair<std::optional<int>, int>
3568 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3569 int Limit = LookAheadHeuristics::ScoreFail) const {
3570 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3571 RootLookAheadMaxDepth);
3572 int BestScore = Limit;
3573 std::optional<int> Index;
3574 for (int I : seq<int>(Begin: 0, End: Candidates.size())) {
3575 int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates[I].first,
3576 RHS: Candidates[I].second,
3577 /*U1=*/nullptr, /*U2=*/nullptr,
3578 /*CurrLevel=*/1, MainAltOps: {});
3579 if (Score > BestScore) {
3580 BestScore = Score;
3581 Index = I;
3582 }
3583 }
3584 return std::make_pair(x&: Index, y&: BestScore);
3585 }
3586
3587 /// Checks if the instruction is marked for deletion.
3588 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(V: I); }
3589
3590 /// Removes an instruction from its block and eventually deletes it.
3591 /// It's like Instruction::eraseFromParent() except that the actual deletion
3592 /// is delayed until BoUpSLP is destructed.
3593 void eraseInstruction(Instruction *I) {
3594 DeletedInstructions.insert(V: I);
3595 }
3596
3597 /// Remove instructions from the parent function and clear the operands of \p
3598 /// DeadVals instructions, marking for deletion trivially dead operands.
3599 template <typename T>
3600 void removeInstructionsAndOperands(
3601 ArrayRef<T *> DeadVals,
3602 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3603 VectorValuesAndScales) {
3604 SmallVector<WeakTrackingVH> DeadInsts;
3605 for (T *V : DeadVals) {
3606 auto *I = cast<Instruction>(V);
3607 eraseInstruction(I);
3608 }
3609 DenseSet<Value *> Processed;
3610 for (T *V : DeadVals) {
3611 if (!V || !Processed.insert(V).second)
3612 continue;
3613 auto *I = cast<Instruction>(V);
3614 salvageDebugInfo(*I);
3615 ArrayRef<TreeEntry *> Entries = getTreeEntries(V: I);
3616 for (Use &U : I->operands()) {
3617 if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
3618 OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
3619 wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
3620 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3621 return Entry->VectorizedValue == OpI;
3622 })))
3623 DeadInsts.push_back(Elt: OpI);
3624 }
3625 I->dropAllReferences();
3626 }
3627 for (T *V : DeadVals) {
3628 auto *I = cast<Instruction>(V);
3629 if (!I->getParent())
3630 continue;
3631 assert((I->use_empty() || all_of(I->uses(),
3632 [&](Use &U) {
3633 return isDeleted(
3634 cast<Instruction>(U.getUser()));
3635 })) &&
3636 "trying to erase instruction with users.");
3637 I->removeFromParent();
3638 SE->forgetValue(V: I);
3639 }
3640 // Process the dead instruction list until empty.
3641 while (!DeadInsts.empty()) {
3642 Value *V = DeadInsts.pop_back_val();
3643 Instruction *VI = cast_or_null<Instruction>(Val: V);
3644 if (!VI || !VI->getParent())
3645 continue;
3646 assert(isInstructionTriviallyDead(VI, TLI) &&
3647 "Live instruction found in dead worklist!");
3648 assert(VI->use_empty() && "Instructions with uses are not dead.");
3649
3650 // Don't lose the debug info while deleting the instructions.
3651 salvageDebugInfo(I&: *VI);
3652
3653 // Null out all of the instruction's operands to see if any operand
3654 // becomes dead as we go.
3655 for (Use &OpU : VI->operands()) {
3656 Value *OpV = OpU.get();
3657 if (!OpV)
3658 continue;
3659 OpU.set(nullptr);
3660
3661 if (!OpV->use_empty())
3662 continue;
3663
3664 // If the operand is an instruction that became dead as we nulled out
3665 // the operand, and if it is 'trivially' dead, delete it in a future
3666 // loop iteration.
3667 if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
3668 if (!DeletedInstructions.contains(V: OpI) &&
3669 (!OpI->getType()->isVectorTy() ||
3670 none_of(
3671 VectorValuesAndScales,
3672 [&](const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3673 &V) { return std::get<0>(t: V) == OpI; })) &&
3674 isInstructionTriviallyDead(I: OpI, TLI))
3675 DeadInsts.push_back(Elt: OpI);
3676 }
3677
3678 VI->removeFromParent();
3679 eraseInstruction(I: VI);
3680 SE->forgetValue(V: VI);
3681 }
3682 }
3683
3684 /// Checks if the instruction was already analyzed for being possible
3685 /// reduction root.
3686 bool isAnalyzedReductionRoot(Instruction *I) const {
3687 return AnalyzedReductionsRoots.count(Ptr: I);
3688 }
3689 /// Register given instruction as already analyzed for being possible
3690 /// reduction root.
3691 void analyzedReductionRoot(Instruction *I) {
3692 AnalyzedReductionsRoots.insert(Ptr: I);
3693 }
3694 /// Checks if the provided list of reduced values was checked already for
3695 /// vectorization.
3696 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
3697 return AnalyzedReductionVals.contains(V: hash_value(S: VL));
3698 }
3699 /// Adds the list of reduced values to list of already checked values for the
3700 /// vectorization.
3701 void analyzedReductionVals(ArrayRef<Value *> VL) {
3702 AnalyzedReductionVals.insert(V: hash_value(S: VL));
3703 }
3704 /// Clear the list of the analyzed reduction root instructions.
3705 void clearReductionData() {
3706 AnalyzedReductionsRoots.clear();
3707 AnalyzedReductionVals.clear();
3708 AnalyzedMinBWVals.clear();
3709 }
3710 /// Checks if the given value is gathered in one of the nodes.
3711 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3712 return any_of(Range: MustGather, P: [&](Value *V) { return Vals.contains(V); });
3713 }
3714 /// Checks if the given value is gathered in one of the nodes.
3715 bool isGathered(const Value *V) const {
3716 return MustGather.contains(Ptr: V);
3717 }
3718 /// Checks if the specified value was not schedule.
3719 bool isNotScheduled(const Value *V) const {
3720 return NonScheduledFirst.contains(Ptr: V);
3721 }
3722
3723 /// Check if the value is vectorized in the tree.
3724 bool isVectorized(const Value *V) const {
3725 assert(V && "V cannot be nullptr.");
3726 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
3727 return any_of(Range&: Entries, P: [&](const TreeEntry *E) {
3728 return !DeletedNodes.contains(Ptr: E) && !TransformedToGatherNodes.contains(Val: E);
3729 });
3730 }
3731
3732 /// Checks if it is legal and profitable to build SplitVectorize node for the
3733 /// given \p VL.
3734 /// \param Op1 first homogeneous scalars.
3735 /// \param Op2 second homogeneous scalars.
3736 /// \param ReorderIndices indices to reorder the scalars.
3737 /// \returns true if the node was successfully built.
3738 bool canBuildSplitNode(ArrayRef<Value *> VL,
3739 const InstructionsState &LocalState,
3740 SmallVectorImpl<Value *> &Op1,
3741 SmallVectorImpl<Value *> &Op2,
3742 OrdersType &ReorderIndices) const;
3743
3744 ~BoUpSLP();
3745
3746private:
3747 /// Determine if a node \p E in can be demoted to a smaller type with a
3748 /// truncation. We collect the entries that will be demoted in ToDemote.
3749 /// \param E Node for analysis
3750 /// \param ToDemote indices of the nodes to be demoted.
3751 bool collectValuesToDemote(
3752 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3753 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
3754 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3755 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3756
3757 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3758 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3759 /// they have only one user and reordarable).
3760 /// \param ReorderableGathers List of all gather nodes that require reordering
3761 /// (e.g., gather of extractlements or partially vectorizable loads).
3762 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3763 /// reordering, subset of \p NonVectorized.
3764 void buildReorderableOperands(
3765 TreeEntry *UserTE,
3766 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3767 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3768 SmallVectorImpl<TreeEntry *> &GatherOps);
3769
3770 /// Checks if the given \p TE is a gather node with clustered reused scalars
3771 /// and reorders it per given \p Mask.
3772 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3773
3774 /// Checks if all users of \p I are the part of the vectorization tree.
3775 bool areAllUsersVectorized(
3776 Instruction *I,
3777 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3778
3779 /// Return information about the vector formed for the specified index
3780 /// of a vector of (the same) instruction.
3781 TargetTransformInfo::OperandValueInfo
3782 getOperandInfo(ArrayRef<Value *> Ops) const;
3783
3784 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3785 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3786 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3787 return const_cast<TreeEntry *>(
3788 getOperandEntry(E: const_cast<const TreeEntry *>(E), Idx));
3789 }
3790
3791 /// Gets the root instruction for the given node. If the node is a strided
3792 /// load/store node with the reverse order, the root instruction is the last
3793 /// one.
3794 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3795
3796 /// \returns Cast context for the given graph node.
3797 TargetTransformInfo::CastContextHint
3798 getCastContextHint(const TreeEntry &TE) const;
3799
3800 /// \returns the scale of the given tree entry to the loop iteration.
3801 /// \p Scalar is the scalar value from the entry, if using the parent for the
3802 /// external use.
3803 /// \p U is the user of the vectorized value from the entry, if using the
3804 /// parent for the external use.
3805 unsigned getScaleToLoopIterations(const TreeEntry &TE,
3806 Value *Scalar = nullptr,
3807 Instruction *U = nullptr);
3808
3809 /// Get the loop nest for the given loop \p L.
3810 ArrayRef<const Loop *> getLoopNest(const Loop *L);
3811
3812 /// \returns the cost of the vectorizable entry.
3813 InstructionCost getEntryCost(const TreeEntry *E,
3814 ArrayRef<Value *> VectorizedVals,
3815 SmallPtrSetImpl<Value *> &CheckedExtracts);
3816
3817 /// Estimates spill/reload cost from vector register pressure for \p E at the
3818 /// point of emitting its vector result type \p FinalVecTy.
3819 InstructionCost getVectorSpillReloadCost(const TreeEntry *E,
3820 VectorType *VecTy,
3821 VectorType *FinalVecTy,
3822 TTI::TargetCostKind CostKind) const;
3823
3824 /// This is the recursive part of buildTree.
3825 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3826 unsigned InterleaveFactor = 0);
3827
3828 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3829 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3830 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3831 /// returns false, setting \p CurrentOrder to either an empty vector or a
3832 /// non-identity permutation that allows to reuse extract instructions.
3833 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3834 /// extract order.
3835 bool canReuseExtract(ArrayRef<Value *> VL,
3836 SmallVectorImpl<unsigned> &CurrentOrder,
3837 bool ResizeAllowed = false) const;
3838
3839 /// Vectorize a single entry in the tree.
3840 Value *vectorizeTree(TreeEntry *E);
3841
3842 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3843 /// \p E.
3844 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3845
3846 /// Create a new vector from a list of scalar values. Produces a sequence
3847 /// which exploits values reused across lanes, and arranges the inserts
3848 /// for ease of later optimization.
3849 template <typename BVTy, typename ResTy, typename... Args>
3850 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3851
3852 /// Create a new vector from a list of scalar values. Produces a sequence
3853 /// which exploits values reused across lanes, and arranges the inserts
3854 /// for ease of later optimization.
3855 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3856
3857 /// Returns the instruction in the bundle, which can be used as a base point
3858 /// for scheduling. Usually it is the last instruction in the bundle, except
3859 /// for the case when all operands are external (in this case, it is the first
3860 /// instruction in the list).
3861 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3862
3863 /// Tries to find extractelement instructions with constant indices from fixed
3864 /// vector type and gather such instructions into a bunch, which highly likely
3865 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3866 /// was successful, the matched scalars are replaced by poison values in \p VL
3867 /// for future analysis.
3868 std::optional<TargetTransformInfo::ShuffleKind>
3869 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3870 SmallVectorImpl<int> &Mask) const;
3871
3872 /// Tries to find extractelement instructions with constant indices from fixed
3873 /// vector type and gather such instructions into a bunch, which highly likely
3874 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3875 /// was successful, the matched scalars are replaced by poison values in \p VL
3876 /// for future analysis.
3877 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3878 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3879 SmallVectorImpl<int> &Mask,
3880 unsigned NumParts) const;
3881
3882 /// Checks if the gathered \p VL can be represented as a single register
3883 /// shuffle(s) of previous tree entries.
3884 /// \param TE Tree entry checked for permutation.
3885 /// \param VL List of scalars (a subset of the TE scalar), checked for
3886 /// permutations. Must form single-register vector.
3887 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3888 /// commands to build the mask using the original vector value, without
3889 /// relying on the potential reordering.
3890 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3891 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3892 std::optional<TargetTransformInfo::ShuffleKind>
3893 isGatherShuffledSingleRegisterEntry(
3894 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3895 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3896 bool ForOrder);
3897
3898 /// Checks if the gathered \p VL can be represented as multi-register
3899 /// shuffle(s) of previous tree entries.
3900 /// \param TE Tree entry checked for permutation.
3901 /// \param VL List of scalars (a subset of the TE scalar), checked for
3902 /// permutations.
3903 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3904 /// commands to build the mask using the original vector value, without
3905 /// relying on the potential reordering.
3906 /// \returns per-register series of ShuffleKind, if gathered values can be
3907 /// represented as shuffles of previous tree entries. \p Mask is filled with
3908 /// the shuffle mask (also on per-register base).
3909 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3910 isGatherShuffledEntry(
3911 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3912 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3913 unsigned NumParts, bool ForOrder = false);
3914
3915 /// \returns the cost of gathering (inserting) the values in \p VL into a
3916 /// vector.
3917 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3918 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3919 Type *ScalarTy) const;
3920
3921 /// Set the Builder insert point to one after the last instruction in
3922 /// the bundle
3923 void setInsertPointAfterBundle(const TreeEntry *E);
3924
3925 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3926 /// specified, the starting vector value is poison.
3927 Value *
3928 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3929 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3930
3931 /// \returns whether the VectorizableTree is fully vectorizable and will
3932 /// be beneficial even the tree height is tiny.
3933 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3934
3935 /// Run through the list of all gathered loads in the graph and try to find
3936 /// vector loads/masked gathers instead of regular gathers. Later these loads
3937 /// are reshufled to build final gathered nodes.
3938 void tryToVectorizeGatheredLoads(
3939 const SmallMapVector<
3940 std::tuple<BasicBlock *, Value *, Type *>,
3941 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3942 &GatheredLoads);
3943
3944 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3945 /// users of \p TE and collects the stores. It returns the map from the store
3946 /// pointers to the collected stores.
3947 SmallVector<SmallVector<StoreInst *>>
3948 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3949
3950 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3951 /// stores in \p StoresVec can form a vector instruction. If so it returns
3952 /// true and populates \p ReorderIndices with the shuffle indices of the
3953 /// stores when compared to the sorted vector.
3954 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3955 OrdersType &ReorderIndices) const;
3956
3957 /// Iterates through the users of \p TE, looking for scalar stores that can be
3958 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3959 /// their order and builds an order index vector for each store bundle. It
3960 /// returns all these order vectors found.
3961 /// We run this after the tree has formed, otherwise we may come across user
3962 /// instructions that are not yet in the tree.
3963 SmallVector<OrdersType, 1>
3964 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3965
3966 /// Tries to reorder the gathering node for better vectorization
3967 /// opportunities.
3968 void reorderGatherNode(TreeEntry &TE);
3969
3970 /// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
3971 /// .., 56))-like pattern.
3972 /// If the int shifts unique, also strided, but not ordered, sets \p Order.
3973 /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
3974 /// If the root nodes are loads, sets \p ForLoads to true.
3975 bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap,
3976 bool &ForLoads) const;
3977
3978 /// Checks if the \p SelectTE matches zext+selects, which can be inversed for
3979 /// better codegen in case like zext (icmp ne), select (icmp eq), ....
3980 bool matchesInversedZExtSelect(
3981 const TreeEntry &SelectTE,
3982 SmallVectorImpl<unsigned> &InversedCmpsIndices) const;
3983
3984 /// Checks if the tree is reduction or of bit selects, like select %cmp, <1,
3985 /// 2, 4, 8, ..>, zeroinitializer, which can be reduced just to a bitcast %cmp
3986 /// to in.
3987 bool matchesSelectOfBits(const TreeEntry &SelectTE) const;
3988
3989 class TreeEntry {
3990 public:
3991 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3992 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3993
3994 /// \returns Common mask for reorder indices and reused scalars.
3995 SmallVector<int> getCommonMask() const {
3996 if (State == TreeEntry::SplitVectorize)
3997 return {};
3998 SmallVector<int> Mask;
3999 inversePermutation(Indices: ReorderIndices, Mask);
4000 ::addMask(Mask, SubMask: ReuseShuffleIndices);
4001 return Mask;
4002 }
4003
4004 /// \returns The mask for split nodes.
4005 SmallVector<int> getSplitMask() const {
4006 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
4007 "Expected only split vectorize node.");
4008 unsigned CommonVF = std::max<unsigned>(
4009 a: CombinedEntriesWithIndices.back().second,
4010 b: Scalars.size() - CombinedEntriesWithIndices.back().second);
4011 const unsigned Scale = getNumElements(Ty: Scalars.front()->getType());
4012 CommonVF *= Scale;
4013 SmallVector<int> Mask(getVectorFactor() * Scale, PoisonMaskElem);
4014 for (auto [Idx, I] : enumerate(First: ReorderIndices)) {
4015 for (unsigned K : seq<unsigned>(Size: Scale)) {
4016 Mask[Scale * I + K] =
4017 Scale * Idx + K +
4018 (Idx >= CombinedEntriesWithIndices.back().second
4019 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
4020 : 0);
4021 }
4022 }
4023 return Mask;
4024 }
4025
4026 /// Updates (reorders) SplitVectorize node according to the given mask \p
4027 /// Mask and order \p MaskOrder.
4028 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
4029 ArrayRef<int> MaskOrder);
4030
4031 /// \returns true if the scalars in VL are equal to this entry.
4032 bool isSame(ArrayRef<Value *> VL) const {
4033 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
4034 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
4035 return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
4036 return VL.size() == Mask.size() &&
4037 std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
4038 binary_pred: [Scalars](Value *V, int Idx) {
4039 return (isa<UndefValue>(Val: V) &&
4040 Idx == PoisonMaskElem) ||
4041 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4042 });
4043 };
4044 if (!ReorderIndices.empty()) {
4045 // TODO: implement matching if the nodes are just reordered, still can
4046 // treat the vector as the same if the list of scalars matches VL
4047 // directly, without reordering.
4048 SmallVector<int> Mask;
4049 inversePermutation(Indices: ReorderIndices, Mask);
4050 if (VL.size() == Scalars.size())
4051 return IsSame(Scalars, Mask);
4052 if (VL.size() == ReuseShuffleIndices.size()) {
4053 ::addMask(Mask, SubMask: ReuseShuffleIndices);
4054 return IsSame(Scalars, Mask);
4055 }
4056 return false;
4057 }
4058 return IsSame(Scalars, ReuseShuffleIndices);
4059 }
4060
4061 /// \returns true if current entry has same operands as \p TE.
4062 bool hasEqualOperands(const TreeEntry &TE) const {
4063 if (TE.getNumOperands() != getNumOperands())
4064 return false;
4065 SmallBitVector Used(getNumOperands());
4066 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
4067 unsigned PrevCount = Used.count();
4068 for (unsigned K = 0; K < E; ++K) {
4069 if (Used.test(Idx: K))
4070 continue;
4071 if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
4072 Used.set(K);
4073 break;
4074 }
4075 }
4076 // Check if we actually found the matching operand.
4077 if (PrevCount == Used.count())
4078 return false;
4079 }
4080 return true;
4081 }
4082
4083 /// \return Final vectorization factor for the node. Defined by the total
4084 /// number of vectorized scalars, including those, used several times in the
4085 /// entry and counted in the \a ReuseShuffleIndices, if any.
4086 unsigned getVectorFactor() const {
4087 if (!ReuseShuffleIndices.empty())
4088 return ReuseShuffleIndices.size();
4089 return Scalars.size();
4090 };
4091
4092 /// Checks if the current node is a gather node.
4093 bool isGather() const { return State == NeedToGather; }
4094
4095 /// A vector of scalars.
4096 ValueList Scalars;
4097
4098 /// The Scalars are vectorized into this value. It is initialized to Null.
4099 WeakTrackingVH VectorizedValue = nullptr;
4100
4101 /// Do we need to gather this sequence or vectorize it
4102 /// (either with vector instruction or with scatter/gather
4103 /// intrinsics for store/load)?
4104 enum EntryState {
4105 Vectorize, ///< The node is regularly vectorized.
4106 ScatterVectorize, ///< Masked scatter/gather node.
4107 StridedVectorize, ///< Strided loads (and stores)
4108 CompressVectorize, ///< (Masked) load with compress.
4109 NeedToGather, ///< Gather/buildvector node.
4110 CombinedVectorize, ///< Vectorized node, combined with its user into more
4111 ///< complex node like select/cmp to minmax, mul/add to
4112 ///< fma, etc. Must be used for the following nodes in
4113 ///< the pattern, not the very first one.
4114 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4115 ///< independently and then combines back.
4116 };
4117 EntryState State;
4118
4119 /// List of combined opcodes supported by the vectorizer.
4120 enum CombinedOpcode {
4121 NotCombinedOp = -1,
4122 MinMax = Instruction::OtherOpsEnd + 1,
4123 FMulAdd,
4124 ReducedBitcast,
4125 ReducedBitcastBSwap,
4126 ReducedBitcastLoads,
4127 ReducedBitcastBSwapLoads,
4128 ReducedCmpBitcast,
4129 };
4130 CombinedOpcode CombinedOp = NotCombinedOp;
4131
4132 /// Does this sequence require some shuffling?
4133 SmallVector<int, 4> ReuseShuffleIndices;
4134
4135 /// Does this entry require reordering?
4136 SmallVector<unsigned, 4> ReorderIndices;
4137
4138 /// Points back to the VectorizableTree.
4139 ///
4140 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4141 /// to be a pointer and needs to be able to initialize the child iterator.
4142 /// Thus we need a reference back to the container to translate the indices
4143 /// to entries.
4144 VecTreeTy &Container;
4145
4146 /// The TreeEntry index containing the user of this entry.
4147 EdgeInfo UserTreeIndex;
4148
4149 /// The index of this treeEntry in VectorizableTree.
4150 unsigned Idx = 0;
4151
4152 /// For gather/buildvector/alt opcode nodes, which are combined from
4153 /// other nodes as a series of insertvector instructions.
4154 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4155
4156 private:
4157 /// The operands of each instruction in each lane Operands[op_index][lane].
4158 /// Note: This helps avoid the replication of the code that performs the
4159 /// reordering of operands during buildTreeRec() and vectorizeTree().
4160 SmallVector<ValueList, 2> Operands;
4161
4162 /// Copyable elements of the entry node.
4163 SmallPtrSet<const Value *, 4> CopyableElements;
4164
4165 /// MainOp and AltOp are recorded inside. S should be obtained from
4166 /// newTreeEntry.
4167 InstructionsState S = InstructionsState::invalid();
4168
4169 /// Interleaving factor for interleaved loads Vectorize nodes.
4170 unsigned InterleaveFactor = 0;
4171
4172 /// True if the node does not require scheduling.
4173 bool DoesNotNeedToSchedule = false;
4174
4175 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4176 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4177 if (Operands.size() < OpIdx + 1)
4178 Operands.resize(N: OpIdx + 1);
4179 assert(Operands[OpIdx].empty() && "Already resized?");
4180 assert(OpVL.size() <= Scalars.size() &&
4181 "Number of operands is greater than the number of scalars.");
4182 Operands[OpIdx].resize(N: OpVL.size());
4183 copy(Range&: OpVL, Out: Operands[OpIdx].begin());
4184 }
4185
4186 /// Maps values to their lanes in the node.
4187 mutable SmallDenseMap<Value *, unsigned> ValueToLane;
4188
4189 public:
4190 /// Returns interleave factor for interleave nodes.
4191 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4192 /// Sets interleaving factor for the interleaving nodes.
4193 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4194
4195 /// Marks the node as one that does not require scheduling.
4196 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4197 /// Returns true if the node is marked as one that does not require
4198 /// scheduling.
4199 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4200
4201 /// Set this bundle's operands from \p Operands.
4202 void setOperands(ArrayRef<ValueList> Operands) {
4203 for (unsigned I : seq<unsigned>(Size: Operands.size()))
4204 setOperand(OpIdx: I, OpVL: Operands[I]);
4205 }
4206
4207 /// Reorders operands of the node to the given mask \p Mask.
4208 void reorderOperands(ArrayRef<int> Mask) {
4209 for (ValueList &Operand : Operands)
4210 reorderScalars(Scalars&: Operand, Mask);
4211 }
4212
4213 /// \returns the \p OpIdx operand of this TreeEntry.
4214 ValueList &getOperand(unsigned OpIdx) {
4215 assert(OpIdx < Operands.size() && "Off bounds");
4216 return Operands[OpIdx];
4217 }
4218
4219 /// \returns the \p OpIdx operand of this TreeEntry.
4220 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4221 assert(OpIdx < Operands.size() && "Off bounds");
4222 return Operands[OpIdx];
4223 }
4224
4225 /// \returns the number of operands.
4226 unsigned getNumOperands() const { return Operands.size(); }
4227
4228 /// \return the single \p OpIdx operand.
4229 Value *getSingleOperand(unsigned OpIdx) const {
4230 assert(OpIdx < Operands.size() && "Off bounds");
4231 assert(!Operands[OpIdx].empty() && "No operand available");
4232 return Operands[OpIdx][0];
4233 }
4234
4235 /// Some of the instructions in the list have alternate opcodes.
4236 bool isAltShuffle() const { return S.isAltShuffle(); }
4237
4238 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4239 return S.getMatchingMainOpOrAltOp(I);
4240 }
4241
4242 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4243 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4244 /// \p OpValue.
4245 Value *isOneOf(Value *Op) const {
4246 auto *I = dyn_cast<Instruction>(Val: Op);
4247 if (I && getMatchingMainOpOrAltOp(I))
4248 return Op;
4249 return S.getMainOp();
4250 }
4251
4252 void setOperations(const InstructionsState &S) {
4253 assert(S && "InstructionsState is invalid.");
4254 this->S = S;
4255 }
4256
4257 Instruction *getMainOp() const { return S.getMainOp(); }
4258
4259 Instruction *getAltOp() const { return S.getAltOp(); }
4260
4261 /// The main/alternate opcodes for the list of instructions.
4262 unsigned getOpcode() const { return S.getOpcode(); }
4263
4264 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4265
4266 bool hasState() const { return S.valid(); }
4267
4268 /// Add \p V to the list of copyable elements.
4269 void addCopyableElement(Value *V) {
4270 assert(S.isCopyableElement(V) && "Not a copyable element.");
4271 CopyableElements.insert(Ptr: V);
4272 }
4273
4274 /// Returns true if \p V is a copyable element.
4275 bool isCopyableElement(Value *V) const {
4276 return CopyableElements.contains(Ptr: V);
4277 }
4278
4279 /// Returns true if any scalar in the list is a copyable element.
4280 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4281
4282 /// Returns the state of the operations.
4283 const InstructionsState &getOperations() const { return S; }
4284
4285 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4286 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4287 unsigned findLaneForValue(Value *V) const {
4288 auto Res = ValueToLane.try_emplace(Key: V, Args: getVectorFactor());
4289 if (!Res.second)
4290 return Res.first->second;
4291 unsigned &FoundLane = Res.first->getSecond();
4292 for (auto *It = find(Range: Scalars, Val: V), *End = Scalars.end(); It != End;
4293 std::advance(i&: It, n: 1)) {
4294 if (*It != V)
4295 continue;
4296 FoundLane = std::distance(first: Scalars.begin(), last: It);
4297 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4298 if (!ReorderIndices.empty())
4299 FoundLane = ReorderIndices[FoundLane];
4300 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4301 if (ReuseShuffleIndices.empty())
4302 break;
4303 if (auto *RIt = find(Range: ReuseShuffleIndices, Val: FoundLane);
4304 RIt != ReuseShuffleIndices.end()) {
4305 FoundLane = std::distance(first: ReuseShuffleIndices.begin(), last: RIt);
4306 break;
4307 }
4308 }
4309 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4310 return FoundLane;
4311 }
4312
4313 /// Build a shuffle mask for graph entry which represents a merge of main
4314 /// and alternate operations.
4315 void
4316 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4317 SmallVectorImpl<int> &Mask,
4318 SmallVectorImpl<Value *> *OpScalars = nullptr,
4319 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4320
4321 /// Return true if this is a non-power-of-2 node.
4322 bool isNonPowOf2Vec() const {
4323 bool IsNonPowerOf2 = !has_single_bit(Value: Scalars.size());
4324 return IsNonPowerOf2;
4325 }
4326
4327 /// Return true if this is a node, which tries to vectorize number of
4328 /// elements, forming whole vectors.
4329 bool
4330 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4331 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4332 TTI, Ty: getValueType(V: Scalars.front()), Sz: Scalars.size());
4333 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4334 "Reshuffling not supported with non-power-of-2 vectors yet.");
4335 return IsNonPowerOf2;
4336 }
4337
4338 Value *getOrdered(unsigned Idx) const {
4339 if (ReorderIndices.empty())
4340 return Scalars[Idx];
4341 SmallVector<int> Mask;
4342 inversePermutation(Indices: ReorderIndices, Mask);
4343 return Scalars[Mask[Idx]];
4344 }
4345
4346#ifndef NDEBUG
4347 /// Debug printer.
4348 LLVM_DUMP_METHOD void dump() const {
4349 dbgs() << Idx << ".\n";
4350 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4351 dbgs() << "Operand " << OpI << ":\n";
4352 for (const Value *V : Operands[OpI])
4353 dbgs().indent(2) << *V << "\n";
4354 }
4355 dbgs() << "Scalars: \n";
4356 for (Value *V : Scalars)
4357 dbgs().indent(2) << *V << "\n";
4358 dbgs() << "State: ";
4359 if (S && hasCopyableElements())
4360 dbgs() << "[[Copyable]] ";
4361 switch (State) {
4362 case Vectorize:
4363 if (InterleaveFactor > 0) {
4364 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4365 << "\n";
4366 } else {
4367 dbgs() << "Vectorize\n";
4368 }
4369 break;
4370 case ScatterVectorize:
4371 dbgs() << "ScatterVectorize\n";
4372 break;
4373 case StridedVectorize:
4374 dbgs() << "StridedVectorize\n";
4375 break;
4376 case CompressVectorize:
4377 dbgs() << "CompressVectorize\n";
4378 break;
4379 case NeedToGather:
4380 dbgs() << "NeedToGather\n";
4381 break;
4382 case CombinedVectorize:
4383 dbgs() << "CombinedVectorize\n";
4384 break;
4385 case SplitVectorize:
4386 dbgs() << "SplitVectorize\n";
4387 break;
4388 }
4389 if (S) {
4390 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4391 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4392 } else {
4393 dbgs() << "MainOp: NULL\n";
4394 dbgs() << "AltOp: NULL\n";
4395 }
4396 dbgs() << "VectorizedValue: ";
4397 if (VectorizedValue)
4398 dbgs() << *VectorizedValue << "\n";
4399 else
4400 dbgs() << "NULL\n";
4401 dbgs() << "ReuseShuffleIndices: ";
4402 if (ReuseShuffleIndices.empty())
4403 dbgs() << "Empty";
4404 else
4405 for (int ReuseIdx : ReuseShuffleIndices)
4406 dbgs() << ReuseIdx << ", ";
4407 dbgs() << "\n";
4408 dbgs() << "ReorderIndices: ";
4409 for (unsigned ReorderIdx : ReorderIndices)
4410 dbgs() << ReorderIdx << ", ";
4411 dbgs() << "\n";
4412 dbgs() << "UserTreeIndex: ";
4413 if (UserTreeIndex)
4414 dbgs() << UserTreeIndex;
4415 else
4416 dbgs() << "<invalid>";
4417 dbgs() << "\n";
4418 if (!CombinedEntriesWithIndices.empty()) {
4419 dbgs() << "Combined entries: ";
4420 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4421 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4422 });
4423 dbgs() << "\n";
4424 }
4425 }
4426#endif
4427 };
4428
4429#ifndef NDEBUG
4430 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4431 InstructionCost VecCost, InstructionCost ScalarCost,
4432 StringRef Banner) const {
4433 dbgs() << "SLP: " << Banner << ":\n";
4434 E->dump();
4435 dbgs() << "SLP: Costs:\n";
4436 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4437 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4438 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4439 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4440 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4441 }
4442#endif
4443
4444 /// Create a new gather TreeEntry
4445 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4446 const InstructionsState &S,
4447 const EdgeInfo &UserTreeIdx,
4448 ArrayRef<int> ReuseShuffleIndices = {}) {
4449 auto Invalid = ScheduleBundle::invalid();
4450 return newTreeEntry(VL, Bundle&: Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4451 }
4452
4453 /// Create a new VectorizableTree entry.
4454 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4455 const InstructionsState &S,
4456 const EdgeInfo &UserTreeIdx,
4457 ArrayRef<int> ReuseShuffleIndices = {},
4458 ArrayRef<unsigned> ReorderIndices = {},
4459 unsigned InterleaveFactor = 0) {
4460 TreeEntry::EntryState EntryState =
4461 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4462 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4463 ReuseShuffleIndices, ReorderIndices);
4464 if (E && InterleaveFactor > 0)
4465 E->setInterleave(InterleaveFactor);
4466 return E;
4467 }
4468
4469 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4470 TreeEntry::EntryState EntryState,
4471 ScheduleBundle &Bundle, const InstructionsState &S,
4472 const EdgeInfo &UserTreeIdx,
4473 ArrayRef<int> ReuseShuffleIndices = {},
4474 ArrayRef<unsigned> ReorderIndices = {}) {
4475 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4476 EntryState == TreeEntry::SplitVectorize)) ||
4477 (Bundle && EntryState != TreeEntry::NeedToGather &&
4478 EntryState != TreeEntry::SplitVectorize)) &&
4479 "Need to vectorize gather entry?");
4480 // Gathered loads still gathered? Do not create entry, use the original one.
4481 if (GatheredLoadsEntriesFirst.has_value() &&
4482 EntryState == TreeEntry::NeedToGather && S &&
4483 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4484 !UserTreeIdx.UserTE)
4485 return nullptr;
4486 VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
4487 TreeEntry *Last = VectorizableTree.back().get();
4488 Last->Idx = VectorizableTree.size() - 1;
4489 Last->State = EntryState;
4490 if (UserTreeIdx.UserTE)
4491 OperandsToTreeEntry.try_emplace(
4492 Key: std::make_pair(x: UserTreeIdx.UserTE, y: UserTreeIdx.EdgeIdx), Args&: Last);
4493 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4494 // for non-power-of-two vectors.
4495 assert(
4496 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4497 ReuseShuffleIndices.empty()) &&
4498 "Reshuffling scalars not yet supported for nodes with padding");
4499 Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
4500 in_end: ReuseShuffleIndices.end());
4501 if (ReorderIndices.empty()) {
4502 Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
4503 if (S)
4504 Last->setOperations(S);
4505 } else {
4506 // Reorder scalars and build final mask.
4507 Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
4508 transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
4509 F: [VL](unsigned Idx) -> Value * {
4510 if (Idx >= VL.size())
4511 return UndefValue::get(T: VL.front()->getType());
4512 return VL[Idx];
4513 });
4514 InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
4515 if (S)
4516 Last->setOperations(S);
4517 Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
4518 }
4519 if (EntryState == TreeEntry::SplitVectorize) {
4520 assert(S && "Split nodes must have operations.");
4521 Last->setOperations(S);
4522 SmallPtrSet<Value *, 4> Processed;
4523 for (Value *V : VL) {
4524 auto *I = dyn_cast<Instruction>(Val: V);
4525 if (!I)
4526 continue;
4527 auto It = ScalarsInSplitNodes.find(Val: V);
4528 if (It == ScalarsInSplitNodes.end()) {
4529 ScalarsInSplitNodes.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4530 (void)Processed.insert(Ptr: V);
4531 } else if (Processed.insert(Ptr: V).second) {
4532 assert(!is_contained(It->getSecond(), Last) &&
4533 "Value already associated with the node.");
4534 It->getSecond().push_back(Elt: Last);
4535 }
4536 }
4537 } else if (!Last->isGather()) {
4538 if (isa<PHINode>(Val: S.getMainOp()) ||
4539 isVectorLikeInstWithConstOps(V: S.getMainOp()) ||
4540 (!S.areInstructionsWithCopyableElements() &&
4541 doesNotNeedToSchedule(VL)) ||
4542 all_of(Range&: VL, P: [&](Value *V) { return S.isNonSchedulable(V); }))
4543 Last->setDoesNotNeedToSchedule();
4544 SmallPtrSet<Value *, 4> Processed;
4545 for (Value *V : VL) {
4546 if (isa<PoisonValue>(Val: V))
4547 continue;
4548 if (S.isCopyableElement(V)) {
4549 Last->addCopyableElement(V);
4550 continue;
4551 }
4552 auto It = ScalarToTreeEntries.find(Val: V);
4553 if (It == ScalarToTreeEntries.end()) {
4554 ScalarToTreeEntries.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4555 (void)Processed.insert(Ptr: V);
4556 } else if (Processed.insert(Ptr: V).second) {
4557 assert(!is_contained(It->getSecond(), Last) &&
4558 "Value already associated with the node.");
4559 It->getSecond().push_back(Elt: Last);
4560 }
4561 }
4562 // Update the scheduler bundle to point to this TreeEntry.
4563 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4564 "Bundle and VL out of sync");
4565 if (!Bundle.getBundle().empty()) {
4566#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4567 auto *BundleMember = Bundle.getBundle().begin();
4568 SmallPtrSet<Value *, 4> Processed;
4569 for (Value *V : VL) {
4570 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4571 continue;
4572 ++BundleMember;
4573 }
4574 assert(BundleMember == Bundle.getBundle().end() &&
4575 "Bundle and VL out of sync");
4576#endif
4577 Bundle.setTreeEntry(Last);
4578 }
4579 } else {
4580 // Build a map for gathered scalars to the nodes where they are used.
4581 bool AllConstsOrCasts = true;
4582 for (Value *V : VL) {
4583 if (S && S.areInstructionsWithCopyableElements() &&
4584 S.isCopyableElement(V))
4585 Last->addCopyableElement(V);
4586 if (!isConstant(V)) {
4587 auto *I = dyn_cast<CastInst>(Val: V);
4588 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4589 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4590 !UserTreeIdx.UserTE->isGather())
4591 ValueToGatherNodes.try_emplace(Key: V).first->getSecond().insert(X: Last);
4592 }
4593 }
4594 if (AllConstsOrCasts)
4595 CastMaxMinBWSizes =
4596 std::make_pair(x: std::numeric_limits<unsigned>::max(), y: 1);
4597 MustGather.insert_range(R&: VL);
4598 }
4599
4600 if (UserTreeIdx.UserTE)
4601 Last->UserTreeIndex = UserTreeIdx;
4602 return Last;
4603 }
4604
4605 /// -- Vectorization State --
4606 /// Holds all of the tree entries.
4607 TreeEntry::VecTreeTy VectorizableTree;
4608
4609#ifndef NDEBUG
4610 /// Debug printer.
4611 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4612 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4613 VectorizableTree[Id]->dump();
4614 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4615 dbgs() << "[[TRANSFORMED TO GATHER]]";
4616 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4617 dbgs() << "[[DELETED NODE]]";
4618 dbgs() << "\n";
4619 }
4620 }
4621#endif
4622
4623 /// Get list of vector entries, associated with the value \p V.
4624 ArrayRef<TreeEntry *> getTreeEntries(const Value *V) const {
4625 assert(V && "V cannot be nullptr.");
4626 auto It = ScalarToTreeEntries.find(Val: V);
4627 if (It == ScalarToTreeEntries.end())
4628 return {};
4629 return It->getSecond();
4630 }
4631
4632 /// Get list of split vector entries, associated with the value \p V.
4633 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4634 assert(V && "V cannot be nullptr.");
4635 auto It = ScalarsInSplitNodes.find(Val: V);
4636 if (It == ScalarsInSplitNodes.end())
4637 return {};
4638 return It->getSecond();
4639 }
4640
4641 /// Returns first vector node for value \p V, matching values \p VL.
4642 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4643 bool SameVF = false) const {
4644 assert(V && "V cannot be nullptr.");
4645 for (TreeEntry *TE : ScalarToTreeEntries.lookup(Val: V))
4646 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4647 return TE;
4648 return nullptr;
4649 }
4650
4651 /// Contains all the outputs of legality analysis for a list of values to
4652 /// vectorize.
4653 class ScalarsVectorizationLegality {
4654 InstructionsState S;
4655 bool IsLegal;
4656 bool TryToFindDuplicates;
4657 bool TrySplitVectorize;
4658
4659 public:
4660 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4661 bool TryToFindDuplicates = true,
4662 bool TrySplitVectorize = false)
4663 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4664 TrySplitVectorize(TrySplitVectorize) {
4665 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4666 "Inconsistent state");
4667 }
4668 const InstructionsState &getInstructionsState() const { return S; };
4669 bool isLegal() const { return IsLegal; }
4670 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4671 bool trySplitVectorize() const { return TrySplitVectorize; }
4672 };
4673
4674 /// Checks if the specified list of the instructions/values can be vectorized
4675 /// in general.
4676 ScalarsVectorizationLegality
4677 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4678 const EdgeInfo &UserTreeIdx) const;
4679
4680 /// Checks if the specified list of the instructions/values can be vectorized
4681 /// and fills required data before actual scheduling of the instructions.
4682 TreeEntry::EntryState getScalarsVectorizationState(
4683 const InstructionsState &S, ArrayRef<Value *> VL,
4684 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4685 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4686
4687 /// Maps a specific scalar to its tree entry(ies).
4688 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4689
4690 /// List of deleted non-profitable nodes.
4691 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4692
4693 /// List of nodes, transformed to gathered, with their conservative
4694 /// gather/buildvector cost estimation.
4695 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4696
4697 /// Maps the operand index and entry to the corresponding tree entry.
4698 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4699 OperandsToTreeEntry;
4700
4701 /// Scalars, used in split vectorize nodes.
4702 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4703
4704 /// Maps a value to the proposed vectorizable size.
4705 SmallDenseMap<Value *, unsigned> InstrElementSize;
4706
4707 /// A list of scalars that we found that we need to keep as scalars.
4708 ValueSet MustGather;
4709
4710 /// A set of first non-schedulable values.
4711 ValueSet NonScheduledFirst;
4712
4713 /// A map between the vectorized entries and the last instructions in the
4714 /// bundles. The bundles are built in use order, not in the def order of the
4715 /// instructions. So, we cannot rely directly on the last instruction in the
4716 /// bundle being the last instruction in the program order during
4717 /// vectorization process since the basic blocks are affected, need to
4718 /// pre-gather them before.
4719 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4720
4721 /// Keeps the mapping between the last instructions and their insertion
4722 /// points, which is an instruction-after-the-last-instruction.
4723 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4724
4725 /// List of gather nodes, depending on other gather/vector nodes, which should
4726 /// be emitted after the vector instruction emission process to correctly
4727 /// handle order of the vector instructions and shuffles.
4728 SetVector<const TreeEntry *> PostponedGathers;
4729
4730 using ValueToGatherNodesMap =
4731 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4732 ValueToGatherNodesMap ValueToGatherNodes;
4733
4734 /// A list of the load entries (node indices), which can be vectorized using
4735 /// strided or masked gather approach, but attempted to be represented as
4736 /// contiguous loads.
4737 SetVector<unsigned> LoadEntriesToVectorize;
4738
4739 /// true if graph nodes transforming mode is on.
4740 bool IsGraphTransformMode = false;
4741
4742 /// The index of the first gathered load entry in the VectorizeTree.
4743 std::optional<unsigned> GatheredLoadsEntriesFirst;
4744
4745 /// Maps compress entries to their mask data for the final codegen.
4746 SmallDenseMap<const TreeEntry *,
4747 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4748 CompressEntryToData;
4749
4750 /// The loop nest, used to check if only a single loop nest is vectorized, not
4751 /// multiple, to avoid side-effects from the loop-aware cost model.
4752 SmallVector<const Loop *> CurrentLoopNest;
4753
4754 /// Maps the loops to their loop nests.
4755 SmallDenseMap<const Loop *, SmallVector<const Loop *>> LoopToLoopNest;
4756
4757 /// Maps the loops to their scale factor, which is built as a multiplication
4758 /// of the tripcounts of the loops in the loop nest.
4759 SmallDenseMap<const Loop *, unsigned> LoopToScaleFactor;
4760
4761 /// This POD struct describes one external user in the vectorized tree.
4762 struct ExternalUser {
4763 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4764 : Scalar(S), User(U), E(E), Lane(L) {}
4765
4766 /// Which scalar in our function.
4767 Value *Scalar = nullptr;
4768
4769 /// Which user that uses the scalar.
4770 llvm::User *User = nullptr;
4771
4772 /// Vector node, the value is part of.
4773 const TreeEntry &E;
4774
4775 /// Which lane does the scalar belong to.
4776 unsigned Lane;
4777 };
4778 using UserList = SmallVector<ExternalUser, 16>;
4779
4780 /// Checks if two instructions may access the same memory.
4781 ///
4782 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4783 /// is invariant in the calling loop.
4784 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4785 Instruction *Inst2) {
4786 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4787 // First check if the result is already in the cache.
4788 AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
4789 auto Res = AliasCache.try_emplace(Key);
4790 if (!Res.second)
4791 return Res.first->second;
4792 bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
4793 // Store the result in the cache.
4794 Res.first->getSecond() = Aliased;
4795 return Aliased;
4796 }
4797
4798 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4799
4800 /// Cache for alias results.
4801 /// TODO: consider moving this to the AliasAnalysis itself.
4802 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4803
4804 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4805 // globally through SLP because we don't perform any action which
4806 // invalidates capture results.
4807 BatchAAResults BatchAA;
4808
4809 /// Temporary store for deleted instructions. Instructions will be deleted
4810 /// eventually when the BoUpSLP is destructed. The deferral is required to
4811 /// ensure that there are no incorrect collisions in the AliasCache, which
4812 /// can happen if a new instruction is allocated at the same address as a
4813 /// previously deleted instruction.
4814 DenseSet<Instruction *> DeletedInstructions;
4815
4816 /// Set of the instruction, being analyzed already for reductions.
4817 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4818
4819 /// Set of hashes for the list of reduction values already being analyzed.
4820 DenseSet<size_t> AnalyzedReductionVals;
4821
4822 /// Values, already been analyzed for mininmal bitwidth and found to be
4823 /// non-profitable.
4824 DenseSet<Value *> AnalyzedMinBWVals;
4825
4826 /// A list of values that need to extracted out of the tree.
4827 /// This list holds pairs of (Internal Scalar : External User). External User
4828 /// can be nullptr, it means that this Internal Scalar will be used later,
4829 /// after vectorization.
4830 UserList ExternalUses;
4831
4832 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4833 /// extractelement instructions.
4834 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4835
4836 /// A list of scalar to be extracted without specific user necause of too many
4837 /// uses.
4838 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4839
4840 /// Values used only by @llvm.assume calls.
4841 SmallPtrSet<const Value *, 32> EphValues;
4842
4843 /// Holds all of the instructions that we gathered, shuffle instructions and
4844 /// extractelements.
4845 SetVector<Instruction *> GatherShuffleExtractSeq;
4846
4847 /// A list of blocks that we are going to CSE.
4848 DenseSet<BasicBlock *> CSEBlocks;
4849
4850 /// List of hashes of vector of loads, which are known to be non vectorizable.
4851 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4852
4853 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4854 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4855 /// instructions, while ScheduleBundle represents a batch of instructions,
4856 /// going to be groupped together. ScheduleCopyableData models extra user for
4857 /// "copyable" instructions.
4858 class ScheduleEntity {
4859 friend class ScheduleBundle;
4860 friend class ScheduleData;
4861 friend class ScheduleCopyableData;
4862
4863 protected:
4864 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4865 Kind getKind() const { return K; }
4866 ScheduleEntity(Kind K) : K(K) {}
4867
4868 private:
4869 /// Used for getting a "good" final ordering of instructions.
4870 int SchedulingPriority = 0;
4871 /// True if this instruction (or bundle) is scheduled (or considered as
4872 /// scheduled in the dry-run).
4873 bool IsScheduled = false;
4874 /// The kind of the ScheduleEntity.
4875 const Kind K = Kind::ScheduleData;
4876
4877 public:
4878 ScheduleEntity() = delete;
4879 /// Gets/sets the scheduling priority.
4880 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4881 int getSchedulingPriority() const { return SchedulingPriority; }
4882 bool isReady() const {
4883 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4884 return SD->isReady();
4885 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4886 return CD->isReady();
4887 return cast<ScheduleBundle>(Val: this)->isReady();
4888 }
4889 /// Returns true if the dependency information has been calculated.
4890 /// Note that depenendency validity can vary between instructions within
4891 /// a single bundle.
4892 bool hasValidDependencies() const {
4893 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4894 return SD->hasValidDependencies();
4895 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4896 return CD->hasValidDependencies();
4897 return cast<ScheduleBundle>(Val: this)->hasValidDependencies();
4898 }
4899 /// Gets the number of unscheduled dependencies.
4900 int getUnscheduledDeps() const {
4901 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4902 return SD->getUnscheduledDeps();
4903 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4904 return CD->getUnscheduledDeps();
4905 return cast<ScheduleBundle>(Val: this)->unscheduledDepsInBundle();
4906 }
4907 /// Increments the number of unscheduled dependencies.
4908 int incrementUnscheduledDeps(int Incr) {
4909 if (auto *SD = dyn_cast<ScheduleData>(Val: this))
4910 return SD->incrementUnscheduledDeps(Incr);
4911 return cast<ScheduleCopyableData>(Val: this)->incrementUnscheduledDeps(Incr);
4912 }
4913 /// Gets the number of dependencies.
4914 int getDependencies() const {
4915 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4916 return SD->getDependencies();
4917 return cast<ScheduleCopyableData>(Val: this)->getDependencies();
4918 }
4919 /// Gets the instruction.
4920 Instruction *getInst() const {
4921 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4922 return SD->getInst();
4923 return cast<ScheduleCopyableData>(Val: this)->getInst();
4924 }
4925
4926 /// Gets/sets if the bundle is scheduled.
4927 bool isScheduled() const { return IsScheduled; }
4928 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4929
4930 static bool classof(const ScheduleEntity *) { return true; }
4931
4932#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4933 void dump(raw_ostream &OS) const {
4934 if (const auto *SD = dyn_cast<ScheduleData>(this))
4935 return SD->dump(OS);
4936 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4937 return CD->dump(OS);
4938 return cast<ScheduleBundle>(this)->dump(OS);
4939 }
4940
4941 LLVM_DUMP_METHOD void dump() const {
4942 dump(dbgs());
4943 dbgs() << '\n';
4944 }
4945#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4946 };
4947
4948#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4949 friend inline raw_ostream &operator<<(raw_ostream &OS,
4950 const BoUpSLP::ScheduleEntity &SE) {
4951 SE.dump(OS);
4952 return OS;
4953 }
4954#endif
4955
4956 /// Contains all scheduling relevant data for an instruction.
4957 /// A ScheduleData either represents a single instruction or a member of an
4958 /// instruction bundle (= a group of instructions which is combined into a
4959 /// vector instruction).
4960 class ScheduleData final : public ScheduleEntity {
4961 public:
4962 // The initial value for the dependency counters. It means that the
4963 // dependencies are not calculated yet.
4964 enum { InvalidDeps = -1 };
4965
4966 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4967 static bool classof(const ScheduleEntity *Entity) {
4968 return Entity->getKind() == Kind::ScheduleData;
4969 }
4970
4971 void init(int BlockSchedulingRegionID, Instruction *I) {
4972 NextLoadStore = nullptr;
4973 IsScheduled = false;
4974 SchedulingRegionID = BlockSchedulingRegionID;
4975 clearDependencies();
4976 Inst = I;
4977 }
4978
4979 /// Verify basic self consistency properties
4980 void verify() {
4981 if (hasValidDependencies()) {
4982 assert(UnscheduledDeps <= Dependencies && "invariant");
4983 } else {
4984 assert(UnscheduledDeps == Dependencies && "invariant");
4985 }
4986
4987 if (IsScheduled) {
4988 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4989 "unexpected scheduled state");
4990 }
4991 }
4992
4993 /// Returns true if the dependency information has been calculated.
4994 /// Note that depenendency validity can vary between instructions within
4995 /// a single bundle.
4996 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4997
4998 /// Returns true if it is ready for scheduling, i.e. it has no more
4999 /// unscheduled depending instructions/bundles.
5000 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5001
5002 /// Modifies the number of unscheduled dependencies for this instruction,
5003 /// and returns the number of remaining dependencies for the containing
5004 /// bundle.
5005 int incrementUnscheduledDeps(int Incr) {
5006 assert(hasValidDependencies() &&
5007 "increment of unscheduled deps would be meaningless");
5008 UnscheduledDeps += Incr;
5009 assert(UnscheduledDeps >= 0 &&
5010 "Expected valid number of unscheduled deps");
5011 return UnscheduledDeps;
5012 }
5013
5014 /// Sets the number of unscheduled dependencies to the number of
5015 /// dependencies.
5016 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5017
5018 /// Clears all dependency information.
5019 void clearDependencies() {
5020 clearDirectDependencies();
5021 MemoryDependencies.clear();
5022 ControlDependencies.clear();
5023 }
5024
5025 /// Clears all direct dependencies only, except for control and memory
5026 /// dependencies.
5027 /// Required for copyable elements to correctly handle control/memory deps
5028 /// and avoid extra reclaculation of such deps.
5029 void clearDirectDependencies() {
5030 Dependencies = InvalidDeps;
5031 resetUnscheduledDeps();
5032 IsScheduled = false;
5033 }
5034
5035 /// Gets the number of unscheduled dependencies.
5036 int getUnscheduledDeps() const { return UnscheduledDeps; }
5037 /// Gets the number of dependencies.
5038 int getDependencies() const { return Dependencies; }
5039 /// Initializes the number of dependencies.
5040 void initDependencies() { Dependencies = 0; }
5041 /// Increments the number of dependencies.
5042 void incDependencies() { Dependencies++; }
5043
5044 /// Gets scheduling region ID.
5045 int getSchedulingRegionID() const { return SchedulingRegionID; }
5046
5047 /// Gets the instruction.
5048 Instruction *getInst() const { return Inst; }
5049
5050 /// Gets the list of memory dependencies.
5051 ArrayRef<ScheduleData *> getMemoryDependencies() const {
5052 return MemoryDependencies;
5053 }
5054 /// Adds a memory dependency.
5055 void addMemoryDependency(ScheduleData *Dep) {
5056 MemoryDependencies.push_back(Elt: Dep);
5057 }
5058 /// Gets the list of control dependencies.
5059 ArrayRef<ScheduleData *> getControlDependencies() const {
5060 return ControlDependencies;
5061 }
5062 /// Adds a control dependency.
5063 void addControlDependency(ScheduleData *Dep) {
5064 ControlDependencies.push_back(Elt: Dep);
5065 }
5066 /// Gets/sets the next load/store instruction in the block.
5067 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
5068 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5069
5070 void dump(raw_ostream &OS) const { OS << *Inst; }
5071
5072 LLVM_DUMP_METHOD void dump() const {
5073 dump(OS&: dbgs());
5074 dbgs() << '\n';
5075 }
5076
5077 private:
5078 Instruction *Inst = nullptr;
5079
5080 /// Single linked list of all memory instructions (e.g. load, store, call)
5081 /// in the block - until the end of the scheduling region.
5082 ScheduleData *NextLoadStore = nullptr;
5083
5084 /// The dependent memory instructions.
5085 /// This list is derived on demand in calculateDependencies().
5086 SmallVector<ScheduleData *> MemoryDependencies;
5087
5088 /// List of instructions which this instruction could be control dependent
5089 /// on. Allowing such nodes to be scheduled below this one could introduce
5090 /// a runtime fault which didn't exist in the original program.
5091 /// ex: this is a load or udiv following a readonly call which inf loops
5092 SmallVector<ScheduleData *> ControlDependencies;
5093
5094 /// This ScheduleData is in the current scheduling region if this matches
5095 /// the current SchedulingRegionID of BlockScheduling.
5096 int SchedulingRegionID = 0;
5097
5098 /// The number of dependencies. Constitutes of the number of users of the
5099 /// instruction plus the number of dependent memory instructions (if any).
5100 /// This value is calculated on demand.
5101 /// If InvalidDeps, the number of dependencies is not calculated yet.
5102 int Dependencies = InvalidDeps;
5103
5104 /// The number of dependencies minus the number of dependencies of scheduled
5105 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5106 /// for scheduling.
5107 /// Note that this is negative as long as Dependencies is not calculated.
5108 int UnscheduledDeps = InvalidDeps;
5109 };
5110
5111#ifndef NDEBUG
5112 friend inline raw_ostream &operator<<(raw_ostream &OS,
5113 const BoUpSLP::ScheduleData &SD) {
5114 SD.dump(OS);
5115 return OS;
5116 }
5117#endif
5118
5119 class ScheduleBundle final : public ScheduleEntity {
5120 /// The schedule data for the instructions in the bundle.
5121 SmallVector<ScheduleEntity *> Bundle;
5122 /// True if this bundle is valid.
5123 bool IsValid = true;
5124 /// The TreeEntry that this instruction corresponds to.
5125 TreeEntry *TE = nullptr;
5126 ScheduleBundle(bool IsValid)
5127 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5128
5129 public:
5130 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5131 static bool classof(const ScheduleEntity *Entity) {
5132 return Entity->getKind() == Kind::ScheduleBundle;
5133 }
5134
5135 /// Verify basic self consistency properties
5136 void verify() const {
5137 for (const ScheduleEntity *SD : Bundle) {
5138 if (SD->hasValidDependencies()) {
5139 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5140 "invariant");
5141 } else {
5142 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5143 "invariant");
5144 }
5145
5146 if (isScheduled()) {
5147 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5148 "unexpected scheduled state");
5149 }
5150 }
5151 }
5152
5153 /// Returns the number of unscheduled dependencies in the bundle.
5154 int unscheduledDepsInBundle() const {
5155 assert(*this && "bundle must not be empty");
5156 int Sum = 0;
5157 for (const ScheduleEntity *BundleMember : Bundle) {
5158 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5159 return ScheduleData::InvalidDeps;
5160 Sum += BundleMember->getUnscheduledDeps();
5161 }
5162 return Sum;
5163 }
5164
5165 /// Returns true if the dependency information has been calculated.
5166 /// Note that depenendency validity can vary between instructions within
5167 /// a single bundle.
5168 bool hasValidDependencies() const {
5169 return all_of(Range: Bundle, P: [](const ScheduleEntity *SD) {
5170 return SD->hasValidDependencies();
5171 });
5172 }
5173
5174 /// Returns true if it is ready for scheduling, i.e. it has no more
5175 /// unscheduled depending instructions/bundles.
5176 bool isReady() const {
5177 assert(*this && "bundle must not be empty");
5178 return unscheduledDepsInBundle() == 0 && !isScheduled();
5179 }
5180
5181 /// Returns the bundle of scheduling data, associated with the current
5182 /// instruction.
5183 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5184 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5185 /// Adds an instruction to the bundle.
5186 void add(ScheduleEntity *SD) { Bundle.push_back(Elt: SD); }
5187
5188 /// Gets/sets the associated tree entry.
5189 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5190 TreeEntry *getTreeEntry() const { return TE; }
5191
5192 static ScheduleBundle invalid() { return {false}; }
5193
5194 operator bool() const { return IsValid; }
5195
5196#ifndef NDEBUG
5197 void dump(raw_ostream &OS) const {
5198 if (!*this) {
5199 OS << "[]";
5200 return;
5201 }
5202 OS << '[';
5203 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5204 if (isa<ScheduleCopyableData>(SD))
5205 OS << "<Copyable>";
5206 OS << *SD->getInst();
5207 });
5208 OS << ']';
5209 }
5210
5211 LLVM_DUMP_METHOD void dump() const {
5212 dump(dbgs());
5213 dbgs() << '\n';
5214 }
5215#endif // NDEBUG
5216 };
5217
5218#ifndef NDEBUG
5219 friend inline raw_ostream &operator<<(raw_ostream &OS,
5220 const BoUpSLP::ScheduleBundle &Bundle) {
5221 Bundle.dump(OS);
5222 return OS;
5223 }
5224#endif
5225
5226 /// Contains all scheduling relevant data for the copyable instruction.
5227 /// It models the virtual instructions, supposed to replace the original
5228 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5229 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5230 /// instruction %virt = add %0, 0.
5231 class ScheduleCopyableData final : public ScheduleEntity {
5232 /// The source schedule data for the instruction.
5233 Instruction *Inst = nullptr;
5234 /// The edge information for the instruction.
5235 const EdgeInfo EI;
5236 /// This ScheduleData is in the current scheduling region if this matches
5237 /// the current SchedulingRegionID of BlockScheduling.
5238 int SchedulingRegionID = 0;
5239 /// Bundle, this data is part of.
5240 ScheduleBundle &Bundle;
5241
5242 public:
5243 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5244 const EdgeInfo &EI, ScheduleBundle &Bundle)
5245 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5246 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5247 static bool classof(const ScheduleEntity *Entity) {
5248 return Entity->getKind() == Kind::ScheduleCopyableData;
5249 }
5250
5251 /// Verify basic self consistency properties
5252 void verify() {
5253 if (hasValidDependencies()) {
5254 assert(UnscheduledDeps <= Dependencies && "invariant");
5255 } else {
5256 assert(UnscheduledDeps == Dependencies && "invariant");
5257 }
5258
5259 if (IsScheduled) {
5260 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5261 "unexpected scheduled state");
5262 }
5263 }
5264
5265 /// Returns true if the dependency information has been calculated.
5266 /// Note that depenendency validity can vary between instructions within
5267 /// a single bundle.
5268 bool hasValidDependencies() const {
5269 return Dependencies != ScheduleData::InvalidDeps;
5270 }
5271
5272 /// Returns true if it is ready for scheduling, i.e. it has no more
5273 /// unscheduled depending instructions/bundles.
5274 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5275
5276 /// Modifies the number of unscheduled dependencies for this instruction,
5277 /// and returns the number of remaining dependencies for the containing
5278 /// bundle.
5279 int incrementUnscheduledDeps(int Incr) {
5280 assert(hasValidDependencies() &&
5281 "increment of unscheduled deps would be meaningless");
5282 UnscheduledDeps += Incr;
5283 assert(UnscheduledDeps >= 0 && "invariant");
5284 return UnscheduledDeps;
5285 }
5286
5287 /// Sets the number of unscheduled dependencies to the number of
5288 /// dependencies.
5289 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5290
5291 /// Gets the number of unscheduled dependencies.
5292 int getUnscheduledDeps() const { return UnscheduledDeps; }
5293 /// Gets the number of dependencies.
5294 int getDependencies() const { return Dependencies; }
5295 /// Initializes the number of dependencies.
5296 void initDependencies() { Dependencies = 0; }
5297 /// Increments the number of dependencies.
5298 void incDependencies() { Dependencies++; }
5299
5300 /// Gets scheduling region ID.
5301 int getSchedulingRegionID() const { return SchedulingRegionID; }
5302
5303 /// Gets the instruction.
5304 Instruction *getInst() const { return Inst; }
5305
5306 /// Clears all dependency information.
5307 void clearDependencies() {
5308 Dependencies = ScheduleData::InvalidDeps;
5309 UnscheduledDeps = ScheduleData::InvalidDeps;
5310 IsScheduled = false;
5311 }
5312
5313 /// Gets the edge information.
5314 const EdgeInfo &getEdgeInfo() const { return EI; }
5315
5316 /// Gets the bundle.
5317 ScheduleBundle &getBundle() { return Bundle; }
5318 const ScheduleBundle &getBundle() const { return Bundle; }
5319
5320#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5321 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5322
5323 LLVM_DUMP_METHOD void dump() const {
5324 dump(dbgs());
5325 dbgs() << '\n';
5326 }
5327#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5328
5329 private:
5330 /// true, if it has valid dependency information. These nodes always have
5331 /// only single dependency.
5332 int Dependencies = ScheduleData::InvalidDeps;
5333
5334 /// The number of dependencies minus the number of dependencies of scheduled
5335 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5336 /// for scheduling.
5337 /// Note that this is negative as long as Dependencies is not calculated.
5338 int UnscheduledDeps = ScheduleData::InvalidDeps;
5339 };
5340
5341#ifndef NDEBUG
5342 friend inline raw_ostream &
5343 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5344 SD.dump(OS);
5345 return OS;
5346 }
5347#endif
5348
5349 friend struct GraphTraits<BoUpSLP *>;
5350 friend struct DOTGraphTraits<BoUpSLP *>;
5351
5352 /// Contains all scheduling data for a basic block.
5353 /// It does not schedules instructions, which are not memory read/write
5354 /// instructions and their operands are either constants, or arguments, or
5355 /// phis, or instructions from others blocks, or their users are phis or from
5356 /// the other blocks. The resulting vector instructions can be placed at the
5357 /// beginning of the basic block without scheduling (if operands does not need
5358 /// to be scheduled) or at the end of the block (if users are outside of the
5359 /// block). It allows to save some compile time and memory used by the
5360 /// compiler.
5361 /// ScheduleData is assigned for each instruction in between the boundaries of
5362 /// the tree entry, even for those, which are not part of the graph. It is
5363 /// required to correctly follow the dependencies between the instructions and
5364 /// their correct scheduling. The ScheduleData is not allocated for the
5365 /// instructions, which do not require scheduling, like phis, nodes with
5366 /// extractelements/insertelements only or nodes with instructions, with
5367 /// uses/operands outside of the block.
5368 struct BlockScheduling {
5369 BlockScheduling(BasicBlock *BB)
5370 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5371
5372 void clear() {
5373 ScheduledBundles.clear();
5374 ScheduledBundlesList.clear();
5375 ScheduleCopyableDataMap.clear();
5376 ScheduleCopyableDataMapByInst.clear();
5377 ScheduleCopyableDataMapByInstUser.clear();
5378 ScheduleCopyableDataMapByUsers.clear();
5379 ReadyInsts.clear();
5380 ScheduleStart = nullptr;
5381 ScheduleEnd = nullptr;
5382 FirstLoadStoreInRegion = nullptr;
5383 LastLoadStoreInRegion = nullptr;
5384 RegionHasStackSave = false;
5385
5386 // Reduce the maximum schedule region size by the size of the
5387 // previous scheduling run.
5388 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5389 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5390 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5391 ScheduleRegionSize = 0;
5392
5393 // Make a new scheduling region, i.e. all existing ScheduleData is not
5394 // in the new region yet.
5395 ++SchedulingRegionID;
5396 }
5397
5398 ScheduleData *getScheduleData(Instruction *I) {
5399 if (!I)
5400 return nullptr;
5401 if (BB != I->getParent())
5402 // Avoid lookup if can't possibly be in map.
5403 return nullptr;
5404 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
5405 if (SD && isInSchedulingRegion(SD: *SD))
5406 return SD;
5407 return nullptr;
5408 }
5409
5410 ScheduleData *getScheduleData(Value *V) {
5411 return getScheduleData(I: dyn_cast<Instruction>(Val: V));
5412 }
5413
5414 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5415 /// operand number) and value.
5416 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5417 const Value *V) const {
5418 if (ScheduleCopyableDataMap.empty())
5419 return nullptr;
5420 auto It = ScheduleCopyableDataMap.find(Val: std::make_pair(x: EI, y&: V));
5421 if (It == ScheduleCopyableDataMap.end())
5422 return nullptr;
5423 ScheduleCopyableData *SD = It->getSecond().get();
5424 if (!isInSchedulingRegion(SD: *SD))
5425 return nullptr;
5426 return SD;
5427 }
5428
5429 /// Returns the ScheduleCopyableData for the given user \p User, operand
5430 /// number and operand \p V.
5431 SmallVector<ScheduleCopyableData *>
5432 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5433 const Value *V) {
5434 if (ScheduleCopyableDataMapByInstUser.empty())
5435 return {};
5436 const auto It = ScheduleCopyableDataMapByInstUser.find(
5437 Val: std::make_pair(x: std::make_pair(x&: User, y&: OperandIdx), y&: V));
5438 if (It == ScheduleCopyableDataMapByInstUser.end())
5439 return {};
5440 SmallVector<ScheduleCopyableData *> Res;
5441 for (ScheduleCopyableData *SD : It->getSecond()) {
5442 if (isInSchedulingRegion(SD: *SD))
5443 Res.push_back(Elt: SD);
5444 }
5445 return Res;
5446 }
5447
5448 /// Returns true if all operands of the given instruction \p User are
5449 /// replaced by copyable data.
5450 /// \param User The user instruction.
5451 /// \param Op The operand, which might be replaced by the copyable data.
5452 /// \param SLP The SLP tree.
5453 /// \param NumOps The number of operands used. If the instruction uses the
5454 /// same operand several times, check for the first use, then the second,
5455 /// etc.
5456 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5457 Instruction *Op, BoUpSLP &SLP,
5458 unsigned NumOps) const {
5459 assert(NumOps > 0 && "No operands");
5460 if (ScheduleCopyableDataMap.empty())
5461 return false;
5462 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5463 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(V: User);
5464 if (Entries.empty())
5465 return false;
5466 unsigned CurNumOps = 0;
5467 for (const Use &U : User->operands()) {
5468 if (U.get() != Op)
5469 continue;
5470 ++CurNumOps;
5471 // Check all tree entries, if they have operands replaced by copyable
5472 // data.
5473 for (TreeEntry *TE : Entries) {
5474 unsigned Inc = 0;
5475 bool IsNonSchedulableWithParentPhiNode =
5476 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5477 TE->UserTreeIndex.UserTE->hasState() &&
5478 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5479 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5480 // Count the number of unique phi nodes, which are the parent for
5481 // parent entry, and exit, if all the unique phis are processed.
5482 if (IsNonSchedulableWithParentPhiNode) {
5483 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5484 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5485 for (Value *V : ParentTE->Scalars) {
5486 auto *PHI = dyn_cast<PHINode>(Val: V);
5487 if (!PHI)
5488 continue;
5489 if (ParentsUniqueUsers.insert(Ptr: PHI).second &&
5490 is_contained(Range: PHI->incoming_values(), Element: User))
5491 ++Inc;
5492 }
5493 } else {
5494 Inc = count(Range&: TE->Scalars, Element: User);
5495 }
5496
5497 // Check if the user is commutative.
5498 // The commutatives are handled later, as their operands can be
5499 // reordered.
5500 // Same applies even for non-commutative cmps, because we can invert
5501 // their predicate potentially and, thus, reorder the operands.
5502 bool IsCommutativeUser =
5503 ::isCommutative(I: User) &&
5504 ::isCommutableOperand(I: User, ValWithUses: User, Op: U.getOperandNo());
5505 if (!IsCommutativeUser) {
5506 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(I: User);
5507 IsCommutativeUser =
5508 ::isCommutative(I: MainOp, ValWithUses: User) &&
5509 ::isCommutableOperand(I: MainOp, ValWithUses: User, Op: U.getOperandNo());
5510 }
5511 // The commutative user with the same operands can be safely
5512 // considered as non-commutative, operands reordering does not change
5513 // the semantics.
5514 assert(
5515 (!IsCommutativeUser ||
5516 (((::isCommutative(User) &&
5517 ::isCommutableOperand(User, User, 0) &&
5518 ::isCommutableOperand(User, User, 1)) ||
5519 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5520 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5521 User, 0) &&
5522 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5523 User, 1))))) &&
5524 "Expected commutative user with 2 first commutable operands");
5525 bool IsCommutativeWithSameOps =
5526 IsCommutativeUser && User->getOperand(i: 0) == User->getOperand(i: 1);
5527 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5528 !isa<CmpInst>(Val: User)) {
5529 EdgeInfo EI(TE, U.getOperandNo());
5530 if (CurNumOps != NumOps || getScheduleCopyableData(EI, V: Op))
5531 continue;
5532 return false;
5533 }
5534 PotentiallyReorderedEntriesCount.try_emplace(Key: TE, Args: 0)
5535 .first->getSecond() += Inc;
5536 }
5537 }
5538 if (PotentiallyReorderedEntriesCount.empty())
5539 return true;
5540 // Check the commutative/cmp entries.
5541 for (auto &P : PotentiallyReorderedEntriesCount) {
5542 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5543 bool IsNonSchedulableWithParentPhiNode =
5544 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5545 P.first->UserTreeIndex.UserTE->hasState() &&
5546 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5547 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5548 auto *It = find(Range&: P.first->Scalars, Val: User);
5549 do {
5550 assert(It != P.first->Scalars.end() &&
5551 "User is not in the tree entry");
5552 int Lane = std::distance(first: P.first->Scalars.begin(), last: It);
5553 assert(Lane >= 0 && "Lane is not found");
5554 if (isa<StoreInst>(Val: User) && !P.first->ReorderIndices.empty())
5555 Lane = P.first->ReorderIndices[Lane];
5556 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5557 "Couldn't find extract lane");
5558 // Count the number of unique phi nodes, which are the parent for
5559 // parent entry, and exit, if all the unique phis are processed.
5560 if (IsNonSchedulableWithParentPhiNode) {
5561 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5562 Value *User = ParentTE->Scalars[Lane];
5563 if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5564 It =
5565 find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5566 continue;
5567 }
5568 }
5569 for (unsigned OpIdx :
5570 seq<unsigned>(Size: ::getNumberOfPotentiallyCommutativeOps(
5571 I: P.first->getMainOp()))) {
5572 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5573 getScheduleCopyableData(EI: EdgeInfo(P.first, OpIdx), V: Op))
5574 --P.getSecond();
5575 }
5576 // If parent node is schedulable, it will be handled correctly.
5577 It = find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5578 } while (It != P.first->Scalars.end());
5579 }
5580 return all_of(Range&: PotentiallyReorderedEntriesCount,
5581 P: [&](const std::pair<const TreeEntry *, unsigned> &P) {
5582 return P.second == NumOps - 1;
5583 });
5584 }
5585
5586 SmallVector<ScheduleCopyableData *>
5587 getScheduleCopyableData(const Instruction *I) const {
5588 if (ScheduleCopyableDataMapByInst.empty())
5589 return {};
5590 const auto It = ScheduleCopyableDataMapByInst.find(Val: I);
5591 if (It == ScheduleCopyableDataMapByInst.end())
5592 return {};
5593 SmallVector<ScheduleCopyableData *> Res;
5594 for (ScheduleCopyableData *SD : It->getSecond()) {
5595 if (isInSchedulingRegion(SD: *SD))
5596 Res.push_back(Elt: SD);
5597 }
5598 return Res;
5599 }
5600
5601 SmallVector<ScheduleCopyableData *>
5602 getScheduleCopyableDataUsers(const Instruction *User) const {
5603 if (ScheduleCopyableDataMapByUsers.empty())
5604 return {};
5605 const auto It = ScheduleCopyableDataMapByUsers.find(Val: User);
5606 if (It == ScheduleCopyableDataMapByUsers.end())
5607 return {};
5608 SmallVector<ScheduleCopyableData *> Res;
5609 for (ScheduleCopyableData *SD : It->getSecond()) {
5610 if (isInSchedulingRegion(SD: *SD))
5611 Res.push_back(Elt: SD);
5612 }
5613 return Res;
5614 }
5615
5616 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5617 Instruction *I,
5618 int SchedulingRegionID,
5619 ScheduleBundle &Bundle) {
5620 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5621 ScheduleCopyableData *CD =
5622 ScheduleCopyableDataMap
5623 .try_emplace(Key: std::make_pair(x: EI, y&: I),
5624 Args: std::make_unique<ScheduleCopyableData>(
5625 args&: SchedulingRegionID, args&: I, args: EI, args&: Bundle))
5626 .first->getSecond()
5627 .get();
5628 ScheduleCopyableDataMapByInst[I].push_back(Elt: CD);
5629 if (EI.UserTE) {
5630 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
5631 const auto *It = find(Range&: Op, Val: I);
5632 assert(It != Op.end() && "Lane not set");
5633 SmallPtrSet<Instruction *, 4> Visited;
5634 do {
5635 int Lane = std::distance(first: Op.begin(), last: It);
5636 assert(Lane >= 0 && "Lane not set");
5637 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
5638 !EI.UserTE->ReorderIndices.empty())
5639 Lane = EI.UserTE->ReorderIndices[Lane];
5640 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5641 "Couldn't find extract lane");
5642 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
5643 if (!Visited.insert(Ptr: In).second) {
5644 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5645 continue;
5646 }
5647 ScheduleCopyableDataMapByInstUser
5648 .try_emplace(Key: std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I))
5649 .first->getSecond()
5650 .push_back(Elt: CD);
5651 ScheduleCopyableDataMapByUsers.try_emplace(Key: I)
5652 .first->getSecond()
5653 .insert(X: CD);
5654 // Remove extra deps for users, becoming non-immediate users of the
5655 // instruction. It may happen, if the chain of same copyable elements
5656 // appears in the tree.
5657 if (In == I) {
5658 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5659 if (ScheduleCopyableData *UserCD =
5660 getScheduleCopyableData(EI: UserEI, V: In))
5661 ScheduleCopyableDataMapByUsers[I].remove(X: UserCD);
5662 }
5663 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5664 } while (It != Op.end());
5665 } else {
5666 ScheduleCopyableDataMapByUsers.try_emplace(Key: I).first->getSecond().insert(
5667 X: CD);
5668 }
5669 return *CD;
5670 }
5671
5672 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5673 auto *I = dyn_cast<Instruction>(Val: V);
5674 if (!I)
5675 return {};
5676 auto It = ScheduledBundles.find(Val: I);
5677 if (It == ScheduledBundles.end())
5678 return {};
5679 return It->getSecond();
5680 }
5681
5682 /// Returns true if the entity is in the scheduling region.
5683 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5684 if (const auto *Data = dyn_cast<ScheduleData>(Val: &SD))
5685 return Data->getSchedulingRegionID() == SchedulingRegionID;
5686 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: &SD))
5687 return CD->getSchedulingRegionID() == SchedulingRegionID;
5688 return all_of(Range: cast<ScheduleBundle>(Val: SD).getBundle(),
5689 P: [&](const ScheduleEntity *BundleMember) {
5690 return isInSchedulingRegion(SD: *BundleMember);
5691 });
5692 }
5693
5694 /// Marks an instruction as scheduled and puts all dependent ready
5695 /// instructions into the ready-list.
5696 template <typename ReadyListType>
5697 void schedule(const BoUpSLP &R, const InstructionsState &S,
5698 const EdgeInfo &EI, ScheduleEntity *Data,
5699 ReadyListType &ReadyList) {
5700 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5701 ArrayRef<ScheduleBundle *> Bundles) {
5702 // Handle the def-use chain dependencies.
5703
5704 // Decrement the unscheduled counter and insert to ready list if ready.
5705 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5706 if ((IsControl || Data->hasValidDependencies()) &&
5707 Data->incrementUnscheduledDeps(-1) == 0) {
5708 // There are no more unscheduled dependencies after
5709 // decrementing, so we can put the dependent instruction
5710 // into the ready list.
5711 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5712 ArrayRef<ScheduleBundle *> Bundles;
5713 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5714 CopyableBundle.push_back(Elt: &CD->getBundle());
5715 Bundles = CopyableBundle;
5716 } else {
5717 Bundles = getScheduleBundles(V: Data->getInst());
5718 }
5719 if (!Bundles.empty()) {
5720 for (ScheduleBundle *Bundle : Bundles) {
5721 if (Bundle->unscheduledDepsInBundle() == 0) {
5722 assert(!Bundle->isScheduled() &&
5723 "already scheduled bundle gets ready");
5724 ReadyList.insert(Bundle);
5725 LLVM_DEBUG(dbgs()
5726 << "SLP: gets ready: " << *Bundle << "\n");
5727 }
5728 }
5729 return;
5730 }
5731 assert(!Data->isScheduled() &&
5732 "already scheduled bundle gets ready");
5733 assert(!isa<ScheduleCopyableData>(Data) &&
5734 "Expected non-copyable data");
5735 ReadyList.insert(Data);
5736 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5737 }
5738 };
5739
5740 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5741 Instruction *I) {
5742 if (!ScheduleCopyableDataMap.empty()) {
5743 SmallVector<ScheduleCopyableData *> CopyableData =
5744 getScheduleCopyableData(User, OperandIdx: OpIdx, V: I);
5745 for (ScheduleCopyableData *CD : CopyableData)
5746 DecrUnsched(CD, /*IsControl=*/false);
5747 if (!CopyableData.empty())
5748 return;
5749 }
5750 if (ScheduleData *OpSD = getScheduleData(I))
5751 DecrUnsched(OpSD, /*IsControl=*/false);
5752 };
5753
5754 // If BundleMember is a vector bundle, its operands may have been
5755 // reordered during buildTree(). We therefore need to get its operands
5756 // through the TreeEntry.
5757 if (!Bundles.empty()) {
5758 auto *In = BundleMember->getInst();
5759 // Count uses of each instruction operand.
5760 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5761 unsigned TotalOpCount = 0;
5762 if (isa<ScheduleCopyableData>(Val: BundleMember)) {
5763 // Copyable data is used only once (uses itself).
5764 TotalOpCount = OperandsUses[In] = 1;
5765 } else {
5766 for (const Use &U : In->operands()) {
5767 if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5768 auto Res = OperandsUses.try_emplace(Key: I, Args: 0);
5769 ++Res.first->getSecond();
5770 ++TotalOpCount;
5771 }
5772 }
5773 }
5774 // Decrement the unscheduled counter and insert to ready list if
5775 // ready.
5776 auto DecrUnschedForInst =
5777 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5778 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5779 &Checked) {
5780 if (!ScheduleCopyableDataMap.empty()) {
5781 const EdgeInfo EI = {UserTE, OpIdx};
5782 if (ScheduleCopyableData *CD =
5783 getScheduleCopyableData(EI, V: I)) {
5784 if (!Checked.insert(V: std::make_pair(x&: CD, y&: OpIdx)).second)
5785 return;
5786 DecrUnsched(CD, /*IsControl=*/false);
5787 return;
5788 }
5789 }
5790 auto It = OperandsUses.find(Val: I);
5791 assert(It != OperandsUses.end() && "Operand not found");
5792 if (It->second > 0) {
5793 if (ScheduleData *OpSD = getScheduleData(I)) {
5794 if (!Checked.insert(V: std::make_pair(x&: OpSD, y&: OpIdx)).second)
5795 return;
5796 --It->getSecond();
5797 assert(TotalOpCount > 0 && "No more operands to decrement");
5798 --TotalOpCount;
5799 DecrUnsched(OpSD, /*IsControl=*/false);
5800 } else {
5801 --It->getSecond();
5802 assert(TotalOpCount > 0 && "No more operands to decrement");
5803 --TotalOpCount;
5804 }
5805 }
5806 };
5807
5808 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5809 for (ScheduleBundle *Bundle : Bundles) {
5810 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5811 break;
5812 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5813 // Need to search for the lane since the tree entry can be
5814 // reordered.
5815 auto *It = find(Range&: Bundle->getTreeEntry()->Scalars, Val: In);
5816 bool IsNonSchedulableWithParentPhiNode =
5817 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5818 Bundle->getTreeEntry()->UserTreeIndex &&
5819 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5820 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5821 TreeEntry::SplitVectorize &&
5822 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5823 Instruction::PHI;
5824 do {
5825 int Lane =
5826 std::distance(first: Bundle->getTreeEntry()->Scalars.begin(), last: It);
5827 assert(Lane >= 0 && "Lane not set");
5828 if (isa<StoreInst>(Val: In) &&
5829 !Bundle->getTreeEntry()->ReorderIndices.empty())
5830 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5831 assert(Lane < static_cast<int>(
5832 Bundle->getTreeEntry()->Scalars.size()) &&
5833 "Couldn't find extract lane");
5834
5835 // Since vectorization tree is being built recursively this
5836 // assertion ensures that the tree entry has all operands set
5837 // before reaching this code. Couple of exceptions known at the
5838 // moment are extracts where their second (immediate) operand is
5839 // not added. Since immediates do not affect scheduler behavior
5840 // this is considered okay.
5841 assert(
5842 In &&
5843 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
5844 In->getNumOperands() ==
5845 Bundle->getTreeEntry()->getNumOperands() ||
5846 (isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
5847 Instruction::Select) ||
5848 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5849 "Missed TreeEntry operands?");
5850
5851 // Count the number of unique phi nodes, which are the parent for
5852 // parent entry, and exit, if all the unique phis are processed.
5853 if (IsNonSchedulableWithParentPhiNode) {
5854 const TreeEntry *ParentTE =
5855 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5856 Value *User = ParentTE->Scalars[Lane];
5857 if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5858 It = std::find(first: std::next(x: It),
5859 last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5860 continue;
5861 }
5862 }
5863
5864 for (unsigned OpIdx :
5865 seq<unsigned>(Size: Bundle->getTreeEntry()->getNumOperands()))
5866 if (auto *I = dyn_cast<Instruction>(
5867 Val: Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5868 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5869 << *I << "\n");
5870 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5871 }
5872 // If parent node is schedulable, it will be handled correctly.
5873 if (Bundle->getTreeEntry()->isCopyableElement(V: In))
5874 break;
5875 It = std::find(first: std::next(x: It),
5876 last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5877 } while (It != Bundle->getTreeEntry()->Scalars.end());
5878 }
5879 } else {
5880 // If BundleMember is a stand-alone instruction, no operand reordering
5881 // has taken place, so we directly access its operands.
5882 for (Use &U : BundleMember->getInst()->operands()) {
5883 if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5884 LLVM_DEBUG(dbgs()
5885 << "SLP: check for readiness (def): " << *I << "\n");
5886 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5887 }
5888 }
5889 }
5890 // Handle the memory dependencies.
5891 auto *SD = dyn_cast<ScheduleData>(Val: BundleMember);
5892 if (!SD)
5893 return;
5894 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5895 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5896 if (!VisitedMemory.insert(Ptr: MemoryDep).second)
5897 continue;
5898 // There are no more unscheduled dependencies after decrementing,
5899 // so we can put the dependent instruction into the ready list.
5900 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5901 << *MemoryDep << "\n");
5902 DecrUnsched(MemoryDep);
5903 }
5904 // Handle the control dependencies.
5905 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5906 for (ScheduleData *Dep : SD->getControlDependencies()) {
5907 if (!VisitedControl.insert(Ptr: Dep).second)
5908 continue;
5909 // There are no more unscheduled dependencies after decrementing,
5910 // so we can put the dependent instruction into the ready list.
5911 LLVM_DEBUG(dbgs()
5912 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5913 DecrUnsched(Dep, /*IsControl=*/true);
5914 }
5915 };
5916 if (auto *SD = dyn_cast<ScheduleData>(Val: Data)) {
5917 SD->setScheduled(/*Scheduled=*/true);
5918 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5919 SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles;
5920 SmallVector<ScheduleBundle *> Bundles;
5921 Instruction *In = SD->getInst();
5922 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(V: In);
5923 if (!Entries.empty()) {
5924 for (TreeEntry *TE : Entries) {
5925 if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(Val: In) &&
5926 In->getNumOperands() != TE->getNumOperands())
5927 continue;
5928 auto &BundlePtr =
5929 PseudoBundles.emplace_back(Args: std::make_unique<ScheduleBundle>());
5930 BundlePtr->setTreeEntry(TE);
5931 BundlePtr->add(SD);
5932 Bundles.push_back(Elt: BundlePtr.get());
5933 }
5934 }
5935 ProcessBundleMember(SD, Bundles);
5936 } else {
5937 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Val: Data);
5938 Bundle.setScheduled(/*Scheduled=*/true);
5939 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5940 auto AreAllBundlesScheduled =
5941 [&](const ScheduleEntity *SD,
5942 ArrayRef<ScheduleBundle *> SDBundles) {
5943 if (isa<ScheduleCopyableData>(Val: SD))
5944 return true;
5945 return !SDBundles.empty() &&
5946 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5947 return SDBundle->isScheduled();
5948 });
5949 };
5950 for (ScheduleEntity *SD : Bundle.getBundle()) {
5951 ArrayRef<ScheduleBundle *> SDBundles;
5952 if (!isa<ScheduleCopyableData>(Val: SD))
5953 SDBundles = getScheduleBundles(V: SD->getInst());
5954 if (AreAllBundlesScheduled(SD, SDBundles)) {
5955 SD->setScheduled(/*Scheduled=*/true);
5956 ProcessBundleMember(SD, isa<ScheduleCopyableData>(Val: SD) ? &Bundle
5957 : SDBundles);
5958 }
5959 }
5960 }
5961 }
5962
5963 /// Verify basic self consistency properties of the data structure.
5964 void verify() {
5965 if (!ScheduleStart)
5966 return;
5967
5968 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5969 ScheduleStart->comesBefore(ScheduleEnd) &&
5970 "Not a valid scheduling region?");
5971
5972 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5973 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
5974 if (!Bundles.empty()) {
5975 for (ScheduleBundle *Bundle : Bundles) {
5976 assert(isInSchedulingRegion(*Bundle) &&
5977 "primary schedule data not in window?");
5978 Bundle->verify();
5979 }
5980 continue;
5981 }
5982 auto *SD = getScheduleData(I);
5983 if (!SD)
5984 continue;
5985 assert(isInSchedulingRegion(*SD) &&
5986 "primary schedule data not in window?");
5987 SD->verify();
5988 }
5989
5990 assert(all_of(ReadyInsts,
5991 [](const ScheduleEntity *Bundle) {
5992 return Bundle->isReady();
5993 }) &&
5994 "item in ready list not ready?");
5995 }
5996
5997 /// Put all instructions into the ReadyList which are ready for scheduling.
5998 template <typename ReadyListType>
5999 void initialFillReadyList(ReadyListType &ReadyList) {
6000 SmallPtrSet<ScheduleBundle *, 16> Visited;
6001 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
6002 ScheduleData *SD = getScheduleData(I);
6003 if (SD && SD->hasValidDependencies() && SD->isReady()) {
6004 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
6005 !Bundles.empty()) {
6006 for (ScheduleBundle *Bundle : Bundles) {
6007 if (!Visited.insert(Ptr: Bundle).second)
6008 continue;
6009 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
6010 ReadyList.insert(Bundle);
6011 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
6012 << *Bundle << "\n");
6013 }
6014 }
6015 continue;
6016 }
6017 ReadyList.insert(SD);
6018 LLVM_DEBUG(dbgs()
6019 << "SLP: initially in ready list: " << *SD << "\n");
6020 }
6021 }
6022 }
6023
6024 /// Build a bundle from the ScheduleData nodes corresponding to the
6025 /// scalar instruction for each lane.
6026 /// \param VL The list of scalar instructions.
6027 /// \param S The state of the instructions.
6028 /// \param EI The edge in the SLP graph or the user node/operand number.
6029 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
6030 const InstructionsState &S, const EdgeInfo &EI);
6031
6032 /// Checks if a bundle of instructions can be scheduled, i.e. has no
6033 /// cyclic dependencies. This is only a dry-run, no instructions are
6034 /// actually moved at this stage.
6035 /// \returns the scheduling bundle. The returned Optional value is not
6036 /// std::nullopt if \p VL is allowed to be scheduled.
6037 std::optional<ScheduleBundle *>
6038 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
6039 const InstructionsState &S, const EdgeInfo &EI);
6040
6041 /// Allocates schedule data chunk.
6042 ScheduleData *allocateScheduleDataChunks();
6043
6044 /// Extends the scheduling region so that V is inside the region.
6045 /// \returns true if the region size is within the limit.
6046 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
6047
6048 /// Initialize the ScheduleData structures for new instructions in the
6049 /// scheduling region.
6050 void initScheduleData(Instruction *FromI, Instruction *ToI,
6051 ScheduleData *PrevLoadStore,
6052 ScheduleData *NextLoadStore);
6053
6054 /// Updates the dependency information of a bundle and of all instructions/
6055 /// bundles which depend on the original bundle.
6056 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
6057 BoUpSLP *SLP,
6058 ArrayRef<ScheduleData *> ControlDeps = {});
6059
6060 /// Sets all instruction in the scheduling region to un-scheduled.
6061 void resetSchedule();
6062
6063 BasicBlock *BB;
6064
6065 /// Simple memory allocation for ScheduleData.
6066 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
6067
6068 /// The size of a ScheduleData array in ScheduleDataChunks.
6069 int ChunkSize;
6070
6071 /// The allocator position in the current chunk, which is the last entry
6072 /// of ScheduleDataChunks.
6073 int ChunkPos;
6074
6075 /// Attaches ScheduleData to Instruction.
6076 /// Note that the mapping survives during all vectorization iterations, i.e.
6077 /// ScheduleData structures are recycled.
6078 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6079
6080 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6081 /// number) and the operand instruction, represented as copyable element.
6082 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6083 std::unique_ptr<ScheduleCopyableData>>
6084 ScheduleCopyableDataMap;
6085
6086 /// Represents mapping between instruction and all related
6087 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6088 /// element). The SLP tree may contain several representations of the same
6089 /// instruction.
6090 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6091 ScheduleCopyableDataMapByInst;
6092
6093 /// Represents mapping between user value and operand number, the operand
6094 /// value and all related ScheduleCopyableData. The relation is 1:n, because
6095 /// the same user may refernce the same operand in different tree entries
6096 /// and the operand may be modelled by the different copyable data element.
6097 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
6098 SmallVector<ScheduleCopyableData *>>
6099 ScheduleCopyableDataMapByInstUser;
6100
6101 /// Represents mapping between instruction and all related
6102 /// ScheduleCopyableData. It represents the mapping between the actual
6103 /// instruction and the last copyable data element in the chain. E.g., if
6104 /// the graph models the following instructions:
6105 /// %0 = non-add instruction ...
6106 /// ...
6107 /// %4 = add %3, 1
6108 /// %5 = add %4, 1
6109 /// %6 = insertelement poison, %0, 0
6110 /// %7 = insertelement %6, %5, 1
6111 /// And the graph is modeled as:
6112 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6113 /// -> [1, 0] -> [%1, 0]
6114 ///
6115 /// this map will map %0 only to the copyable element <1>, which is the last
6116 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6117 /// keep the map to <0>, not the %0.
6118 SmallDenseMap<const Instruction *,
6119 SmallSetVector<ScheduleCopyableData *, 4>>
6120 ScheduleCopyableDataMapByUsers;
6121
6122 /// Attaches ScheduleBundle to Instruction.
6123 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6124 ScheduledBundles;
6125 /// The list of ScheduleBundles.
6126 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6127
6128 /// The ready-list for scheduling (only used for the dry-run).
6129 SetVector<ScheduleEntity *> ReadyInsts;
6130
6131 /// The first instruction of the scheduling region.
6132 Instruction *ScheduleStart = nullptr;
6133
6134 /// The first instruction _after_ the scheduling region.
6135 Instruction *ScheduleEnd = nullptr;
6136
6137 /// The first memory accessing instruction in the scheduling region
6138 /// (can be null).
6139 ScheduleData *FirstLoadStoreInRegion = nullptr;
6140
6141 /// The last memory accessing instruction in the scheduling region
6142 /// (can be null).
6143 ScheduleData *LastLoadStoreInRegion = nullptr;
6144
6145 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6146 /// region? Used to optimize the dependence calculation for the
6147 /// common case where there isn't.
6148 bool RegionHasStackSave = false;
6149
6150 /// The current size of the scheduling region.
6151 int ScheduleRegionSize = 0;
6152
6153 /// The maximum size allowed for the scheduling region.
6154 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6155
6156 /// The ID of the scheduling region. For a new vectorization iteration this
6157 /// is incremented which "removes" all ScheduleData from the region.
6158 /// Make sure that the initial SchedulingRegionID is greater than the
6159 /// initial SchedulingRegionID in ScheduleData (which is 0).
6160 int SchedulingRegionID = 1;
6161 };
6162
6163 /// Attaches the BlockScheduling structures to basic blocks.
6164 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6165
6166 /// Performs the "real" scheduling. Done before vectorization is actually
6167 /// performed in a basic block.
6168 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6169
6170 /// List of users to ignore during scheduling and that don't need extracting.
6171 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6172
6173 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6174 /// sorted SmallVectors of unsigned.
6175 struct OrdersTypeDenseMapInfo {
6176 static OrdersType getEmptyKey() {
6177 OrdersType V;
6178 V.push_back(Elt: ~1U);
6179 return V;
6180 }
6181
6182 static OrdersType getTombstoneKey() {
6183 OrdersType V;
6184 V.push_back(Elt: ~2U);
6185 return V;
6186 }
6187
6188 static unsigned getHashValue(const OrdersType &V) {
6189 return static_cast<unsigned>(hash_combine_range(R: V));
6190 }
6191
6192 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6193 return LHS == RHS;
6194 }
6195 };
6196
6197 // Analysis and block reference.
6198 Function *F;
6199 ScalarEvolution *SE;
6200 TargetTransformInfo *TTI;
6201 TargetLibraryInfo *TLI;
6202 LoopInfo *LI;
6203 DominatorTree *DT;
6204 AssumptionCache *AC;
6205 DemandedBits *DB;
6206 const DataLayout *DL;
6207 OptimizationRemarkEmitter *ORE;
6208
6209 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6210 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6211
6212 /// Instruction builder to construct the vectorized tree.
6213 IRBuilder<TargetFolder> Builder;
6214
6215 /// A map of scalar integer values to the smallest bit width with which they
6216 /// can legally be represented. The values map to (width, signed) pairs,
6217 /// where "width" indicates the minimum bit width and "signed" is True if the
6218 /// value must be signed-extended, rather than zero-extended, back to its
6219 /// original width.
6220 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6221
6222 /// Final size of the reduced vector, if the current graph represents the
6223 /// input for the reduction and it was possible to narrow the size of the
6224 /// reduction.
6225 unsigned ReductionBitWidth = 0;
6226
6227 /// Canonical graph size before the transformations.
6228 unsigned BaseGraphSize = 1;
6229
6230 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6231 /// type sizes, used in the tree.
6232 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6233
6234 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6235 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6236 DenseSet<unsigned> ExtraBitWidthNodes;
6237};
6238
6239template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6240 using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;
6241 using SecondInfo = DenseMapInfo<unsigned>;
6242 static BoUpSLP::EdgeInfo getEmptyKey() {
6243 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6244 SecondInfo::getEmptyKey());
6245 }
6246
6247 static BoUpSLP::EdgeInfo getTombstoneKey() {
6248 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6249 SecondInfo::getTombstoneKey());
6250 }
6251
6252 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6253 return detail::combineHashValue(a: FirstInfo::getHashValue(PtrVal: Val.UserTE),
6254 b: SecondInfo::getHashValue(Val: Val.EdgeIdx));
6255 }
6256
6257 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6258 const BoUpSLP::EdgeInfo &RHS) {
6259 return LHS == RHS;
6260 }
6261};
6262
6263template <> struct llvm::GraphTraits<BoUpSLP *> {
6264 using TreeEntry = BoUpSLP::TreeEntry;
6265
6266 /// NodeRef has to be a pointer per the GraphWriter.
6267 using NodeRef = TreeEntry *;
6268
6269 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6270
6271 /// Add the VectorizableTree to the index iterator to be able to return
6272 /// TreeEntry pointers.
6273 struct ChildIteratorType
6274 : public iterator_adaptor_base<
6275 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6276 ContainerTy &VectorizableTree;
6277
6278 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
6279 ContainerTy &VT)
6280 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
6281
6282 NodeRef operator*() { return I->UserTE; }
6283 };
6284
6285 static NodeRef getEntryNode(BoUpSLP &R) {
6286 return R.VectorizableTree[0].get();
6287 }
6288
6289 static ChildIteratorType child_begin(NodeRef N) {
6290 return {&N->UserTreeIndex, N->Container};
6291 }
6292
6293 static ChildIteratorType child_end(NodeRef N) {
6294 return {&N->UserTreeIndex + 1, N->Container};
6295 }
6296
6297 /// For the node iterator we just need to turn the TreeEntry iterator into a
6298 /// TreeEntry* iterator so that it dereferences to NodeRef.
6299 class nodes_iterator {
6300 using ItTy = ContainerTy::iterator;
6301 ItTy It;
6302
6303 public:
6304 nodes_iterator(const ItTy &It2) : It(It2) {}
6305 NodeRef operator*() { return It->get(); }
6306 nodes_iterator operator++() {
6307 ++It;
6308 return *this;
6309 }
6310 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6311 };
6312
6313 static nodes_iterator nodes_begin(BoUpSLP *R) {
6314 return nodes_iterator(R->VectorizableTree.begin());
6315 }
6316
6317 static nodes_iterator nodes_end(BoUpSLP *R) {
6318 return nodes_iterator(R->VectorizableTree.end());
6319 }
6320
6321 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6322};
6323
6324template <>
6325struct llvm::DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6326 using TreeEntry = BoUpSLP::TreeEntry;
6327
6328 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6329
6330 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6331 std::string Str;
6332 raw_string_ostream OS(Str);
6333 OS << Entry->Idx << ".\n";
6334 if (isSplat(VL: Entry->Scalars))
6335 OS << "<splat> ";
6336 for (auto *V : Entry->Scalars) {
6337 OS << *V;
6338 if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
6339 return EU.Scalar == V;
6340 }))
6341 OS << " <extract>";
6342 OS << "\n";
6343 }
6344 return Str;
6345 }
6346
6347 static std::string getNodeAttributes(const TreeEntry *Entry,
6348 const BoUpSLP *) {
6349 if (Entry->isGather())
6350 return "color=red";
6351 if (Entry->State == TreeEntry::ScatterVectorize ||
6352 Entry->State == TreeEntry::StridedVectorize ||
6353 Entry->State == TreeEntry::CompressVectorize)
6354 return "color=blue";
6355 return "";
6356 }
6357};
6358
6359BoUpSLP::~BoUpSLP() {
6360 SmallVector<WeakTrackingVH> DeadInsts;
6361 for (auto *I : DeletedInstructions) {
6362 if (!I->getParent()) {
6363 // Temporarily insert instruction back to erase them from parent and
6364 // memory later.
6365 if (isa<PHINode>(Val: I))
6366 // Phi nodes must be the very first instructions in the block.
6367 I->insertBefore(BB&: F->getEntryBlock(),
6368 InsertPos: F->getEntryBlock().getFirstNonPHIIt());
6369 else
6370 I->insertBefore(InsertPos: F->getEntryBlock().getTerminator()->getIterator());
6371 continue;
6372 }
6373 for (Use &U : I->operands()) {
6374 auto *Op = dyn_cast<Instruction>(Val: U.get());
6375 if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
6376 wouldInstructionBeTriviallyDead(I: Op, TLI))
6377 DeadInsts.emplace_back(Args&: Op);
6378 }
6379 I->dropAllReferences();
6380 }
6381 for (auto *I : DeletedInstructions) {
6382 assert(I->use_empty() &&
6383 "trying to erase instruction with users.");
6384 I->eraseFromParent();
6385 }
6386
6387 // Cleanup any dead scalar code feeding the vectorized instructions
6388 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
6389
6390#ifdef EXPENSIVE_CHECKS
6391 // If we could guarantee that this call is not extremely slow, we could
6392 // remove the ifdef limitation (see PR47712).
6393 assert(!verifyFunction(*F, &dbgs()));
6394#endif
6395}
6396
6397/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6398/// contains original mask for the scalars reused in the node. Procedure
6399/// transform this mask in accordance with the given \p Mask.
6400static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
6401 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6402 "Expected non-empty mask.");
6403 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6404 Prev.swap(RHS&: Reuses);
6405 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6406 if (Mask[I] != PoisonMaskElem)
6407 Reuses[Mask[I]] = Prev[I];
6408}
6409
6410/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6411/// the original order of the scalars. Procedure transforms the provided order
6412/// in accordance with the given \p Mask. If the resulting \p Order is just an
6413/// identity order, \p Order is cleared.
6414static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
6415 bool BottomOrder = false) {
6416 assert(!Mask.empty() && "Expected non-empty mask.");
6417 unsigned Sz = Mask.size();
6418 if (BottomOrder) {
6419 SmallVector<unsigned> PrevOrder;
6420 if (Order.empty()) {
6421 PrevOrder.resize(N: Sz);
6422 std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: 0);
6423 } else {
6424 PrevOrder.swap(RHS&: Order);
6425 }
6426 Order.assign(NumElts: Sz, Elt: Sz);
6427 for (unsigned I = 0; I < Sz; ++I)
6428 if (Mask[I] != PoisonMaskElem)
6429 Order[I] = PrevOrder[Mask[I]];
6430 if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
6431 return Data.value() == Sz || Data.index() == Data.value();
6432 })) {
6433 Order.clear();
6434 return;
6435 }
6436 fixupOrderingIndices(Order);
6437 return;
6438 }
6439 SmallVector<int> MaskOrder;
6440 if (Order.empty()) {
6441 MaskOrder.resize(N: Sz);
6442 std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: 0);
6443 } else {
6444 inversePermutation(Indices: Order, Mask&: MaskOrder);
6445 }
6446 reorderReuses(Reuses&: MaskOrder, Mask);
6447 if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
6448 Order.clear();
6449 return;
6450 }
6451 Order.assign(NumElts: Sz, Elt: Sz);
6452 for (unsigned I = 0; I < Sz; ++I)
6453 if (MaskOrder[I] != PoisonMaskElem)
6454 Order[MaskOrder[I]] = I;
6455 fixupOrderingIndices(Order);
6456}
6457
6458std::optional<BoUpSLP::OrdersType>
6459BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6460 bool TopToBottom, bool IgnoreReorder) {
6461 assert(TE.isGather() && "Expected gather node only.");
6462 // Try to find subvector extract/insert patterns and reorder only such
6463 // patterns.
6464 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6465 Type *ScalarTy = GatheredScalars.front()->getType();
6466 size_t NumScalars = GatheredScalars.size();
6467 if (!isValidElementType(Ty: ScalarTy))
6468 return std::nullopt;
6469 auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
6470 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: NumScalars);
6471 SmallVector<int> ExtractMask;
6472 SmallVector<int> Mask;
6473 SmallVector<SmallVector<const TreeEntry *>> Entries;
6474 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
6475 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
6476 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
6477 isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
6478 /*ForOrder=*/true);
6479 // No shuffled operands - ignore.
6480 if (GatherShuffles.empty() && ExtractShuffles.empty())
6481 return std::nullopt;
6482 OrdersType CurrentOrder(NumScalars, NumScalars);
6483 if (GatherShuffles.size() == 1 &&
6484 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6485 Entries.front().front()->isSame(VL: TE.Scalars)) {
6486 // If the full matched node in whole tree rotation - no need to consider the
6487 // matching order, rotating the whole tree.
6488 if (TopToBottom)
6489 return std::nullopt;
6490 // No need to keep the order for the same user node.
6491 if (Entries.front().front()->UserTreeIndex.UserTE ==
6492 TE.UserTreeIndex.UserTE)
6493 return std::nullopt;
6494 // No need to keep the order for the matched root node, if it can be freely
6495 // reordered.
6496 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6497 return std::nullopt;
6498 // If shuffling 2 elements only and the matching node has reverse reuses -
6499 // no need to count order, both work fine.
6500 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6501 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6502 any_of(Range: enumerate(First: Entries.front().front()->ReuseShuffleIndices),
6503 P: [](const auto &P) {
6504 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6505 }))
6506 return std::nullopt;
6507
6508 // Perfect match in the graph, will reuse the previously vectorized
6509 // node. Cost is 0.
6510 std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: 0);
6511 return CurrentOrder;
6512 }
6513 auto IsSplatMask = [](ArrayRef<int> Mask) {
6514 int SingleElt = PoisonMaskElem;
6515 return all_of(Range&: Mask, P: [&](int I) {
6516 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6517 SingleElt = I;
6518 return I == PoisonMaskElem || I == SingleElt;
6519 });
6520 };
6521 // Exclusive broadcast mask - ignore.
6522 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6523 (Entries.size() != 1 ||
6524 Entries.front().front()->ReorderIndices.empty())) ||
6525 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6526 return std::nullopt;
6527 SmallBitVector ShuffledSubMasks(NumParts);
6528 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6529 ArrayRef<int> Mask, int PartSz, int NumParts,
6530 function_ref<unsigned(unsigned)> GetVF) {
6531 for (int I : seq<int>(Begin: 0, End: NumParts)) {
6532 if (ShuffledSubMasks.test(Idx: I))
6533 continue;
6534 const int VF = GetVF(I);
6535 if (VF == 0)
6536 continue;
6537 unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
6538 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
6539 // Shuffle of at least 2 vectors - ignore.
6540 if (any_of(Range&: Slice, P: not_equal_to(Arg&: NumScalars))) {
6541 llvm::fill(Range&: Slice, Value&: NumScalars);
6542 ShuffledSubMasks.set(I);
6543 continue;
6544 }
6545 // Try to include as much elements from the mask as possible.
6546 int FirstMin = INT_MAX;
6547 int SecondVecFound = false;
6548 for (int K : seq<int>(Size: Limit)) {
6549 int Idx = Mask[I * PartSz + K];
6550 if (Idx == PoisonMaskElem) {
6551 Value *V = GatheredScalars[I * PartSz + K];
6552 if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
6553 SecondVecFound = true;
6554 break;
6555 }
6556 continue;
6557 }
6558 if (Idx < VF) {
6559 if (FirstMin > Idx)
6560 FirstMin = Idx;
6561 } else {
6562 SecondVecFound = true;
6563 break;
6564 }
6565 }
6566 FirstMin = (FirstMin / PartSz) * PartSz;
6567 // Shuffle of at least 2 vectors - ignore.
6568 if (SecondVecFound) {
6569 llvm::fill(Range&: Slice, Value&: NumScalars);
6570 ShuffledSubMasks.set(I);
6571 continue;
6572 }
6573 for (int K : seq<int>(Size: Limit)) {
6574 int Idx = Mask[I * PartSz + K];
6575 if (Idx == PoisonMaskElem)
6576 continue;
6577 Idx -= FirstMin;
6578 if (Idx >= PartSz) {
6579 SecondVecFound = true;
6580 break;
6581 }
6582 if (CurrentOrder[I * PartSz + Idx] >
6583 static_cast<unsigned>(I * PartSz + K) &&
6584 CurrentOrder[I * PartSz + Idx] !=
6585 static_cast<unsigned>(I * PartSz + Idx))
6586 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6587 }
6588 // Shuffle of at least 2 vectors - ignore.
6589 if (SecondVecFound) {
6590 llvm::fill(Range&: Slice, Value&: NumScalars);
6591 ShuffledSubMasks.set(I);
6592 continue;
6593 }
6594 }
6595 };
6596 int PartSz = getPartNumElems(Size: NumScalars, NumParts);
6597 if (!ExtractShuffles.empty())
6598 TransformMaskToOrder(
6599 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6600 if (!ExtractShuffles[I])
6601 return 0U;
6602 unsigned VF = 0;
6603 unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
6604 for (unsigned Idx : seq<unsigned>(Size: Sz)) {
6605 int K = I * PartSz + Idx;
6606 if (ExtractMask[K] == PoisonMaskElem)
6607 continue;
6608 if (!TE.ReuseShuffleIndices.empty())
6609 K = TE.ReuseShuffleIndices[K];
6610 if (K == PoisonMaskElem)
6611 continue;
6612 if (!TE.ReorderIndices.empty())
6613 K = std::distance(first: TE.ReorderIndices.begin(),
6614 last: find(Range: TE.ReorderIndices, Val: K));
6615 auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars[K]);
6616 if (!EI)
6617 continue;
6618 VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
6619 ->getElementCount()
6620 .getKnownMinValue());
6621 }
6622 return VF;
6623 });
6624 // Check special corner case - single shuffle of the same entry.
6625 if (GatherShuffles.size() == 1 && NumParts != 1) {
6626 if (ShuffledSubMasks.any())
6627 return std::nullopt;
6628 PartSz = NumScalars;
6629 NumParts = 1;
6630 }
6631 if (!Entries.empty())
6632 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6633 if (!GatherShuffles[I])
6634 return 0U;
6635 return std::max(a: Entries[I].front()->getVectorFactor(),
6636 b: Entries[I].back()->getVectorFactor());
6637 });
6638 unsigned NumUndefs = count(Range&: CurrentOrder, Element: NumScalars);
6639 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6640 return std::nullopt;
6641 return std::move(CurrentOrder);
6642}
6643
6644static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6645 const TargetLibraryInfo &TLI,
6646 bool CompareOpcodes = true) {
6647 if (getUnderlyingObject(V: Ptr1, MaxLookup: RecursionMaxDepth) !=
6648 getUnderlyingObject(V: Ptr2, MaxLookup: RecursionMaxDepth))
6649 return false;
6650 auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
6651 auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
6652 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6653 (!GEP2 || GEP2->getNumOperands() == 2) &&
6654 (((!GEP1 || isConstant(V: GEP1->getOperand(i_nocapture: 1))) &&
6655 (!GEP2 || isConstant(V: GEP2->getOperand(i_nocapture: 1)))) ||
6656 !CompareOpcodes ||
6657 (GEP1 && GEP2 &&
6658 getSameOpcode(VL: {GEP1->getOperand(i_nocapture: 1), GEP2->getOperand(i_nocapture: 1)}, TLI)));
6659}
6660
6661/// Calculates minimal alignment as a common alignment.
6662template <typename T>
6663static Align computeCommonAlignment(ArrayRef<Value *> VL) {
6664 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6665 for (Value *V : VL)
6666 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6667 return CommonAlignment;
6668}
6669
6670/// Check if \p Order represents reverse order.
6671static bool isReverseOrder(ArrayRef<unsigned> Order) {
6672 assert(!Order.empty() &&
6673 "Order is empty. Please check it before using isReverseOrder.");
6674 unsigned Sz = Order.size();
6675 return all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
6676 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6677 });
6678}
6679
6680/// Checks if the provided list of pointers \p Pointers represents the strided
6681/// pointers for type ElemTy. If they are not, nullptr is returned.
6682/// Otherwise, SCEV* of the stride value is returned.
6683/// If `PointerOps` can be rearanged into the following sequence:
6684/// ```
6685/// %x + c_0 * stride,
6686/// %x + c_1 * stride,
6687/// %x + c_2 * stride
6688/// ...
6689/// ```
6690/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6691/// and the SCEV of the `stride` will be returned.
6692static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6693 const DataLayout &DL, ScalarEvolution &SE,
6694 SmallVectorImpl<unsigned> &SortedIndices,
6695 SmallVectorImpl<int64_t> &Coeffs) {
6696 assert(Coeffs.size() == PointerOps.size() &&
6697 "Coeffs vector needs to be of correct size");
6698 SmallVector<const SCEV *> SCEVs;
6699 const SCEV *PtrSCEVLowest = nullptr;
6700 const SCEV *PtrSCEVHighest = nullptr;
6701 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6702 // addresses).
6703 for (Value *Ptr : PointerOps) {
6704 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
6705 if (!PtrSCEV)
6706 return nullptr;
6707 SCEVs.push_back(Elt: PtrSCEV);
6708 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6709 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6710 continue;
6711 }
6712 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6713 if (isa<SCEVCouldNotCompute>(Val: Diff))
6714 return nullptr;
6715 if (Diff->isNonConstantNegative()) {
6716 PtrSCEVLowest = PtrSCEV;
6717 continue;
6718 }
6719 const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
6720 if (isa<SCEVCouldNotCompute>(Val: Diff1))
6721 return nullptr;
6722 if (Diff1->isNonConstantNegative()) {
6723 PtrSCEVHighest = PtrSCEV;
6724 continue;
6725 }
6726 }
6727 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6728 const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
6729 if (isa<SCEVCouldNotCompute>(Val: Dist))
6730 return nullptr;
6731 int Size = DL.getTypeStoreSize(Ty: ElemTy);
6732 auto TryGetStride = [&](const SCEV *Dist,
6733 const SCEV *Multiplier) -> const SCEV * {
6734 if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
6735 if (M->getOperand(i: 0) == Multiplier)
6736 return M->getOperand(i: 1);
6737 if (M->getOperand(i: 1) == Multiplier)
6738 return M->getOperand(i: 0);
6739 return nullptr;
6740 }
6741 if (Multiplier == Dist)
6742 return SE.getConstant(Ty: Dist->getType(), V: 1);
6743 return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
6744 };
6745 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6746 const SCEV *Stride = nullptr;
6747 if (Size != 1 || SCEVs.size() > 2) {
6748 const SCEV *Sz = SE.getConstant(Ty: Dist->getType(), V: Size * (SCEVs.size() - 1));
6749 Stride = TryGetStride(Dist, Sz);
6750 if (!Stride)
6751 return nullptr;
6752 }
6753 if (!Stride || isa<SCEVConstant>(Val: Stride))
6754 return nullptr;
6755 // Iterate through all pointers and check if all distances are
6756 // unique multiple of Stride.
6757 using DistOrdPair = std::pair<int64_t, int>;
6758 auto Compare = llvm::less_first();
6759 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6760 int Cnt = 0;
6761 bool IsConsecutive = true;
6762 for (const auto [Idx, PtrSCEV] : enumerate(First&: SCEVs)) {
6763 unsigned Dist = 0;
6764 if (PtrSCEV != PtrSCEVLowest) {
6765 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6766 const SCEV *Coeff = TryGetStride(Diff, Stride);
6767 if (!Coeff)
6768 return nullptr;
6769 const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
6770 if (!SC || isa<SCEVCouldNotCompute>(Val: SC))
6771 return nullptr;
6772 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6773 if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
6774 RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
6775 ->isZero())
6776 return nullptr;
6777 Dist = SC->getAPInt().getZExtValue();
6778 } else {
6779 Coeffs[Idx] = 0;
6780 }
6781 // If the strides are not the same or repeated, we can't vectorize.
6782 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6783 return nullptr;
6784 auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
6785 if (!Res.second)
6786 return nullptr;
6787 // Consecutive order if the inserted element is the last one.
6788 IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
6789 ++Cnt;
6790 }
6791 if (Offsets.size() != SCEVs.size())
6792 return nullptr;
6793 SortedIndices.clear();
6794 if (!IsConsecutive) {
6795 // Fill SortedIndices array only if it is non-consecutive.
6796 SortedIndices.resize(N: PointerOps.size());
6797 Cnt = 0;
6798 for (const std::pair<int64_t, int> &Pair : Offsets) {
6799 SortedIndices[Cnt] = Pair.second;
6800 ++Cnt;
6801 }
6802 }
6803 return Stride;
6804}
6805
6806static std::pair<InstructionCost, InstructionCost>
6807getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6808 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6809 Type *ScalarTy, VectorType *VecTy);
6810
6811/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6812/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6813/// subvector pattern.
6814static InstructionCost
6815getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
6816 VectorType *Tp, ArrayRef<int> Mask = {},
6817 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
6818 int Index = 0, VectorType *SubTp = nullptr,
6819 ArrayRef<const Value *> Args = {}) {
6820 VectorType *DstTy = Tp;
6821 if (!Mask.empty())
6822 DstTy = FixedVectorType::get(ElementType: Tp->getScalarType(), NumElts: Mask.size());
6823
6824 if (Kind != TTI::SK_PermuteTwoSrc)
6825 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6826 Args);
6827 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6828 int NumSubElts;
6829 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
6830 Mask, NumSrcElts, NumSubElts, Index)) {
6831 if (Index + NumSubElts > NumSrcElts &&
6832 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6833 return TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy, SrcTy: Tp, Mask,
6834 CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
6835 }
6836 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6837 Args);
6838}
6839
6840/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6841/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6842/// instead of a scalar.
6843static InstructionCost
6844getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
6845 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6846 bool Extract, TTI::TargetCostKind CostKind,
6847 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6848 assert(!isa<ScalableVectorType>(Ty) &&
6849 "ScalableVectorType is not supported.");
6850 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6851 getNumElements(Ty) &&
6852 "Incorrect usage.");
6853 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6854 assert(SLPReVec && "Only supported by REVEC.");
6855 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6856 // of CreateInsertElement.
6857 unsigned ScalarTyNumElements = VecTy->getNumElements();
6858 InstructionCost Cost = 0;
6859 for (unsigned I : seq(Size: DemandedElts.getBitWidth())) {
6860 if (!DemandedElts[I])
6861 continue;
6862 if (Insert)
6863 Cost += getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: Ty, Mask: {}, CostKind,
6864 Index: I * ScalarTyNumElements, SubTp: VecTy);
6865 if (Extract)
6866 Cost += getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: Ty, Mask: {}, CostKind,
6867 Index: I * ScalarTyNumElements, SubTp: VecTy);
6868 }
6869 return Cost;
6870 }
6871 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6872 CostKind, ForPoisonSrc, VL);
6873}
6874
6875/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6876/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6877static InstructionCost getVectorInstrCost(
6878 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6879 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6880 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6881 if (Opcode == Instruction::ExtractElement) {
6882 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6883 assert(SLPReVec && "Only supported by REVEC.");
6884 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6885 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
6886 Tp: cast<VectorType>(Val), Mask: {}, CostKind,
6887 Index: Index * VecTy->getNumElements(), SubTp: VecTy);
6888 }
6889 }
6890 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6891 ScalarUserAndIdx);
6892}
6893
6894/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6895/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6896static InstructionCost getExtractWithExtendCost(
6897 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6898 VectorType *VecTy, unsigned Index,
6899 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
6900 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Val: Dst)) {
6901 assert(SLPReVec && "Only supported by REVEC.");
6902 auto *SubTp =
6903 getWidenedType(ScalarTy: VecTy->getElementType(), VF: ScalarTy->getNumElements());
6904 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: VecTy, Mask: {}, CostKind,
6905 Index: Index * ScalarTy->getNumElements(), SubTp) +
6906 TTI.getCastInstrCost(Opcode, Dst, Src: SubTp, CCH: TTI::CastContextHint::None,
6907 CostKind);
6908 }
6909 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6910}
6911
6912/// Creates subvector insert. Generates shuffle using \p Generator or
6913/// using default shuffle.
6914static Value *createInsertVector(
6915 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6916 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6917 if (isa<PoisonValue>(Val: Vec) && isa<PoisonValue>(Val: V))
6918 return Vec;
6919 const unsigned SubVecVF = getNumElements(Ty: V->getType());
6920 // Create shuffle, insertvector requires that index is multiple of
6921 // the subvector length.
6922 const unsigned VecVF = getNumElements(Ty: Vec->getType());
6923 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6924 if (isa<PoisonValue>(Val: Vec)) {
6925 auto *Begin = std::next(x: Mask.begin(), n: Index);
6926 std::iota(first: Begin, last: std::next(x: Begin, n: SubVecVF), value: 0);
6927 Vec = Builder.CreateShuffleVector(V, Mask);
6928 return Vec;
6929 }
6930 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
6931 std::iota(first: std::next(x: Mask.begin(), n: Index),
6932 last: std::next(x: Mask.begin(), n: Index + SubVecVF), value: VecVF);
6933 if (Generator)
6934 return Generator(Vec, V, Mask);
6935 // 1. Resize V to the size of Vec.
6936 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6937 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: SubVecVF), value: 0);
6938 V = Builder.CreateShuffleVector(V, Mask: ResizeMask);
6939 // 2. Insert V into Vec.
6940 return Builder.CreateShuffleVector(V1: Vec, V2: V, Mask);
6941}
6942
6943/// Generates subvector extract using \p Generator or using default shuffle.
6944static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
6945 unsigned SubVecVF, unsigned Index) {
6946 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6947 std::iota(first: Mask.begin(), last: Mask.end(), value: Index);
6948 return Builder.CreateShuffleVector(V: Vec, Mask);
6949}
6950
6951/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6952/// with \p Order.
6953/// \return true if the mask represents strided access, false - otherwise.
6954static bool buildCompressMask(ArrayRef<Value *> PointerOps,
6955 ArrayRef<unsigned> Order, Type *ScalarTy,
6956 const DataLayout &DL, ScalarEvolution &SE,
6957 SmallVectorImpl<int> &CompressMask) {
6958 const unsigned Sz = PointerOps.size();
6959 CompressMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
6960 // The first element always set.
6961 CompressMask[0] = 0;
6962 // Check if the mask represents strided access.
6963 std::optional<unsigned> Stride = 0;
6964 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6965 for (unsigned I : seq<unsigned>(Begin: 1, End: Sz)) {
6966 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6967 std::optional<int64_t> OptPos =
6968 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
6969 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6970 return false;
6971 unsigned Pos = static_cast<unsigned>(*OptPos);
6972 CompressMask[I] = Pos;
6973 if (!Stride)
6974 continue;
6975 if (*Stride == 0) {
6976 *Stride = Pos;
6977 continue;
6978 }
6979 if (Pos != *Stride * I)
6980 Stride.reset();
6981 }
6982 return Stride.has_value();
6983}
6984
6985/// Checks if the \p VL can be transformed to a (masked)load + compress or
6986/// (masked) interleaved load.
6987static bool isMaskedLoadCompress(
6988 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6989 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
6990 const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
6991 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6992 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6993 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6994 VectorType *&LoadVecTy) {
6995 InterleaveFactor = 0;
6996 Type *ScalarTy = VL.front()->getType();
6997 const size_t Sz = VL.size();
6998 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6999 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7000 SmallVector<int> Mask;
7001 if (!Order.empty())
7002 inversePermutation(Indices: Order, Mask);
7003 // Check external uses.
7004 for (const auto [I, V] : enumerate(First&: VL)) {
7005 if (AreAllUsersVectorized(V))
7006 continue;
7007 InstructionCost ExtractCost =
7008 TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
7009 Index: Mask.empty() ? I : Mask[I]);
7010 InstructionCost ScalarCost =
7011 TTI.getInstructionCost(U: cast<Instruction>(Val: V), CostKind);
7012 if (ExtractCost <= ScalarCost)
7013 return false;
7014 }
7015 Value *Ptr0;
7016 Value *PtrN;
7017 if (Order.empty()) {
7018 Ptr0 = PointerOps.front();
7019 PtrN = PointerOps.back();
7020 } else {
7021 Ptr0 = PointerOps[Order.front()];
7022 PtrN = PointerOps[Order.back()];
7023 }
7024 std::optional<int64_t> Diff =
7025 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL, SE);
7026 if (!Diff)
7027 return false;
7028 const size_t MaxRegSize =
7029 TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
7030 .getFixedValue();
7031 // Check for very large distances between elements.
7032 if (*Diff / Sz >= MaxRegSize / 8)
7033 return false;
7034 LoadVecTy = getWidenedType(ScalarTy, VF: *Diff + 1);
7035 auto *LI = cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()]);
7036 Align CommonAlignment = LI->getAlign();
7037 IsMasked = !isSafeToLoadUnconditionally(
7038 V: Ptr0, Ty: LoadVecTy, Alignment: CommonAlignment, DL,
7039 ScanFrom: cast<LoadInst>(Val: Order.empty() ? VL.back() : VL[Order.back()]), AC: &AC, DT: &DT,
7040 TLI: &TLI);
7041 if (IsMasked && !TTI.isLegalMaskedLoad(DataType: LoadVecTy, Alignment: CommonAlignment,
7042 AddressSpace: LI->getPointerAddressSpace()))
7043 return false;
7044 // TODO: perform the analysis of each scalar load for better
7045 // safe-load-unconditionally analysis.
7046 bool IsStrided =
7047 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
7048 assert(CompressMask.size() >= 2 && "At least two elements are required");
7049 SmallVector<Value *> OrderedPointerOps(PointerOps);
7050 if (!Order.empty())
7051 reorderScalars(Scalars&: OrderedPointerOps, Mask);
7052 auto [ScalarGEPCost, VectorGEPCost] =
7053 getGEPCosts(TTI, Ptrs: OrderedPointerOps, BasePtr: OrderedPointerOps.front(),
7054 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy: LoadVecTy);
7055 // The cost of scalar loads.
7056 InstructionCost ScalarLoadsCost =
7057 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
7058 binary_op: [&](InstructionCost C, Value *V) {
7059 return C + TTI.getInstructionCost(U: cast<Instruction>(Val: V),
7060 CostKind);
7061 }) +
7062 ScalarGEPCost;
7063 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
7064 InstructionCost GatherCost =
7065 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7066 /*Insert=*/true,
7067 /*Extract=*/false, CostKind) +
7068 ScalarLoadsCost;
7069 InstructionCost LoadCost = 0;
7070 if (IsMasked) {
7071 LoadCost = TTI.getMemIntrinsicInstrCost(
7072 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
7073 CommonAlignment,
7074 LI->getPointerAddressSpace()),
7075 CostKind);
7076 } else {
7077 LoadCost =
7078 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
7079 AddressSpace: LI->getPointerAddressSpace(), CostKind);
7080 }
7081 if (IsStrided && !IsMasked && Order.empty()) {
7082 // Check for potential segmented(interleaved) loads.
7083 VectorType *AlignedLoadVecTy = getWidenedType(
7084 ScalarTy, VF: getFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: *Diff + 1));
7085 if (!isSafeToLoadUnconditionally(V: Ptr0, Ty: AlignedLoadVecTy, Alignment: CommonAlignment,
7086 DL, ScanFrom: cast<LoadInst>(Val: VL.back()), AC: &AC, DT: &DT,
7087 TLI: &TLI))
7088 AlignedLoadVecTy = LoadVecTy;
7089 if (TTI.isLegalInterleavedAccessType(VTy: AlignedLoadVecTy, Factor: CompressMask[1],
7090 Alignment: CommonAlignment,
7091 AddrSpace: LI->getPointerAddressSpace())) {
7092 InstructionCost InterleavedCost =
7093 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7094 Opcode: Instruction::Load, VecTy: AlignedLoadVecTy,
7095 Factor: CompressMask[1], Indices: {}, Alignment: CommonAlignment,
7096 AddressSpace: LI->getPointerAddressSpace(), CostKind, UseMaskForCond: IsMasked);
7097 if (InterleavedCost < GatherCost) {
7098 InterleaveFactor = CompressMask[1];
7099 LoadVecTy = AlignedLoadVecTy;
7100 return true;
7101 }
7102 }
7103 }
7104 InstructionCost CompressCost = ::getShuffleCost(
7105 TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: LoadVecTy, Mask: CompressMask, CostKind);
7106 if (!Order.empty()) {
7107 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7108 for (unsigned I : seq<unsigned>(Size: Sz)) {
7109 NewMask[I] = CompressMask[Mask[I]];
7110 }
7111 CompressMask.swap(RHS&: NewMask);
7112 }
7113 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7114 return TotalVecCost < GatherCost;
7115}
7116
7117/// Checks if the \p VL can be transformed to a (masked)load + compress or
7118/// (masked) interleaved load.
7119static bool
7120isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
7121 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
7122 const DataLayout &DL, ScalarEvolution &SE,
7123 AssumptionCache &AC, const DominatorTree &DT,
7124 const TargetLibraryInfo &TLI,
7125 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7126 bool IsMasked;
7127 unsigned InterleaveFactor;
7128 SmallVector<int> CompressMask;
7129 VectorType *LoadVecTy;
7130 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7131 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7132 CompressMask, LoadVecTy);
7133}
7134
7135/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7136/// PointerOps:
7137/// 1. Target with strided load support is detected.
7138/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7139/// potential stride <= MaxProfitableLoadStride and the potential stride is
7140/// power-of-2 (to avoid perf regressions for the very small number of loads)
7141/// and max distance > number of loads, or potential stride is -1.
7142/// 3. The loads are ordered, or number of unordered loads <=
7143/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7144/// to avoid extra costs for very expensive shuffles).
7145/// 4. Any pointer operand is an instruction with the users outside of the
7146/// current graph (for masked gathers extra extractelement instructions
7147/// might be required).
7148bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
7149 Align Alignment, const int64_t Diff,
7150 const size_t Sz) const {
7151 if (Diff % (Sz - 1) != 0)
7152 return false;
7153
7154 // Try to generate strided load node.
7155 auto IsAnyPointerUsedOutGraph = any_of(Range&: PointerOps, P: [&](Value *V) {
7156 return isa<Instruction>(Val: V) && any_of(Range: V->users(), P: [&](User *U) {
7157 return !isVectorized(V: U) && !MustGather.contains(Ptr: U);
7158 });
7159 });
7160
7161 const uint64_t AbsoluteDiff = std::abs(i: Diff);
7162 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7163 if (IsAnyPointerUsedOutGraph ||
7164 (AbsoluteDiff > Sz &&
7165 (Sz > MinProfitableStridedLoads ||
7166 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7167 AbsoluteDiff % Sz == 0 && has_single_bit(Value: AbsoluteDiff / Sz)))) ||
7168 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7169 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7170 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7171 return false;
7172 if (!TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment))
7173 return false;
7174 return true;
7175 }
7176 return false;
7177}
7178
7179bool BoUpSLP::analyzeConstantStrideCandidate(
7180 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7181 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7182 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7183 const size_t Sz = PointerOps.size();
7184 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7185 // Go through `PointerOps` in sorted order and record offsets from
7186 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7187 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7188 // PointerOps[0]. This is safe since only offset differences are used below.
7189 for (unsigned I : seq<unsigned>(Size: Sz)) {
7190 Value *Ptr =
7191 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7192 std::optional<int64_t> Offset =
7193 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: Ptr, DL: *DL, SE&: *SE);
7194 assert(Offset && "sortPtrAccesses should have validated this pointer");
7195 SortedOffsetsFromBase[I] = *Offset;
7196 }
7197
7198 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7199 // ```
7200 // [
7201 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7202 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7203 // ...
7204 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7205 // GroupSize - 1}), // last group
7206 // ]
7207 // ```
7208 // The distance between consecutive elements within each group should all be
7209 // the same `StrideWithinGroup`. The distance between the first elements of
7210 // consecutive groups should all be the same `StrideBetweenGroups`.
7211
7212 int64_t StrideWithinGroup =
7213 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7214 // Determine size of the first group. Later we will check that all other
7215 // groups have the same size.
7216 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7217 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7218 StrideWithinGroup;
7219 };
7220 auto Indices = seq<unsigned>(Begin: 1, End: Sz);
7221 auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7222 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7223
7224 unsigned VecSz = Sz;
7225 Type *NewScalarTy = ScalarTy;
7226
7227 // Quick detour: at this point we can say what the type of strided load would
7228 // be if all the checks pass. Check if this type is legal for the target.
7229 bool NeedsWidening = Sz != GroupSize;
7230 if (NeedsWidening) {
7231 if (Sz % GroupSize != 0)
7232 return false;
7233
7234 if (StrideWithinGroup != 1)
7235 return false;
7236 VecSz = Sz / GroupSize;
7237 NewScalarTy = Type::getIntNTy(
7238 C&: SE->getContext(),
7239 N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * GroupSize);
7240 }
7241
7242 if (!isStridedLoad(PointerOps, ScalarTy: NewScalarTy, Alignment, Diff, Sz: VecSz))
7243 return false;
7244
7245 int64_t StrideIntVal = StrideWithinGroup;
7246 if (NeedsWidening) {
7247 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7248 // Check that the strides between groups are all the same.
7249 unsigned CurrentGroupStartIdx = GroupSize;
7250 int64_t StrideBetweenGroups =
7251 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7252 StrideIntVal = StrideBetweenGroups;
7253 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7254 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7255 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7256 StrideBetweenGroups)
7257 return false;
7258 }
7259
7260 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7261 auto Indices = seq<unsigned>(Begin: StartIdx + 1, End: Sz);
7262 auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7263 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7264 return GroupEndIdx - StartIdx == GroupSize;
7265 };
7266 for (unsigned I = 0; I < Sz; I += GroupSize) {
7267 if (!CheckGroup(I))
7268 return false;
7269 }
7270 }
7271
7272 Type *StrideTy = DL->getIndexType(PtrTy: Ptr0->getType());
7273 SPtrInfo.StrideVal = ConstantInt::getSigned(Ty: StrideTy, V: StrideIntVal);
7274 SPtrInfo.Ty = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7275 return true;
7276}
7277
7278bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
7279 Type *ScalarTy, Align CommonAlignment,
7280 SmallVectorImpl<unsigned> &SortedIndices,
7281 StridedPtrInfo &SPtrInfo) const {
7282 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7283 // is constant, we partition `PointerOps` sequence into subsequences of
7284 // pointers with the same offset. For each offset we record values from
7285 // `PointerOps` and their indicies in `PointerOps`.
7286 SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
7287 OffsetToPointerOpIdxMap;
7288 for (auto [Idx, Ptr] : enumerate(First&: PointerOps)) {
7289 const SCEV *PtrSCEV = SE->getSCEV(V: Ptr);
7290 if (!PtrSCEV)
7291 return false;
7292
7293 const auto *Add = dyn_cast<SCEVAddExpr>(Val: PtrSCEV);
7294 int64_t Offset = 0;
7295 if (Add) {
7296 // `Offset` is non-zero.
7297 for (int I : seq<int>(Size: Add->getNumOperands())) {
7298 const auto *SC = dyn_cast<SCEVConstant>(Val: Add->getOperand(i: I));
7299 if (!SC)
7300 continue;
7301 Offset = SC->getAPInt().getSExtValue();
7302 if (Offset >= std::numeric_limits<int64_t>::max() - 1) {
7303 Offset = 0;
7304 continue;
7305 }
7306 break;
7307 }
7308 }
7309 OffsetToPointerOpIdxMap[Offset].first.push_back(Elt: Ptr);
7310 OffsetToPointerOpIdxMap[Offset].second.push_back(Elt: Idx);
7311 }
7312 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7313
7314 // Quick detour: at this point we can say what the type of strided load would
7315 // be if all the checks pass. Check if this type is legal for the target.
7316 const unsigned Sz = PointerOps.size();
7317 unsigned VecSz = Sz;
7318 Type *NewScalarTy = ScalarTy;
7319 if (NumOffsets > 1) {
7320 if (Sz % NumOffsets != 0)
7321 return false;
7322 VecSz = Sz / NumOffsets;
7323 NewScalarTy = Type::getIntNTy(
7324 C&: SE->getContext(),
7325 N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * NumOffsets);
7326 }
7327 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7328 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(Ty: StridedLoadTy) ||
7329 !TTI->isLegalStridedLoadStore(DataType: StridedLoadTy, Alignment: CommonAlignment))
7330 return false;
7331
7332 // Check if the offsets are contiguous and that each group has the required
7333 // size.
7334 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7335 for (auto [Idx, MapPair] : enumerate(First&: OffsetToPointerOpIdxMap)) {
7336 if (MapPair.second.first.size() != VecSz)
7337 return false;
7338 SortedOffsetsV[Idx] = MapPair.first;
7339 }
7340 sort(C&: SortedOffsetsV);
7341
7342 if (NumOffsets > 1) {
7343 for (int I : seq<int>(Begin: 1, End: SortedOffsetsV.size())) {
7344 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7345 return false;
7346 }
7347 }
7348
7349 // Introduce some notation for the explanations below. Let `PointerOps_j`
7350 // denote the subsequence of `PointerOps` with offsets equal to
7351 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7352 // ```
7353 // PointerOps_j[SortedIndices_j[0]],
7354 // PointerOps_j[SortedIndices_j[1]],
7355 // PointerOps_j[SortedIndices_j[2]],
7356 // ...
7357 // ```
7358 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7359 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7360 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7361 // The entire sorted `PointerOps` looks like this:
7362 // ```
7363 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7364 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7365 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7366 // ...
7367 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7368 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7369 //
7370 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7371 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7372 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7373 // ...
7374 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7375 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7376 //
7377 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7378 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7379 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7380 // ...
7381 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7382 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7383 // ...
7384 // ...
7385 // ...
7386 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7387 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7388 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7389 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7390 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7391 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7392 // ...
7393 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7394 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7395 // ```
7396 // In order to be able to generate a strided load, we need the following
7397 // checks to pass:
7398 //
7399 // (1) for each `PointerOps_j` check that the distance
7400 // between adjacent pointers are all equal to the same value (stride).
7401 // (2) for each `PointerOps_j` check that coefficients calculated by
7402 // `calculateRtStride` are all the same.
7403 //
7404 // As we do that, also calculate SortedIndices. Since we should not modify
7405 // `SortedIndices` unless we know that all the checks succeed, record the
7406 // indicies into `SortedIndicesDraft`.
7407 SmallVector<unsigned> SortedIndicesDraft(Sz);
7408
7409 // Given sorted indices for a particular offset (as calculated by
7410 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7411 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7412 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7413 // \param `IndicesInAllPointerOps` vector of indices of the
7414 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7415 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7416 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7417 auto UpdateSortedIndices =
7418 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7419 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7420 if (SortedIndicesForOffset.empty()) {
7421 SortedIndicesForOffset.resize(N: IndicesInAllPointerOps.size());
7422 std::iota(first: SortedIndicesForOffset.begin(),
7423 last: SortedIndicesForOffset.end(), value: 0);
7424 }
7425 for (const auto [Num, Idx] : enumerate(First&: SortedIndicesForOffset)) {
7426 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7427 IndicesInAllPointerOps[Idx];
7428 }
7429 };
7430
7431 int64_t LowestOffset = SortedOffsetsV[0];
7432 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7433
7434 SmallVector<int64_t> Coeffs0(VecSz);
7435 SmallVector<unsigned> SortedIndicesForOffset0;
7436 const SCEV *Stride0 = calculateRtStride(PointerOps: PointerOps0, ElemTy: ScalarTy, DL: *DL, SE&: *SE,
7437 SortedIndices&: SortedIndicesForOffset0, Coeffs&: Coeffs0);
7438 if (!Stride0)
7439 return false;
7440 unsigned NumCoeffs0 = Coeffs0.size();
7441 if (NumCoeffs0 * NumOffsets != Sz)
7442 return false;
7443 sort(C&: Coeffs0);
7444
7445 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7446 OffsetToPointerOpIdxMap[LowestOffset].second;
7447 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7448
7449 // Now that we know what the common stride and coefficients has to be check
7450 // the remaining `PointerOps_j`.
7451 SmallVector<int64_t> Coeffs;
7452 SmallVector<unsigned> SortedIndicesForOffset;
7453 for (int J : seq<int>(Begin: 1, End: NumOffsets)) {
7454 Coeffs.clear();
7455 Coeffs.resize(N: VecSz);
7456 SortedIndicesForOffset.clear();
7457
7458 int64_t Offset = SortedOffsetsV[J];
7459 ArrayRef<Value *> PointerOpsForOffset =
7460 OffsetToPointerOpIdxMap[Offset].first;
7461 ArrayRef<unsigned> IndicesInAllPointerOps =
7462 OffsetToPointerOpIdxMap[Offset].second;
7463 const SCEV *StrideWithinGroup =
7464 calculateRtStride(PointerOps: PointerOpsForOffset, ElemTy: ScalarTy, DL: *DL, SE&: *SE,
7465 SortedIndices&: SortedIndicesForOffset, Coeffs);
7466
7467 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7468 return false;
7469 if (Coeffs.size() != NumCoeffs0)
7470 return false;
7471 sort(C&: Coeffs);
7472 if (Coeffs != Coeffs0)
7473 return false;
7474
7475 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7476 }
7477
7478 SortedIndices.clear();
7479 SortedIndices = std::move(SortedIndicesDraft);
7480 SPtrInfo.StrideSCEV = Stride0;
7481 SPtrInfo.Ty = StridedLoadTy;
7482 return true;
7483}
7484
7485BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
7486 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7487 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7488 unsigned *BestVF, bool TryRecursiveCheck) const {
7489 // Check that a vectorized load would load the same memory as a scalar
7490 // load. For example, we don't want to vectorize loads that are smaller
7491 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7492 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7493 // from such a struct, we read/write packed bits disagreeing with the
7494 // unvectorized version.
7495 if (BestVF)
7496 *BestVF = 0;
7497 if (areKnownNonVectorizableLoads(VL))
7498 return LoadsState::Gather;
7499 Type *ScalarTy = VL0->getType();
7500
7501 if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
7502 return LoadsState::Gather;
7503
7504 // Make sure all loads in the bundle are simple - we can't vectorize
7505 // atomic or volatile loads.
7506 PointerOps.clear();
7507 const size_t Sz = VL.size();
7508 PointerOps.resize(N: Sz);
7509 auto *POIter = PointerOps.begin();
7510 for (Value *V : VL) {
7511 auto *L = dyn_cast<LoadInst>(Val: V);
7512 if (!L || !L->isSimple())
7513 return LoadsState::Gather;
7514 *POIter = L->getPointerOperand();
7515 ++POIter;
7516 }
7517
7518 Order.clear();
7519 // Check the order of pointer operands or that all pointers are the same.
7520 bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order);
7521
7522 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7523 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7524 if (!IsSorted) {
7525 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, SortedIndices&: Order,
7526 SPtrInfo))
7527 return LoadsState::StridedVectorize;
7528
7529 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
7530 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7531 return LoadsState::Gather;
7532
7533 if (!all_of(Range&: PointerOps, P: [&](Value *P) {
7534 return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
7535 }))
7536 return LoadsState::Gather;
7537
7538 } else {
7539 Value *Ptr0;
7540 Value *PtrN;
7541 if (Order.empty()) {
7542 Ptr0 = PointerOps.front();
7543 PtrN = PointerOps.back();
7544 } else {
7545 Ptr0 = PointerOps[Order.front()];
7546 PtrN = PointerOps[Order.back()];
7547 }
7548 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7549 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7550 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7551 std::optional<int64_t> Diff0 =
7552 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: Ptr0, DL: *DL, SE&: *SE);
7553 std::optional<int64_t> DiffN =
7554 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
7555 assert(Diff0 && DiffN &&
7556 "sortPtrAccesses should have validated these pointers");
7557 int64_t Diff = *DiffN - *Diff0;
7558 // Check that the sorted loads are consecutive.
7559 if (static_cast<uint64_t>(Diff) == Sz - 1)
7560 return LoadsState::Vectorize;
7561 if (isMaskedLoadCompress(VL, PointerOps, Order, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
7562 TLI: *TLI, AreAllUsersVectorized: [&](Value *V) {
7563 return areAllUsersVectorized(
7564 I: cast<Instruction>(Val: V), VectorizedVals: UserIgnoreList);
7565 }))
7566 return LoadsState::CompressVectorize;
7567 Align Alignment =
7568 cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()])
7569 ->getAlign();
7570 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, SortedIndices: Order,
7571 Diff, Ptr0, PtrN, SPtrInfo))
7572 return LoadsState::StridedVectorize;
7573 }
7574 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
7575 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7576 return LoadsState::Gather;
7577 // Correctly identify compare the cost of loads + shuffles rather than
7578 // strided/masked gather loads. Returns true if vectorized + shuffles
7579 // representation is better than just gather.
7580 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7581 unsigned *BestVF,
7582 bool ProfitableGatherPointers) {
7583 if (BestVF)
7584 *BestVF = 0;
7585 // Compare masked gather cost and loads + insert subvector costs.
7586 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7587 auto [ScalarGEPCost, VectorGEPCost] =
7588 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(),
7589 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7590 // Estimate the cost of masked gather GEP. If not a splat, roughly
7591 // estimate as a buildvector, otherwise estimate as splat.
7592 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
7593 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7594 VectorType *PtrVecTy = getWidenedType(ScalarTy: PtrScalarTy, VF: Sz);
7595 if (static_cast<unsigned>(count_if(
7596 Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7597 any_of(Range&: PointerOps, P: [&](Value *V) {
7598 return getUnderlyingObject(V) !=
7599 getUnderlyingObject(V: PointerOps.front());
7600 }))
7601 VectorGEPCost += getScalarizationOverhead(TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy,
7602 DemandedElts, /*Insert=*/true,
7603 /*Extract=*/false, CostKind);
7604 else
7605 VectorGEPCost +=
7606 getScalarizationOverhead(
7607 TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, DemandedElts: APInt::getOneBitSet(numBits: Sz, BitNo: 0),
7608 /*Insert=*/true, /*Extract=*/false, CostKind) +
7609 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: PtrVecTy, Mask: {}, CostKind);
7610 // The cost of scalar loads.
7611 InstructionCost ScalarLoadsCost =
7612 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
7613 binary_op: [&](InstructionCost C, Value *V) {
7614 return C + TTI.getInstructionCost(
7615 U: cast<Instruction>(Val: V), CostKind);
7616 }) +
7617 ScalarGEPCost;
7618 // The cost of masked gather.
7619 InstructionCost MaskedGatherCost =
7620 TTI.getMemIntrinsicInstrCost(
7621 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7622 cast<LoadInst>(Val: VL0)->getPointerOperand(),
7623 /*VariableMask=*/false, CommonAlignment),
7624 CostKind) +
7625 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7626 InstructionCost GatherCost =
7627 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7628 /*Insert=*/true,
7629 /*Extract=*/false, CostKind) +
7630 ScalarLoadsCost;
7631 // The list of loads is small or perform partial check already - directly
7632 // compare masked gather cost and gather cost.
7633 constexpr unsigned ListLimit = 4;
7634 if (!TryRecursiveCheck || VL.size() < ListLimit)
7635 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7636
7637 // FIXME: The following code has not been updated for non-power-of-2
7638 // vectors (and not whole registers). The splitting logic here does not
7639 // cover the original vector if the vector factor is not a power of two.
7640 if (!hasFullVectorsOrPowerOf2(TTI, Ty: ScalarTy, Sz: VL.size()))
7641 return false;
7642
7643 unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
7644 unsigned MinVF = getMinVF(Sz: 2 * Sz);
7645 DemandedElts.clearAllBits();
7646 // Iterate through possible vectorization factors and check if vectorized +
7647 // shuffles is better than just gather.
7648 for (unsigned VF =
7649 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VL.size() - 1);
7650 VF >= MinVF;
7651 VF = getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VF - 1)) {
7652 SmallVector<LoadsState> States;
7653 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7654 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
7655 SmallVector<unsigned> Order;
7656 SmallVector<Value *> PointerOps;
7657 LoadsState LS = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
7658 PointerOps, SPtrInfo, BestVF,
7659 /*TryRecursiveCheck=*/false);
7660 // Check that the sorted loads are consecutive.
7661 if (LS == LoadsState::Gather) {
7662 if (BestVF) {
7663 DemandedElts.setAllBits();
7664 break;
7665 }
7666 DemandedElts.setBits(loBit: Cnt, hiBit: Cnt + VF);
7667 continue;
7668 }
7669 // If need the reorder - consider as high-cost masked gather for now.
7670 if ((LS == LoadsState::Vectorize ||
7671 LS == LoadsState::StridedVectorize ||
7672 LS == LoadsState::CompressVectorize) &&
7673 !Order.empty() && !isReverseOrder(Order))
7674 LS = LoadsState::ScatterVectorize;
7675 States.push_back(Elt: LS);
7676 }
7677 if (DemandedElts.isAllOnes())
7678 // All loads gathered - try smaller VF.
7679 continue;
7680 // Can be vectorized later as a serie of loads/insertelements.
7681 InstructionCost VecLdCost = 0;
7682 if (!DemandedElts.isZero()) {
7683 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7684 /*Insert=*/true,
7685 /*Extract=*/false, CostKind) +
7686 ScalarGEPCost;
7687 for (unsigned Idx : seq<unsigned>(Size: VL.size()))
7688 if (DemandedElts[Idx])
7689 VecLdCost +=
7690 TTI.getInstructionCost(U: cast<Instruction>(Val: VL[Idx]), CostKind);
7691 }
7692 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7693 for (auto [I, LS] : enumerate(First&: States)) {
7694 auto *LI0 = cast<LoadInst>(Val: VL[I * VF]);
7695 InstructionCost VectorGEPCost =
7696 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7697 ? 0
7698 : getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
7699 BasePtr: LI0->getPointerOperand(),
7700 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy,
7701 VecTy: SubVecTy)
7702 .second;
7703 if (LS == LoadsState::ScatterVectorize) {
7704 if (static_cast<unsigned>(
7705 count_if(Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) <
7706 PointerOps.size() - 1 ||
7707 any_of(Range&: PointerOps, P: [&](Value *V) {
7708 return getUnderlyingObject(V) !=
7709 getUnderlyingObject(V: PointerOps.front());
7710 }))
7711 VectorGEPCost += getScalarizationOverhead(
7712 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getAllOnes(numBits: VF),
7713 /*Insert=*/true, /*Extract=*/false, CostKind);
7714 else
7715 VectorGEPCost +=
7716 getScalarizationOverhead(
7717 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getOneBitSet(numBits: VF, BitNo: 0),
7718 /*Insert=*/true, /*Extract=*/false, CostKind) +
7719 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: SubVecTy, Mask: {},
7720 CostKind);
7721 }
7722 switch (LS) {
7723 case LoadsState::Vectorize:
7724 VecLdCost +=
7725 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
7726 AddressSpace: LI0->getPointerAddressSpace(), CostKind,
7727 OpdInfo: TTI::OperandValueInfo()) +
7728 VectorGEPCost;
7729 break;
7730 case LoadsState::StridedVectorize:
7731 VecLdCost += TTI.getMemIntrinsicInstrCost(
7732 MICA: MemIntrinsicCostAttributes(
7733 Intrinsic::experimental_vp_strided_load,
7734 SubVecTy, LI0->getPointerOperand(),
7735 /*VariableMask=*/false, CommonAlignment),
7736 CostKind) +
7737 VectorGEPCost;
7738 break;
7739 case LoadsState::CompressVectorize:
7740 VecLdCost += TTI.getMemIntrinsicInstrCost(
7741 MICA: MemIntrinsicCostAttributes(
7742 Intrinsic::masked_load, SubVecTy,
7743 CommonAlignment, LI0->getPointerAddressSpace()),
7744 CostKind) +
7745 ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SubVecTy,
7746 Mask: {}, CostKind);
7747 break;
7748 case LoadsState::ScatterVectorize:
7749 VecLdCost += TTI.getMemIntrinsicInstrCost(
7750 MICA: MemIntrinsicCostAttributes(
7751 Intrinsic::masked_gather, SubVecTy,
7752 LI0->getPointerOperand(),
7753 /*VariableMask=*/false, CommonAlignment),
7754 CostKind) +
7755 VectorGEPCost;
7756 break;
7757 case LoadsState::Gather:
7758 // Gathers are already calculated - ignore.
7759 continue;
7760 }
7761 SmallVector<int> ShuffleMask(VL.size());
7762 for (int Idx : seq<int>(Begin: 0, End: VL.size()))
7763 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7764 if (I > 0)
7765 VecLdCost +=
7766 ::getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
7767 CostKind, Index: I * VF, SubTp: SubVecTy);
7768 }
7769 // If masked gather cost is higher - better to vectorize, so
7770 // consider it as a gather node. It will be better estimated
7771 // later.
7772 if (MaskedGatherCost >= VecLdCost &&
7773 VecLdCost - GatherCost < -SLPCostThreshold) {
7774 if (BestVF)
7775 *BestVF = VF;
7776 return true;
7777 }
7778 }
7779 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7780 };
7781 // TODO: need to improve analysis of the pointers, if not all of them are
7782 // GEPs or have > 2 operands, we end up with a gather node, which just
7783 // increases the cost.
7784 Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
7785 bool ProfitableGatherPointers =
7786 L && Sz > 2 && static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
7787 return L->isLoopInvariant(V);
7788 })) <= Sz / 2;
7789 if (ProfitableGatherPointers || all_of(Range&: PointerOps, P: [](Value *P) {
7790 auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
7791 return (!GEP && doesNotNeedToBeScheduled(V: P)) ||
7792 (GEP && GEP->getNumOperands() == 2 &&
7793 isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: 1)));
7794 })) {
7795 // Check if potential masked gather can be represented as series
7796 // of loads + insertsubvectors.
7797 // If masked gather cost is higher - better to vectorize, so
7798 // consider it as a gather node. It will be better estimated
7799 // later.
7800 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7801 ProfitableGatherPointers))
7802 return LoadsState::ScatterVectorize;
7803 }
7804
7805 return LoadsState::Gather;
7806}
7807
7808static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
7809 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7810 const DataLayout &DL, ScalarEvolution &SE,
7811 SmallVectorImpl<unsigned> &SortedIndices) {
7812 assert(
7813 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7814 "Expected list of pointer operands.");
7815 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7816 // Ptr into, sort and return the sorted indices with values next to one
7817 // another.
7818 SmallMapVector<
7819 std::pair<BasicBlock *, Value *>,
7820 SmallVector<SmallVector<std::tuple<Value *, int64_t, unsigned>>>, 8>
7821 Bases;
7822 Bases
7823 .try_emplace(Key: std::make_pair(
7824 x: BBs.front(), y: getUnderlyingObject(V: VL.front(), MaxLookup: RecursionMaxDepth)))
7825 .first->second.emplace_back().emplace_back(Args: VL.front(), Args: 0U, Args: 0U);
7826
7827 SortedIndices.clear();
7828 for (auto [Cnt, Ptr] : enumerate(First: VL.drop_front())) {
7829 auto Key = std::make_pair(x: BBs[Cnt + 1],
7830 y: getUnderlyingObject(V: Ptr, MaxLookup: RecursionMaxDepth));
7831 bool Found = any_of(Range&: Bases.try_emplace(Key).first->second,
7832 P: [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7833 std::optional<int64_t> Diff =
7834 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7835 ElemTy, Ptr, DL, SE,
7836 /*StrictCheck=*/true);
7837 if (!Diff)
7838 return false;
7839
7840 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7841 return true;
7842 });
7843
7844 if (!Found) {
7845 // If we haven't found enough to usefully cluster, return early.
7846 if (Bases.size() > VL.size() / 2 - 1)
7847 return false;
7848
7849 // Not found already - add a new Base
7850 Bases.find(Key)->second.emplace_back().emplace_back(Args: Ptr, Args: 0, Args: Cnt + 1);
7851 }
7852 }
7853
7854 if (Bases.size() == VL.size())
7855 return false;
7856
7857 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7858 Bases.front().second.size() == VL.size()))
7859 return false;
7860
7861 // For each of the bases sort the pointers by Offset and check if any of the
7862 // base become consecutively allocated.
7863 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7864 SmallPtrSet<Value *, 13> FirstPointers;
7865 SmallPtrSet<Value *, 13> SecondPointers;
7866 Value *P1 = Ptr1;
7867 Value *P2 = Ptr2;
7868 unsigned Depth = 0;
7869 while (!FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1)) {
7870 if (P1 == P2 || Depth > RecursionMaxDepth)
7871 return false;
7872 FirstPointers.insert(Ptr: P1);
7873 SecondPointers.insert(Ptr: P2);
7874 P1 = getUnderlyingObject(V: P1, /*MaxLookup=*/1);
7875 P2 = getUnderlyingObject(V: P2, /*MaxLookup=*/1);
7876 ++Depth;
7877 }
7878 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7879 "Unable to find matching root.");
7880 return FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1);
7881 };
7882 for (auto &Base : Bases) {
7883 for (auto &Vec : Base.second) {
7884 if (Vec.size() > 1) {
7885 stable_sort(Range&: Vec, C: llvm::less_second());
7886 int64_t InitialOffset = std::get<1>(t&: Vec[0]);
7887 bool AnyConsecutive =
7888 all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
7889 return std::get<1>(P.value()) ==
7890 int64_t(P.index()) + InitialOffset;
7891 });
7892 // Fill SortedIndices array only if it looks worth-while to sort the
7893 // ptrs.
7894 if (!AnyConsecutive)
7895 return false;
7896 }
7897 }
7898 stable_sort(Range&: Base.second, C: [&](const auto &V1, const auto &V2) {
7899 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7900 });
7901 }
7902
7903 for (auto &T : Bases)
7904 for (const auto &Vec : T.second)
7905 for (const auto &P : Vec)
7906 SortedIndices.push_back(Elt: std::get<2>(t: P));
7907
7908 assert(SortedIndices.size() == VL.size() &&
7909 "Expected SortedIndices to be the size of VL");
7910 return true;
7911}
7912
7913std::optional<BoUpSLP::OrdersType>
7914BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7915 assert(TE.isGather() && "Expected gather node only.");
7916 Type *ScalarTy = TE.Scalars[0]->getType();
7917
7918 SmallVector<Value *> Ptrs;
7919 Ptrs.reserve(N: TE.Scalars.size());
7920 SmallVector<BasicBlock *> BBs;
7921 BBs.reserve(N: TE.Scalars.size());
7922 for (Value *V : TE.Scalars) {
7923 auto *L = dyn_cast<LoadInst>(Val: V);
7924 if (!L || !L->isSimple())
7925 return std::nullopt;
7926 Ptrs.push_back(Elt: L->getPointerOperand());
7927 BBs.push_back(Elt: L->getParent());
7928 }
7929
7930 BoUpSLP::OrdersType Order;
7931 if (!LoadEntriesToVectorize.contains(key: TE.Idx) &&
7932 clusterSortPtrAccesses(VL: Ptrs, BBs, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
7933 return std::move(Order);
7934 return std::nullopt;
7935}
7936
7937/// Check if two insertelement instructions are from the same buildvector.
7938static bool areTwoInsertFromSameBuildVector(
7939 InsertElementInst *VU, InsertElementInst *V,
7940 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7941 // Instructions must be from the same basic blocks.
7942 if (VU->getParent() != V->getParent())
7943 return false;
7944 // Checks if 2 insertelements are from the same buildvector.
7945 if (VU->getType() != V->getType())
7946 return false;
7947 // Multiple used inserts are separate nodes.
7948 if (!VU->hasOneUse() && !V->hasOneUse())
7949 return false;
7950 auto *IE1 = VU;
7951 auto *IE2 = V;
7952 std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
7953 std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
7954 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7955 return false;
7956 // Go through the vector operand of insertelement instructions trying to find
7957 // either VU as the original vector for IE2 or V as the original vector for
7958 // IE1.
7959 SmallBitVector ReusedIdx(
7960 cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
7961 bool IsReusedIdx = false;
7962 do {
7963 if (IE2 == VU && !IE1)
7964 return VU->hasOneUse();
7965 if (IE1 == V && !IE2)
7966 return V->hasOneUse();
7967 if (IE1 && IE1 != V) {
7968 unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
7969 IsReusedIdx |= ReusedIdx.test(Idx: Idx1);
7970 ReusedIdx.set(Idx1);
7971 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7972 IE1 = nullptr;
7973 else
7974 IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE1));
7975 }
7976 if (IE2 && IE2 != VU) {
7977 unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
7978 IsReusedIdx |= ReusedIdx.test(Idx: Idx2);
7979 ReusedIdx.set(Idx2);
7980 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7981 IE2 = nullptr;
7982 else
7983 IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE2));
7984 }
7985 } while (!IsReusedIdx && (IE1 || IE2));
7986 return false;
7987}
7988
7989/// Checks if the specified instruction \p I is an alternate operation for
7990/// the given \p MainOp and \p AltOp instructions.
7991static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7992 Instruction *AltOp,
7993 const TargetLibraryInfo &TLI);
7994
7995std::optional<BoUpSLP::OrdersType>
7996BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7997 bool IgnoreReorder) {
7998 // No need to reorder if need to shuffle reuses, still need to shuffle the
7999 // node.
8000 if (!TE.ReuseShuffleIndices.empty()) {
8001 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
8002 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
8003 "Reshuffling scalars not yet supported for nodes with padding");
8004
8005 if (isSplat(VL: TE.Scalars))
8006 return std::nullopt;
8007 // Check if reuse shuffle indices can be improved by reordering.
8008 // For this, check that reuse mask is "clustered", i.e. each scalar values
8009 // is used once in each submask of size <number_of_scalars>.
8010 // Example: 4 scalar values.
8011 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
8012 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
8013 // element 3 is used twice in the second submask.
8014 unsigned Sz = TE.Scalars.size();
8015 if (TE.isGather()) {
8016 if (std::optional<OrdersType> CurrentOrder =
8017 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
8018 SmallVector<int> Mask;
8019 fixupOrderingIndices(Order: *CurrentOrder);
8020 inversePermutation(Indices: *CurrentOrder, Mask);
8021 ::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
8022 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
8023 unsigned Sz = TE.Scalars.size();
8024 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
8025 for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
8026 if (Idx != PoisonMaskElem)
8027 Res[Idx + K * Sz] = I + K * Sz;
8028 }
8029 return std::move(Res);
8030 }
8031 }
8032 if (Sz == 2 && TE.getVectorFactor() == 4 &&
8033 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
8034 VF: 2 * TE.getVectorFactor())) == 1)
8035 return std::nullopt;
8036 if (TE.ReuseShuffleIndices.size() % Sz != 0)
8037 return std::nullopt;
8038 if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
8039 VF: Sz)) {
8040 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8041 if (TE.ReorderIndices.empty())
8042 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
8043 else
8044 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
8045 ::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
8046 unsigned VF = ReorderMask.size();
8047 OrdersType ResOrder(VF, VF);
8048 unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
8049 SmallBitVector UsedVals(NumParts);
8050 for (unsigned I = 0; I < VF; I += Sz) {
8051 int Val = PoisonMaskElem;
8052 unsigned UndefCnt = 0;
8053 unsigned Limit = std::min(a: Sz, b: VF - I);
8054 if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
8055 P: [&](int Idx) {
8056 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
8057 Val = Idx;
8058 if (Idx == PoisonMaskElem)
8059 ++UndefCnt;
8060 return Idx != PoisonMaskElem && Idx != Val;
8061 }) ||
8062 Val >= static_cast<int>(NumParts) || UsedVals.test(Idx: Val) ||
8063 UndefCnt > Sz / 2)
8064 return std::nullopt;
8065 UsedVals.set(Val);
8066 for (unsigned K = 0; K < NumParts; ++K) {
8067 unsigned Idx = Val + Sz * K;
8068 if (Idx < VF && I + K < VF)
8069 ResOrder[Idx] = I + K;
8070 }
8071 }
8072 return std::move(ResOrder);
8073 }
8074 unsigned VF = TE.getVectorFactor();
8075 // Try build correct order for extractelement instructions.
8076 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8077 TE.ReuseShuffleIndices.end());
8078 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8079 all_of(Range: TE.Scalars, P: [Sz](Value *V) {
8080 if (isa<PoisonValue>(Val: V))
8081 return true;
8082 std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
8083 return Idx && *Idx < Sz;
8084 })) {
8085 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8086 "by BinaryOperator and CastInst.");
8087 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8088 if (TE.ReorderIndices.empty())
8089 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
8090 else
8091 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
8092 for (unsigned I = 0; I < VF; ++I) {
8093 int &Idx = ReusedMask[I];
8094 if (Idx == PoisonMaskElem)
8095 continue;
8096 Value *V = TE.Scalars[ReorderMask[Idx]];
8097 std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
8098 Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
8099 }
8100 }
8101 // Build the order of the VF size, need to reorder reuses shuffles, they are
8102 // always of VF size.
8103 OrdersType ResOrder(VF);
8104 std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: 0);
8105 auto *It = ResOrder.begin();
8106 for (unsigned K = 0; K < VF; K += Sz) {
8107 OrdersType CurrentOrder(TE.ReorderIndices);
8108 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
8109 if (SubMask.front() == PoisonMaskElem)
8110 std::iota(first: SubMask.begin(), last: SubMask.end(), value: 0);
8111 reorderOrder(Order&: CurrentOrder, Mask: SubMask);
8112 transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
8113 std::advance(i&: It, n: Sz);
8114 }
8115 if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
8116 return Data.index() == Data.value();
8117 }))
8118 return std::nullopt; // No need to reorder.
8119 return std::move(ResOrder);
8120 }
8121 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8122 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8123 !Instruction::isBinaryOp(Opcode: TE.UserTreeIndex.UserTE->getOpcode())) &&
8124 (TE.ReorderIndices.empty() || isReverseOrder(Order: TE.ReorderIndices)))
8125 return std::nullopt;
8126 if (TE.State == TreeEntry::SplitVectorize ||
8127 ((TE.State == TreeEntry::Vectorize ||
8128 TE.State == TreeEntry::StridedVectorize ||
8129 TE.State == TreeEntry::CompressVectorize) &&
8130 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) ||
8131 (TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))))) {
8132 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8133 "Alternate instructions are only supported by "
8134 "BinaryOperator and CastInst.");
8135 return TE.ReorderIndices;
8136 }
8137 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8138 TE.isAltShuffle()) {
8139 assert(TE.ReuseShuffleIndices.empty() &&
8140 "ReuseShuffleIndices should be "
8141 "empty for alternate instructions.");
8142 SmallVector<int> Mask;
8143 TE.buildAltOpShuffleMask(
8144 IsAltOp: [&](Instruction *I) {
8145 assert(TE.getMatchingMainOpOrAltOp(I) &&
8146 "Unexpected main/alternate opcode");
8147 return isAlternateInstruction(I, MainOp: TE.getMainOp(), AltOp: TE.getAltOp(), TLI: *TLI);
8148 },
8149 Mask);
8150 const int VF = TE.getVectorFactor();
8151 OrdersType ResOrder(VF, VF);
8152 for (unsigned I : seq<unsigned>(Size: VF)) {
8153 if (Mask[I] == PoisonMaskElem)
8154 continue;
8155 ResOrder[Mask[I] % VF] = I;
8156 }
8157 return std::move(ResOrder);
8158 }
8159 if (!TE.ReorderIndices.empty())
8160 return TE.ReorderIndices;
8161 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8162 if (!TE.ReorderIndices.empty())
8163 return TE.ReorderIndices;
8164
8165 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8166 for (auto [I, V] : zip(t&: UserBVHead, u: TE.Scalars)) {
8167 if (isa<Constant>(Val: V) || !V->hasNUsesOrMore(N: 1))
8168 continue;
8169 auto *II = dyn_cast<InsertElementInst>(Val: *V->user_begin());
8170 if (!II)
8171 continue;
8172 Instruction *BVHead = nullptr;
8173 BasicBlock *BB = II->getParent();
8174 while (II && II->hasOneUse() && II->getParent() == BB) {
8175 BVHead = II;
8176 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
8177 }
8178 I = BVHead;
8179 }
8180
8181 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8182 assert(BB1 != BB2 && "Expected different basic blocks.");
8183 if (!DT->isReachableFromEntry(A: BB1))
8184 return false;
8185 if (!DT->isReachableFromEntry(A: BB2))
8186 return true;
8187 auto *NodeA = DT->getNode(BB: BB1);
8188 auto *NodeB = DT->getNode(BB: BB2);
8189 assert(NodeA && "Should only process reachable instructions");
8190 assert(NodeB && "Should only process reachable instructions");
8191 assert((NodeA == NodeB) ==
8192 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8193 "Different nodes should have different DFS numbers");
8194 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8195 };
8196 auto PHICompare = [&](unsigned I1, unsigned I2) {
8197 Value *V1 = TE.Scalars[I1];
8198 Value *V2 = TE.Scalars[I2];
8199 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8200 return false;
8201 if (isa<PoisonValue>(Val: V1))
8202 return true;
8203 if (isa<PoisonValue>(Val: V2))
8204 return false;
8205 if (V1->getNumUses() < V2->getNumUses())
8206 return true;
8207 if (V1->getNumUses() > V2->getNumUses())
8208 return false;
8209 auto *FirstUserOfPhi1 = cast<Instruction>(Val: *V1->user_begin());
8210 auto *FirstUserOfPhi2 = cast<Instruction>(Val: *V2->user_begin());
8211 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8212 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8213 FirstUserOfPhi2->getParent());
8214 auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1);
8215 auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2);
8216 auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1);
8217 auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2);
8218 if (IE1 && !IE2)
8219 return true;
8220 if (!IE1 && IE2)
8221 return false;
8222 if (IE1 && IE2) {
8223 if (UserBVHead[I1] && !UserBVHead[I2])
8224 return true;
8225 if (!UserBVHead[I1])
8226 return false;
8227 if (UserBVHead[I1] == UserBVHead[I2])
8228 return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
8229 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8230 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8231 UserBVHead[I2]->getParent());
8232 return UserBVHead[I1]->comesBefore(Other: UserBVHead[I2]);
8233 }
8234 if (EE1 && !EE2)
8235 return true;
8236 if (!EE1 && EE2)
8237 return false;
8238 if (EE1 && EE2) {
8239 auto *Inst1 = dyn_cast<Instruction>(Val: EE1->getOperand(i_nocapture: 0));
8240 auto *Inst2 = dyn_cast<Instruction>(Val: EE2->getOperand(i_nocapture: 0));
8241 auto *P1 = dyn_cast<Argument>(Val: EE1->getOperand(i_nocapture: 0));
8242 auto *P2 = dyn_cast<Argument>(Val: EE2->getOperand(i_nocapture: 0));
8243 if (!Inst2 && !P2)
8244 return Inst1 || P1;
8245 if (EE1->getOperand(i_nocapture: 0) == EE2->getOperand(i_nocapture: 0))
8246 return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
8247 if (!Inst1 && Inst2)
8248 return false;
8249 if (Inst1 && Inst2) {
8250 if (Inst1->getParent() != Inst2->getParent())
8251 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8252 return Inst1->comesBefore(Other: Inst2);
8253 }
8254 if (!P1 && P2)
8255 return false;
8256 assert(P1 && P2 &&
8257 "Expected either instructions or arguments vector operands.");
8258 return P1->getArgNo() < P2->getArgNo();
8259 }
8260 return false;
8261 };
8262 OrdersType Phis(TE.Scalars.size());
8263 std::iota(first: Phis.begin(), last: Phis.end(), value: 0);
8264 stable_sort(Range&: Phis, C: PHICompare);
8265 if (isIdentityOrder(Order: Phis))
8266 return std::nullopt; // No need to reorder.
8267 return std::move(Phis);
8268 }
8269 if (TE.isGather() &&
8270 (!TE.hasState() || !TE.isAltShuffle() ||
8271 ScalarsInSplitNodes.contains(Val: TE.getMainOp())) &&
8272 allSameType(VL: TE.Scalars)) {
8273 // TODO: add analysis of other gather nodes with extractelement
8274 // instructions and other values/instructions, not only undefs.
8275 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8276 (all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
8277 any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
8278 all_of(Range: TE.Scalars, P: [](Value *V) {
8279 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8280 return !EE || isa<FixedVectorType>(Val: EE->getVectorOperandType());
8281 })) {
8282 // Check that gather of extractelements can be represented as
8283 // just a shuffle of a single vector.
8284 OrdersType CurrentOrder;
8285 bool Reuse =
8286 canReuseExtract(VL: TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8287 if (Reuse || !CurrentOrder.empty())
8288 return std::move(CurrentOrder);
8289 }
8290 // If the gather node is <undef, v, .., poison> and
8291 // insertelement poison, v, 0 [+ permute]
8292 // is cheaper than
8293 // insertelement poison, v, n - try to reorder.
8294 // If rotating the whole graph, exclude the permute cost, the whole graph
8295 // might be transformed.
8296 int Sz = TE.Scalars.size();
8297 if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
8298 count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - 1) {
8299 const auto *It = find_if_not(Range: TE.Scalars, P: isConstant);
8300 if (It == TE.Scalars.begin())
8301 return OrdersType();
8302 auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
8303 if (It != TE.Scalars.end()) {
8304 OrdersType Order(Sz, Sz);
8305 unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
8306 Order[Idx] = 0;
8307 fixupOrderingIndices(Order);
8308 SmallVector<int> Mask;
8309 inversePermutation(Indices: Order, Mask);
8310 InstructionCost PermuteCost =
8311 TopToBottom
8312 ? 0
8313 : ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
8314 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8315 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: 0,
8316 Op0: PoisonValue::get(T: Ty), Op1: *It);
8317 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8318 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
8319 Op0: PoisonValue::get(T: Ty), Op1: *It);
8320 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8321 OrdersType Order(Sz, Sz);
8322 Order[Idx] = 0;
8323 return std::move(Order);
8324 }
8325 }
8326 }
8327 if (isSplat(VL: TE.Scalars))
8328 return std::nullopt;
8329 if (TE.Scalars.size() >= 3)
8330 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8331 return Order;
8332 // Check if can include the order of vectorized loads. For masked gathers do
8333 // extra analysis later, so include such nodes into a special list.
8334 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8335 SmallVector<Value *> PointerOps;
8336 StridedPtrInfo SPtrInfo;
8337 OrdersType CurrentOrder;
8338 LoadsState Res = canVectorizeLoads(VL: TE.Scalars, VL0: TE.Scalars.front(),
8339 Order&: CurrentOrder, PointerOps, SPtrInfo);
8340 if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
8341 Res == LoadsState::CompressVectorize)
8342 return std::move(CurrentOrder);
8343 }
8344 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8345 // has been auditted for correctness with non-power-of-two vectors.
8346 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
8347 if (std::optional<OrdersType> CurrentOrder =
8348 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8349 return CurrentOrder;
8350 }
8351 return std::nullopt;
8352}
8353
8354/// Checks if the given mask is a "clustered" mask with the same clusters of
8355/// size \p Sz, which are not identity submasks.
8356static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
8357 unsigned Sz) {
8358 ArrayRef<int> FirstCluster = Mask.slice(N: 0, M: Sz);
8359 if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
8360 return false;
8361 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8362 ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
8363 if (Cluster != FirstCluster)
8364 return false;
8365 }
8366 return true;
8367}
8368
8369void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8370 // Reorder reuses mask.
8371 reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
8372 const unsigned Sz = TE.Scalars.size();
8373 // For vectorized and non-clustered reused no need to do anything else.
8374 if (!TE.isGather() ||
8375 !ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
8376 VF: Sz) ||
8377 !isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
8378 return;
8379 SmallVector<int> NewMask;
8380 inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
8381 addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
8382 // Clear reorder since it is going to be applied to the new mask.
8383 TE.ReorderIndices.clear();
8384 // Try to improve gathered nodes with clustered reuses, if possible.
8385 ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: 0, M: Sz);
8386 SmallVector<unsigned> NewOrder(Slice);
8387 inversePermutation(Indices: NewOrder, Mask&: NewMask);
8388 reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
8389 // Fill the reuses mask with the identity submasks.
8390 for (auto *It = TE.ReuseShuffleIndices.begin(),
8391 *End = TE.ReuseShuffleIndices.end();
8392 It != End; std::advance(i&: It, n: Sz))
8393 std::iota(first: It, last: std::next(x: It, n: Sz), value: 0);
8394}
8395
8396static void combineOrders(MutableArrayRef<unsigned> Order,
8397 ArrayRef<unsigned> SecondaryOrder) {
8398 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8399 "Expected same size of orders");
8400 size_t Sz = Order.size();
8401 SmallBitVector UsedIndices(Sz);
8402 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) {
8403 if (Order[Idx] != Sz)
8404 UsedIndices.set(Order[Idx]);
8405 }
8406 if (SecondaryOrder.empty()) {
8407 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
8408 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8409 Order[Idx] = Idx;
8410 } else {
8411 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
8412 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8413 !UsedIndices.test(Idx: SecondaryOrder[Idx]))
8414 Order[Idx] = SecondaryOrder[Idx];
8415 }
8416}
8417
8418bool BoUpSLP::isProfitableToReorder() const {
8419 if (DisableTreeReorder)
8420 return false;
8421
8422 constexpr unsigned TinyVF = 2;
8423 constexpr unsigned TinyTree = 10;
8424 constexpr unsigned PhiOpsLimit = 12;
8425 constexpr unsigned GatherLoadsLimit = 2;
8426 if (VectorizableTree.size() <= TinyTree)
8427 return true;
8428 if (VectorizableTree.front()->hasState() &&
8429 !VectorizableTree.front()->isGather() &&
8430 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8431 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8432 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8433 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8434 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8435 VectorizableTree.front()->ReorderIndices.empty()) {
8436 // Check if the tree has only single store and single (unordered) load node,
8437 // other nodes are phis or geps/binops, combined with phis, and/or single
8438 // gather load node
8439 if (VectorizableTree.front()->hasState() &&
8440 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8441 VectorizableTree.front()->Scalars.size() == TinyVF &&
8442 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8443 return false;
8444 // Single node, which require reorder - skip.
8445 if (VectorizableTree.front()->hasState() &&
8446 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8447 VectorizableTree.front()->ReorderIndices.empty()) {
8448 const unsigned ReorderedSplitsCnt =
8449 count_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8450 return TE->State == TreeEntry::SplitVectorize &&
8451 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8452 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8453 ::isCommutative(I: TE->UserTreeIndex.UserTE->getMainOp());
8454 });
8455 if (ReorderedSplitsCnt <= 1 &&
8456 static_cast<unsigned>(count_if(
8457 Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8458 return ((!TE->isGather() &&
8459 (TE->ReorderIndices.empty() ||
8460 (TE->UserTreeIndex.UserTE &&
8461 TE->UserTreeIndex.UserTE->State ==
8462 TreeEntry::Vectorize &&
8463 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8464 .empty()))) ||
8465 (TE->isGather() && TE->ReorderIndices.empty() &&
8466 (!TE->hasState() || TE->isAltShuffle() ||
8467 TE->getOpcode() == Instruction::Load ||
8468 TE->getOpcode() == Instruction::ZExt ||
8469 TE->getOpcode() == Instruction::SExt))) &&
8470 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8471 !TE->isGather() || none_of(Range&: TE->Scalars, P: [&](Value *V) {
8472 return !isConstant(V) && isVectorized(V);
8473 }));
8474 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8475 return false;
8476 }
8477 bool HasPhis = false;
8478 bool HasLoad = true;
8479 unsigned GatherLoads = 0;
8480 for (const std::unique_ptr<TreeEntry> &TE :
8481 ArrayRef(VectorizableTree).drop_front()) {
8482 if (TE->State == TreeEntry::SplitVectorize)
8483 continue;
8484 if (!TE->hasState()) {
8485 if (all_of(Range&: TE->Scalars, P: IsaPred<Constant, PHINode>) ||
8486 all_of(Range&: TE->Scalars, P: IsaPred<BinaryOperator, PHINode>))
8487 continue;
8488 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8489 any_of(Range&: TE->Scalars, P: IsaPred<PHINode, GEPOperator>))
8490 continue;
8491 return true;
8492 }
8493 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8494 if (!TE->isGather()) {
8495 HasLoad = false;
8496 continue;
8497 }
8498 if (HasLoad)
8499 return true;
8500 ++GatherLoads;
8501 if (GatherLoads >= GatherLoadsLimit)
8502 return true;
8503 }
8504 if (TE->getOpcode() == Instruction::GetElementPtr ||
8505 Instruction::isBinaryOp(Opcode: TE->getOpcode()))
8506 continue;
8507 if (TE->getOpcode() != Instruction::PHI &&
8508 (!TE->hasCopyableElements() ||
8509 static_cast<unsigned>(count_if(Range&: TE->Scalars, P: IsaPred<PHINode>)) <
8510 TE->Scalars.size() / 2))
8511 return true;
8512 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8513 TE->getNumOperands() > PhiOpsLimit)
8514 return false;
8515 HasPhis = true;
8516 }
8517 return !HasPhis;
8518 }
8519 return true;
8520}
8521
8522void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8523 ArrayRef<int> MaskOrder) {
8524 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8525 SmallVector<int> NewMask(getVectorFactor());
8526 SmallVector<int> NewMaskOrder(getVectorFactor());
8527 std::iota(first: NewMask.begin(), last: NewMask.end(), value: 0);
8528 std::iota(first: NewMaskOrder.begin(), last: NewMaskOrder.end(), value: 0);
8529 if (Idx == 0) {
8530 copy(Range&: Mask, Out: NewMask.begin());
8531 copy(Range&: MaskOrder, Out: NewMaskOrder.begin());
8532 } else {
8533 assert(Idx == 1 && "Expected either 0 or 1 index.");
8534 unsigned Offset = CombinedEntriesWithIndices.back().second;
8535 for (unsigned I : seq<unsigned>(Size: Mask.size())) {
8536 NewMask[I + Offset] = Mask[I] + Offset;
8537 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8538 }
8539 }
8540 reorderScalars(Scalars, Mask: NewMask);
8541 reorderOrder(Order&: ReorderIndices, Mask: NewMaskOrder, /*BottomOrder=*/true);
8542 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(Order: ReorderIndices))
8543 ReorderIndices.clear();
8544}
8545
8546void BoUpSLP::reorderTopToBottom() {
8547 // Maps VF to the graph nodes.
8548 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
8549 // ExtractElement gather nodes which can be vectorized and need to handle
8550 // their ordering.
8551 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
8552
8553 // Phi nodes can have preferred ordering based on their result users
8554 DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
8555
8556 // AltShuffles can also have a preferred ordering that leads to fewer
8557 // instructions, e.g., the addsub instruction in x86.
8558 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8559
8560 // Maps a TreeEntry to the reorder indices of external users.
8561 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
8562 ExternalUserReorderMap;
8563 // Find all reorderable nodes with the given VF.
8564 // Currently the are vectorized stores,loads,extracts + some gathering of
8565 // extracts.
8566 for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
8567 const std::unique_ptr<TreeEntry> &TE) {
8568 // Look for external users that will probably be vectorized.
8569 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8570 findExternalStoreUsersReorderIndices(TE: TE.get());
8571 if (!ExternalUserReorderIndices.empty()) {
8572 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8573 ExternalUserReorderMap.try_emplace(Key: TE.get(),
8574 Args: std::move(ExternalUserReorderIndices));
8575 }
8576
8577 // Patterns like [fadd,fsub] can be combined into a single instruction in
8578 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8579 // to take into account their order when looking for the most used order.
8580 if (TE->hasState() && TE->isAltShuffle() &&
8581 TE->State != TreeEntry::SplitVectorize) {
8582 Type *ScalarTy = TE->Scalars[0]->getType();
8583 VectorType *VecTy = getWidenedType(ScalarTy, VF: TE->Scalars.size());
8584 unsigned Opcode0 = TE->getOpcode();
8585 unsigned Opcode1 = TE->getAltOpcode();
8586 SmallBitVector OpcodeMask(
8587 getAltInstrMask(VL: TE->Scalars, ScalarTy, Opcode0, Opcode1));
8588 // If this pattern is supported by the target then we consider the order.
8589 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8590 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8591 AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType());
8592 }
8593 // TODO: Check the reverse order too.
8594 }
8595
8596 bool IgnoreReorder =
8597 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8598 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8599 VectorizableTree.front()->getOpcode() == Instruction::Store);
8600 if (std::optional<OrdersType> CurrentOrder =
8601 getReorderingData(TE: *TE, /*TopToBottom=*/true, IgnoreReorder)) {
8602 // Do not include ordering for nodes used in the alt opcode vectorization,
8603 // better to reorder them during bottom-to-top stage. If follow the order
8604 // here, it causes reordering of the whole graph though actually it is
8605 // profitable just to reorder the subgraph that starts from the alternate
8606 // opcode vectorization node. Such nodes already end-up with the shuffle
8607 // instruction and it is just enough to change this shuffle rather than
8608 // rotate the scalars for the whole graph.
8609 unsigned Cnt = 0;
8610 const TreeEntry *UserTE = TE.get();
8611 while (UserTE && Cnt < RecursionMaxDepth) {
8612 if (!UserTE->UserTreeIndex)
8613 break;
8614 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8615 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8616 UserTE->UserTreeIndex.UserTE->Idx != 0)
8617 return;
8618 UserTE = UserTE->UserTreeIndex.UserTE;
8619 ++Cnt;
8620 }
8621 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8622 if (!(TE->State == TreeEntry::Vectorize ||
8623 TE->State == TreeEntry::StridedVectorize ||
8624 TE->State == TreeEntry::SplitVectorize ||
8625 TE->State == TreeEntry::CompressVectorize) ||
8626 !TE->ReuseShuffleIndices.empty())
8627 GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8628 if (TE->State == TreeEntry::Vectorize &&
8629 TE->getOpcode() == Instruction::PHI)
8630 PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8631 }
8632 });
8633
8634 // Reorder the graph nodes according to their vectorization factor.
8635 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8636 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8637 auto It = VFToOrderedEntries.find(Val: VF);
8638 if (It == VFToOrderedEntries.end())
8639 continue;
8640 // Try to find the most profitable order. We just are looking for the most
8641 // used order and reorder scalar elements in the nodes according to this
8642 // mostly used order.
8643 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8644 // Delete VF entry upon exit.
8645 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(I: It); });
8646
8647 // All operands are reordered and used only in this node - propagate the
8648 // most used order to the user node.
8649 MapVector<OrdersType, unsigned,
8650 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
8651 OrdersUses;
8652 for (const TreeEntry *OpTE : OrderedEntries) {
8653 // No need to reorder this nodes, still need to extend and to use shuffle,
8654 // just need to merge reordering shuffle and the reuse shuffle.
8655 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE) &&
8656 OpTE->State != TreeEntry::SplitVectorize)
8657 continue;
8658 // Count number of orders uses.
8659 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8660 &PhisToOrders]() -> const OrdersType & {
8661 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8662 auto It = GathersToOrders.find(Val: OpTE);
8663 if (It != GathersToOrders.end())
8664 return It->second;
8665 }
8666 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8667 auto It = AltShufflesToOrders.find(Val: OpTE);
8668 if (It != AltShufflesToOrders.end())
8669 return It->second;
8670 }
8671 if (OpTE->State == TreeEntry::Vectorize &&
8672 OpTE->getOpcode() == Instruction::PHI) {
8673 auto It = PhisToOrders.find(Val: OpTE);
8674 if (It != PhisToOrders.end())
8675 return It->second;
8676 }
8677 return OpTE->ReorderIndices;
8678 }();
8679 // First consider the order of the external scalar users.
8680 auto It = ExternalUserReorderMap.find(Val: OpTE);
8681 if (It != ExternalUserReorderMap.end()) {
8682 const auto &ExternalUserReorderIndices = It->second;
8683 // If the OpTE vector factor != number of scalars - use natural order,
8684 // it is an attempt to reorder node with reused scalars but with
8685 // external uses.
8686 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8687 OrdersUses.try_emplace(Key: OrdersType(), Args: 0).first->second +=
8688 ExternalUserReorderIndices.size();
8689 } else {
8690 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8691 ++OrdersUses.try_emplace(Key: ExtOrder, Args: 0).first->second;
8692 }
8693 // No other useful reorder data in this entry.
8694 if (Order.empty())
8695 continue;
8696 }
8697 // Stores actually store the mask, not the order, need to invert.
8698 if (OpTE->State == TreeEntry::Vectorize &&
8699 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8700 assert(!OpTE->isAltShuffle() &&
8701 "Alternate instructions are only supported by BinaryOperator "
8702 "and CastInst.");
8703 SmallVector<int> Mask;
8704 inversePermutation(Indices: Order, Mask);
8705 unsigned E = Order.size();
8706 OrdersType CurrentOrder(E, E);
8707 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
8708 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8709 });
8710 fixupOrderingIndices(Order: CurrentOrder);
8711 ++OrdersUses.try_emplace(Key: CurrentOrder, Args: 0).first->second;
8712 } else {
8713 ++OrdersUses.try_emplace(Key: Order, Args: 0).first->second;
8714 }
8715 }
8716 if (OrdersUses.empty())
8717 continue;
8718 // Choose the most used order.
8719 unsigned IdentityCnt = 0;
8720 unsigned FilledIdentityCnt = 0;
8721 OrdersType IdentityOrder(VF, VF);
8722 for (auto &Pair : OrdersUses) {
8723 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
8724 if (!Pair.first.empty())
8725 FilledIdentityCnt += Pair.second;
8726 IdentityCnt += Pair.second;
8727 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
8728 }
8729 }
8730 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8731 unsigned Cnt = IdentityCnt;
8732 for (auto &Pair : OrdersUses) {
8733 // Prefer identity order. But, if filled identity found (non-empty order)
8734 // with same number of uses, as the new candidate order, we can choose
8735 // this candidate order.
8736 if (Cnt < Pair.second ||
8737 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8738 Cnt == Pair.second && !BestOrder.empty() &&
8739 isIdentityOrder(Order: BestOrder))) {
8740 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
8741 BestOrder = Pair.first;
8742 Cnt = Pair.second;
8743 } else {
8744 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
8745 }
8746 }
8747 // Set order of the user node.
8748 if (isIdentityOrder(Order: BestOrder))
8749 continue;
8750 fixupOrderingIndices(Order: BestOrder);
8751 SmallVector<int> Mask;
8752 inversePermutation(Indices: BestOrder, Mask);
8753 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8754 unsigned E = BestOrder.size();
8755 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8756 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8757 });
8758 // Do an actual reordering, if profitable.
8759 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8760 // Just do the reordering for the nodes with the given VF.
8761 if (TE->Scalars.size() != VF) {
8762 if (TE->ReuseShuffleIndices.size() == VF) {
8763 assert(TE->State != TreeEntry::SplitVectorize &&
8764 "Split vectorized not expected.");
8765 // Need to reorder the reuses masks of the operands with smaller VF to
8766 // be able to find the match between the graph nodes and scalar
8767 // operands of the given node during vectorization/cost estimation.
8768 assert(
8769 (!TE->UserTreeIndex ||
8770 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8771 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8772 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8773 "All users must be of VF size.");
8774 if (SLPReVec) {
8775 assert(SLPReVec && "Only supported by REVEC.");
8776 // ShuffleVectorInst does not do reorderOperands (and it should not
8777 // because ShuffleVectorInst supports only a limited set of
8778 // patterns). Only do reorderNodeWithReuses if the user is not
8779 // ShuffleVectorInst.
8780 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8781 isa<ShuffleVectorInst>(Val: TE->UserTreeIndex.UserTE->getMainOp()))
8782 continue;
8783 }
8784 // Update ordering of the operands with the smaller VF than the given
8785 // one.
8786 reorderNodeWithReuses(TE&: *TE, Mask);
8787 // Update orders in user split vectorize nodes.
8788 if (TE->UserTreeIndex &&
8789 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8790 TE->UserTreeIndex.UserTE->reorderSplitNode(
8791 Idx: TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8792 }
8793 continue;
8794 }
8795 if ((TE->State == TreeEntry::SplitVectorize &&
8796 TE->ReuseShuffleIndices.empty()) ||
8797 ((TE->State == TreeEntry::Vectorize ||
8798 TE->State == TreeEntry::StridedVectorize ||
8799 TE->State == TreeEntry::CompressVectorize) &&
8800 (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
8801 InsertElementInst>(Val: TE->getMainOp()) ||
8802 (SLPReVec && isa<ShuffleVectorInst>(Val: TE->getMainOp()))))) {
8803 assert(
8804 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8805 TE->ReuseShuffleIndices.empty())) &&
8806 "Alternate instructions are only supported by BinaryOperator "
8807 "and CastInst.");
8808 // Build correct orders for extract{element,value}, loads,
8809 // stores and alternate (split) nodes.
8810 reorderOrder(Order&: TE->ReorderIndices, Mask);
8811 if (isa<InsertElementInst, StoreInst>(Val: TE->getMainOp()))
8812 TE->reorderOperands(Mask);
8813 } else {
8814 // Reorder the node and its operands.
8815 TE->reorderOperands(Mask);
8816 assert(TE->ReorderIndices.empty() &&
8817 "Expected empty reorder sequence.");
8818 reorderScalars(Scalars&: TE->Scalars, Mask);
8819 }
8820 if (!TE->ReuseShuffleIndices.empty()) {
8821 // Apply reversed order to keep the original ordering of the reused
8822 // elements to avoid extra reorder indices shuffling.
8823 OrdersType CurrentOrder;
8824 reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
8825 SmallVector<int> NewReuses;
8826 inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
8827 addMask(Mask&: NewReuses, SubMask: TE->ReuseShuffleIndices);
8828 TE->ReuseShuffleIndices.swap(RHS&: NewReuses);
8829 } else if (TE->UserTreeIndex &&
8830 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8831 // Update orders in user split vectorize nodes.
8832 TE->UserTreeIndex.UserTE->reorderSplitNode(Idx: TE->UserTreeIndex.EdgeIdx,
8833 Mask, MaskOrder);
8834 }
8835 }
8836}
8837
8838void BoUpSLP::buildReorderableOperands(
8839 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8840 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8841 SmallVectorImpl<TreeEntry *> &GatherOps) {
8842 for (unsigned I : seq<unsigned>(Size: UserTE->getNumOperands())) {
8843 if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8844 return OpData.first == I &&
8845 (OpData.second->State == TreeEntry::Vectorize ||
8846 OpData.second->State == TreeEntry::StridedVectorize ||
8847 OpData.second->State == TreeEntry::CompressVectorize ||
8848 OpData.second->State == TreeEntry::SplitVectorize);
8849 }))
8850 continue;
8851 // Do not request operands, if they do not exist.
8852 if (UserTE->hasState()) {
8853 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8854 UserTE->getOpcode() == Instruction::ExtractValue)
8855 continue;
8856 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8857 continue;
8858 if (UserTE->getOpcode() == Instruction::Store &&
8859 UserTE->State == TreeEntry::Vectorize && I == 1)
8860 continue;
8861 if (UserTE->getOpcode() == Instruction::Load &&
8862 (UserTE->State == TreeEntry::Vectorize ||
8863 UserTE->State == TreeEntry::StridedVectorize ||
8864 UserTE->State == TreeEntry::CompressVectorize))
8865 continue;
8866 }
8867 TreeEntry *TE = getOperandEntry(E: UserTE, Idx: I);
8868 assert(TE && "Expected operand entry.");
8869 if (!TE->isGather()) {
8870 // Add the node to the list of the ordered nodes with the identity
8871 // order.
8872 Edges.emplace_back(Args&: I, Args&: TE);
8873 // Add ScatterVectorize nodes to the list of operands, where just
8874 // reordering of the scalars is required. Similar to the gathers, so
8875 // simply add to the list of gathered ops.
8876 // If there are reused scalars, process this node as a regular vectorize
8877 // node, just reorder reuses mask.
8878 if (TE->State == TreeEntry::ScatterVectorize &&
8879 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8880 GatherOps.push_back(Elt: TE);
8881 continue;
8882 }
8883 if (ReorderableGathers.contains(Ptr: TE))
8884 GatherOps.push_back(Elt: TE);
8885 }
8886}
8887
8888void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8889 struct TreeEntryCompare {
8890 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8891 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8892 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8893 return LHS->Idx < RHS->Idx;
8894 }
8895 };
8896 PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue;
8897 DenseSet<const TreeEntry *> GathersToOrders;
8898 // Find all reorderable leaf nodes with the given VF.
8899 // Currently the are vectorized loads,extracts without alternate operands +
8900 // some gathering of extracts.
8901 SmallPtrSet<const TreeEntry *, 4> NonVectorized;
8902 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8903 if (TE->State != TreeEntry::Vectorize &&
8904 TE->State != TreeEntry::StridedVectorize &&
8905 TE->State != TreeEntry::CompressVectorize &&
8906 TE->State != TreeEntry::SplitVectorize)
8907 NonVectorized.insert(Ptr: TE.get());
8908 if (std::optional<OrdersType> CurrentOrder =
8909 getReorderingData(TE: *TE, /*TopToBottom=*/false, IgnoreReorder)) {
8910 Queue.push(x: TE.get());
8911 if (!(TE->State == TreeEntry::Vectorize ||
8912 TE->State == TreeEntry::StridedVectorize ||
8913 TE->State == TreeEntry::CompressVectorize ||
8914 TE->State == TreeEntry::SplitVectorize) ||
8915 !TE->ReuseShuffleIndices.empty())
8916 GathersToOrders.insert(V: TE.get());
8917 }
8918 }
8919
8920 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8921 // I.e., if the node has operands, that are reordered, try to make at least
8922 // one operand order in the natural order and reorder others + reorder the
8923 // user node itself.
8924 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8925 while (!Queue.empty()) {
8926 // 1. Filter out only reordered nodes.
8927 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8928 TreeEntry *TE = Queue.top();
8929 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8930 Queue.pop();
8931 SmallVector<TreeEntry *> OrderedOps(1, TE);
8932 while (!Queue.empty()) {
8933 TE = Queue.top();
8934 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8935 break;
8936 Queue.pop();
8937 OrderedOps.push_back(Elt: TE);
8938 }
8939 for (TreeEntry *TE : OrderedOps) {
8940 if (!(TE->State == TreeEntry::Vectorize ||
8941 TE->State == TreeEntry::StridedVectorize ||
8942 TE->State == TreeEntry::CompressVectorize ||
8943 TE->State == TreeEntry::SplitVectorize ||
8944 (TE->isGather() && GathersToOrders.contains(V: TE))) ||
8945 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8946 !Visited.insert(Ptr: TE).second)
8947 continue;
8948 // Build a map between user nodes and their operands order to speedup
8949 // search. The graph currently does not provide this dependency directly.
8950 Users.first = TE->UserTreeIndex.UserTE;
8951 Users.second.emplace_back(Args&: TE->UserTreeIndex.EdgeIdx, Args&: TE);
8952 }
8953 if (Users.first) {
8954 auto &Data = Users;
8955 if (Data.first->State == TreeEntry::SplitVectorize) {
8956 assert(
8957 Data.second.size() <= 2 &&
8958 "Expected not greater than 2 operands for split vectorize node.");
8959 if (any_of(Range&: Data.second,
8960 P: [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8961 continue;
8962 // Update orders in user split vectorize nodes.
8963 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8964 "Expected exactly 2 entries.");
8965 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8966 TreeEntry &OpTE = *VectorizableTree[P.first];
8967 OrdersType Order = OpTE.ReorderIndices;
8968 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8969 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8970 continue;
8971 const auto BestOrder =
8972 getReorderingData(TE: OpTE, /*TopToBottom=*/false, IgnoreReorder);
8973 if (!BestOrder || BestOrder->empty() || isIdentityOrder(Order: *BestOrder))
8974 continue;
8975 Order = *BestOrder;
8976 }
8977 fixupOrderingIndices(Order);
8978 SmallVector<int> Mask;
8979 inversePermutation(Indices: Order, Mask);
8980 const unsigned E = Order.size();
8981 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8982 transform(Range&: Order, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8983 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8984 });
8985 Data.first->reorderSplitNode(Idx: P.second ? 1 : 0, Mask, MaskOrder);
8986 // Clear ordering of the operand.
8987 if (!OpTE.ReorderIndices.empty()) {
8988 OpTE.ReorderIndices.clear();
8989 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8990 reorderReuses(Reuses&: OpTE.ReuseShuffleIndices, Mask);
8991 } else {
8992 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8993 reorderScalars(Scalars&: OpTE.Scalars, Mask);
8994 }
8995 }
8996 if (Data.first->ReuseShuffleIndices.empty() &&
8997 !Data.first->ReorderIndices.empty()) {
8998 // Insert user node to the list to try to sink reordering deeper in
8999 // the graph.
9000 Queue.push(x: Data.first);
9001 }
9002 continue;
9003 }
9004 // Check that operands are used only in the User node.
9005 SmallVector<TreeEntry *> GatherOps;
9006 buildReorderableOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
9007 GatherOps);
9008 // All operands are reordered and used only in this node - propagate the
9009 // most used order to the user node.
9010 MapVector<OrdersType, unsigned,
9011 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
9012 OrdersUses;
9013 // Do the analysis for each tree entry only once, otherwise the order of
9014 // the same node my be considered several times, though might be not
9015 // profitable.
9016 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
9017 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
9018 for (const auto &Op : Data.second) {
9019 TreeEntry *OpTE = Op.second;
9020 if (!VisitedOps.insert(Ptr: OpTE).second)
9021 continue;
9022 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
9023 continue;
9024 const auto Order = [&]() -> const OrdersType {
9025 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
9026 return getReorderingData(TE: *OpTE, /*TopToBottom=*/false,
9027 IgnoreReorder)
9028 .value_or(u: OrdersType(1));
9029 return OpTE->ReorderIndices;
9030 }();
9031 // The order is partially ordered, skip it in favor of fully non-ordered
9032 // orders.
9033 if (Order.size() == 1)
9034 continue;
9035
9036 // Check that the reordering does not increase number of shuffles, i.e.
9037 // same-values-nodes has same parents or their parents has same parents.
9038 if (!Order.empty() && !isIdentityOrder(Order)) {
9039 Value *Root = OpTE->hasState()
9040 ? OpTE->getMainOp()
9041 : *find_if_not(Range&: OpTE->Scalars, P: isConstant);
9042 auto GetSameNodesUsers = [&](Value *Root) {
9043 SmallSetVector<TreeEntry *, 4> Res;
9044 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Val: Root)) {
9045 if (TE != OpTE && TE->UserTreeIndex &&
9046 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9047 TE->Scalars.size() == OpTE->Scalars.size() &&
9048 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
9049 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
9050 Res.insert(X: TE->UserTreeIndex.UserTE);
9051 }
9052 for (const TreeEntry *TE : getTreeEntries(V: Root)) {
9053 if (TE != OpTE && TE->UserTreeIndex &&
9054 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9055 TE->Scalars.size() == OpTE->Scalars.size() &&
9056 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
9057 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
9058 Res.insert(X: TE->UserTreeIndex.UserTE);
9059 }
9060 return Res.takeVector();
9061 };
9062 auto GetNumOperands = [](const TreeEntry *TE) {
9063 if (TE->State == TreeEntry::SplitVectorize)
9064 return TE->getNumOperands();
9065 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
9066 return CI->arg_size();
9067 return TE->getNumOperands();
9068 };
9069 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9070 const TreeEntry *TE) {
9071 Intrinsic::ID ID = Intrinsic::not_intrinsic;
9072 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
9073 ID = getVectorIntrinsicIDForCall(CI, TLI);
9074 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(TE))) {
9075 if (ID != Intrinsic::not_intrinsic &&
9076 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9077 continue;
9078 const TreeEntry *Op = getOperandEntry(E: TE, Idx);
9079 if (Op->isGather() && Op->hasState()) {
9080 const TreeEntry *VecOp =
9081 getSameValuesTreeEntry(V: Op->getMainOp(), VL: Op->Scalars);
9082 if (VecOp)
9083 Op = VecOp;
9084 }
9085 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9086 return false;
9087 }
9088 return true;
9089 };
9090 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
9091 if (!Users.empty() && !all_of(Range&: Users, P: [&](TreeEntry *UTE) {
9092 if (!RevisitedOps.insert(Ptr: UTE).second)
9093 return false;
9094 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
9095 !UTE->ReuseShuffleIndices.empty() ||
9096 (UTE->UserTreeIndex &&
9097 UTE->UserTreeIndex.UserTE == Data.first) ||
9098 (Data.first->UserTreeIndex &&
9099 Data.first->UserTreeIndex.UserTE == UTE) ||
9100 (IgnoreReorder && UTE->UserTreeIndex &&
9101 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9102 NodeShouldBeReorderedWithOperands(UTE);
9103 }))
9104 continue;
9105 for (TreeEntry *UTE : Users) {
9106 Intrinsic::ID ID = Intrinsic::not_intrinsic;
9107 if (auto *CI = dyn_cast<CallInst>(Val: UTE->getMainOp()); CI)
9108 ID = getVectorIntrinsicIDForCall(CI, TLI);
9109 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(UTE))) {
9110 if (ID != Intrinsic::not_intrinsic &&
9111 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9112 continue;
9113 const TreeEntry *Op = getOperandEntry(E: UTE, Idx);
9114 Visited.erase(Ptr: Op);
9115 Queue.push(x: const_cast<TreeEntry *>(Op));
9116 }
9117 }
9118 }
9119 unsigned NumOps = count_if(
9120 Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9121 return P.second == OpTE;
9122 });
9123 // Stores actually store the mask, not the order, need to invert.
9124 if (OpTE->State == TreeEntry::Vectorize &&
9125 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9126 assert(!OpTE->isAltShuffle() &&
9127 "Alternate instructions are only supported by BinaryOperator "
9128 "and CastInst.");
9129 SmallVector<int> Mask;
9130 inversePermutation(Indices: Order, Mask);
9131 unsigned E = Order.size();
9132 OrdersType CurrentOrder(E, E);
9133 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
9134 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9135 });
9136 fixupOrderingIndices(Order: CurrentOrder);
9137 OrdersUses.try_emplace(Key: CurrentOrder, Args: 0).first->second += NumOps;
9138 } else {
9139 OrdersUses.try_emplace(Key: Order, Args: 0).first->second += NumOps;
9140 }
9141 auto Res = OrdersUses.try_emplace(Key: OrdersType(), Args: 0);
9142 const auto AllowsReordering = [&](const TreeEntry *TE) {
9143 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9144 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9145 (IgnoreReorder && TE->Idx == 0))
9146 return true;
9147 if (TE->isGather()) {
9148 if (GathersToOrders.contains(V: TE))
9149 return !getReorderingData(TE: *TE, /*TopToBottom=*/false,
9150 IgnoreReorder)
9151 .value_or(u: OrdersType(1))
9152 .empty();
9153 return true;
9154 }
9155 return false;
9156 };
9157 if (OpTE->UserTreeIndex) {
9158 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9159 if (!VisitedUsers.insert(Ptr: UserTE).second)
9160 continue;
9161 // May reorder user node if it requires reordering, has reused
9162 // scalars, is an alternate op vectorize node or its op nodes require
9163 // reordering.
9164 if (AllowsReordering(UserTE))
9165 continue;
9166 // Check if users allow reordering.
9167 // Currently look up just 1 level of operands to avoid increase of
9168 // the compile time.
9169 // Profitable to reorder if definitely more operands allow
9170 // reordering rather than those with natural order.
9171 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
9172 if (static_cast<unsigned>(count_if(
9173 Range&: Ops, P: [UserTE, &AllowsReordering](
9174 const std::pair<unsigned, TreeEntry *> &Op) {
9175 return AllowsReordering(Op.second) &&
9176 Op.second->UserTreeIndex.UserTE == UserTE;
9177 })) <= Ops.size() / 2)
9178 ++Res.first->second;
9179 }
9180 }
9181 if (OrdersUses.empty()) {
9182 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9183 continue;
9184 }
9185 // Choose the most used order.
9186 unsigned IdentityCnt = 0;
9187 unsigned VF = Data.second.front().second->getVectorFactor();
9188 OrdersType IdentityOrder(VF, VF);
9189 for (auto &Pair : OrdersUses) {
9190 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
9191 IdentityCnt += Pair.second;
9192 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
9193 }
9194 }
9195 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9196 unsigned Cnt = IdentityCnt;
9197 for (auto &Pair : OrdersUses) {
9198 // Prefer identity order. But, if filled identity found (non-empty
9199 // order) with same number of uses, as the new candidate order, we can
9200 // choose this candidate order.
9201 if (Cnt < Pair.second) {
9202 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
9203 BestOrder = Pair.first;
9204 Cnt = Pair.second;
9205 } else {
9206 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
9207 }
9208 }
9209 // Set order of the user node.
9210 if (isIdentityOrder(Order: BestOrder)) {
9211 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9212 continue;
9213 }
9214 fixupOrderingIndices(Order: BestOrder);
9215 // Erase operands from OrderedEntries list and adjust their orders.
9216 VisitedOps.clear();
9217 SmallVector<int> Mask;
9218 inversePermutation(Indices: BestOrder, Mask);
9219 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9220 unsigned E = BestOrder.size();
9221 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
9222 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9223 });
9224 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9225 TreeEntry *TE = Op.second;
9226 if (!VisitedOps.insert(Ptr: TE).second)
9227 continue;
9228 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9229 reorderNodeWithReuses(TE&: *TE, Mask);
9230 continue;
9231 }
9232 // Gathers are processed separately.
9233 if (TE->State != TreeEntry::Vectorize &&
9234 TE->State != TreeEntry::StridedVectorize &&
9235 TE->State != TreeEntry::CompressVectorize &&
9236 TE->State != TreeEntry::SplitVectorize &&
9237 (TE->State != TreeEntry::ScatterVectorize ||
9238 TE->ReorderIndices.empty()))
9239 continue;
9240 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9241 TE->ReorderIndices.empty()) &&
9242 "Non-matching sizes of user/operand entries.");
9243 reorderOrder(Order&: TE->ReorderIndices, Mask);
9244 if (IgnoreReorder && TE == VectorizableTree.front().get())
9245 IgnoreReorder = false;
9246 }
9247 // For gathers just need to reorder its scalars.
9248 for (TreeEntry *Gather : GatherOps) {
9249 assert(Gather->ReorderIndices.empty() &&
9250 "Unexpected reordering of gathers.");
9251 if (!Gather->ReuseShuffleIndices.empty()) {
9252 // Just reorder reuses indices.
9253 reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
9254 continue;
9255 }
9256 reorderScalars(Scalars&: Gather->Scalars, Mask);
9257 Visited.insert(Ptr: Gather);
9258 }
9259 // Reorder operands of the user node and set the ordering for the user
9260 // node itself.
9261 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9262 return TE.isAltShuffle() &&
9263 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9264 TE.ReorderIndices.empty());
9265 };
9266 if (Data.first->State != TreeEntry::Vectorize ||
9267 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
9268 Val: Data.first->getMainOp()) ||
9269 IsNotProfitableAltCodeNode(*Data.first))
9270 Data.first->reorderOperands(Mask);
9271 if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) ||
9272 IsNotProfitableAltCodeNode(*Data.first) ||
9273 Data.first->State == TreeEntry::StridedVectorize ||
9274 Data.first->State == TreeEntry::CompressVectorize) {
9275 reorderScalars(Scalars&: Data.first->Scalars, Mask);
9276 reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
9277 /*BottomOrder=*/true);
9278 if (Data.first->ReuseShuffleIndices.empty() &&
9279 !Data.first->ReorderIndices.empty() &&
9280 !IsNotProfitableAltCodeNode(*Data.first)) {
9281 // Insert user node to the list to try to sink reordering deeper in
9282 // the graph.
9283 Queue.push(x: Data.first);
9284 }
9285 } else {
9286 reorderOrder(Order&: Data.first->ReorderIndices, Mask);
9287 }
9288 }
9289 }
9290 // If the reordering is unnecessary, just remove the reorder.
9291 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9292 VectorizableTree.front()->ReuseShuffleIndices.empty())
9293 VectorizableTree.front()->ReorderIndices.clear();
9294}
9295
9296Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9297 if (Entry.hasState() &&
9298 (Entry.getOpcode() == Instruction::Store ||
9299 Entry.getOpcode() == Instruction::Load) &&
9300 Entry.State == TreeEntry::StridedVectorize &&
9301 !Entry.ReorderIndices.empty() && isReverseOrder(Order: Entry.ReorderIndices))
9302 return dyn_cast<Instruction>(Val: Entry.Scalars[Entry.ReorderIndices.front()]);
9303 return dyn_cast<Instruction>(Val: Entry.Scalars.front());
9304}
9305
9306void BoUpSLP::buildExternalUses(
9307 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9308 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9309 DenseMap<Value *, unsigned> ScalarToExtUses;
9310 // Collect the values that we need to extract from the tree.
9311 for (auto &TEPtr : VectorizableTree) {
9312 TreeEntry *Entry = TEPtr.get();
9313
9314 // No need to handle users of gathered values.
9315 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9316 DeletedNodes.contains(Ptr: Entry) ||
9317 TransformedToGatherNodes.contains(Val: Entry))
9318 continue;
9319
9320 // For each lane:
9321 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9322 Value *Scalar = Entry->Scalars[Lane];
9323 if (!isa<Instruction>(Val: Scalar) || Entry->isCopyableElement(V: Scalar))
9324 continue;
9325
9326 // All uses must be replaced already? No need to do it again.
9327 auto It = ScalarToExtUses.find(Val: Scalar);
9328 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9329 continue;
9330
9331 if (Scalar->hasNUsesOrMore(N: NumVectScalars)) {
9332 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9333 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9334 << " from " << *Scalar << "for many users.\n");
9335 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9336 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9337 ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9338 continue;
9339 }
9340
9341 // Check if the scalar is externally used as an extra arg.
9342 const auto ExtI = ExternallyUsedValues.find(V: Scalar);
9343 if (ExtI != ExternallyUsedValues.end()) {
9344 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9345 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9346 << FoundLane << " from " << *Scalar << ".\n");
9347 ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
9348 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9349 continue;
9350 }
9351 for (User *U : Scalar->users()) {
9352 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9353
9354 Instruction *UserInst = dyn_cast<Instruction>(Val: U);
9355 if (!UserInst || isDeleted(I: UserInst))
9356 continue;
9357
9358 // Ignore users in the user ignore list.
9359 if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
9360 continue;
9361
9362 // Skip in-tree scalars that become vectors
9363 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(V: U);
9364 any_of(Range&: UseEntries, P: [this](const TreeEntry *UseEntry) {
9365 return !DeletedNodes.contains(Ptr: UseEntry) &&
9366 !TransformedToGatherNodes.contains(Val: UseEntry);
9367 })) {
9368 // Some in-tree scalars will remain as scalar in vectorized
9369 // instructions. If that is the case, the one in FoundLane will
9370 // be used.
9371 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9372 isa<LoadInst, StoreInst>(Val: UserInst)) ||
9373 isa<CallInst>(Val: UserInst)) ||
9374 all_of(Range&: UseEntries, P: [&](TreeEntry *UseEntry) {
9375 if (DeletedNodes.contains(Ptr: UseEntry) ||
9376 TransformedToGatherNodes.contains(Val: UseEntry))
9377 return true;
9378 return UseEntry->State == TreeEntry::ScatterVectorize ||
9379 !doesInTreeUserNeedToExtract(
9380 Scalar, UserInst: getRootEntryInstruction(Entry: *UseEntry), TLI,
9381 TTI);
9382 })) {
9383 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9384 << ".\n");
9385 assert(none_of(UseEntries,
9386 [](TreeEntry *UseEntry) {
9387 return UseEntry->isGather();
9388 }) &&
9389 "Bad state");
9390 continue;
9391 }
9392 U = nullptr;
9393 if (It != ScalarToExtUses.end()) {
9394 ExternalUses[It->second].User = nullptr;
9395 break;
9396 }
9397 }
9398
9399 if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
9400 U = nullptr;
9401 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9402 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9403 << " from lane " << FoundLane << " from " << *Scalar
9404 << ".\n");
9405 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9406 ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: *Entry, Args&: FoundLane);
9407 ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9408 if (!U)
9409 break;
9410 }
9411 }
9412 }
9413}
9414
9415SmallVector<SmallVector<StoreInst *>>
9416BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9417 SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
9418 SmallVector<StoreInst *>, 8>
9419 PtrToStoresMap;
9420 for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size())) {
9421 Value *V = TE->Scalars[Lane];
9422 // Don't iterate over the users of constant data.
9423 if (!isa<Instruction>(Val: V))
9424 continue;
9425 // To save compilation time we don't visit if we have too many users.
9426 if (V->hasNUsesOrMore(N: UsesLimit))
9427 break;
9428
9429 // Collect stores per pointer object.
9430 for (User *U : V->users()) {
9431 auto *SI = dyn_cast<StoreInst>(Val: U);
9432 // Test whether we can handle the store. V might be a global, which could
9433 // be used in a different function.
9434 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9435 !isValidElementType(Ty: SI->getValueOperand()->getType()))
9436 continue;
9437 // Skip entry if already
9438 if (isVectorized(V: U))
9439 continue;
9440
9441 Value *Ptr =
9442 getUnderlyingObject(V: SI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
9443 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9444 SI->getValueOperand()->getType(), Ptr}];
9445 // For now just keep one store per pointer object per lane.
9446 // TODO: Extend this to support multiple stores per pointer per lane
9447 if (StoresVec.size() > Lane)
9448 continue;
9449 if (!StoresVec.empty()) {
9450 std::optional<int64_t> Diff = getPointersDiff(
9451 ElemTyA: SI->getValueOperand()->getType(), PtrA: SI->getPointerOperand(),
9452 ElemTyB: SI->getValueOperand()->getType(),
9453 PtrB: StoresVec.front()->getPointerOperand(), DL: *DL, SE&: *SE,
9454 /*StrictCheck=*/true);
9455 // We failed to compare the pointers so just abandon this store.
9456 if (!Diff)
9457 continue;
9458 }
9459 StoresVec.push_back(Elt: SI);
9460 }
9461 }
9462 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9463 unsigned I = 0;
9464 for (auto &P : PtrToStoresMap) {
9465 Res[I].swap(RHS&: P.second);
9466 ++I;
9467 }
9468 return Res;
9469}
9470
9471bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9472 OrdersType &ReorderIndices) const {
9473 // We check whether the stores in StoreVec can form a vector by sorting them
9474 // and checking whether they are consecutive.
9475
9476 // To avoid calling getPointersDiff() while sorting we create a vector of
9477 // pairs {store, offset from first} and sort this instead.
9478 SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec;
9479 StoreInst *S0 = StoresVec[0];
9480 StoreOffsetVec.emplace_back(Args: 0, Args: 0);
9481 Type *S0Ty = S0->getValueOperand()->getType();
9482 Value *S0Ptr = S0->getPointerOperand();
9483 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoresVec.size())) {
9484 StoreInst *SI = StoresVec[Idx];
9485 std::optional<int64_t> Diff =
9486 getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
9487 PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
9488 /*StrictCheck=*/true);
9489 StoreOffsetVec.emplace_back(Args&: *Diff, Args&: Idx);
9490 }
9491
9492 // Check if the stores are consecutive by checking if their difference is 1.
9493 if (StoreOffsetVec.size() != StoresVec.size())
9494 return false;
9495 sort(C&: StoreOffsetVec, Comp: llvm::less_first());
9496 unsigned Idx = 0;
9497 int64_t PrevDist = 0;
9498 for (const auto &P : StoreOffsetVec) {
9499 if (Idx > 0 && P.first != PrevDist + 1)
9500 return false;
9501 PrevDist = P.first;
9502 ++Idx;
9503 }
9504
9505 // Calculate the shuffle indices according to their offset against the sorted
9506 // StoreOffsetVec.
9507 ReorderIndices.assign(NumElts: StoresVec.size(), Elt: 0);
9508 bool IsIdentity = true;
9509 for (auto [I, P] : enumerate(First&: StoreOffsetVec)) {
9510 ReorderIndices[P.second] = I;
9511 IsIdentity &= P.second == I;
9512 }
9513 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9514 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9515 // same convention here.
9516 if (IsIdentity)
9517 ReorderIndices.clear();
9518
9519 return true;
9520}
9521
9522#ifndef NDEBUG
9523LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
9524 for (unsigned Idx : Order)
9525 dbgs() << Idx << ", ";
9526 dbgs() << "\n";
9527}
9528#endif
9529
9530SmallVector<BoUpSLP::OrdersType, 1>
9531BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9532 unsigned NumLanes = TE->Scalars.size();
9533
9534 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9535
9536 // Holds the reorder indices for each candidate store vector that is a user of
9537 // the current TreeEntry.
9538 SmallVector<OrdersType, 1> ExternalReorderIndices;
9539
9540 // Now inspect the stores collected per pointer and look for vectorization
9541 // candidates. For each candidate calculate the reorder index vector and push
9542 // it into `ExternalReorderIndices`
9543 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9544 // If we have fewer than NumLanes stores, then we can't form a vector.
9545 if (StoresVec.size() != NumLanes)
9546 continue;
9547
9548 // If the stores are not consecutive then abandon this StoresVec.
9549 OrdersType ReorderIndices;
9550 if (!canFormVector(StoresVec, ReorderIndices))
9551 continue;
9552
9553 // We now know that the scalars in StoresVec can form a vector instruction,
9554 // so set the reorder indices.
9555 ExternalReorderIndices.push_back(Elt: ReorderIndices);
9556 }
9557 return ExternalReorderIndices;
9558}
9559
9560void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
9561 const SmallDenseSet<Value *> &UserIgnoreLst) {
9562 deleteTree();
9563 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9564 "TreeEntryToStridedPtrInfoMap is not cleared");
9565 UserIgnoreList = &UserIgnoreLst;
9566 if (!allSameType(VL: Roots))
9567 return;
9568 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
9569}
9570
9571void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
9572 deleteTree();
9573 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9574 "TreeEntryToStridedPtrInfoMap is not cleared");
9575 if (!allSameType(VL: Roots))
9576 return;
9577 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
9578}
9579
9580/// Tries to find subvector of loads and builds new vector of only loads if can
9581/// be profitable.
9582static void gatherPossiblyVectorizableLoads(
9583 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9584 ScalarEvolution &SE, const TargetTransformInfo &TTI,
9585 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9586 bool AddNew = true) {
9587 if (VL.empty())
9588 return;
9589 Type *ScalarTy = getValueType(V: VL.front());
9590 if (!isValidElementType(Ty: ScalarTy))
9591 return;
9592 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads;
9593 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9594 for (Value *V : VL) {
9595 auto *LI = dyn_cast<LoadInst>(Val: V);
9596 if (!LI)
9597 continue;
9598 if (R.isDeleted(I: LI) || R.isVectorized(V: LI) || !LI->isSimple())
9599 continue;
9600 bool IsFound = false;
9601 for (auto [Map, Data] : zip(t&: ClusteredDistToLoad, u&: ClusteredLoads)) {
9602 assert(LI->getParent() == Data.front().first->getParent() &&
9603 LI->getType() == Data.front().first->getType() &&
9604 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9605 getUnderlyingObject(Data.front().first->getPointerOperand(),
9606 RecursionMaxDepth) &&
9607 "Expected loads with the same type, same parent and same "
9608 "underlying pointer.");
9609 std::optional<int64_t> Dist = getPointersDiff(
9610 ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), ElemTyB: Data.front().first->getType(),
9611 PtrB: Data.front().first->getPointerOperand(), DL, SE,
9612 /*StrictCheck=*/true);
9613 if (!Dist)
9614 continue;
9615 auto It = Map.find(Val: *Dist);
9616 if (It != Map.end() && It->second != LI)
9617 continue;
9618 if (It == Map.end()) {
9619 Data.emplace_back(Args&: LI, Args&: *Dist);
9620 Map.try_emplace(Key: *Dist, Args&: LI);
9621 }
9622 IsFound = true;
9623 break;
9624 }
9625 if (!IsFound) {
9626 ClusteredLoads.emplace_back().emplace_back(Args&: LI, Args: 0);
9627 ClusteredDistToLoad.emplace_back().try_emplace(Key: 0, Args&: LI);
9628 }
9629 }
9630 auto FindMatchingLoads =
9631 [&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads,
9632 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>>
9633 &GatheredLoads,
9634 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9635 int64_t &Offset, unsigned &Start) {
9636 if (Loads.empty())
9637 return GatheredLoads.end();
9638 LoadInst *LI = Loads.front().first;
9639 for (auto [Idx, Data] : enumerate(First&: GatheredLoads)) {
9640 if (Idx < Start)
9641 continue;
9642 ToAdd.clear();
9643 if (LI->getParent() != Data.front().first->getParent() ||
9644 LI->getType() != Data.front().first->getType())
9645 continue;
9646 std::optional<int64_t> Dist =
9647 getPointersDiff(ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(),
9648 ElemTyB: Data.front().first->getType(),
9649 PtrB: Data.front().first->getPointerOperand(), DL, SE,
9650 /*StrictCheck=*/true);
9651 if (!Dist)
9652 continue;
9653 SmallSet<int64_t, 4> DataDists;
9654 SmallPtrSet<LoadInst *, 4> DataLoads;
9655 for (std::pair<LoadInst *, int64_t> P : Data) {
9656 DataDists.insert(V: P.second);
9657 DataLoads.insert(Ptr: P.first);
9658 }
9659 // Found matching gathered loads - check if all loads are unique or
9660 // can be effectively vectorized.
9661 unsigned NumUniques = 0;
9662 for (auto [Cnt, Pair] : enumerate(First&: Loads)) {
9663 bool Used = DataLoads.contains(Ptr: Pair.first);
9664 if (!Used && !DataDists.contains(V: *Dist + Pair.second)) {
9665 ++NumUniques;
9666 ToAdd.insert(X: Cnt);
9667 } else if (Used) {
9668 Repeated.insert(X: Cnt);
9669 }
9670 }
9671 if (NumUniques > 0 &&
9672 (Loads.size() == NumUniques ||
9673 (Loads.size() - NumUniques >= 2 &&
9674 Loads.size() - NumUniques >= Loads.size() / 2 &&
9675 (has_single_bit(Value: Data.size() + NumUniques) ||
9676 bit_ceil(Value: Data.size()) <
9677 bit_ceil(Value: Data.size() + NumUniques))))) {
9678 Offset = *Dist;
9679 Start = Idx + 1;
9680 return std::next(x: GatheredLoads.begin(), n: Idx);
9681 }
9682 }
9683 ToAdd.clear();
9684 return GatheredLoads.end();
9685 };
9686 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9687 unsigned Start = 0;
9688 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9689 int64_t Offset = 0;
9690 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9691 Offset, Start);
9692 while (It != GatheredLoads.end()) {
9693 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9694 for (unsigned Idx : LocalToAdd)
9695 It->emplace_back(Args: Data[Idx].first, Args: Data[Idx].second + Offset);
9696 ToAdd.insert_range(R&: LocalToAdd);
9697 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9698 Start);
9699 }
9700 if (any_of(Range: seq<unsigned>(Size: Data.size()), P: [&](unsigned Idx) {
9701 return !ToAdd.contains(key: Idx) && !Repeated.contains(key: Idx);
9702 })) {
9703 auto AddNewLoads =
9704 [&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) {
9705 for (unsigned Idx : seq<unsigned>(Size: Data.size())) {
9706 if (ToAdd.contains(key: Idx) || Repeated.contains(key: Idx))
9707 continue;
9708 Loads.push_back(Elt: Data[Idx]);
9709 }
9710 };
9711 if (!AddNew) {
9712 LoadInst *LI = Data.front().first;
9713 It = find_if(
9714 Range&: GatheredLoads, P: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9715 return PD.front().first->getParent() == LI->getParent() &&
9716 PD.front().first->getType() == LI->getType();
9717 });
9718 while (It != GatheredLoads.end()) {
9719 AddNewLoads(*It);
9720 It = std::find_if(
9721 first: std::next(x: It), last: GatheredLoads.end(),
9722 pred: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9723 return PD.front().first->getParent() == LI->getParent() &&
9724 PD.front().first->getType() == LI->getType();
9725 });
9726 }
9727 }
9728 GatheredLoads.emplace_back().append(in_start: Data.begin(), in_end: Data.end());
9729 AddNewLoads(GatheredLoads.emplace_back());
9730 }
9731 }
9732}
9733
9734void BoUpSLP::tryToVectorizeGatheredLoads(
9735 const SmallMapVector<
9736 std::tuple<BasicBlock *, Value *, Type *>,
9737 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9738 &GatheredLoads) {
9739 GatheredLoadsEntriesFirst = VectorizableTree.size();
9740
9741 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9742 LoadEntriesToVectorize.size());
9743 for (auto [Idx, Set] : zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize))
9744 Set.insert_range(R&: VectorizableTree[Idx]->Scalars);
9745
9746 // Sort loads by distance.
9747 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9748 const std::pair<LoadInst *, int64_t> &L2) {
9749 return L1.second > L2.second;
9750 };
9751
9752 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9753 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9754 Loads.size());
9755 Align Alignment = computeCommonAlignment<LoadInst>(VL: Values);
9756 auto *Ty = getWidenedType(ScalarTy: Loads.front()->getType(), VF: Loads.size());
9757 return TTI->isLegalMaskedGather(DataType: Ty, Alignment) &&
9758 !TTI->forceScalarizeMaskedGather(Type: Ty, Alignment);
9759 };
9760
9761 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9762 BoUpSLP::ValueSet &VectorizedLoads,
9763 SmallVectorImpl<LoadInst *> &NonVectorized,
9764 bool Final, unsigned MaxVF) {
9765 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
9766 unsigned StartIdx = 0;
9767 SmallVector<int> CandidateVFs;
9768 if (VectorizeNonPowerOf2 && has_single_bit(Value: MaxVF + 1))
9769 CandidateVFs.push_back(Elt: MaxVF);
9770 for (int NumElts = getFloorFullVectorNumberOfElements(
9771 TTI: *TTI, Ty: Loads.front()->getType(), Sz: MaxVF);
9772 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9773 TTI: *TTI, Ty: Loads.front()->getType(), Sz: NumElts - 1)) {
9774 CandidateVFs.push_back(Elt: NumElts);
9775 if (VectorizeNonPowerOf2 && NumElts > 2)
9776 CandidateVFs.push_back(Elt: NumElts - 1);
9777 }
9778
9779 if (Final && CandidateVFs.empty())
9780 return Results;
9781
9782 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9783 for (unsigned NumElts : CandidateVFs) {
9784 if (Final && NumElts > BestVF)
9785 continue;
9786 SmallVector<unsigned> MaskedGatherVectorized;
9787 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9788 ++Cnt) {
9789 ArrayRef<LoadInst *> Slice =
9790 ArrayRef(Loads).slice(N: Cnt, M: std::min(a: NumElts, b: E - Cnt));
9791 if (VectorizedLoads.count(Ptr: Slice.front()) ||
9792 VectorizedLoads.count(Ptr: Slice.back()) ||
9793 areKnownNonVectorizableLoads(VL: Slice))
9794 continue;
9795 // Check if it is profitable to try vectorizing gathered loads. It is
9796 // profitable if we have more than 3 consecutive loads or if we have
9797 // less but all users are vectorized or deleted.
9798 bool AllowToVectorize = false;
9799 // Check if it is profitable to vectorize 2-elements loads.
9800 if (NumElts == 2) {
9801 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9802 ElementTy: Slice.front()->getType(), NumElements: ElementCount::getFixed(MinVal: NumElts));
9803 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9804 for (LoadInst *LI : Slice) {
9805 // If single use/user - allow to vectorize.
9806 if (LI->hasOneUse())
9807 continue;
9808 // 1. Check if number of uses equals number of users.
9809 // 2. All users are deleted.
9810 // 3. The load broadcasts are not allowed or the load is not
9811 // broadcasted.
9812 if (static_cast<unsigned int>(std::distance(
9813 first: LI->user_begin(), last: LI->user_end())) != LI->getNumUses())
9814 return false;
9815 if (!IsLegalBroadcastLoad)
9816 continue;
9817 if (LI->hasNUsesOrMore(N: UsesLimit))
9818 return false;
9819 for (User *U : LI->users()) {
9820 if (auto *UI = dyn_cast<Instruction>(Val: U); UI && isDeleted(I: UI))
9821 continue;
9822 for (const TreeEntry *UTE : getTreeEntries(V: U)) {
9823 for (int I : seq<int>(Size: UTE->getNumOperands())) {
9824 if (all_of(Range: UTE->getOperand(OpIdx: I), P: [LI](Value *V) {
9825 return V == LI || isa<PoisonValue>(Val: V);
9826 }))
9827 // Found legal broadcast - do not vectorize.
9828 return false;
9829 }
9830 }
9831 }
9832 }
9833 return true;
9834 };
9835 AllowToVectorize = CheckIfAllowed(Slice);
9836 } else {
9837 AllowToVectorize =
9838 (NumElts >= 3 ||
9839 any_of(Range&: ValueToGatherNodes.at(Val: Slice.front()),
9840 P: [=](const TreeEntry *TE) {
9841 return TE->Scalars.size() == 2 &&
9842 ((TE->Scalars.front() == Slice.front() &&
9843 TE->Scalars.back() == Slice.back()) ||
9844 (TE->Scalars.front() == Slice.back() &&
9845 TE->Scalars.back() == Slice.front()));
9846 })) &&
9847 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(),
9848 Sz: Slice.size());
9849 }
9850 if (AllowToVectorize) {
9851 SmallVector<Value *> PointerOps;
9852 OrdersType CurrentOrder;
9853 // Try to build vector load.
9854 ArrayRef<Value *> Values(
9855 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9856 StridedPtrInfo SPtrInfo;
9857 LoadsState LS = canVectorizeLoads(VL: Values, VL0: Slice.front(), Order&: CurrentOrder,
9858 PointerOps, SPtrInfo, BestVF: &BestVF);
9859 if (LS != LoadsState::Gather ||
9860 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9861 if (LS == LoadsState::ScatterVectorize) {
9862 if (MaskedGatherVectorized.empty() ||
9863 Cnt >= MaskedGatherVectorized.back() + NumElts)
9864 MaskedGatherVectorized.push_back(Elt: Cnt);
9865 continue;
9866 }
9867 if (LS != LoadsState::Gather) {
9868 Results.emplace_back(Args&: Values, Args&: LS);
9869 VectorizedLoads.insert_range(R&: Slice);
9870 // If we vectorized initial block, no need to try to vectorize it
9871 // again.
9872 if (Cnt == StartIdx)
9873 StartIdx += NumElts;
9874 }
9875 // Check if the whole array was vectorized already - exit.
9876 if (StartIdx >= Loads.size())
9877 break;
9878 // Erase last masked gather candidate, if another candidate within
9879 // the range is found to be better.
9880 if (!MaskedGatherVectorized.empty() &&
9881 Cnt < MaskedGatherVectorized.back() + NumElts)
9882 MaskedGatherVectorized.pop_back();
9883 Cnt += NumElts - 1;
9884 continue;
9885 }
9886 }
9887 if (!AllowToVectorize || BestVF == 0)
9888 registerNonVectorizableLoads(VL: Slice);
9889 }
9890 // Mark masked gathers candidates as vectorized, if any.
9891 for (unsigned Cnt : MaskedGatherVectorized) {
9892 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9893 N: Cnt, M: std::min<unsigned>(a: NumElts, b: Loads.size() - Cnt));
9894 ArrayRef<Value *> Values(
9895 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9896 Results.emplace_back(Args&: Values, Args: LoadsState::ScatterVectorize);
9897 VectorizedLoads.insert_range(R&: Slice);
9898 // If we vectorized initial block, no need to try to vectorize it again.
9899 if (Cnt == StartIdx)
9900 StartIdx += NumElts;
9901 }
9902 }
9903 for (LoadInst *LI : Loads) {
9904 if (!VectorizedLoads.contains(Ptr: LI))
9905 NonVectorized.push_back(Elt: LI);
9906 }
9907 return Results;
9908 };
9909 auto ProcessGatheredLoads =
9910 [&, &TTI = *TTI](
9911 ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
9912 bool Final = false) {
9913 SmallVector<LoadInst *> NonVectorized;
9914 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9915 GatheredLoads) {
9916 if (LoadsDists.size() <= 1) {
9917 NonVectorized.push_back(Elt: LoadsDists.back().first);
9918 continue;
9919 }
9920 SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
9921 LoadsDists);
9922 SmallVector<LoadInst *> OriginalLoads(make_first_range(c&: LoadsDists));
9923 stable_sort(Range&: LocalLoadsDists, C: LoadSorter);
9924 SmallVector<LoadInst *> Loads;
9925 unsigned MaxConsecutiveDistance = 0;
9926 unsigned CurrentConsecutiveDist = 1;
9927 int64_t LastDist = LocalLoadsDists.front().second;
9928 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9929 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9930 if (isVectorized(V: L.first))
9931 continue;
9932 assert(LastDist >= L.second &&
9933 "Expected first distance always not less than second");
9934 if (static_cast<uint64_t>(LastDist - L.second) ==
9935 CurrentConsecutiveDist) {
9936 ++CurrentConsecutiveDist;
9937 MaxConsecutiveDistance =
9938 std::max(a: MaxConsecutiveDistance, b: CurrentConsecutiveDist);
9939 Loads.push_back(Elt: L.first);
9940 continue;
9941 }
9942 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9943 !Loads.empty())
9944 Loads.pop_back();
9945 CurrentConsecutiveDist = 1;
9946 LastDist = L.second;
9947 Loads.push_back(Elt: L.first);
9948 }
9949 if (Loads.size() <= 1)
9950 continue;
9951 if (AllowMaskedGather)
9952 MaxConsecutiveDistance = Loads.size();
9953 else if (MaxConsecutiveDistance < 2)
9954 continue;
9955 BoUpSLP::ValueSet VectorizedLoads;
9956 SmallVector<LoadInst *> SortedNonVectorized;
9957 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
9958 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9959 Final, MaxConsecutiveDistance);
9960 if (!Results.empty() && !SortedNonVectorized.empty() &&
9961 OriginalLoads.size() == Loads.size() &&
9962 MaxConsecutiveDistance == Loads.size() &&
9963 all_of(Range&: Results,
9964 P: [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9965 return P.second == LoadsState::ScatterVectorize;
9966 })) {
9967 VectorizedLoads.clear();
9968 SmallVector<LoadInst *> UnsortedNonVectorized;
9969 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
9970 UnsortedResults =
9971 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9972 UnsortedNonVectorized, Final,
9973 OriginalLoads.size());
9974 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9975 SortedNonVectorized.swap(RHS&: UnsortedNonVectorized);
9976 Results.swap(RHS&: UnsortedResults);
9977 }
9978 }
9979 for (auto [Slice, _] : Results) {
9980 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9981 << Slice.size() << ")\n");
9982 if (any_of(Range&: Slice, P: [&](Value *V) { return isVectorized(V); })) {
9983 for (Value *L : Slice)
9984 if (!isVectorized(V: L))
9985 SortedNonVectorized.push_back(Elt: cast<LoadInst>(Val: L));
9986 continue;
9987 }
9988
9989 // Select maximum VF as a maximum of user gathered nodes and
9990 // distance between scalar loads in these nodes.
9991 unsigned MaxVF = Slice.size();
9992 unsigned UserMaxVF = 0;
9993 unsigned InterleaveFactor = 0;
9994 if (MaxVF == 2) {
9995 UserMaxVF = MaxVF;
9996 } else {
9997 // Found distance between segments of the interleaved loads.
9998 std::optional<unsigned> InterleavedLoadsDistance = 0;
9999 unsigned Order = 0;
10000 std::optional<unsigned> CommonVF = 0;
10001 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
10002 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
10003 for (auto [Idx, V] : enumerate(First&: Slice)) {
10004 for (const TreeEntry *E : ValueToGatherNodes.at(Val: V)) {
10005 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: E->Scalars.size());
10006 unsigned Pos =
10007 EntryToPosition.try_emplace(Key: E, Args&: Idx).first->second;
10008 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: Idx - Pos + 1);
10009 if (CommonVF) {
10010 if (*CommonVF == 0) {
10011 CommonVF = E->Scalars.size();
10012 continue;
10013 }
10014 if (*CommonVF != E->Scalars.size())
10015 CommonVF.reset();
10016 }
10017 // Check if the load is the part of the interleaved load.
10018 if (Pos != Idx && InterleavedLoadsDistance) {
10019 if (!DeinterleavedNodes.contains(Ptr: E) &&
10020 any_of(Range: E->Scalars, P: [&, Slice = Slice](Value *V) {
10021 if (isa<Constant>(Val: V))
10022 return false;
10023 if (isVectorized(V))
10024 return true;
10025 const auto &Nodes = ValueToGatherNodes.at(Val: V);
10026 return (Nodes.size() != 1 || !Nodes.contains(key: E)) &&
10027 !is_contained(Range: Slice, Element: V);
10028 })) {
10029 InterleavedLoadsDistance.reset();
10030 continue;
10031 }
10032 DeinterleavedNodes.insert(Ptr: E);
10033 if (*InterleavedLoadsDistance == 0) {
10034 InterleavedLoadsDistance = Idx - Pos;
10035 continue;
10036 }
10037 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
10038 (Idx - Pos) / *InterleavedLoadsDistance < Order)
10039 InterleavedLoadsDistance.reset();
10040 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(u: 1);
10041 }
10042 }
10043 }
10044 DeinterleavedNodes.clear();
10045 // Check if the large load represents interleaved load operation.
10046 if (InterleavedLoadsDistance.value_or(u: 0) > 1 &&
10047 CommonVF.value_or(u: 0) != 0) {
10048 InterleaveFactor = bit_ceil(Value: *InterleavedLoadsDistance);
10049 unsigned VF = *CommonVF;
10050 OrdersType Order;
10051 SmallVector<Value *> PointerOps;
10052 StridedPtrInfo SPtrInfo;
10053 // Segmented load detected - vectorize at maximum vector factor.
10054 if (InterleaveFactor <= Slice.size() &&
10055 TTI.isLegalInterleavedAccessType(
10056 VTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF),
10057 Factor: InterleaveFactor,
10058 Alignment: cast<LoadInst>(Val: Slice.front())->getAlign(),
10059 AddrSpace: cast<LoadInst>(Val: Slice.front())
10060 ->getPointerAddressSpace()) &&
10061 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
10062 SPtrInfo) == LoadsState::Vectorize) {
10063 UserMaxVF = InterleaveFactor * VF;
10064 } else {
10065 InterleaveFactor = 0;
10066 }
10067 }
10068 // Cannot represent the loads as consecutive vectorizable nodes -
10069 // just exit.
10070 unsigned ConsecutiveNodesSize = 0;
10071 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10072 any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
10073 P: [&, Slice = Slice](const auto &P) {
10074 const auto *It = find_if(Slice, [&](Value *V) {
10075 return std::get<1>(P).contains(V);
10076 });
10077 if (It == Slice.end())
10078 return false;
10079 const TreeEntry &TE =
10080 *VectorizableTree[std::get<0>(P)];
10081 ArrayRef<Value *> VL = TE.Scalars;
10082 OrdersType Order;
10083 SmallVector<Value *> PointerOps;
10084 StridedPtrInfo SPtrInfo;
10085 LoadsState State = canVectorizeLoads(
10086 VL, VL0: VL.front(), Order, PointerOps, SPtrInfo);
10087 if (State == LoadsState::ScatterVectorize ||
10088 State == LoadsState::CompressVectorize)
10089 return false;
10090 ConsecutiveNodesSize += VL.size();
10091 size_t Start = std::distance(Slice.begin(), It);
10092 size_t Sz = Slice.size() - Start;
10093 return Sz < VL.size() ||
10094 Slice.slice(N: Start, M: VL.size()) != VL;
10095 }))
10096 continue;
10097 // Try to build long masked gather loads.
10098 UserMaxVF = bit_ceil(Value: UserMaxVF);
10099 if (InterleaveFactor == 0 &&
10100 any_of(Range: seq<unsigned>(Size: Slice.size() / UserMaxVF),
10101 P: [&, Slice = Slice](unsigned Idx) {
10102 OrdersType Order;
10103 SmallVector<Value *> PointerOps;
10104 StridedPtrInfo SPtrInfo;
10105 return canVectorizeLoads(
10106 VL: Slice.slice(N: Idx * UserMaxVF, M: UserMaxVF),
10107 VL0: Slice[Idx * UserMaxVF], Order, PointerOps,
10108 SPtrInfo) == LoadsState::ScatterVectorize;
10109 }))
10110 UserMaxVF = MaxVF;
10111 if (Slice.size() != ConsecutiveNodesSize)
10112 MaxVF = std::min<unsigned>(a: MaxVF, b: UserMaxVF);
10113 }
10114 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10115 bool IsVectorized = true;
10116 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10117 ArrayRef<Value *> SubSlice =
10118 Slice.slice(N: I, M: std::min(a: VF, b: E - I));
10119 if (isVectorized(V: SubSlice.front()))
10120 continue;
10121 // Check if the subslice is to be-vectorized entry, which is not
10122 // equal to entry.
10123 if (any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
10124 P: [&](const auto &P) {
10125 return !SubSlice.equals(
10126 RHS: VectorizableTree[std::get<0>(P)]
10127 ->Scalars) &&
10128 set_is_subset(SubSlice, std::get<1>(P));
10129 }))
10130 continue;
10131 unsigned Sz = VectorizableTree.size();
10132 buildTreeRec(Roots: SubSlice, Depth: 0, EI: EdgeInfo(), InterleaveFactor);
10133 if (Sz == VectorizableTree.size()) {
10134 IsVectorized = false;
10135 // Try non-interleaved vectorization with smaller vector
10136 // factor.
10137 if (InterleaveFactor > 0) {
10138 VF = 2 * (MaxVF / InterleaveFactor);
10139 InterleaveFactor = 0;
10140 }
10141 continue;
10142 }
10143 }
10144 if (IsVectorized)
10145 break;
10146 }
10147 }
10148 NonVectorized.append(RHS: SortedNonVectorized);
10149 }
10150 return NonVectorized;
10151 };
10152 for (const auto &GLs : GatheredLoads) {
10153 const auto &Ref = GLs.second;
10154 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10155 if (!Ref.empty() && !NonVectorized.empty() &&
10156 std::accumulate(
10157 first: Ref.begin(), last: Ref.end(), init: 0u,
10158 binary_op: [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10159 -> unsigned { return S + LoadsDists.size(); }) !=
10160 NonVectorized.size() &&
10161 IsMaskedGatherSupported(NonVectorized)) {
10162 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>
10163 FinalGatheredLoads;
10164 for (LoadInst *LI : NonVectorized) {
10165 // Reinsert non-vectorized loads to other list of loads with the same
10166 // base pointers.
10167 gatherPossiblyVectorizableLoads(R: *this, VL: LI, DL: *DL, SE&: *SE, TTI: *TTI,
10168 GatheredLoads&: FinalGatheredLoads,
10169 /*AddNew=*/false);
10170 }
10171 // Final attempt to vectorize non-vectorized loads.
10172 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10173 }
10174 }
10175 // Try to vectorize postponed load entries, previously marked as gathered.
10176 for (unsigned Idx : LoadEntriesToVectorize) {
10177 const TreeEntry &E = *VectorizableTree[Idx];
10178 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10179 // Avoid reordering, if possible.
10180 if (!E.ReorderIndices.empty()) {
10181 // Build a mask out of the reorder indices and reorder scalars per this
10182 // mask.
10183 SmallVector<int> ReorderMask;
10184 inversePermutation(Indices: E.ReorderIndices, Mask&: ReorderMask);
10185 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
10186 }
10187 buildTreeRec(Roots: GatheredScalars, Depth: 0, EI: EdgeInfo());
10188 }
10189 // If no new entries created, consider it as no gathered loads entries must be
10190 // handled.
10191 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10192 VectorizableTree.size())
10193 GatheredLoadsEntriesFirst.reset();
10194}
10195
10196/// Generates key/subkey pair for the given value to provide effective sorting
10197/// of the values and better detection of the vectorizable values sequences. The
10198/// keys/subkeys can be used for better sorting of the values themselves (keys)
10199/// and in values subgroups (subkeys).
10200static std::pair<size_t, size_t> generateKeySubkey(
10201 Value *V, const TargetLibraryInfo *TLI,
10202 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10203 bool AllowAlternate) {
10204 hash_code Key = hash_value(value: V->getValueID() + 2);
10205 hash_code SubKey = hash_value(value: 0);
10206 // Sort the loads by the distance between the pointers.
10207 if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
10208 Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
10209 if (LI->isSimple())
10210 SubKey = hash_value(code: LoadsSubkeyGenerator(Key, LI));
10211 else
10212 Key = SubKey = hash_value(ptr: LI);
10213 } else if (isVectorLikeInstWithConstOps(V)) {
10214 // Sort extracts by the vector operands.
10215 if (isa<ExtractElementInst, UndefValue>(Val: V))
10216 Key = hash_value(value: Value::UndefValueVal + 1);
10217 if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
10218 if (!isUndefVector(V: EI->getVectorOperand()).all() &&
10219 !isa<UndefValue>(Val: EI->getIndexOperand()))
10220 SubKey = hash_value(ptr: EI->getVectorOperand());
10221 }
10222 } else if (auto *I = dyn_cast<Instruction>(Val: V)) {
10223 // Sort other instructions just by the opcodes except for CMPInst.
10224 // For CMP also sort by the predicate kind.
10225 if ((isa<BinaryOperator, CastInst>(Val: I)) &&
10226 isValidForAlternation(Opcode: I->getOpcode())) {
10227 if (AllowAlternate)
10228 Key = hash_value(value: isa<BinaryOperator>(Val: I) ? 1 : 0);
10229 else
10230 Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
10231 SubKey = hash_combine(
10232 args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
10233 args: hash_value(ptr: isa<BinaryOperator>(Val: I)
10234 ? I->getType()
10235 : cast<CastInst>(Val: I)->getOperand(i_nocapture: 0)->getType()));
10236 // For casts, look through the only operand to improve compile time.
10237 if (isa<CastInst>(Val: I)) {
10238 std::pair<size_t, size_t> OpVals =
10239 generateKeySubkey(V: I->getOperand(i: 0), TLI, LoadsSubkeyGenerator,
10240 /*AllowAlternate=*/true);
10241 Key = hash_combine(args: OpVals.first, args: Key);
10242 SubKey = hash_combine(args: OpVals.first, args: SubKey);
10243 }
10244 } else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
10245 CmpInst::Predicate Pred = CI->getPredicate();
10246 if (CI->isCommutative())
10247 Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
10248 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
10249 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
10250 args: hash_value(value: SwapPred),
10251 args: hash_value(ptr: CI->getOperand(i_nocapture: 0)->getType()));
10252 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
10253 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
10254 if (isTriviallyVectorizable(ID)) {
10255 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
10256 } else if (!VFDatabase(*Call).getMappings(CI: *Call).empty()) {
10257 SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
10258 args: hash_value(ptr: Call->getCalledFunction()));
10259 } else {
10260 Key = hash_combine(args: hash_value(ptr: Call), args: Key);
10261 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
10262 }
10263 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10264 SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
10265 args: hash_value(ptr: Op.Tag), args: SubKey);
10266 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
10267 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: 1)))
10268 SubKey = hash_value(ptr: Gep->getPointerOperand());
10269 else
10270 SubKey = hash_value(ptr: Gep);
10271 } else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
10272 !isa<ConstantInt>(Val: I->getOperand(i: 1))) {
10273 // Do not try to vectorize instructions with potentially high cost.
10274 SubKey = hash_value(ptr: I);
10275 } else {
10276 SubKey = hash_value(value: I->getOpcode());
10277 }
10278 Key = hash_combine(args: hash_value(value: I->getParent()->getNumber()), args: Key);
10279 }
10280 return std::make_pair(x&: Key, y&: SubKey);
10281}
10282
10283/// Checks if the specified instruction \p I is an main operation for the given
10284/// \p MainOp and \p AltOp instructions.
10285static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10286 Instruction *AltOp, const TargetLibraryInfo &TLI);
10287
10288/// Builds the arguments types vector for the given call instruction with the
10289/// given \p ID for the specified vector factor.
10290static SmallVector<Type *>
10291buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
10292 const unsigned VF, unsigned MinBW,
10293 const TargetTransformInfo *TTI) {
10294 SmallVector<Type *> ArgTys;
10295 for (auto [Idx, Arg] : enumerate(First: CI->args())) {
10296 if (ID != Intrinsic::not_intrinsic) {
10297 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) {
10298 ArgTys.push_back(Elt: Arg->getType());
10299 continue;
10300 }
10301 if (MinBW > 0) {
10302 ArgTys.push_back(
10303 Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
10304 continue;
10305 }
10306 }
10307 ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg->getType(), VF));
10308 }
10309 return ArgTys;
10310}
10311
10312/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10313/// function (if possible) calls. Returns invalid cost for the corresponding
10314/// calls, if they cannot be vectorized/will be scalarized.
10315static std::pair<InstructionCost, InstructionCost>
10316getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
10317 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10318 ArrayRef<Type *> ArgTys) {
10319 auto Shape = VFShape::get(FTy: CI->getFunctionType(),
10320 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
10321 HasGlobalPred: false /*HasGlobalPred*/);
10322 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10323 auto LibCost = InstructionCost::getInvalid();
10324 if (!CI->isNoBuiltin() && VecFunc) {
10325 // Calculate the cost of the vector library call.
10326 // If the corresponding vector call is cheaper, return its cost.
10327 LibCost =
10328 TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
10329 }
10330 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10331
10332 // Calculate the cost of the vector intrinsic call.
10333 FastMathFlags FMF;
10334 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
10335 FMF = FPCI->getFastMathFlags();
10336 const InstructionCost ScalarLimit = 10000;
10337 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10338 LibCost.isValid() ? LibCost : ScalarLimit);
10339 auto IntrinsicCost =
10340 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
10341 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10342 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10343 IntrinsicCost = InstructionCost::getInvalid();
10344
10345 return {IntrinsicCost, LibCost};
10346}
10347
10348/// Find the innermost loop starting from \p L, for which at least a single
10349/// value in \p VL is not invariant.
10350static const Loop *findInnermostNonInvariantLoop(const Loop *L,
10351 ArrayRef<Value *> VL) {
10352 assert(L && "Expected valid loop");
10353 auto IsLoopInvariant = [&](const Loop *L, ArrayRef<Value *> VL) {
10354 return all_of(Range&: VL, P: [&](Value *V) {
10355 return isa<Constant>(Val: V) || !isa<Instruction>(Val: V) || L->isLoopInvariant(V);
10356 });
10357 };
10358 while (L && IsLoopInvariant(L, VL))
10359 L = L->getParentLoop();
10360 return L;
10361}
10362
10363/// Get the loop nest for the given loop.
10364ArrayRef<const Loop *> BoUpSLP::getLoopNest(const Loop *L) {
10365 assert(L && "Expected valid loop");
10366 if (LoopAwareTripCount == 0)
10367 return {};
10368 SmallVector<const Loop *> &Res =
10369 LoopToLoopNest.try_emplace(Key: L).first->getSecond();
10370 if (!Res.empty())
10371 return Res;
10372 SmallVector<const Loop *> LoopNest;
10373 while (L) {
10374 LoopNest.push_back(Elt: L);
10375 L = L->getParentLoop();
10376 }
10377 Res.assign(in_start: LoopNest.rbegin(), in_end: LoopNest.rend());
10378 return Res;
10379}
10380
10381BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10382 const InstructionsState &S, ArrayRef<Value *> VL,
10383 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10384 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10385 assert(S.getMainOp() &&
10386 "Expected instructions with same/alternate opcodes only.");
10387
10388 unsigned ShuffleOrOp =
10389 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10390 Instruction *VL0 = S.getMainOp();
10391 switch (ShuffleOrOp) {
10392 case Instruction::PHI: {
10393 // Too many operands - gather, most probably won't be vectorized.
10394 if (VL0->getNumOperands() > MaxPHINumOperands)
10395 return TreeEntry::NeedToGather;
10396 // Check for terminator values (e.g. invoke).
10397 for (Value *V : VL) {
10398 auto *PHI = dyn_cast<PHINode>(Val: V);
10399 if (!PHI)
10400 continue;
10401 for (Value *Incoming : PHI->incoming_values()) {
10402 Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
10403 if (Term && Term->isTerminator()) {
10404 LLVM_DEBUG(dbgs()
10405 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10406 return TreeEntry::NeedToGather;
10407 }
10408 }
10409 }
10410
10411 return TreeEntry::Vectorize;
10412 }
10413 case Instruction::ExtractElement:
10414 if (any_of(Range&: VL, P: [&](Value *V) {
10415 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
10416 if (!EI)
10417 return true;
10418 return isVectorized(V: EI->getOperand(i_nocapture: 0));
10419 }))
10420 return TreeEntry::NeedToGather;
10421 [[fallthrough]];
10422 case Instruction::ExtractValue: {
10423 bool Reuse = canReuseExtract(VL, CurrentOrder);
10424 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10425 // non-full registers).
10426 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: VL0->getType(), Sz: VL.size()))
10427 return TreeEntry::NeedToGather;
10428 if (Reuse || !CurrentOrder.empty())
10429 return TreeEntry::Vectorize;
10430 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10431 return TreeEntry::NeedToGather;
10432 }
10433 case Instruction::InsertElement: {
10434 // Check that we have a buildvector and not a shuffle of 2 or more
10435 // different vectors.
10436 ValueSet SourceVectors;
10437 for (Value *V : VL) {
10438 if (isa<PoisonValue>(Val: V)) {
10439 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10440 return TreeEntry::NeedToGather;
10441 }
10442 SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: 0));
10443 assert(getElementIndex(V) != std::nullopt &&
10444 "Non-constant or undef index?");
10445 }
10446
10447 if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
10448 return !SourceVectors.contains(Ptr: V);
10449 }) >= 2) {
10450 // Found 2nd source vector - cancel.
10451 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10452 "different source vectors.\n");
10453 return TreeEntry::NeedToGather;
10454 }
10455
10456 if (any_of(Range&: VL, P: [&SourceVectors](Value *V) {
10457 // The last InsertElement can have multiple uses.
10458 return SourceVectors.contains(Ptr: V) && !V->hasOneUse();
10459 })) {
10460 assert(SLPReVec && "Only supported by REVEC.");
10461 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10462 "multiple uses.\n");
10463 return TreeEntry::NeedToGather;
10464 }
10465
10466 return TreeEntry::Vectorize;
10467 }
10468 case Instruction::Load: {
10469 // Check that a vectorized load would load the same memory as a scalar
10470 // load. For example, we don't want to vectorize loads that are smaller
10471 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10472 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10473 // from such a struct, we read/write packed bits disagreeing with the
10474 // unvectorized version.
10475 auto IsGatheredNode = [&]() {
10476 if (!GatheredLoadsEntriesFirst)
10477 return false;
10478 return all_of(Range&: VL, P: [&](Value *V) {
10479 if (isa<PoisonValue>(Val: V))
10480 return true;
10481 return any_of(Range: getTreeEntries(V), P: [&](const TreeEntry *TE) {
10482 return TE->Idx >= *GatheredLoadsEntriesFirst;
10483 });
10484 });
10485 };
10486 switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps, SPtrInfo)) {
10487 case LoadsState::Vectorize:
10488 return TreeEntry::Vectorize;
10489 case LoadsState::CompressVectorize:
10490 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10491 // Delay slow vectorized nodes for better vectorization attempts.
10492 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10493 return TreeEntry::NeedToGather;
10494 }
10495 return IsGatheredNode() ? TreeEntry::NeedToGather
10496 : TreeEntry::CompressVectorize;
10497 case LoadsState::ScatterVectorize:
10498 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10499 // Delay slow vectorized nodes for better vectorization attempts.
10500 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10501 return TreeEntry::NeedToGather;
10502 }
10503 return IsGatheredNode() ? TreeEntry::NeedToGather
10504 : TreeEntry::ScatterVectorize;
10505 case LoadsState::StridedVectorize:
10506 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10507 // Delay slow vectorized nodes for better vectorization attempts.
10508 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10509 return TreeEntry::NeedToGather;
10510 }
10511 return IsGatheredNode() ? TreeEntry::NeedToGather
10512 : TreeEntry::StridedVectorize;
10513 case LoadsState::Gather:
10514#ifndef NDEBUG
10515 Type *ScalarTy = VL0->getType();
10516 if (DL->getTypeSizeInBits(ScalarTy) !=
10517 DL->getTypeAllocSizeInBits(ScalarTy))
10518 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10519 else if (any_of(VL, [](Value *V) {
10520 auto *LI = dyn_cast<LoadInst>(V);
10521 return !LI || !LI->isSimple();
10522 }))
10523 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10524 else
10525 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10526#endif // NDEBUG
10527 registerNonVectorizableLoads(VL);
10528 return TreeEntry::NeedToGather;
10529 }
10530 llvm_unreachable("Unexpected state of loads");
10531 }
10532 case Instruction::ZExt:
10533 case Instruction::SExt:
10534 case Instruction::FPToUI:
10535 case Instruction::FPToSI:
10536 case Instruction::FPExt:
10537 case Instruction::PtrToInt:
10538 case Instruction::IntToPtr:
10539 case Instruction::SIToFP:
10540 case Instruction::UIToFP:
10541 case Instruction::Trunc:
10542 case Instruction::FPTrunc:
10543 case Instruction::BitCast: {
10544 Type *SrcTy = VL0->getOperand(i: 0)->getType();
10545 for (Value *V : VL) {
10546 if (isa<PoisonValue>(Val: V))
10547 continue;
10548 Type *Ty = cast<Instruction>(Val: V)->getOperand(i: 0)->getType();
10549 if (Ty != SrcTy || !isValidElementType(Ty)) {
10550 LLVM_DEBUG(
10551 dbgs() << "SLP: Gathering casts with different src types.\n");
10552 return TreeEntry::NeedToGather;
10553 }
10554 }
10555 return TreeEntry::Vectorize;
10556 }
10557 case Instruction::ICmp:
10558 case Instruction::FCmp: {
10559 // Check that all of the compares have the same predicate.
10560 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
10561 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
10562 Type *ComparedTy = VL0->getOperand(i: 0)->getType();
10563 for (Value *V : VL) {
10564 if (isa<PoisonValue>(Val: V))
10565 continue;
10566 auto *Cmp = cast<CmpInst>(Val: V);
10567 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10568 Cmp->getOperand(i_nocapture: 0)->getType() != ComparedTy) {
10569 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10570 return TreeEntry::NeedToGather;
10571 }
10572 }
10573 return TreeEntry::Vectorize;
10574 }
10575 case Instruction::Select:
10576 if (SLPReVec) {
10577 SmallPtrSet<Type *, 4> CondTypes;
10578 for (Value *V : VL) {
10579 Value *Cond;
10580 if (!match(V, P: m_Select(C: m_Value(V&: Cond), L: m_Value(), R: m_Value())) &&
10581 !match(V, P: m_ZExt(Op: m_Value(V&: Cond))))
10582 continue;
10583 CondTypes.insert(Ptr: Cond->getType());
10584 }
10585 if (CondTypes.size() > 1) {
10586 LLVM_DEBUG(
10587 dbgs()
10588 << "SLP: Gathering select with different condition types.\n");
10589 return TreeEntry::NeedToGather;
10590 }
10591 }
10592 [[fallthrough]];
10593 case Instruction::FNeg:
10594 case Instruction::Add:
10595 case Instruction::FAdd:
10596 case Instruction::Sub:
10597 case Instruction::FSub:
10598 case Instruction::Mul:
10599 case Instruction::FMul:
10600 case Instruction::UDiv:
10601 case Instruction::SDiv:
10602 case Instruction::FDiv:
10603 case Instruction::URem:
10604 case Instruction::SRem:
10605 case Instruction::FRem:
10606 case Instruction::Shl:
10607 case Instruction::LShr:
10608 case Instruction::AShr:
10609 case Instruction::And:
10610 case Instruction::Or:
10611 case Instruction::Xor:
10612 case Instruction::Freeze:
10613 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10614 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10615 auto *I = dyn_cast<Instruction>(Val: V);
10616 return I && I->isBinaryOp() && !I->isFast();
10617 }))
10618 return TreeEntry::NeedToGather;
10619 return TreeEntry::Vectorize;
10620 case Instruction::GetElementPtr: {
10621 // We don't combine GEPs with complicated (nested) indexing.
10622 for (Value *V : VL) {
10623 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10624 if (!I)
10625 continue;
10626 if (I->getNumOperands() != 2) {
10627 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10628 return TreeEntry::NeedToGather;
10629 }
10630 }
10631
10632 // We can't combine several GEPs into one vector if they operate on
10633 // different types.
10634 Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
10635 for (Value *V : VL) {
10636 auto *GEP = dyn_cast<GEPOperator>(Val: V);
10637 if (!GEP)
10638 continue;
10639 Type *CurTy = GEP->getSourceElementType();
10640 if (Ty0 != CurTy) {
10641 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10642 return TreeEntry::NeedToGather;
10643 }
10644 }
10645
10646 // We don't combine GEPs with non-constant indexes.
10647 Type *Ty1 = VL0->getOperand(i: 1)->getType();
10648 for (Value *V : VL) {
10649 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10650 if (!I)
10651 continue;
10652 auto *Op = I->getOperand(i_nocapture: 1);
10653 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
10654 (Op->getType() != Ty1 &&
10655 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
10656 Op->getType()->getScalarSizeInBits() >
10657 DL->getIndexSizeInBits(
10658 AS: V->getType()->getPointerAddressSpace())))) {
10659 LLVM_DEBUG(
10660 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10661 return TreeEntry::NeedToGather;
10662 }
10663 }
10664
10665 return TreeEntry::Vectorize;
10666 }
10667 case Instruction::Store: {
10668 // Check if the stores are consecutive or if we need to swizzle them.
10669 llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
10670 // Avoid types that are padded when being allocated as scalars, while
10671 // being packed together in a vector (such as i1).
10672 if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
10673 DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
10674 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10675 return TreeEntry::NeedToGather;
10676 }
10677 // Make sure all stores in the bundle are simple - we can't vectorize
10678 // atomic or volatile stores.
10679 for (Value *V : VL) {
10680 auto *SI = cast<StoreInst>(Val: V);
10681 if (!SI->isSimple()) {
10682 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10683 return TreeEntry::NeedToGather;
10684 }
10685 PointerOps.push_back(Elt: SI->getPointerOperand());
10686 }
10687
10688 // Check the order of pointer operands.
10689 if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: CurrentOrder)) {
10690 Value *Ptr0;
10691 Value *PtrN;
10692 if (CurrentOrder.empty()) {
10693 Ptr0 = PointerOps.front();
10694 PtrN = PointerOps.back();
10695 } else {
10696 Ptr0 = PointerOps[CurrentOrder.front()];
10697 PtrN = PointerOps[CurrentOrder.back()];
10698 }
10699 std::optional<int64_t> Dist =
10700 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
10701 // Check that the sorted pointer operands are consecutive.
10702 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10703 return TreeEntry::Vectorize;
10704 }
10705
10706 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10707 return TreeEntry::NeedToGather;
10708 }
10709 case Instruction::Call: {
10710 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10711 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10712 auto *I = dyn_cast<Instruction>(Val: V);
10713 return I && !I->isFast();
10714 }))
10715 return TreeEntry::NeedToGather;
10716 // Check if the calls are all to the same vectorizable intrinsic or
10717 // library function.
10718 CallInst *CI = cast<CallInst>(Val: VL0);
10719 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10720
10721 VFShape Shape = VFShape::get(
10722 FTy: CI->getFunctionType(),
10723 EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
10724 HasGlobalPred: false /*HasGlobalPred*/);
10725 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10726
10727 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10728 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10729 return TreeEntry::NeedToGather;
10730 }
10731 Function *F = CI->getCalledFunction();
10732 unsigned NumArgs = CI->arg_size();
10733 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10734 for (unsigned J = 0; J != NumArgs; ++J)
10735 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI))
10736 ScalarArgs[J] = CI->getArgOperand(i: J);
10737 for (Value *V : VL) {
10738 CallInst *CI2 = dyn_cast<CallInst>(Val: V);
10739 if (!CI2 || CI2->getCalledFunction() != F ||
10740 getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID ||
10741 (VecFunc &&
10742 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10743 !CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
10744 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10745 << "\n");
10746 return TreeEntry::NeedToGather;
10747 }
10748 // Some intrinsics have scalar arguments and should be same in order for
10749 // them to be vectorized.
10750 for (unsigned J = 0; J != NumArgs; ++J) {
10751 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) {
10752 Value *A1J = CI2->getArgOperand(i: J);
10753 if (ScalarArgs[J] != A1J) {
10754 LLVM_DEBUG(dbgs()
10755 << "SLP: mismatched arguments in call:" << *CI
10756 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10757 return TreeEntry::NeedToGather;
10758 }
10759 }
10760 }
10761 // Verify that the bundle operands are identical between the two calls.
10762 if (CI->hasOperandBundles() &&
10763 !std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
10764 last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
10765 first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10766 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10767 << "!=" << *V << '\n');
10768 return TreeEntry::NeedToGather;
10769 }
10770 }
10771 SmallVector<Type *> ArgTys =
10772 buildIntrinsicArgTypes(CI, ID, VF: VL.size(), MinBW: 0, TTI);
10773 auto *VecTy = getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
10774 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10775 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10776 return TreeEntry::NeedToGather;
10777
10778 return TreeEntry::Vectorize;
10779 }
10780 case Instruction::ShuffleVector: {
10781 if (!S.isAltShuffle()) {
10782 // REVEC can support non alternate shuffle.
10783 if (SLPReVec && getShufflevectorNumGroups(VL))
10784 return TreeEntry::Vectorize;
10785 // If this is not an alternate sequence of opcode like add-sub
10786 // then do not vectorize this instruction.
10787 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10788 return TreeEntry::NeedToGather;
10789 }
10790
10791 return TreeEntry::Vectorize;
10792 }
10793 default:
10794 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10795 return TreeEntry::NeedToGather;
10796 }
10797}
10798
10799namespace {
10800/// Allows to correctly handle operands of the phi nodes based on the \p Main
10801/// PHINode order of incoming basic blocks/values.
10802class PHIHandler {
10803 DominatorTree &DT;
10804 PHINode *Main = nullptr;
10805 SmallVector<Value *> Phis;
10806 SmallVector<SmallVector<Value *>> Operands;
10807
10808public:
10809 PHIHandler() = delete;
10810 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10811 : DT(DT), Main(Main), Phis(Phis),
10812 Operands(Main->getNumIncomingValues(),
10813 SmallVector<Value *>(Phis.size(), nullptr)) {}
10814 void buildOperands() {
10815 constexpr unsigned FastLimit = 4;
10816 if (Main->getNumIncomingValues() <= FastLimit) {
10817 for (unsigned I : seq<unsigned>(Begin: 0, End: Main->getNumIncomingValues())) {
10818 BasicBlock *InBB = Main->getIncomingBlock(i: I);
10819 if (!DT.isReachableFromEntry(A: InBB)) {
10820 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10821 continue;
10822 }
10823 // Prepare the operand vector.
10824 for (auto [Idx, V] : enumerate(First&: Phis)) {
10825 auto *P = dyn_cast<PHINode>(Val: V);
10826 if (!P) {
10827 assert(isa<PoisonValue>(V) &&
10828 "Expected isa instruction or poison value.");
10829 Operands[I][Idx] = V;
10830 continue;
10831 }
10832 if (P->getIncomingBlock(i: I) == InBB)
10833 Operands[I][Idx] = P->getIncomingValue(i: I);
10834 else
10835 Operands[I][Idx] = P->getIncomingValueForBlock(BB: InBB);
10836 }
10837 }
10838 return;
10839 }
10840 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10841 Blocks;
10842 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) {
10843 BasicBlock *InBB = Main->getIncomingBlock(i: I);
10844 if (!DT.isReachableFromEntry(A: InBB)) {
10845 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10846 continue;
10847 }
10848 Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I);
10849 }
10850 for (auto [Idx, V] : enumerate(First&: Phis)) {
10851 if (isa<PoisonValue>(Val: V)) {
10852 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues()))
10853 Operands[I][Idx] = V;
10854 continue;
10855 }
10856 auto *P = cast<PHINode>(Val: V);
10857 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
10858 BasicBlock *InBB = P->getIncomingBlock(i: I);
10859 if (InBB == Main->getIncomingBlock(i: I)) {
10860 if (isa_and_nonnull<PoisonValue>(Val: Operands[I][Idx]))
10861 continue;
10862 Operands[I][Idx] = P->getIncomingValue(i: I);
10863 continue;
10864 }
10865 auto *It = Blocks.find(Key: InBB);
10866 if (It == Blocks.end())
10867 continue;
10868 Operands[It->second.front()][Idx] = P->getIncomingValue(i: I);
10869 }
10870 }
10871 for (const auto &P : Blocks) {
10872 ArrayRef<unsigned> IncomingValues = P.second;
10873 if (IncomingValues.size() <= 1)
10874 continue;
10875 unsigned BasicI = IncomingValues.consume_front();
10876 for (unsigned I : IncomingValues) {
10877 assert(all_of(enumerate(Operands[I]),
10878 [&](const auto &Data) {
10879 return !Data.value() ||
10880 Data.value() == Operands[BasicI][Data.index()];
10881 }) &&
10882 "Expected empty operands list.");
10883 Operands[I] = Operands[BasicI];
10884 }
10885 }
10886 }
10887 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10888};
10889} // namespace
10890
10891/// Returns main/alternate instructions for the given \p VL. Unlike
10892/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10893/// node support.
10894/// \returns first main/alt instructions, if only poisons and instruction with
10895/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10896static std::pair<Instruction *, Instruction *>
10897getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
10898 Instruction *MainOp = nullptr;
10899 Instruction *AltOp = nullptr;
10900 for (Value *V : VL) {
10901 if (isa<PoisonValue>(Val: V))
10902 continue;
10903 auto *I = dyn_cast<Instruction>(Val: V);
10904 if (!I)
10905 return {};
10906 if (!MainOp) {
10907 MainOp = I;
10908 continue;
10909 }
10910 if (MainOp->getOpcode() == I->getOpcode()) {
10911 if (I->getParent() != MainOp->getParent())
10912 return {};
10913 continue;
10914 }
10915 if (!AltOp) {
10916 AltOp = I;
10917 continue;
10918 }
10919 if (AltOp->getOpcode() == I->getOpcode()) {
10920 if (I->getParent() != AltOp->getParent())
10921 return {};
10922 continue;
10923 }
10924 return {};
10925 }
10926 if (!AltOp)
10927 return {};
10928 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10929 "Expected different main and alt instructions.");
10930 return std::make_pair(x&: MainOp, y&: AltOp);
10931}
10932
10933/// Checks that every instruction appears once in the list and if not, packs
10934/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10935/// unique scalars is extended by poison values to the whole register size.
10936///
10937/// \returns false if \p VL could not be uniquified, in which case \p VL is
10938/// unchanged and \p ReuseShuffleIndices is empty.
10939static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
10940 SmallVectorImpl<int> &ReuseShuffleIndices,
10941 const TargetTransformInfo &TTI,
10942 const TargetLibraryInfo &TLI,
10943 const InstructionsState &S,
10944 const BoUpSLP::EdgeInfo &UserTreeIdx,
10945 bool TryPad = false) {
10946 // Check that every instruction appears once in this bundle.
10947 SmallVector<Value *> UniqueValues;
10948 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10949 for (Value *V : VL) {
10950 if (isConstant(V)) {
10951 // Constants are always considered distinct, even if the same constant
10952 // appears multiple times in VL.
10953 ReuseShuffleIndices.emplace_back(
10954 Args: isa<PoisonValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
10955 UniqueValues.emplace_back(Args&: V);
10956 continue;
10957 }
10958 auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
10959 ReuseShuffleIndices.emplace_back(Args&: Res.first->second);
10960 if (Res.second)
10961 UniqueValues.emplace_back(Args&: V);
10962 }
10963
10964 // Easy case: VL has unique values and a "natural" size
10965 size_t NumUniqueScalarValues = UniqueValues.size();
10966 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10967 TTI, Ty: getValueType(V: UniqueValues.front()), Sz: NumUniqueScalarValues);
10968 if (NumUniqueScalarValues == VL.size() &&
10969 (VectorizeNonPowerOf2 || IsFullVectors)) {
10970 ReuseShuffleIndices.clear();
10971 return true;
10972 }
10973
10974 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10975 if ((UserTreeIdx.UserTE &&
10976 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10977 !hasFullVectorsOrPowerOf2(TTI, Ty: getValueType(V: VL.front()), Sz: VL.size())) {
10978 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10979 "for nodes with padding.\n");
10980 ReuseShuffleIndices.clear();
10981 return false;
10982 }
10983
10984 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10985 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10986 (UniquePositions.size() == 1 && all_of(Range&: UniqueValues, P: [](Value *V) {
10987 return isa<UndefValue>(Val: V) || !isConstant(V);
10988 }))) {
10989 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10990 S.getMainOp()->isSafeToRemove() &&
10991 (S.areInstructionsWithCopyableElements() ||
10992 all_of(Range&: UniqueValues, P: IsaPred<Instruction, PoisonValue>))) {
10993 // Find the number of elements, which forms full vectors.
10994 unsigned PWSz = getFullVectorNumberOfElements(
10995 TTI, Ty: UniqueValues.front()->getType(), Sz: UniqueValues.size());
10996 PWSz = std::min<unsigned>(a: PWSz, b: VL.size());
10997 if (PWSz == VL.size()) {
10998 // We ended up with the same size after removing duplicates and
10999 // upgrading the resulting vector size to a "nice size". Just keep
11000 // the initial VL then.
11001 ReuseShuffleIndices.clear();
11002 } else {
11003 // Pad unique values with poison to grow the vector to a "nice" size
11004 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
11005 UniqueValues.end());
11006 PaddedUniqueValues.append(
11007 NumInputs: PWSz - UniqueValues.size(),
11008 Elt: PoisonValue::get(T: UniqueValues.front()->getType()));
11009 // Check that extended with poisons/copyable operations are still valid
11010 // for vectorization (div/rem are not allowed).
11011 if ((!S.areInstructionsWithCopyableElements() &&
11012 !getSameOpcode(VL: PaddedUniqueValues, TLI).valid()) ||
11013 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
11014 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
11015 isa<CallInst>(Val: S.getMainOp())))) {
11016 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11017 ReuseShuffleIndices.clear();
11018 return false;
11019 }
11020 VL = std::move(PaddedUniqueValues);
11021 }
11022 return true;
11023 }
11024 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11025 ReuseShuffleIndices.clear();
11026 return false;
11027 }
11028 VL = std::move(UniqueValues);
11029 return true;
11030}
11031
11032bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
11033 const InstructionsState &LocalState,
11034 SmallVectorImpl<Value *> &Op1,
11035 SmallVectorImpl<Value *> &Op2,
11036 OrdersType &ReorderIndices) const {
11037 constexpr unsigned SmallNodeSize = 4;
11038 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11039 !SplitAlternateInstructions)
11040 return false;
11041
11042 // Check if this is a duplicate of another split entry.
11043 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11044 << ".\n");
11045 for (TreeEntry *E : getSplitTreeEntries(V: LocalState.getMainOp())) {
11046 if (E->isSame(VL)) {
11047 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11048 << *LocalState.getMainOp() << ".\n");
11049 return false;
11050 }
11051 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11052 if (all_of(Range&: VL, P: [&](Value *V) {
11053 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V);
11054 })) {
11055 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11056 return false;
11057 }
11058 }
11059
11060 ReorderIndices.assign(NumElts: VL.size(), Elt: VL.size());
11061 SmallBitVector Op1Indices(VL.size());
11062 for (auto [Idx, V] : enumerate(First&: VL)) {
11063 auto *I = dyn_cast<Instruction>(Val: V);
11064 if (!I) {
11065 Op1.push_back(Elt: V);
11066 Op1Indices.set(Idx);
11067 continue;
11068 }
11069 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11070 isMainInstruction(I, MainOp: LocalState.getMainOp(), AltOp: LocalState.getAltOp(),
11071 TLI: *TLI)) ||
11072 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11073 !isAlternateInstruction(I, MainOp: LocalState.getMainOp(),
11074 AltOp: LocalState.getAltOp(), TLI: *TLI))) {
11075 Op1.push_back(Elt: V);
11076 Op1Indices.set(Idx);
11077 continue;
11078 }
11079 Op2.push_back(Elt: V);
11080 }
11081 Type *ScalarTy = getValueType(V: VL.front());
11082 VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11083 unsigned Opcode0 = LocalState.getOpcode();
11084 unsigned Opcode1 = LocalState.getAltOpcode();
11085 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11086 // Enable split node, only if all nodes do not form legal alternate
11087 // instruction (like X86 addsub).
11088 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
11089 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
11090 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11091 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11092 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op1.front()->getType(), Sz: Op1.size()) ||
11093 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op2.front()->getType(), Sz: Op2.size()))
11094 return false;
11095 // Enable split node, only if all nodes are power-of-2/full registers.
11096 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11097 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11098 if (Op1Indices.test(Idx)) {
11099 ReorderIndices[Op1Cnt] = Idx;
11100 ++Op1Cnt;
11101 } else {
11102 ReorderIndices[Op2Cnt] = Idx;
11103 ++Op2Cnt;
11104 }
11105 }
11106 if (isIdentityOrder(Order: ReorderIndices))
11107 ReorderIndices.clear();
11108 SmallVector<int> Mask;
11109 if (!ReorderIndices.empty())
11110 inversePermutation(Indices: ReorderIndices, Mask);
11111 unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
11112 VectorType *Op1VecTy = getWidenedType(ScalarTy, VF: Op1.size());
11113 VectorType *Op2VecTy = getWidenedType(ScalarTy, VF: Op2.size());
11114 // Check non-profitable single register ops, which better to be represented
11115 // as alternate ops.
11116 if (NumParts >= VL.size())
11117 return false;
11118 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11119 InstructionCost InsertCost = ::getShuffleCost(
11120 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind: Kind, Index: Op1.size(), SubTp: Op2VecTy);
11121 FixedVectorType *SubVecTy =
11122 getWidenedType(ScalarTy, VF: std::max(a: Op1.size(), b: Op2.size()));
11123 InstructionCost NewShuffleCost =
11124 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: SubVecTy, Mask, CostKind: Kind);
11125 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11126 (Mask.empty() || InsertCost >= NewShuffleCost))
11127 return false;
11128 if ((LocalState.getMainOp()->isBinaryOp() &&
11129 LocalState.getAltOp()->isBinaryOp() &&
11130 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11131 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11132 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11133 (LocalState.getMainOp()->isUnaryOp() &&
11134 LocalState.getAltOp()->isUnaryOp())) {
11135 InstructionCost OriginalVecOpsCost =
11136 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: VecTy, CostKind: Kind) +
11137 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: VecTy, CostKind: Kind);
11138 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11139 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11140 if (isa<PoisonValue>(Val: VL[Idx]))
11141 continue;
11142 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11143 }
11144 InstructionCost OriginalCost =
11145 OriginalVecOpsCost + ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
11146 Tp: VecTy, Mask: OriginalMask, CostKind: Kind);
11147 InstructionCost NewVecOpsCost =
11148 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: Op1VecTy, CostKind: Kind) +
11149 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: Op2VecTy, CostKind: Kind);
11150 InstructionCost NewCost =
11151 NewVecOpsCost + InsertCost +
11152 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11153 VectorizableTree.front()->getOpcode() == Instruction::Store
11154 ? NewShuffleCost
11155 : 0);
11156 // If not profitable to split - exit.
11157 if (NewCost >= OriginalCost)
11158 return false;
11159 }
11160 return true;
11161}
11162
11163namespace {
11164/// Class accepts incoming list of values, checks if it is able to model
11165/// "copyable" values as compatible operations, and generates the list of values
11166/// for scheduling and list of operands doe the new nodes.
11167class InstructionsCompatibilityAnalysis {
11168 DominatorTree &DT;
11169 const DataLayout &DL;
11170 const TargetTransformInfo &TTI;
11171 const TargetLibraryInfo &TLI;
11172 unsigned MainOpcode = 0;
11173 Instruction *MainOp = nullptr;
11174
11175 /// Checks if the opcode is supported as the main opcode for copyable
11176 /// elements.
11177 static bool isSupportedOpcode(const unsigned Opcode) {
11178 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11179 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11180 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11181 Opcode == Instruction::And || Opcode == Instruction::Or ||
11182 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11183 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11184 Opcode == Instruction::FDiv;
11185 }
11186
11187 /// Identifies the best candidate value, which represents main opcode
11188 /// operation.
11189 /// Currently the best candidate is the Add instruction with the parent
11190 /// block with the highest DFS incoming number (block, that dominates other).
11191 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11192 BasicBlock *Parent = nullptr;
11193 // Checks if the instruction has supported opcode.
11194 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11195 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I)))
11196 return false;
11197 return I && isSupportedOpcode(Opcode: I->getOpcode()) &&
11198 (!doesNotNeedToBeScheduled(V: I) || !R.isVectorized(V: I));
11199 };
11200 // Exclude operands instructions immediately to improve compile time, it
11201 // will be unable to schedule anyway.
11202 SmallDenseSet<Value *, 8> Operands;
11203 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11204 bool AnyUndef = false;
11205 for (Value *V : VL) {
11206 auto *I = dyn_cast<Instruction>(Val: V);
11207 if (!I) {
11208 AnyUndef |= isa<UndefValue>(Val: V);
11209 continue;
11210 }
11211 if (!DT.isReachableFromEntry(A: I->getParent()))
11212 continue;
11213 if (Candidates.empty()) {
11214 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11215 Parent = I->getParent();
11216 Operands.insert(I: I->op_begin(), E: I->op_end());
11217 continue;
11218 }
11219 if (Parent == I->getParent()) {
11220 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11221 Operands.insert(I: I->op_begin(), E: I->op_end());
11222 continue;
11223 }
11224 auto *NodeA = DT.getNode(BB: Parent);
11225 auto *NodeB = DT.getNode(BB: I->getParent());
11226 assert(NodeA && "Should only process reachable instructions");
11227 assert(NodeB && "Should only process reachable instructions");
11228 assert((NodeA == NodeB) ==
11229 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11230 "Different nodes should have different DFS numbers");
11231 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11232 Candidates.clear();
11233 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11234 Parent = I->getParent();
11235 Operands.clear();
11236 Operands.insert(I: I->op_begin(), E: I->op_end());
11237 }
11238 }
11239 unsigned BestOpcodeNum = 0;
11240 MainOp = nullptr;
11241 bool UsedOutside = false;
11242 for (const auto &P : Candidates) {
11243 bool PUsedOutside = all_of(Range: P.second, P: isUsedOutsideBlock);
11244 if (UsedOutside && !PUsedOutside)
11245 continue;
11246 if (!UsedOutside && PUsedOutside)
11247 BestOpcodeNum = 0;
11248 if (P.second.size() < BestOpcodeNum)
11249 continue;
11250 // If have inner dependencies - skip.
11251 if (!PUsedOutside && any_of(Range: P.second, P: [&](Instruction *I) {
11252 return Operands.contains(V: I);
11253 }))
11254 continue;
11255 UsedOutside = PUsedOutside;
11256 for (Instruction *I : P.second) {
11257 if (IsSupportedInstruction(I, AnyUndef)) {
11258 MainOp = I;
11259 BestOpcodeNum = P.second.size();
11260 break;
11261 }
11262 }
11263 }
11264 if (MainOp) {
11265 // Do not match, if any copyable is a terminator from the same block as
11266 // the main operation.
11267 if (any_of(Range&: VL, P: [&](Value *V) {
11268 auto *I = dyn_cast<Instruction>(Val: V);
11269 return I && I->getParent() == MainOp->getParent() &&
11270 I->isTerminator();
11271 })) {
11272 MainOp = nullptr;
11273 return;
11274 }
11275 MainOpcode = MainOp->getOpcode();
11276 }
11277 }
11278
11279 /// Returns the idempotent value for the \p MainOp with the detected \p
11280 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11281 /// the operand itself, since V or V == V.
11282 Value *selectBestIdempotentValue() const {
11283 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11284 return ConstantExpr::getBinOpIdentity(Opcode: MainOpcode, Ty: MainOp->getType(),
11285 AllowRHSConstant: !MainOp->isCommutative());
11286 }
11287
11288 /// Returns the value and operands for the \p V, considering if it is original
11289 /// instruction and its actual operands should be returned, or it is a
11290 /// copyable element and its should be represented as idempotent instruction.
11291 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11292 if (isa<PoisonValue>(Val: V))
11293 return {V, V};
11294 if (!S.isCopyableElement(V))
11295 return convertTo(I: cast<Instruction>(Val: V), S).second;
11296 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11297 return {V, selectBestIdempotentValue()};
11298 }
11299
11300 /// Builds operands for the original instructions.
11301 void
11302 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11303 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11304
11305 unsigned ShuffleOrOp =
11306 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11307 Instruction *VL0 = S.getMainOp();
11308
11309 switch (ShuffleOrOp) {
11310 case Instruction::PHI: {
11311 auto *PH = cast<PHINode>(Val: VL0);
11312
11313 // Keeps the reordered operands to avoid code duplication.
11314 PHIHandler Handler(DT, PH, VL);
11315 Handler.buildOperands();
11316 Operands.assign(NumElts: PH->getNumOperands(), Elt: {});
11317 for (unsigned I : seq<unsigned>(Size: PH->getNumOperands()))
11318 Operands[I].assign(in_start: Handler.getOperands(I).begin(),
11319 in_end: Handler.getOperands(I).end());
11320 return;
11321 }
11322 case Instruction::ExtractValue:
11323 case Instruction::ExtractElement:
11324 // This is a special case, as it does not gather, but at the same time
11325 // we are not extending buildTree_rec() towards the operands.
11326 Operands.assign(NumElts: 1, Elt: {VL.size(), VL0->getOperand(i: 0)});
11327 return;
11328 case Instruction::InsertElement:
11329 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
11330 for (auto [Idx, V] : enumerate(First&: VL)) {
11331 auto *IE = cast<InsertElementInst>(Val: V);
11332 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11333 Ops[Idx] = IE->getOperand(i_nocapture: OpIdx);
11334 }
11335 return;
11336 case Instruction::Load:
11337 Operands.assign(
11338 NumElts: 1, Elt: {VL.size(),
11339 PoisonValue::get(T: cast<LoadInst>(Val: VL0)->getPointerOperandType())});
11340 for (auto [V, Op] : zip(t&: VL, u&: Operands.back())) {
11341 auto *LI = dyn_cast<LoadInst>(Val: V);
11342 if (!LI)
11343 continue;
11344 Op = LI->getPointerOperand();
11345 }
11346 return;
11347 case Instruction::ZExt:
11348 case Instruction::SExt:
11349 case Instruction::FPToUI:
11350 case Instruction::FPToSI:
11351 case Instruction::FPExt:
11352 case Instruction::PtrToInt:
11353 case Instruction::IntToPtr:
11354 case Instruction::SIToFP:
11355 case Instruction::UIToFP:
11356 case Instruction::Trunc:
11357 case Instruction::FPTrunc:
11358 case Instruction::BitCast:
11359 case Instruction::ICmp:
11360 case Instruction::FCmp:
11361 case Instruction::FNeg:
11362 case Instruction::Add:
11363 case Instruction::FAdd:
11364 case Instruction::Sub:
11365 case Instruction::FSub:
11366 case Instruction::Mul:
11367 case Instruction::FMul:
11368 case Instruction::UDiv:
11369 case Instruction::SDiv:
11370 case Instruction::FDiv:
11371 case Instruction::URem:
11372 case Instruction::SRem:
11373 case Instruction::FRem:
11374 case Instruction::Shl:
11375 case Instruction::LShr:
11376 case Instruction::AShr:
11377 case Instruction::And:
11378 case Instruction::Or:
11379 case Instruction::Xor:
11380 case Instruction::Freeze:
11381 case Instruction::Store:
11382 case Instruction::ShuffleVector:
11383 Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
11384 for (auto [Idx, V] : enumerate(First&: VL)) {
11385 auto *I = dyn_cast<Instruction>(Val: V);
11386 if (!I) {
11387 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11388 Ops[Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
11389 continue;
11390 }
11391 auto [Op, ConvertedOps] = convertTo(I, S);
11392 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11393 Ops[Idx] = ConvertedOps[OpIdx];
11394 }
11395 return;
11396 case Instruction::Select:
11397 Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
11398 for (auto [Idx, V] : enumerate(First&: VL)) {
11399 auto *I = dyn_cast<Instruction>(Val: V);
11400 if (!I) {
11401 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11402 Ops[Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
11403 continue;
11404 }
11405 if (isa<ZExtInst>(Val: I)) {
11406 // Special case for select + zext i1 to avoid explosion of different
11407 // types. We want to keep the condition as i1 to be able to match
11408 // different selects together and reuse the vectorized condition
11409 // rather than trying to gather it.
11410 Operands[0][Idx] = I->getOperand(i: 0);
11411 Operands[1][Idx] = ConstantInt::get(Ty: I->getType(), V: 1);
11412 Operands[2][Idx] = ConstantInt::getNullValue(Ty: I->getType());
11413 continue;
11414 }
11415 auto [Op, ConvertedOps] = convertTo(I, S);
11416 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11417 Ops[Idx] = ConvertedOps[OpIdx];
11418 }
11419 return;
11420 case Instruction::GetElementPtr: {
11421 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
11422 // Need to cast all indices to the same type before vectorization to
11423 // avoid crash.
11424 // Required to be able to find correct matches between different gather
11425 // nodes and reuse the vectorized values rather than trying to gather them
11426 // again.
11427 const unsigned IndexIdx = 1;
11428 Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
11429 Type *Ty =
11430 all_of(Range&: VL,
11431 P: [&](Value *V) {
11432 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11433 return !GEP || VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
11434 })
11435 ? VL0Ty
11436 : DL.getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
11437 ->getPointerOperandType()
11438 ->getScalarType());
11439 for (auto [Idx, V] : enumerate(First&: VL)) {
11440 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11441 if (!GEP) {
11442 Operands[0][Idx] = V;
11443 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11444 continue;
11445 }
11446 Operands[0][Idx] = GEP->getPointerOperand();
11447 auto *Op = GEP->getOperand(i_nocapture: IndexIdx);
11448 auto *CI = dyn_cast<ConstantInt>(Val: Op);
11449 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11450 C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL)
11451 : Op;
11452 }
11453 return;
11454 }
11455 case Instruction::Call: {
11456 auto *CI = cast<CallInst>(Val: VL0);
11457 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
11458 for (unsigned Idx : seq<unsigned>(Size: CI->arg_size())) {
11459 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI: &TTI))
11460 continue;
11461 auto &Ops = Operands.emplace_back();
11462 for (Value *V : VL) {
11463 auto *I = dyn_cast<Instruction>(Val: V);
11464 Ops.push_back(Elt: I ? I->getOperand(i: Idx)
11465 : PoisonValue::get(T: VL0->getOperand(i: Idx)->getType()));
11466 }
11467 }
11468 return;
11469 }
11470 default:
11471 break;
11472 }
11473 llvm_unreachable("Unexpected vectorization of the instructions.");
11474 }
11475
11476 /// Check if the specified \p VL list of values is better to represent as
11477 /// uniform with copyables, as modeled via \p CopyableS, or as alternate (or
11478 /// uniform with compatible ops), modeled via \p S.
11479 /// Performs the analysis of the operands, choosing the preferred main
11480 /// instruction and checking the matching of the operands for the main
11481 /// instruction and copyable elements.
11482 bool isCopyablePreferable(ArrayRef<Value *> VL, const BoUpSLP &R,
11483 const InstructionsState &S,
11484 const InstructionsState &CopyableS) {
11485 // If all elements are vectorized already - keep as is.
11486 if (all_of(Range&: VL, P: [&](Value *V) {
11487 return isa<PoisonValue>(Val: V) || R.isVectorized(V);
11488 }))
11489 return false;
11490 Instruction *SMain = S.getMainOp();
11491 Instruction *SAlt = S.isAltShuffle() ? S.getAltOp() : nullptr;
11492 const bool IsCommutative = ::isCommutative(I: SMain);
11493 const bool IsAltCommutative =
11494 S.isAltShuffle() ? ::isCommutative(I: SAlt) : false;
11495 const bool IsMainCommutative = ::isCommutative(I: MainOp);
11496 SmallVector<BoUpSLP::ValueList> Ops;
11497 buildOriginalOperands(S, VL: SMain, Operands&: Ops);
11498 // Support only binary operations for now.
11499 if (Ops.size() != 2)
11500 return false;
11501 // Try to find better candidate for S main instruction, which operands have
11502 // better matching.
11503 auto CheckOperands = [](Value *Op, Value *SMainOp) {
11504 auto *OpI = dyn_cast<BinaryOperator>(Val: Op);
11505 if (!OpI)
11506 return false;
11507 auto *SMainOpI = dyn_cast<BinaryOperator>(Val: SMainOp);
11508 if (!SMainOpI)
11509 return true;
11510 return any_of(Range: OpI->operands(), P: [&](Value *V) {
11511 auto *I = dyn_cast<Instruction>(Val: V);
11512 return I && I->getOpcode() == SMainOpI->getOpcode();
11513 });
11514 };
11515 SmallPtrSet<Value *, 8> Operands;
11516 for (Value *V : VL) {
11517 auto *I = dyn_cast<Instruction>(Val: V);
11518 if (!I || I == SMain)
11519 continue;
11520 Instruction *MatchingOp = S.getMatchingMainOpOrAltOp(I);
11521 if (MatchingOp != SMain)
11522 continue;
11523 SmallVector<BoUpSLP::ValueList> VOps;
11524 buildOriginalOperands(S, VL: I, Operands&: VOps);
11525 Operands.insert(I: I->op_begin(), E: I->op_end());
11526 assert(VOps.size() == 2 && Ops.size() == 2 &&
11527 "Expected binary operations only.");
11528 if (CheckOperands(VOps[0][0], Ops[0][0]) ||
11529 CheckOperands(VOps[1][0], Ops[1][0]) ||
11530 (IsCommutative && (CheckOperands(VOps[0][0], Ops[1][0]) ||
11531 CheckOperands(VOps[1][0], Ops[0][0])))) {
11532 SMain = I;
11533 Ops.swap(RHS&: VOps);
11534 break;
11535 }
11536 }
11537 SmallVector<BoUpSLP::ValueList> MainOps;
11538 buildOriginalOperands(S, VL: MainOp, Operands&: MainOps);
11539
11540 auto BuildFirstOperandCandidates =
11541 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11542 ArrayRef<BoUpSLP::ValueList> Ops, Value *Op0, Value *Op1,
11543 bool IsCommutative) {
11544 Candidates.emplace_back(Args: Ops[0][0], Args&: Op0);
11545 if (IsCommutative)
11546 Candidates.emplace_back(Args: Ops[0][0], Args&: Op1);
11547 };
11548
11549 auto BuildSecondOperandCandidates =
11550 [&](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates,
11551 ArrayRef<BoUpSLP::ValueList> Ops, int PrevBestIdx, Value *Op0,
11552 Value *Op1, bool IsCommutative) {
11553 if (PrevBestIdx != 1)
11554 Candidates.emplace_back(Args: Ops[1][0], Args&: Op1);
11555 if (PrevBestIdx != 0 && IsCommutative)
11556 Candidates.emplace_back(Args: Ops[1][0], Args&: Op0);
11557 };
11558
11559 auto FindBestCandidate =
11560 [&](ArrayRef<std::pair<Value *, Value *>> Candidates, bool &IsConst,
11561 int &Score) {
11562 auto Res = R.findBestRootPair(Candidates);
11563 Score = Res.second;
11564 IsConst =
11565 Res.second == BoUpSLP::LookAheadHeuristics::ScoreConstants &&
11566 isConstant(V: Candidates[Res.first.value_or(u: 0)].first) &&
11567 isConstant(V: Candidates[Res.first.value_or(u: 0)].second);
11568 if (IsConst) {
11569 // Check if there are splat candidates and consider them better
11570 // option.
11571 for (const auto [Idx, P] : enumerate(First&: Candidates)) {
11572 if (!isConstant(V: P.first) && !isConstant(V: P.second) &&
11573 P.second == P.first) {
11574 Res.first = Idx;
11575 IsConst = false;
11576 Score = isa<LoadInst>(Val: Candidates[Res.first.value_or(u: 0)].first)
11577 ? BoUpSLP::LookAheadHeuristics::ScoreSplatLoads
11578 : BoUpSLP::LookAheadHeuristics::ScoreSplat;
11579 break;
11580 }
11581 }
11582 }
11583 return Res.first;
11584 };
11585
11586 for (Value *V : VL) {
11587 auto *I = dyn_cast<Instruction>(Val: V);
11588 if (!I || (I == MainOp && (!S.isAltShuffle() || I == SMain)) ||
11589 (!S.isAltShuffle() && I == SMain))
11590 continue;
11591 SmallVector<BoUpSLP::ValueList> VOps;
11592 buildOriginalOperands(S, VL: I == SMain ? MainOp : I, Operands&: VOps);
11593 SmallVector<Value *> CopyableOps =
11594 getOperands(S: CopyableS, V: I == MainOp ? SMain : I);
11595 if (CopyableOps.size() == VOps.size() &&
11596 all_of(Range: zip(t&: CopyableOps, u&: VOps), P: [&](const auto &P) {
11597 return std::get<0>(P) == std::get<1>(P)[0];
11598 }))
11599 continue;
11600 SmallVector<std::pair<Value *, Value *>> Candidates;
11601 BuildFirstOperandCandidates(Candidates, MainOps, CopyableOps[0],
11602 CopyableOps[1], IsMainCommutative);
11603 const unsigned OpSize = Candidates.size();
11604 Instruction *MatchingOp =
11605 S.getMatchingMainOpOrAltOp(I) == S.getMainOp() ? SMain : SAlt;
11606 const bool IsCommutativeInst =
11607 (MatchingOp == SMain ? IsCommutative : IsAltCommutative) ||
11608 ::isCommutative(I, ValWithUses: MatchingOp);
11609 if (S.isAltShuffle() && MatchingOp == SAlt &&
11610 any_of(Range&: VOps, P: [&](const BoUpSLP::ValueList &Ops) {
11611 auto *I = dyn_cast<BinaryOperator>(Val: Ops[0]);
11612 return I && Operands.contains(Ptr: I);
11613 }))
11614 return false;
11615 if (S.isAltShuffle() && MatchingOp == SMain)
11616 Operands.insert(I: I->op_begin(), E: I->op_end());
11617 BuildFirstOperandCandidates(Candidates, Ops, VOps[0][0], VOps[1][0],
11618 IsCommutativeInst);
11619 bool IsBestConst;
11620 int Score;
11621 std::optional<int> BestOp =
11622 FindBestCandidate(Candidates, IsBestConst, Score);
11623 const bool IsOriginalBetter =
11624 static_cast<unsigned>(BestOp.value_or(u: OpSize)) >= OpSize;
11625 Candidates.clear();
11626 BuildSecondOperandCandidates(
11627 Candidates, MainOps, IsOriginalBetter ? -1 : *BestOp, CopyableOps[0],
11628 CopyableOps[1], IsMainCommutative);
11629 const unsigned SecondOpSize = Candidates.size();
11630 BuildSecondOperandCandidates(
11631 Candidates, Ops,
11632 IsOriginalBetter ? BestOp.value_or(u: OpSize - 1) - OpSize : -1,
11633 VOps[0][0], VOps[1][0], IsCommutativeInst);
11634 bool IsSecondBestConst;
11635 int SecondScore;
11636 std::optional<int> SecondBestOp =
11637 FindBestCandidate(Candidates, IsSecondBestConst, SecondScore);
11638 // No best candidates.
11639 if (!BestOp && !SecondBestOp)
11640 return false;
11641 // Original better in both ops combinations.
11642 const bool IsSecondOriginalBetter =
11643 static_cast<unsigned>(SecondBestOp.value_or(u: SecondOpSize)) >=
11644 SecondOpSize;
11645 if (IsOriginalBetter && IsSecondOriginalBetter)
11646 return false;
11647 // Original is better in second combination, but in the first combination
11648 // no best candidates.
11649 if (!BestOp && IsSecondOriginalBetter)
11650 return false;
11651 // Original is better in first combination, but in the second combination
11652 // no best candidates.
11653 if (!SecondBestOp && IsOriginalBetter)
11654 return false;
11655 // Copyable is best in the first combination, but it is constant, but
11656 // original is better in second non-constant combination.
11657 if (!IsOriginalBetter && IsBestConst && IsSecondOriginalBetter &&
11658 !IsSecondBestConst)
11659 return false;
11660 // Copyable is best in the second combination, but it is constant, but
11661 // original is better in the first non-constant combination.
11662 if (BestOp && IsOriginalBetter && !IsBestConst &&
11663 !IsSecondOriginalBetter && IsSecondBestConst)
11664 return false;
11665 // Original combination score is better.
11666 if (((Score > SecondScore ||
11667 (Score <= BoUpSLP::LookAheadHeuristics::ScoreAltOpcodes &&
11668 Score == SecondScore)) &&
11669 IsOriginalBetter) ||
11670 (IsSecondOriginalBetter &&
11671 (SecondScore > Score ||
11672 (Score <= BoUpSLP::LookAheadHeuristics::ScoreAltOpcodes &&
11673 Score == SecondScore))))
11674 return false;
11675 }
11676 return true;
11677 }
11678
11679public:
11680 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11681 const TargetTransformInfo &TTI,
11682 const TargetLibraryInfo &TLI)
11683 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11684
11685 InstructionsState buildInstructionsState(ArrayRef<Value *> VL,
11686 const BoUpSLP &R,
11687 bool WithProfitabilityCheck = false,
11688 bool SkipSameCodeCheck = false) {
11689 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11690 ? InstructionsState::invalid()
11691 : getSameOpcode(VL, TLI);
11692 // Check if series of selects + zext i1 %x to in can be combined into
11693 // selects + select %x, i32 1, i32 0.
11694 Instruction *SelectOp = nullptr;
11695 if (!S && allSameBlock(VL) && all_of(Range&: VL, P: [&](Value *V) {
11696 if (match(V, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()))) {
11697 if (!SelectOp)
11698 SelectOp = cast<Instruction>(Val: V);
11699 return true;
11700 }
11701 auto *ZExt = dyn_cast<ZExtInst>(Val: V);
11702 return (ZExt && ZExt->getSrcTy()->isIntegerTy(Bitwidth: 1)) ||
11703 isa<PoisonValue>(Val: V);
11704 })) {
11705 if (SelectOp)
11706 return InstructionsState(SelectOp, SelectOp);
11707 }
11708 if (S && S.isAltShuffle()) {
11709 Type *ScalarTy = S.getMainOp()->getType();
11710 VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11711 unsigned Opcode0 = S.getOpcode();
11712 unsigned Opcode1 = S.getAltOpcode();
11713 SmallBitVector OpcodeMask(
11714 getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11715 // If this pattern is supported by the target then we consider the order.
11716 if (TTI.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask))
11717 return S;
11718 } else if (S && (!VectorizeCopyableElements ||
11719 !isa<BinaryOperator>(Val: S.getMainOp()) ||
11720 all_of(Range&: VL, P: [&](Value *V) {
11721 auto *I = dyn_cast<Instruction>(Val: V);
11722 return !I || I->getOpcode() == S.getOpcode();
11723 }))) {
11724 return S;
11725 }
11726 if (!VectorizeCopyableElements)
11727 return S;
11728 findAndSetMainInstruction(VL, R);
11729 if (!MainOp)
11730 return S;
11731 InstructionsState OrigS = S;
11732 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11733 if (OrigS && !isCopyablePreferable(VL, R, S: OrigS, CopyableS: S))
11734 return OrigS;
11735 if (!WithProfitabilityCheck)
11736 return S;
11737 // Check if it is profitable to vectorize the instruction.
11738 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11739 auto BuildCandidates =
11740 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11741 Value *V2) {
11742 if (V1 != V2 && isa<PHINode>(Val: V1))
11743 return;
11744 auto *I1 = dyn_cast<Instruction>(Val: V1);
11745 auto *I2 = dyn_cast<Instruction>(Val: V2);
11746 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11747 I1->getParent() != I2->getParent())
11748 return;
11749 Candidates.emplace_back(Args&: V1, Args&: (I1 || I2) ? V2 : V1);
11750 };
11751 if (VL.size() == 2) {
11752 // Check if the operands allow better vectorization.
11753 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11754 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11755 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11756 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11757 R.findBestRootPair(Candidates: Candidates1).first &&
11758 R.findBestRootPair(Candidates: Candidates2).first;
11759 if (!Res && isCommutative(I: MainOp)) {
11760 Candidates1.clear();
11761 Candidates2.clear();
11762 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11763 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11764 Res = !Candidates1.empty() && !Candidates2.empty() &&
11765 R.findBestRootPair(Candidates: Candidates1).first &&
11766 R.findBestRootPair(Candidates: Candidates2).first;
11767 }
11768 if (!Res)
11769 return OrigS;
11770 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11771 InstructionCost ScalarCost = TTI.getInstructionCost(U: S.getMainOp(), CostKind: Kind);
11772 InstructionCost VectorCost;
11773 FixedVectorType *VecTy =
11774 getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
11775 switch (MainOpcode) {
11776 case Instruction::Add:
11777 case Instruction::Sub:
11778 case Instruction::LShr:
11779 case Instruction::Shl:
11780 case Instruction::SDiv:
11781 case Instruction::UDiv:
11782 case Instruction::And:
11783 case Instruction::Or:
11784 case Instruction::Xor:
11785 case Instruction::FAdd:
11786 case Instruction::FMul:
11787 case Instruction::FSub:
11788 case Instruction::FDiv:
11789 VectorCost = TTI.getArithmeticInstrCost(Opcode: MainOpcode, Ty: VecTy, CostKind: Kind);
11790 break;
11791 default:
11792 llvm_unreachable("Unexpected instruction.");
11793 }
11794 if (VectorCost > ScalarCost)
11795 return OrigS;
11796 return S;
11797 }
11798 assert(Operands.size() == 2 && "Unexpected number of operands!");
11799 unsigned CopyableNum =
11800 count_if(Range&: VL, P: [&](Value *V) { return S.isCopyableElement(V); });
11801 if (CopyableNum < VL.size() / 2)
11802 return S;
11803 // Too many phi copyables - exit.
11804 const unsigned Limit = VL.size() / 24;
11805 if ((CopyableNum >= VL.size() - Limit ||
11806 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11807 CopyableNum >= MaxPHINumOperands) &&
11808 all_of(Range&: VL, P: [&](Value *V) {
11809 return isa<PHINode>(Val: V) || !S.isCopyableElement(V);
11810 }))
11811 return OrigS;
11812 // Check profitability if number of copyables > VL.size() / 2.
11813 // 1. Reorder operands for better matching.
11814 if (isCommutative(I: MainOp)) {
11815 Value *BestFrontOp = nullptr;
11816 for (auto [OpL, OpR] : zip(t&: Operands.front(), u&: Operands.back())) {
11817 // Make instructions the first operands.
11818 if (!isa<Instruction>(Val: OpL) && isa<Instruction>(Val: OpR)) {
11819 BestFrontOp = OpR;
11820 std::swap(a&: OpL, b&: OpR);
11821 continue;
11822 }
11823 // Make constants the second operands.
11824 if ((isa<Constant>(Val: OpL) && !match(V: OpR, P: m_Zero())) ||
11825 match(V: OpL, P: m_Zero())) {
11826 if (isa<Instruction>(Val: OpR))
11827 BestFrontOp = OpR;
11828 std::swap(a&: OpL, b&: OpR);
11829 continue;
11830 }
11831 if (isa<Instruction>(Val: OpL))
11832 BestFrontOp = OpL;
11833 }
11834 // If some of the RHS operands better match most of LHS - swap such
11835 // operands to increase matching rate.
11836 if (auto *BestLHS = dyn_cast_if_present<Instruction>(Val: BestFrontOp)) {
11837 const unsigned BestOpcode = BestLHS->getOpcode();
11838 for (auto [OpL, OpR] : zip(t&: Operands.front(), u&: Operands.back())) {
11839 auto *OpRI = dyn_cast<Instruction>(Val: OpR);
11840 if (!OpRI)
11841 continue;
11842 if (OpRI->getOpcode() == BestOpcode)
11843 std::swap(a&: OpL, b&: OpR);
11844 }
11845 }
11846 }
11847 // 2. Check, if operands can be vectorized.
11848 if (count_if(Range&: Operands.back(), P: IsaPred<Instruction>) > 1)
11849 return OrigS;
11850 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11851 if (allConstant(VL: Ops) || isSplat(VL: Ops))
11852 return true;
11853 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11854 // one is different.
11855 constexpr unsigned Limit = 4;
11856 if (Operands.front().size() >= Limit) {
11857 SmallDenseMap<const Value *, unsigned> Counters;
11858 for (Value *V : Ops) {
11859 if (isa<UndefValue>(Val: V))
11860 continue;
11861 ++Counters[V];
11862 }
11863 if (Counters.size() == 2 &&
11864 any_of(Range&: Counters, P: [&](const std::pair<const Value *, unsigned> &C) {
11865 return C.second == 1;
11866 }))
11867 return true;
11868 }
11869 // First operand not a constant or splat? Last attempt - check for
11870 // potential vectorization.
11871 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11872 InstructionsState OpS = Analysis.buildInstructionsState(VL: Ops, R);
11873 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(VL: Ops)))
11874 return false;
11875 unsigned CopyableNum =
11876 count_if(Range&: Ops, P: [&](Value *V) { return OpS.isCopyableElement(V); });
11877 return CopyableNum <= VL.size() / 2;
11878 };
11879 if (!CheckOperand(Operands.front()))
11880 return OrigS;
11881
11882 return S;
11883 }
11884
11885 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11886 ArrayRef<Value *> VL) {
11887 assert(S && "Invalid state!");
11888 SmallVector<BoUpSLP::ValueList> Operands;
11889 if (S.areInstructionsWithCopyableElements()) {
11890 MainOp = S.getMainOp();
11891 MainOpcode = S.getOpcode();
11892 Operands.assign(NumElts: MainOp->getNumOperands(),
11893 Elt: BoUpSLP::ValueList(VL.size(), nullptr));
11894 for (auto [Idx, V] : enumerate(First&: VL)) {
11895 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11896 for (auto [OperandIdx, Operand] : enumerate(First&: OperandsForValue))
11897 Operands[OperandIdx][Idx] = Operand;
11898 }
11899 } else {
11900 buildOriginalOperands(S, VL, Operands);
11901 }
11902 return Operands;
11903 }
11904};
11905} // namespace
11906
11907BoUpSLP::ScalarsVectorizationLegality
11908BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
11909 const EdgeInfo &UserTreeIdx) const {
11910 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11911
11912 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11913 InstructionsState S = Analysis.buildInstructionsState(
11914 VL, R: *this, /*WithProfitabilityCheck=*/true);
11915
11916 bool AreScatterAllGEPSameBlock = false;
11917 if (!S) {
11918 SmallVector<unsigned> SortedIndices;
11919 BasicBlock *BB = nullptr;
11920 bool IsScatterVectorizeUserTE =
11921 UserTreeIdx.UserTE &&
11922 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11923 AreScatterAllGEPSameBlock =
11924 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11925 VL.size() > 2 &&
11926 all_of(Range&: VL,
11927 P: [&BB](Value *V) {
11928 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
11929 if (!I)
11930 return doesNotNeedToBeScheduled(V);
11931 if (!BB)
11932 BB = I->getParent();
11933 return BB == I->getParent() && I->getNumOperands() == 2;
11934 }) &&
11935 BB &&
11936 sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL,
11937 SE&: *SE, SortedIndices));
11938 if (!AreScatterAllGEPSameBlock) {
11939 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11940 "C,S,B,O, small shuffle. \n";
11941 dbgs() << "[";
11942 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11943 dbgs() << "]\n");
11944 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11945 /*TryToFindDuplicates=*/true,
11946 /*TrySplitVectorize=*/true);
11947 }
11948 // Reset S to make it GetElementPtr kind of node.
11949 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
11950 assert(It != VL.end() && "Expected at least one GEP.");
11951 S = getSameOpcode(VL: *It, TLI: *TLI);
11952 }
11953 assert(S && "Must be valid.");
11954
11955 // Don't handle vectors.
11956 if (!SLPReVec && getValueType(V: VL.front())->isVectorTy()) {
11957 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11958 // Do not try to pack to avoid extra instructions here.
11959 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11960 /*TryToFindDuplicates=*/false);
11961 }
11962
11963 // Check that all of the users of the scalars that we want to vectorize are
11964 // schedulable.
11965 BasicBlock *BB = S.getMainOp()->getParent();
11966
11967 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()) ||
11968 !DT->isReachableFromEntry(A: BB)) {
11969 // Don't go into unreachable blocks. They may contain instructions with
11970 // dependency cycles which confuse the final scheduling.
11971 // Do not vectorize EH and non-returning blocks, not profitable in most
11972 // cases.
11973 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11974 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11975 }
11976
11977 // Don't go into catchswitch blocks, which can happen with PHIs.
11978 // Such blocks can only have PHIs and the catchswitch. There is no
11979 // place to insert a shuffle if we need to, so just avoid that issue.
11980 if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
11981 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11982 // Do not try to pack to avoid extra instructions here.
11983 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11984 /*TryToFindDuplicates=*/false);
11985 }
11986
11987 // Don't handle scalable vectors
11988 if (S.getOpcode() == Instruction::ExtractElement &&
11989 isa<ScalableVectorType>(
11990 Val: cast<ExtractElementInst>(Val: S.getMainOp())->getVectorOperandType())) {
11991 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11992 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11993 }
11994
11995 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11996 // a load), in which case peek through to include it in the tree, without
11997 // ballooning over-budget.
11998 if (Depth >= RecursionMaxDepth &&
11999 (S.isAltShuffle() || VL.size() < 4 ||
12000 !(match(V: S.getMainOp(), P: m_Load(Op: m_Value())) ||
12001 all_of(Range&: VL, P: [&S](const Value *I) {
12002 return match(V: I,
12003 P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
12004 cast<Instruction>(Val: I)->getOpcode() == S.getOpcode();
12005 })))) {
12006 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
12007 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12008 }
12009
12010 // Check if this is a duplicate of another entry.
12011 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
12012 for (TreeEntry *E : getTreeEntries(V: S.getMainOp())) {
12013 if (E->isSame(VL)) {
12014 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
12015 << ".\n");
12016 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12017 }
12018 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
12019 if (all_of(Range&: VL, P: [&](Value *V) {
12020 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V) ||
12021 (S.getOpcode() == Instruction::PHI && isa<PHINode>(Val: V) &&
12022 LI->getLoopFor(BB: S.getMainOp()->getParent()) &&
12023 isVectorized(V));
12024 })) {
12025 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
12026 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12027 }
12028 }
12029
12030 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
12031 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
12032 if (!AreAllSameInsts || isSplat(VL) ||
12033 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
12034 Val: S.getMainOp()) &&
12035 !all_of(Range&: VL, P: isVectorLikeInstWithConstOps))) {
12036 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O conditions. \n";
12037 dbgs() << "[";
12038 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
12039 dbgs() << "]\n");
12040 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12041 }
12042
12043 // Don't vectorize ephemeral values.
12044 if (!EphValues.empty()) {
12045 for (Value *V : VL) {
12046 if (EphValues.count(Ptr: V)) {
12047 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
12048 << ") is ephemeral.\n");
12049 // Do not try to pack to avoid extra instructions here.
12050 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
12051 /*TryToFindDuplicates=*/false);
12052 }
12053 }
12054 }
12055
12056 // We now know that this is a vector of instructions of the same type from
12057 // the same block.
12058
12059 // Check that none of the instructions in the bundle are already in the tree
12060 // and the node may be not profitable for the vectorization as the small
12061 // alternate node.
12062 if (S.isAltShuffle()) {
12063 auto GetNumVectorizedExtracted = [&]() {
12064 APInt Extracted = APInt::getZero(numBits: VL.size());
12065 APInt Vectorized = APInt::getAllOnes(numBits: VL.size());
12066 for (auto [Idx, V] : enumerate(First&: VL)) {
12067 auto *I = dyn_cast<Instruction>(Val: V);
12068 if (!I || doesNotNeedToBeScheduled(V: I) ||
12069 all_of(Range: I->operands(), P: [&](const Use &U) {
12070 return isa<ExtractElementInst>(Val: U.get());
12071 }))
12072 continue;
12073 if (isVectorized(V: I))
12074 Vectorized.clearBit(BitPosition: Idx);
12075 else if (!I->hasOneUser() && !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList))
12076 Extracted.setBit(Idx);
12077 }
12078 return std::make_pair(x&: Vectorized, y&: Extracted);
12079 };
12080 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
12081 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
12082 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
12083 if (!Vectorized.isAllOnes() && !PreferScalarize) {
12084 // Rough cost estimation, if the vector code (+ potential extracts) is
12085 // more profitable than the scalar + buildvector.
12086 Type *ScalarTy = VL.front()->getType();
12087 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
12088 InstructionCost VectorizeCostEstimate =
12089 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: {}, CostKind: Kind) +
12090 ::getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Extracted,
12091 /*Insert=*/false, /*Extract=*/true, CostKind: Kind);
12092 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
12093 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Vectorized,
12094 /*Insert=*/true, /*Extract=*/false, CostKind: Kind, /*ForPoisonSrc=*/false);
12095 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
12096 }
12097 if (PreferScalarize) {
12098 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
12099 "node is not profitable.\n");
12100 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12101 }
12102 }
12103
12104 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
12105 if (UserIgnoreList && !UserIgnoreList->empty()) {
12106 for (Value *V : VL) {
12107 if (UserIgnoreList->contains(V)) {
12108 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
12109 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
12110 }
12111 }
12112 }
12113
12114 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
12115}
12116
12117void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
12118 const EdgeInfo &UserTreeIdx,
12119 unsigned InterleaveFactor) {
12120 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
12121
12122 SmallVector<int> ReuseShuffleIndices;
12123 SmallVector<Value *> VL(VLRef);
12124
12125 // Tries to build split node.
12126 auto TrySplitNode = [&](const InstructionsState &LocalState) {
12127 SmallVector<Value *> Op1, Op2;
12128 OrdersType ReorderIndices;
12129 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
12130 return false;
12131
12132 auto Invalid = ScheduleBundle::invalid();
12133 auto *TE = newTreeEntry(VL, EntryState: TreeEntry::SplitVectorize, Bundle&: Invalid, S: LocalState,
12134 UserTreeIdx, ReuseShuffleIndices: {}, ReorderIndices);
12135 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
12136 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
12137 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
12138 if (S && (isa<LoadInst>(Val: S.getMainOp()) ||
12139 getSameValuesTreeEntry(V: S.getMainOp(), VL: Op, /*SameVF=*/true))) {
12140 // Build gather node for loads, they will be gathered later.
12141 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
12142 Args: Idx == 0 ? 0 : Op1.size());
12143 (void)newTreeEntry(VL: Op, EntryState: TreeEntry::NeedToGather, Bundle&: Invalid, S, UserTreeIdx: {TE, Idx});
12144 } else {
12145 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
12146 Args: Idx == 0 ? 0 : Op1.size());
12147 buildTreeRec(VLRef: Op, Depth, UserTreeIdx: {TE, Idx});
12148 }
12149 };
12150 AddNode(Op1, 0);
12151 AddNode(Op2, 1);
12152 return true;
12153 };
12154
12155 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
12156 bool AreConsts = false;
12157 for (Value *V : VL) {
12158 if (isa<PoisonValue>(Val: V))
12159 continue;
12160 if (isa<Constant>(Val: V)) {
12161 AreConsts = true;
12162 continue;
12163 }
12164 if (!isa<PHINode>(Val: V))
12165 return false;
12166 }
12167 return AreConsts;
12168 };
12169 if (AreOnlyConstsWithPHIs(VL)) {
12170 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
12171 newGatherTreeEntry(VL, S: InstructionsState::invalid(), UserTreeIdx);
12172 return;
12173 }
12174
12175 ScalarsVectorizationLegality Legality =
12176 getScalarsVectorizationLegality(VL, Depth, UserTreeIdx);
12177 InstructionsState S = Legality.getInstructionsState();
12178 if (!Legality.isLegal()) {
12179 if (Legality.trySplitVectorize()) {
12180 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
12181 // Last chance to try to vectorize alternate node.
12182 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
12183 return;
12184 }
12185 if (Legality.tryToFindDuplicates())
12186 tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx);
12187
12188 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12189 return;
12190 }
12191
12192 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
12193 if (S.isAltShuffle() && TrySplitNode(S))
12194 return;
12195
12196 // Check that every instruction appears once in this bundle.
12197 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx,
12198 /*TryPad=*/true)) {
12199 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12200 return;
12201 }
12202
12203 // Perform specific checks for each particular instruction kind.
12204 bool IsScatterVectorizeUserTE =
12205 UserTreeIdx.UserTE &&
12206 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
12207 OrdersType CurrentOrder;
12208 SmallVector<Value *> PointerOps;
12209 StridedPtrInfo SPtrInfo;
12210 TreeEntry::EntryState State = getScalarsVectorizationState(
12211 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
12212 if (State == TreeEntry::NeedToGather) {
12213 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12214 return;
12215 }
12216
12217 // Check the loop nest. We need to be sure we handle a single loop nest at a
12218 // time to avoid incorrect cost estimation because of the loop aware cost
12219 // model.
12220 if (VectorizableTree.empty()) {
12221 assert(CurrentLoopNest.empty() && "Expected empty loop nest");
12222 // Process the first node? Initial fill of the loop nest.
12223 BasicBlock *Parent = S.getMainOp()->getParent();
12224 if (const Loop *L = LI->getLoopFor(BB: Parent)) {
12225 L = findInnermostNonInvariantLoop(L, VL);
12226 if (L)
12227 CurrentLoopNest.assign(AR: getLoopNest(L));
12228 }
12229 } else if (!UserTreeIdx ||
12230 UserTreeIdx.UserTE->State == TreeEntry::SplitVectorize ||
12231 UserTreeIdx.UserTE->isGather() ||
12232 UserTreeIdx.UserTE->getMainOp()->getParent() !=
12233 S.getMainOp()->getParent()) {
12234 BasicBlock *Parent = S.getMainOp()->getParent();
12235 if (const Loop *L = LI->getLoopFor(BB: Parent)) {
12236 // Check that the new loop nest is not involved.
12237 // Otherwise, mark it as a gather node.
12238 L = findInnermostNonInvariantLoop(L, VL);
12239 if (L) {
12240 SmallVector<const Loop *> NewLoopNest(getLoopNest(L));
12241 for (const auto [L1, L2] : zip(t&: CurrentLoopNest, u&: NewLoopNest)) {
12242 if (L1 != L2) {
12243 LLVM_DEBUG(dbgs() << "SLP: Different loop nest.\n");
12244 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12245 return;
12246 }
12247 }
12248 if (NewLoopNest.size() > CurrentLoopNest.size())
12249 CurrentLoopNest.append(in_start: std::next(x: NewLoopNest.begin(), n: CurrentLoopNest.size()),
12250 in_end: NewLoopNest.end());
12251 }
12252 }
12253 }
12254
12255 Instruction *VL0 = S.getMainOp();
12256 BasicBlock *BB = VL0->getParent();
12257 auto &BSRef = BlocksSchedules[BB];
12258 if (!BSRef)
12259 BSRef = std::make_unique<BlockScheduling>(args&: BB);
12260
12261 BlockScheduling &BS = *BSRef;
12262
12263 SetVector<Value *> UniqueValues(llvm::from_range, VL);
12264 std::optional<ScheduleBundle *> BundlePtr =
12265 BS.tryScheduleBundle(VL: UniqueValues.getArrayRef(), SLP: this, S, EI: UserTreeIdx);
12266#ifdef EXPENSIVE_CHECKS
12267 // Make sure we didn't break any internal invariants
12268 BS.verify();
12269#endif
12270 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12271 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
12272 // Last chance to try to vectorize alternate node.
12273 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
12274 return;
12275 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12276 NonScheduledFirst.insert(Ptr: VL.front());
12277 if (S.getOpcode() == Instruction::Load &&
12278 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12279 registerNonVectorizableLoads(VL: ArrayRef(VL));
12280 return;
12281 }
12282 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
12283 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
12284 ScheduleBundle Empty;
12285 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
12286 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
12287
12288 unsigned ShuffleOrOp =
12289 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12290 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
12291 // Postpone PHI nodes creation
12292 SmallVector<unsigned> PHIOps;
12293 for (unsigned I : seq<unsigned>(Operands.size())) {
12294 ArrayRef<Value *> Op = Operands[I];
12295 if (Op.empty())
12296 continue;
12297 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
12298 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12299 buildTreeRec(VLRef: Op, Depth: Depth + 1, UserTreeIdx: {TE, I});
12300 else
12301 PHIOps.push_back(Elt: I);
12302 }
12303 for (unsigned I : PHIOps)
12304 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
12305 };
12306 switch (ShuffleOrOp) {
12307 case Instruction::PHI: {
12308 TreeEntry *TE =
12309 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12310 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12311 TE->dump());
12312
12313 TE->setOperands(Operands);
12314 CreateOperandNodes(TE, Operands);
12315 return;
12316 }
12317 case Instruction::ExtractValue:
12318 case Instruction::ExtractElement: {
12319 if (CurrentOrder.empty()) {
12320 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12321 } else {
12322 LLVM_DEBUG({
12323 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12324 "with order";
12325 for (unsigned Idx : CurrentOrder)
12326 dbgs() << " " << Idx;
12327 dbgs() << "\n";
12328 });
12329 fixupOrderingIndices(Order: CurrentOrder);
12330 }
12331 // Insert new order with initial value 0, if it does not exist,
12332 // otherwise return the iterator to the existing one.
12333 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12334 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12335 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12336 "(ExtractValueInst/ExtractElementInst).\n";
12337 TE->dump());
12338 // This is a special case, as it does not gather, but at the same time
12339 // we are not extending buildTreeRec() towards the operands.
12340 TE->setOperands(Operands);
12341 return;
12342 }
12343 case Instruction::InsertElement: {
12344 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12345
12346 auto OrdCompare = [](const std::pair<int, int> &P1,
12347 const std::pair<int, int> &P2) {
12348 return P1.first > P2.first;
12349 };
12350 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12351 decltype(OrdCompare)>
12352 Indices(OrdCompare);
12353 for (int I = 0, E = VL.size(); I < E; ++I) {
12354 unsigned Idx = *getElementIndex(Inst: VL[I]);
12355 Indices.emplace(args&: Idx, args&: I);
12356 }
12357 OrdersType CurrentOrder(VL.size(), VL.size());
12358 bool IsIdentity = true;
12359 for (int I = 0, E = VL.size(); I < E; ++I) {
12360 CurrentOrder[Indices.top().second] = I;
12361 IsIdentity &= Indices.top().second == I;
12362 Indices.pop();
12363 }
12364 if (IsIdentity)
12365 CurrentOrder.clear();
12366 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12367 ReuseShuffleIndices: {}, ReorderIndices: CurrentOrder);
12368 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12369 TE->dump());
12370
12371 TE->setOperands(Operands);
12372 buildTreeRec(VLRef: TE->getOperand(OpIdx: 1), Depth: Depth + 1, UserTreeIdx: {TE, 1});
12373 return;
12374 }
12375 case Instruction::Load: {
12376 // Check that a vectorized load would load the same memory as a scalar
12377 // load. For example, we don't want to vectorize loads that are smaller
12378 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12379 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12380 // from such a struct, we read/write packed bits disagreeing with the
12381 // unvectorized version.
12382 TreeEntry *TE = nullptr;
12383 fixupOrderingIndices(Order: CurrentOrder);
12384 switch (State) {
12385 case TreeEntry::Vectorize:
12386 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12387 ReuseShuffleIndices, ReorderIndices: CurrentOrder, InterleaveFactor);
12388 if (CurrentOrder.empty())
12389 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12390 TE->dump());
12391 else
12392 LLVM_DEBUG(dbgs()
12393 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12394 TE->dump());
12395 break;
12396 case TreeEntry::CompressVectorize:
12397 // Vectorizing non-consecutive loads with (masked)load + compress.
12398 TE = newTreeEntry(VL, EntryState: TreeEntry::CompressVectorize, Bundle, S,
12399 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12400 LLVM_DEBUG(
12401 dbgs()
12402 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12403 TE->dump());
12404 break;
12405 case TreeEntry::StridedVectorize:
12406 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12407 TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
12408 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12409 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12410 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12411 TE->dump());
12412 break;
12413 case TreeEntry::ScatterVectorize:
12414 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12415 TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
12416 UserTreeIdx, ReuseShuffleIndices);
12417 LLVM_DEBUG(
12418 dbgs()
12419 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12420 TE->dump());
12421 break;
12422 case TreeEntry::CombinedVectorize:
12423 case TreeEntry::SplitVectorize:
12424 case TreeEntry::NeedToGather:
12425 llvm_unreachable("Unexpected loads state.");
12426 }
12427 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12428 assert(Operands.size() == 1 && "Expected a single operand only");
12429 SmallVector<int> Mask;
12430 inversePermutation(Indices: CurrentOrder, Mask);
12431 reorderScalars(Scalars&: Operands.front(), Mask);
12432 }
12433 TE->setOperands(Operands);
12434 if (State == TreeEntry::ScatterVectorize)
12435 buildTreeRec(VLRef: PointerOps, Depth: Depth + 1, UserTreeIdx: {TE, 0});
12436 return;
12437 }
12438 case Instruction::ZExt:
12439 case Instruction::SExt:
12440 case Instruction::FPToUI:
12441 case Instruction::FPToSI:
12442 case Instruction::FPExt:
12443 case Instruction::PtrToInt:
12444 case Instruction::IntToPtr:
12445 case Instruction::SIToFP:
12446 case Instruction::UIToFP:
12447 case Instruction::Trunc:
12448 case Instruction::FPTrunc:
12449 case Instruction::BitCast: {
12450 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12451 u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
12452 y: std::numeric_limits<unsigned>::max()));
12453 if (ShuffleOrOp == Instruction::ZExt ||
12454 ShuffleOrOp == Instruction::SExt) {
12455 CastMaxMinBWSizes = std::make_pair(
12456 x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12457 b: PrevMaxBW),
12458 y: std::min<unsigned>(
12459 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
12460 b: PrevMinBW));
12461 } else if (ShuffleOrOp == Instruction::Trunc) {
12462 CastMaxMinBWSizes = std::make_pair(
12463 x: std::max<unsigned>(
12464 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
12465 b: PrevMaxBW),
12466 y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12467 b: PrevMinBW));
12468 }
12469 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12470 ReuseShuffleIndices);
12471 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12472 TE->dump());
12473
12474 TE->setOperands(Operands);
12475 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12476 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth, UserTreeIdx: {TE, I});
12477 if (ShuffleOrOp == Instruction::Trunc) {
12478 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12479 } else if (ShuffleOrOp == Instruction::SIToFP ||
12480 ShuffleOrOp == Instruction::UIToFP) {
12481 unsigned NumSignBits =
12482 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
12483 if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: 0))) {
12484 APInt Mask = DB->getDemandedBits(I: OpI);
12485 NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
12486 }
12487 if (NumSignBits * 2 >=
12488 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
12489 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12490 }
12491 return;
12492 }
12493 case Instruction::ICmp:
12494 case Instruction::FCmp: {
12495 // Check that all of the compares have the same predicate.
12496 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
12497 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12498 ReuseShuffleIndices);
12499 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12500 TE->dump());
12501
12502 VLOperands Ops(VL, Operands, S, *this);
12503 if (cast<CmpInst>(Val: VL0)->isCommutative()) {
12504 // Commutative predicate - collect + sort operands of the instructions
12505 // so that each side is more likely to have the same opcode.
12506 assert(P0 == CmpInst::getSwappedPredicate(P0) &&
12507 "Commutative Predicate mismatch");
12508 Ops.reorder();
12509 Operands.front() = Ops.getVL(OpIdx: 0);
12510 Operands.back() = Ops.getVL(OpIdx: 1);
12511 } else {
12512 // Collect operands - commute if it uses the swapped predicate.
12513 for (auto [Idx, V] : enumerate(First&: VL)) {
12514 if (isa<PoisonValue>(Val: V))
12515 continue;
12516 auto *Cmp = cast<CmpInst>(Val: V);
12517 if (Cmp->getPredicate() != P0)
12518 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12519 }
12520 }
12521 TE->setOperands(Operands);
12522 buildTreeRec(VLRef: Operands.front(), Depth, UserTreeIdx: {TE, 0});
12523 buildTreeRec(VLRef: Operands.back(), Depth, UserTreeIdx: {TE, 1});
12524 if (ShuffleOrOp == Instruction::ICmp) {
12525 unsigned NumSignBits0 =
12526 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
12527 if (NumSignBits0 * 2 >=
12528 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
12529 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12530 unsigned NumSignBits1 =
12531 ComputeNumSignBits(Op: VL0->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
12532 if (NumSignBits1 * 2 >=
12533 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 1)->getType()))
12534 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 1)->Idx);
12535 }
12536 return;
12537 }
12538 case Instruction::Select:
12539 case Instruction::FNeg:
12540 case Instruction::Add:
12541 case Instruction::FAdd:
12542 case Instruction::Sub:
12543 case Instruction::FSub:
12544 case Instruction::Mul:
12545 case Instruction::FMul:
12546 case Instruction::UDiv:
12547 case Instruction::SDiv:
12548 case Instruction::FDiv:
12549 case Instruction::URem:
12550 case Instruction::SRem:
12551 case Instruction::FRem:
12552 case Instruction::Shl:
12553 case Instruction::LShr:
12554 case Instruction::AShr:
12555 case Instruction::And:
12556 case Instruction::Or:
12557 case Instruction::Xor:
12558 case Instruction::Freeze: {
12559 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12560 ReuseShuffleIndices);
12561 LLVM_DEBUG(
12562 dbgs() << "SLP: added a new TreeEntry "
12563 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12564 TE->dump());
12565
12566 if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
12567 VLOperands Ops(VL, Operands, S, *this);
12568 Ops.reorder();
12569 Operands[0] = Ops.getVL(OpIdx: 0);
12570 Operands[1] = Ops.getVL(OpIdx: 1);
12571 }
12572 TE->setOperands(Operands);
12573 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12574 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12575 return;
12576 }
12577 case Instruction::GetElementPtr: {
12578 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12579 ReuseShuffleIndices);
12580 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12581 TE->dump());
12582 TE->setOperands(Operands);
12583
12584 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12585 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
12586 return;
12587 }
12588 case Instruction::Store: {
12589 bool Consecutive = CurrentOrder.empty();
12590 if (!Consecutive)
12591 fixupOrderingIndices(Order: CurrentOrder);
12592 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12593 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12594 if (Consecutive)
12595 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12596 TE->dump());
12597 else
12598 LLVM_DEBUG(
12599 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12600 TE->dump());
12601 TE->setOperands(Operands);
12602 buildTreeRec(VLRef: TE->getOperand(OpIdx: 0), Depth: Depth + 1, UserTreeIdx: {TE, 0});
12603 return;
12604 }
12605 case Instruction::Call: {
12606 // Check if the calls are all to the same vectorizable intrinsic or
12607 // library function.
12608 CallInst *CI = cast<CallInst>(Val: VL0);
12609 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
12610
12611 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12612 ReuseShuffleIndices);
12613 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12614 TE->dump());
12615 if (isCommutative(I: VL0)) {
12616 VLOperands Ops(VL, Operands, S, *this);
12617 Ops.reorder();
12618 Operands[0] = Ops.getVL(OpIdx: 0);
12619 Operands[1] = Ops.getVL(OpIdx: 1);
12620 }
12621 TE->setOperands(Operands);
12622 for (unsigned I : seq<unsigned>(Size: CI->arg_size())) {
12623 // For scalar operands no need to create an entry since no need to
12624 // vectorize it.
12625 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI))
12626 continue;
12627 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12628 }
12629 return;
12630 }
12631 case Instruction::ShuffleVector: {
12632 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12633 ReuseShuffleIndices);
12634 if (S.isAltShuffle()) {
12635 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12636 TE->dump());
12637 } else {
12638 assert(SLPReVec && "Only supported by REVEC.");
12639 LLVM_DEBUG(
12640 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12641 TE->dump());
12642 }
12643
12644 // Reorder operands if reordering would enable vectorization.
12645 auto *CI = dyn_cast<CmpInst>(Val: VL0);
12646 if (CI && any_of(Range&: VL, P: [](Value *V) {
12647 return !isa<PoisonValue>(Val: V) && !cast<CmpInst>(Val: V)->isCommutative();
12648 })) {
12649 auto *MainCI = cast<CmpInst>(Val: S.getMainOp());
12650 auto *AltCI = cast<CmpInst>(Val: S.getAltOp());
12651 CmpInst::Predicate MainP = MainCI->getPredicate();
12652 CmpInst::Predicate AltP = AltCI->getPredicate();
12653 assert(MainP != AltP &&
12654 "Expected different main/alternate predicates.");
12655 // Collect operands - commute if it uses the swapped predicate or
12656 // alternate operation.
12657 for (auto [Idx, V] : enumerate(First&: VL)) {
12658 if (isa<PoisonValue>(Val: V))
12659 continue;
12660 auto *Cmp = cast<CmpInst>(Val: V);
12661
12662 if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
12663 if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12664 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12665 } else {
12666 if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12667 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12668 }
12669 }
12670 TE->setOperands(Operands);
12671 buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0});
12672 buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1});
12673 return;
12674 }
12675
12676 if (isa<BinaryOperator>(Val: VL0) || CI) {
12677 VLOperands Ops(VL, Operands, S, *this);
12678 Ops.reorder();
12679 Operands[0] = Ops.getVL(OpIdx: 0);
12680 Operands[1] = Ops.getVL(OpIdx: 1);
12681 }
12682 TE->setOperands(Operands);
12683 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12684 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12685 return;
12686 }
12687 default:
12688 break;
12689 }
12690 llvm_unreachable("Unexpected vectorization of the instructions.");
12691}
12692
12693unsigned BoUpSLP::canMapToVector(Type *T) const {
12694 unsigned N = 1;
12695 Type *EltTy = T;
12696
12697 while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
12698 if (EltTy->isEmptyTy())
12699 return 0;
12700 if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
12701 // Check that struct is homogeneous.
12702 for (const auto *Ty : ST->elements())
12703 if (Ty != *ST->element_begin())
12704 return 0;
12705 N *= ST->getNumElements();
12706 EltTy = *ST->element_begin();
12707 } else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
12708 N *= AT->getNumElements();
12709 EltTy = AT->getElementType();
12710 } else {
12711 auto *VT = cast<FixedVectorType>(Val: EltTy);
12712 N *= VT->getNumElements();
12713 EltTy = VT->getElementType();
12714 }
12715 }
12716
12717 if (!isValidElementType(Ty: EltTy))
12718 return 0;
12719 size_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
12720 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12721 VTSize != DL->getTypeStoreSizeInBits(Ty: T))
12722 return 0;
12723 return N;
12724}
12725
12726bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12727 SmallVectorImpl<unsigned> &CurrentOrder,
12728 bool ResizeAllowed) const {
12729 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
12730 assert(It != VL.end() && "Expected at least one extract instruction.");
12731 auto *E0 = cast<Instruction>(Val: *It);
12732 assert(
12733 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
12734 "Invalid opcode");
12735 // Check if all of the extracts come from the same vector and from the
12736 // correct offset.
12737 Value *Vec = E0->getOperand(i: 0);
12738
12739 CurrentOrder.clear();
12740
12741 // We have to extract from a vector/aggregate with the same number of elements.
12742 unsigned NElts;
12743 if (E0->getOpcode() == Instruction::ExtractValue) {
12744 NElts = canMapToVector(T: Vec->getType());
12745 if (!NElts)
12746 return false;
12747 // Check if load can be rewritten as load of vector.
12748 LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
12749 if (!LI || !LI->isSimple() || !LI->hasNUses(N: VL.size()))
12750 return false;
12751 } else {
12752 NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
12753 }
12754
12755 unsigned E = VL.size();
12756 if (!ResizeAllowed && NElts != E)
12757 return false;
12758 SmallVector<int> Indices(E, PoisonMaskElem);
12759 unsigned MinIdx = NElts, MaxIdx = 0;
12760 for (auto [I, V] : enumerate(First&: VL)) {
12761 auto *Inst = dyn_cast<Instruction>(Val: V);
12762 if (!Inst)
12763 continue;
12764 if (Inst->getOperand(i: 0) != Vec)
12765 return false;
12766 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
12767 if (isa<UndefValue>(Val: EE->getIndexOperand()))
12768 continue;
12769 std::optional<unsigned> Idx = getExtractIndex(E: Inst);
12770 if (!Idx)
12771 return false;
12772 const unsigned ExtIdx = *Idx;
12773 if (ExtIdx >= NElts)
12774 continue;
12775 Indices[I] = ExtIdx;
12776 if (MinIdx > ExtIdx)
12777 MinIdx = ExtIdx;
12778 if (MaxIdx < ExtIdx)
12779 MaxIdx = ExtIdx;
12780 }
12781 if (MaxIdx - MinIdx + 1 > E)
12782 return false;
12783 if (MaxIdx + 1 <= E)
12784 MinIdx = 0;
12785
12786 // Check that all of the indices extract from the correct offset.
12787 bool ShouldKeepOrder = true;
12788 // Assign to all items the initial value E + 1 so we can check if the extract
12789 // instruction index was used already.
12790 // Also, later we can check that all the indices are used and we have a
12791 // consecutive access in the extract instructions, by checking that no
12792 // element of CurrentOrder still has value E + 1.
12793 CurrentOrder.assign(NumElts: E, Elt: E);
12794 for (unsigned I = 0; I < E; ++I) {
12795 if (Indices[I] == PoisonMaskElem)
12796 continue;
12797 const unsigned ExtIdx = Indices[I] - MinIdx;
12798 if (CurrentOrder[ExtIdx] != E) {
12799 CurrentOrder.clear();
12800 return false;
12801 }
12802 ShouldKeepOrder &= ExtIdx == I;
12803 CurrentOrder[ExtIdx] = I;
12804 }
12805 if (ShouldKeepOrder)
12806 CurrentOrder.clear();
12807
12808 return ShouldKeepOrder;
12809}
12810
12811bool BoUpSLP::areAllUsersVectorized(
12812 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12813 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(V: I))) ||
12814 all_of(Range: I->users(), P: [this](User *U) {
12815 return isVectorized(V: U) || isVectorLikeInstWithConstOps(V: U) ||
12816 (isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
12817 });
12818}
12819
12820void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12821 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12822 SmallVectorImpl<Value *> *OpScalars,
12823 SmallVectorImpl<Value *> *AltScalars) const {
12824 unsigned Sz = Scalars.size();
12825 Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
12826 SmallVector<int> OrderMask;
12827 if (!ReorderIndices.empty())
12828 inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
12829 for (unsigned I = 0; I < Sz; ++I) {
12830 unsigned Idx = I;
12831 if (!ReorderIndices.empty())
12832 Idx = OrderMask[I];
12833 if (isa<PoisonValue>(Val: Scalars[Idx]))
12834 continue;
12835 auto *OpInst = cast<Instruction>(Val: Scalars[Idx]);
12836 if (IsAltOp(OpInst)) {
12837 Mask[I] = Sz + Idx;
12838 if (AltScalars)
12839 AltScalars->push_back(Elt: OpInst);
12840 } else {
12841 Mask[I] = Idx;
12842 if (OpScalars)
12843 OpScalars->push_back(Elt: OpInst);
12844 }
12845 }
12846 if (!ReuseShuffleIndices.empty()) {
12847 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12848 transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
12849 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12850 });
12851 Mask.swap(RHS&: NewMask);
12852 }
12853}
12854
12855static bool isMainInstruction(Instruction *I, Instruction *MainOp,
12856 Instruction *AltOp,
12857 const TargetLibraryInfo &TLI) {
12858 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12859}
12860
12861static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
12862 Instruction *AltOp,
12863 const TargetLibraryInfo &TLI) {
12864 if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
12865 auto *AltCI = cast<CmpInst>(Val: AltOp);
12866 CmpInst::Predicate MainP = MainCI->getPredicate();
12867 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12868 assert(MainP != AltP && "Expected different main/alternate predicates.");
12869 auto *CI = cast<CmpInst>(Val: I);
12870 if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
12871 return false;
12872 if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
12873 return true;
12874 CmpInst::Predicate P = CI->getPredicate();
12875 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
12876
12877 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12878 "CmpInst expected to match either main or alternate predicate or "
12879 "their swap.");
12880 return MainP != P && MainP != SwappedP;
12881 }
12882 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12883}
12884
12885TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) const {
12886 assert(!Ops.empty());
12887 const auto *Op0 = Ops.front();
12888
12889 const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
12890 // TODO: We should allow undef elements here
12891 return isConstant(V) && !isa<UndefValue>(Val: V);
12892 });
12893 const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
12894 // TODO: We should allow undef elements here
12895 return V == Op0;
12896 });
12897 const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12898 // TODO: We should allow undef elements here
12899 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12900 return CI->getValue().isPowerOf2();
12901 return false;
12902 });
12903 const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12904 // TODO: We should allow undef elements here
12905 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12906 return CI->getValue().isNegatedPowerOf2();
12907 return false;
12908 });
12909
12910 TTI::OperandValueKind VK = TTI::OK_AnyValue;
12911 if (IsConstant && IsUniform)
12912 VK = TTI::OK_UniformConstantValue;
12913 else if (IsConstant)
12914 VK = TTI::OK_NonUniformConstantValue;
12915 else if (IsUniform)
12916 VK = TTI::OK_UniformValue;
12917
12918 TTI::OperandValueProperties VP = TTI::OP_None;
12919 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12920 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12921
12922 return {.Kind: VK, .Properties: VP};
12923}
12924
12925namespace {
12926/// The base class for shuffle instruction emission and shuffle cost estimation.
12927class BaseShuffleAnalysis {
12928protected:
12929 Type *ScalarTy = nullptr;
12930
12931 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12932
12933 /// V is expected to be a vectorized value.
12934 /// When REVEC is disabled, there is no difference between VF and
12935 /// VNumElements.
12936 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12937 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12938 /// of 8.
12939 unsigned getVF(Value *V) const {
12940 assert(V && "V cannot be nullptr");
12941 assert(isa<FixedVectorType>(V->getType()) &&
12942 "V does not have FixedVectorType");
12943 assert(ScalarTy && "ScalarTy cannot be nullptr");
12944 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
12945 unsigned VNumElements =
12946 cast<FixedVectorType>(Val: V->getType())->getNumElements();
12947 assert(VNumElements > ScalarTyNumElements &&
12948 "the number of elements of V is not large enough");
12949 assert(VNumElements % ScalarTyNumElements == 0 &&
12950 "the number of elements of V is not a vectorized value");
12951 return VNumElements / ScalarTyNumElements;
12952 }
12953
12954 /// Checks if the mask is an identity mask.
12955 /// \param IsStrict if is true the function returns false if mask size does
12956 /// not match vector size.
12957 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12958 bool IsStrict) {
12959 int Limit = Mask.size();
12960 int VF = VecTy->getNumElements();
12961 int Index = -1;
12962 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
12963 return true;
12964 if (!IsStrict) {
12965 // Consider extract subvector starting from index 0.
12966 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
12967 Index == 0)
12968 return true;
12969 // All VF-size submasks are identity (e.g.
12970 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12971 if (Limit % VF == 0 && all_of(Range: seq<int>(Begin: 0, End: Limit / VF), P: [=](int Idx) {
12972 ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
12973 return all_of(Range&: Slice, P: equal_to(Arg: PoisonMaskElem)) ||
12974 ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
12975 }))
12976 return true;
12977 }
12978 return false;
12979 }
12980
12981 /// Tries to combine 2 different masks into single one.
12982 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12983 /// change the size of the vector, \p LocalVF is the original size of the
12984 /// shuffled vector.
12985 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12986 ArrayRef<int> ExtMask) {
12987 unsigned VF = Mask.size();
12988 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12989 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12990 if (ExtMask[I] == PoisonMaskElem)
12991 continue;
12992 int MaskedIdx = Mask[ExtMask[I] % VF];
12993 NewMask[I] =
12994 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12995 }
12996 Mask.swap(RHS&: NewMask);
12997 }
12998
12999 /// Looks through shuffles trying to reduce final number of shuffles in the
13000 /// code. The function looks through the previously emitted shuffle
13001 /// instructions and properly mark indices in mask as undef.
13002 /// For example, given the code
13003 /// \code
13004 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
13005 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
13006 /// \endcode
13007 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
13008 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
13009 /// <0, 1, 2, 3> for the shuffle.
13010 /// If 2 operands are of different size, the smallest one will be resized and
13011 /// the mask recalculated properly.
13012 /// For example, given the code
13013 /// \code
13014 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
13015 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
13016 /// \endcode
13017 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
13018 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
13019 /// <0, 1, 2, 3> for the shuffle.
13020 /// So, it tries to transform permutations to simple vector merge, if
13021 /// possible.
13022 /// \param V The input vector which must be shuffled using the given \p Mask.
13023 /// If the better candidate is found, \p V is set to this best candidate
13024 /// vector.
13025 /// \param Mask The input mask for the shuffle. If the best candidate is found
13026 /// during looking-through-shuffles attempt, it is updated accordingly.
13027 /// \param SinglePermute true if the shuffle operation is originally a
13028 /// single-value-permutation. In this case the look-through-shuffles procedure
13029 /// may look for resizing shuffles as the best candidates.
13030 /// \return true if the shuffle results in the non-resizing identity shuffle
13031 /// (and thus can be ignored), false - otherwise.
13032 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
13033 bool SinglePermute) {
13034 Value *Op = V;
13035 ShuffleVectorInst *IdentityOp = nullptr;
13036 SmallVector<int> IdentityMask;
13037 while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
13038 // Exit if not a fixed vector type or changing size shuffle.
13039 auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
13040 if (!SVTy)
13041 break;
13042 // Remember the identity or broadcast mask, if it is not a resizing
13043 // shuffle. If no better candidates are found, this Op and Mask will be
13044 // used in the final shuffle.
13045 if (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/false)) {
13046 if (!IdentityOp || !SinglePermute ||
13047 (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/true) &&
13048 !ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
13049 NumSrcElts: IdentityMask.size()))) {
13050 IdentityOp = SV;
13051 // Store current mask in the IdentityMask so later we did not lost
13052 // this info if IdentityOp is selected as the best candidate for the
13053 // permutation.
13054 IdentityMask.assign(RHS: Mask);
13055 }
13056 }
13057 // Remember the broadcast mask. If no better candidates are found, this Op
13058 // and Mask will be used in the final shuffle.
13059 // Zero splat can be used as identity too, since it might be used with
13060 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
13061 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
13062 // expensive, the analysis founds out, that the source vector is just a
13063 // broadcast, this original mask can be transformed to identity mask <0,
13064 // 1, 2, 3>.
13065 // \code
13066 // %0 = shuffle %v, poison, zeroinitalizer
13067 // %res = shuffle %0, poison, <3, 1, 2, 0>
13068 // \endcode
13069 // may be transformed to
13070 // \code
13071 // %0 = shuffle %v, poison, zeroinitalizer
13072 // %res = shuffle %0, poison, <0, 1, 2, 3>
13073 // \endcode
13074 if (SV->isZeroEltSplat()) {
13075 IdentityOp = SV;
13076 IdentityMask.assign(RHS: Mask);
13077 }
13078 int LocalVF = Mask.size();
13079 if (auto *SVOpTy =
13080 dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType()))
13081 LocalVF = SVOpTy->getNumElements();
13082 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
13083 for (auto [Idx, I] : enumerate(First&: Mask)) {
13084 if (I == PoisonMaskElem ||
13085 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
13086 continue;
13087 ExtMask[Idx] = SV->getMaskValue(Elt: I);
13088 }
13089 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
13090 V: SV->getOperand(i_nocapture: 0),
13091 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
13092 .all();
13093 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
13094 V: SV->getOperand(i_nocapture: 1),
13095 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
13096 .all();
13097 if (!IsOp1Undef && !IsOp2Undef) {
13098 // Update mask and mark undef elems.
13099 for (int &I : Mask) {
13100 if (I == PoisonMaskElem)
13101 continue;
13102 if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
13103 PoisonMaskElem)
13104 I = PoisonMaskElem;
13105 }
13106 break;
13107 }
13108 SmallVector<int> ShuffleMask(SV->getShuffleMask());
13109 combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
13110 Mask.swap(RHS&: ShuffleMask);
13111 if (IsOp2Undef)
13112 Op = SV->getOperand(i_nocapture: 0);
13113 else
13114 Op = SV->getOperand(i_nocapture: 1);
13115 }
13116 if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
13117 !OpTy || !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) ||
13118 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
13119 if (IdentityOp) {
13120 V = IdentityOp;
13121 assert(Mask.size() == IdentityMask.size() &&
13122 "Expected masks of same sizes.");
13123 // Clear known poison elements.
13124 for (auto [I, Idx] : enumerate(First&: Mask))
13125 if (Idx == PoisonMaskElem)
13126 IdentityMask[I] = PoisonMaskElem;
13127 Mask.swap(RHS&: IdentityMask);
13128 auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
13129 return SinglePermute &&
13130 (isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
13131 /*IsStrict=*/true) ||
13132 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
13133 Shuffle->isZeroEltSplat() &&
13134 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size()) &&
13135 all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) {
13136 return P.value() == PoisonMaskElem ||
13137 Shuffle->getShuffleMask()[P.index()] == 0;
13138 })));
13139 }
13140 V = Op;
13141 return false;
13142 }
13143 V = Op;
13144 return true;
13145 }
13146
13147 /// Smart shuffle instruction emission, walks through shuffles trees and
13148 /// tries to find the best matching vector for the actual shuffle
13149 /// instruction.
13150 template <typename T, typename ShuffleBuilderTy>
13151 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
13152 ShuffleBuilderTy &Builder, Type *ScalarTy) {
13153 assert(V1 && "Expected at least one vector value.");
13154 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
13155 SmallVector<int> NewMask(Mask);
13156 if (ScalarTyNumElements != 1) {
13157 assert(SLPReVec && "FixedVectorType is not expected.");
13158 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask&: NewMask);
13159 Mask = NewMask;
13160 }
13161 if (V2)
13162 Builder.resizeToMatch(V1, V2);
13163 int VF = Mask.size();
13164 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
13165 VF = FTy->getNumElements();
13166 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
13167 V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg))
13168 .all()) {
13169 // Peek through shuffles.
13170 Value *Op1 = V1;
13171 Value *Op2 = V2;
13172 int VF =
13173 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
13174 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13175 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13176 for (int I = 0, E = Mask.size(); I < E; ++I) {
13177 if (Mask[I] < VF)
13178 CombinedMask1[I] = Mask[I];
13179 else
13180 CombinedMask2[I] = Mask[I] - VF;
13181 }
13182 Value *PrevOp1;
13183 Value *PrevOp2;
13184 do {
13185 PrevOp1 = Op1;
13186 PrevOp2 = Op2;
13187 (void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /*SinglePermute=*/false);
13188 (void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /*SinglePermute=*/false);
13189 // Check if we have 2 resizing shuffles - need to peek through operands
13190 // again.
13191 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
13192 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
13193 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
13194 for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
13195 if (I == PoisonMaskElem)
13196 continue;
13197 ExtMask1[Idx] = SV1->getMaskValue(Elt: I);
13198 }
13199 SmallBitVector UseMask1 = buildUseMask(
13200 VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: 1)->getType())
13201 ->getNumElements(),
13202 Mask: ExtMask1, MaskArg: UseMask::SecondArg);
13203 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
13204 for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
13205 if (I == PoisonMaskElem)
13206 continue;
13207 ExtMask2[Idx] = SV2->getMaskValue(Elt: I);
13208 }
13209 SmallBitVector UseMask2 = buildUseMask(
13210 VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: 1)->getType())
13211 ->getNumElements(),
13212 Mask: ExtMask2, MaskArg: UseMask::SecondArg);
13213 if (SV1->getOperand(i_nocapture: 0)->getType() ==
13214 SV2->getOperand(i_nocapture: 0)->getType() &&
13215 SV1->getOperand(i_nocapture: 0)->getType() != SV1->getType() &&
13216 isUndefVector(V: SV1->getOperand(i_nocapture: 1), UseMask: UseMask1).all() &&
13217 isUndefVector(V: SV2->getOperand(i_nocapture: 1), UseMask: UseMask2).all()) {
13218 Op1 = SV1->getOperand(i_nocapture: 0);
13219 Op2 = SV2->getOperand(i_nocapture: 0);
13220 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
13221 int LocalVF = ShuffleMask1.size();
13222 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
13223 LocalVF = FTy->getNumElements();
13224 combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
13225 CombinedMask1.swap(RHS&: ShuffleMask1);
13226 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
13227 LocalVF = ShuffleMask2.size();
13228 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
13229 LocalVF = FTy->getNumElements();
13230 combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
13231 CombinedMask2.swap(RHS&: ShuffleMask2);
13232 }
13233 }
13234 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
13235 Builder.resizeToMatch(Op1, Op2);
13236 VF = std::max(a: cast<VectorType>(Val: Op1->getType())
13237 ->getElementCount()
13238 .getKnownMinValue(),
13239 b: cast<VectorType>(Val: Op2->getType())
13240 ->getElementCount()
13241 .getKnownMinValue());
13242 for (int I = 0, E = Mask.size(); I < E; ++I) {
13243 if (CombinedMask2[I] != PoisonMaskElem) {
13244 assert(CombinedMask1[I] == PoisonMaskElem &&
13245 "Expected undefined mask element");
13246 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
13247 }
13248 }
13249 if (Op1 == Op2 &&
13250 (ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) ||
13251 (ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
13252 isa<ShuffleVectorInst>(Val: Op1) &&
13253 cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
13254 ArrayRef(CombinedMask1))))
13255 return Builder.createIdentity(Op1);
13256 return Builder.createShuffleVector(
13257 Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
13258 CombinedMask1);
13259 }
13260 if (isa<PoisonValue>(Val: V1))
13261 return Builder.createPoison(
13262 cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
13263 bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /*SinglePermute=*/true);
13264 assert(V1 && "Expected non-null value after looking through shuffles.");
13265
13266 if (!IsIdentity)
13267 return Builder.createShuffleVector(V1, NewMask);
13268 return Builder.createIdentity(V1);
13269 }
13270
13271 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
13272 /// shuffle emission.
13273 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
13274 ArrayRef<int> Mask) {
13275 for (unsigned I : seq<unsigned>(Size: CommonMask.size()))
13276 if (Mask[I] != PoisonMaskElem)
13277 CommonMask[I] = I;
13278 }
13279};
13280} // namespace
13281
13282/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
13283static std::pair<InstructionCost, InstructionCost>
13284getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
13285 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
13286 Type *ScalarTy, VectorType *VecTy) {
13287 InstructionCost ScalarCost = 0;
13288 InstructionCost VecCost = 0;
13289 // Here we differentiate two cases: (1) when Ptrs represent a regular
13290 // vectorization tree node (as they are pointer arguments of scattered
13291 // loads) or (2) when Ptrs are the arguments of loads or stores being
13292 // vectorized as plane wide unit-stride load/store since all the
13293 // loads/stores are known to be from/to adjacent locations.
13294 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13295 // Case 2: estimate costs for pointer related costs when vectorizing to
13296 // a wide load/store.
13297 // Scalar cost is estimated as a set of pointers with known relationship
13298 // between them.
13299 // For vector code we will use BasePtr as argument for the wide load/store
13300 // but we also need to account all the instructions which are going to
13301 // stay in vectorized code due to uses outside of these scalar
13302 // loads/stores.
13303 ScalarCost = TTI.getPointersChainCost(
13304 Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
13305 CostKind);
13306
13307 SmallVector<const Value *> PtrsRetainedInVecCode;
13308 for (Value *V : Ptrs) {
13309 if (V == BasePtr) {
13310 PtrsRetainedInVecCode.push_back(Elt: V);
13311 continue;
13312 }
13313 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
13314 // For simplicity assume Ptr to stay in vectorized code if it's not a
13315 // GEP instruction. We don't care since it's cost considered free.
13316 // TODO: We should check for any uses outside of vectorizable tree
13317 // rather than just single use.
13318 if (!Ptr || !Ptr->hasOneUse())
13319 PtrsRetainedInVecCode.push_back(Elt: V);
13320 }
13321
13322 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
13323 // If all pointers stay in vectorized code then we don't have
13324 // any savings on that.
13325 return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
13326 }
13327 VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
13328 Info: TTI::PointersChainInfo::getKnownStride(),
13329 AccessTy: VecTy, CostKind);
13330 } else {
13331 // Case 1: Ptrs are the arguments of loads that we are going to transform
13332 // into masked gather load intrinsic.
13333 // All the scalar GEPs will be removed as a result of vectorization.
13334 // For any external uses of some lanes extract element instructions will
13335 // be generated (which cost is estimated separately).
13336 TTI::PointersChainInfo PtrsInfo =
13337 all_of(Range&: Ptrs,
13338 P: [](const Value *V) {
13339 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
13340 return Ptr && !Ptr->hasAllConstantIndices();
13341 })
13342 ? TTI::PointersChainInfo::getUnknownStride()
13343 : TTI::PointersChainInfo::getKnownStride();
13344
13345 ScalarCost =
13346 TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
13347 auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
13348 if (!BaseGEP) {
13349 auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
13350 if (It != Ptrs.end())
13351 BaseGEP = cast<GEPOperator>(Val: *It);
13352 }
13353 if (BaseGEP) {
13354 SmallVector<const Value *> Indices(BaseGEP->indices());
13355 VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
13356 Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
13357 CostKind);
13358 }
13359 }
13360
13361 return std::make_pair(x&: ScalarCost, y&: VecCost);
13362}
13363
13364void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13365 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13366 "Expected gather node without reordering.");
13367 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13368 SmallSet<size_t, 2> LoadKeyUsed;
13369
13370 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13371 // instructions have same opcode already.
13372 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13373 all_of(Range&: TE.Scalars, P: isConstant))
13374 return;
13375
13376 if (any_of(Range: seq<unsigned>(Size: TE.Idx), P: [&](unsigned Idx) {
13377 return VectorizableTree[Idx]->isSame(VL: TE.Scalars);
13378 }))
13379 return;
13380
13381 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13382 Key = hash_combine(args: hash_value(value: LI->getParent()->getNumber()), args: Key);
13383 Value *Ptr =
13384 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
13385 if (LoadKeyUsed.contains(V: Key)) {
13386 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
13387 if (LIt != LoadsMap.end()) {
13388 for (LoadInst *RLI : LIt->second) {
13389 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
13390 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL: *DL, SE&: *SE,
13391 /*StrictCheck=*/true))
13392 return hash_value(ptr: RLI->getPointerOperand());
13393 }
13394 for (LoadInst *RLI : LIt->second) {
13395 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
13396 Ptr2: LI->getPointerOperand(), TLI: *TLI)) {
13397 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
13398 return SubKey;
13399 }
13400 }
13401 if (LIt->second.size() > 2) {
13402 hash_code SubKey =
13403 hash_value(ptr: LIt->second.back()->getPointerOperand());
13404 return SubKey;
13405 }
13406 }
13407 }
13408 LoadKeyUsed.insert(V: Key);
13409 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)).first->second.push_back(Elt: LI);
13410 return hash_value(ptr: LI->getPointerOperand());
13411 };
13412 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13413 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13414 bool IsOrdered = true;
13415 unsigned NumInstructions = 0;
13416 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13417 // nodes.
13418 for (auto [I, V] : enumerate(First&: TE.Scalars)) {
13419 size_t Key = 1, Idx = 1;
13420 if (auto *Inst = dyn_cast<Instruction>(Val: V);
13421 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(Val: V) &&
13422 !isDeleted(I: Inst) && !isVectorized(V)) {
13423 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
13424 /*AllowAlternate=*/false);
13425 ++NumInstructions;
13426 }
13427 auto &Container = SortedValues[Key];
13428 if (IsOrdered && !KeyToIndex.contains(Val: V) &&
13429 !(isa<Constant, ExtractElementInst>(Val: V) ||
13430 isVectorLikeInstWithConstOps(V)) &&
13431 ((Container.contains(Key: Idx) &&
13432 KeyToIndex.at(Val: Container[Idx].back()).back() != I - 1) ||
13433 (!Container.empty() && !Container.contains(Key: Idx) &&
13434 KeyToIndex.at(Val: Container.back().second.back()).back() != I - 1)))
13435 IsOrdered = false;
13436 auto &KTI = KeyToIndex[V];
13437 if (KTI.empty())
13438 Container[Idx].push_back(Elt: V);
13439 KTI.push_back(Elt: I);
13440 }
13441 SmallVector<std::pair<unsigned, unsigned>> SubVectors;
13442 APInt DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13443 if (!IsOrdered && NumInstructions > 1) {
13444 unsigned Cnt = 0;
13445 TE.ReorderIndices.resize(N: TE.Scalars.size(), NV: TE.Scalars.size());
13446 for (const auto &D : SortedValues) {
13447 for (const auto &P : D.second) {
13448 unsigned Sz = 0;
13449 for (Value *V : P.second) {
13450 ArrayRef<unsigned> Indices = KeyToIndex.at(Val: V);
13451 for (auto [K, Idx] : enumerate(First&: Indices)) {
13452 TE.ReorderIndices[Cnt + K] = Idx;
13453 TE.Scalars[Cnt + K] = V;
13454 }
13455 Sz += Indices.size();
13456 Cnt += Indices.size();
13457 }
13458 if (Sz > 1 && isa<Instruction>(Val: P.second.front())) {
13459 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13460 TTI: *TTI, Ty: TE.Scalars.front()->getType(), Sz);
13461 SubVectors.emplace_back(Args: Cnt - Sz, Args: SubVF);
13462 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt - Sz + SubVF))
13463 DemandedElts.clearBit(BitPosition: I);
13464 } else if (!P.second.empty() && isConstant(V: P.second.front())) {
13465 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt))
13466 DemandedElts.clearBit(BitPosition: I);
13467 }
13468 }
13469 }
13470 }
13471 // Reuses always require shuffles, so consider it as profitable.
13472 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13473 return;
13474 // Do simple cost estimation.
13475 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13476 InstructionCost Cost = 0;
13477 auto *ScalarTy = TE.Scalars.front()->getType();
13478 auto *VecTy = getWidenedType(ScalarTy, VF: TE.Scalars.size());
13479 for (auto [Idx, Sz] : SubVectors) {
13480 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind,
13481 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: Sz));
13482 }
13483 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13484 /*Insert=*/true,
13485 /*Extract=*/false, CostKind);
13486 int Sz = TE.Scalars.size();
13487 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13488 TE.ReorderIndices.end());
13489 for (unsigned I : seq<unsigned>(Size: Sz)) {
13490 Value *V = TE.getOrdered(Idx: I);
13491 if (isa<PoisonValue>(Val: V)) {
13492 ReorderMask[I] = PoisonMaskElem;
13493 } else if (isConstant(V) || DemandedElts[I]) {
13494 ReorderMask[I] = I + TE.ReorderIndices.size();
13495 }
13496 }
13497 Cost += ::getShuffleCost(TTI: *TTI,
13498 Kind: any_of(Range&: ReorderMask, P: [&](int I) { return I >= Sz; })
13499 ? TTI::SK_PermuteTwoSrc
13500 : TTI::SK_PermuteSingleSrc,
13501 Tp: VecTy, Mask: ReorderMask);
13502 DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13503 ReorderMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
13504 for (unsigned I : seq<unsigned>(Size: Sz)) {
13505 Value *V = TE.getOrdered(Idx: I);
13506 if (isConstant(V)) {
13507 DemandedElts.clearBit(BitPosition: I);
13508 if (!isa<PoisonValue>(Val: V))
13509 ReorderMask[I] = I;
13510 } else {
13511 ReorderMask[I] = I + Sz;
13512 }
13513 }
13514 InstructionCost BVCost =
13515 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13516 /*Insert=*/true, /*Extract=*/false, CostKind);
13517 if (!DemandedElts.isAllOnes())
13518 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: ReorderMask);
13519 if (Cost >= BVCost) {
13520 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13521 reorderScalars(Scalars&: TE.Scalars, Mask);
13522 TE.ReorderIndices.clear();
13523 }
13524}
13525
13526/// Check if we can convert fadd/fsub sequence to FMAD.
13527/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13528static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
13529 const InstructionsState &S,
13530 DominatorTree &DT, const DataLayout &DL,
13531 TargetTransformInfo &TTI,
13532 const TargetLibraryInfo &TLI) {
13533 assert(all_of(VL,
13534 [](Value *V) {
13535 return V->getType()->getScalarType()->isFloatingPointTy();
13536 }) &&
13537 "Can only convert to FMA for floating point types");
13538 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13539
13540 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13541 FastMathFlags FMF;
13542 FMF.set();
13543 for (Value *V : VL) {
13544 auto *I = dyn_cast<Instruction>(Val: V);
13545 if (!I)
13546 continue;
13547 if (S.isCopyableElement(V: I))
13548 continue;
13549 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13550 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13551 continue;
13552 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13553 FMF &= FPCI->getFastMathFlags();
13554 }
13555 return FMF.allowContract();
13556 };
13557 if (!CheckForContractable(VL))
13558 return InstructionCost::getInvalid();
13559 // fmul also should be contractable
13560 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13561 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13562
13563 InstructionsState OpS = getSameOpcode(VL: Operands.front(), TLI);
13564 if (!OpS.valid())
13565 return InstructionCost::getInvalid();
13566
13567 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13568 return InstructionCost::getInvalid();
13569 if (!CheckForContractable(Operands.front()))
13570 return InstructionCost::getInvalid();
13571 // Compare the costs.
13572 InstructionCost FMulPlusFAddCost = 0;
13573 InstructionCost FMACost = 0;
13574 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13575 FastMathFlags FMF;
13576 FMF.set();
13577 for (Value *V : VL) {
13578 auto *I = dyn_cast<Instruction>(Val: V);
13579 if (!I)
13580 continue;
13581 if (!S.isCopyableElement(V: I))
13582 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13583 FMF &= FPCI->getFastMathFlags();
13584 FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13585 }
13586 unsigned NumOps = 0;
13587 for (auto [V, Op] : zip(t&: VL, u&: Operands.front())) {
13588 if (S.isCopyableElement(V))
13589 continue;
13590 auto *I = dyn_cast<Instruction>(Val: Op);
13591 if (!I || !I->hasOneUse() || OpS.isCopyableElement(V: I)) {
13592 if (auto *OpI = dyn_cast<Instruction>(Val: V))
13593 FMACost += TTI.getInstructionCost(U: OpI, CostKind);
13594 if (I)
13595 FMACost += TTI.getInstructionCost(U: I, CostKind);
13596 continue;
13597 }
13598 ++NumOps;
13599 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13600 FMF &= FPCI->getFastMathFlags();
13601 FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13602 }
13603 Type *Ty = VL.front()->getType();
13604 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13605 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13606 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13607}
13608
13609bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
13610 bool &IsBSwap, bool &ForLoads) const {
13611 assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
13612 "Expected Shl node.");
13613 IsBSwap = false;
13614 ForLoads = false;
13615 if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
13616 !TE.ReuseShuffleIndices.empty() || MinBWs.contains(Val: &TE) ||
13617 any_of(Range: TE.Scalars, P: [](Value *V) { return !V->hasOneUse(); }))
13618 return false;
13619 Type *ScalarTy = TE.getMainOp()->getType();
13620 // TODO: Check if same can be done for the vector types.
13621 if (!ScalarTy->isIntegerTy())
13622 return false;
13623 if (ScalarTy->isVectorTy())
13624 return false;
13625 const unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
13626 const TreeEntry *LhsTE = getOperandEntry(E: &TE, /*Idx=*/0);
13627 const TreeEntry *RhsTE = getOperandEntry(E: &TE, /*Idx=*/1);
13628 // Lhs should be zext i<stride> to I<sz>.
13629 if (!(LhsTE->State == TreeEntry::Vectorize &&
13630 LhsTE->getOpcode() == Instruction::ZExt &&
13631 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13632 !MinBWs.contains(Val: LhsTE) &&
13633 all_of(Range: LhsTE->Scalars, P: [](Value *V) { return V->hasOneUse(); })))
13634 return false;
13635 Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
13636 unsigned Stride = DL->getTypeSizeInBits(Ty: SrcScalarTy);
13637 if (!isPowerOf2_64(Value: Stride) || Stride >= Sz || Sz % Stride != 0 ||
13638 !isPowerOf2_64(Value: LhsTE->getVectorFactor()))
13639 return false;
13640 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13641 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(Val: RhsTE)))
13642 return false;
13643 Order.clear();
13644 unsigned CurrentValue = 0;
13645 // Rhs should be (0, Stride, 2 * Stride, ..., N-Stride), where N <= Sz.
13646 if (all_of(Range: RhsTE->Scalars,
13647 P: [&](Value *V) {
13648 CurrentValue += Stride;
13649 if (isa<UndefValue>(Val: V))
13650 return true;
13651 auto *C = dyn_cast<Constant>(Val: V);
13652 if (!C)
13653 return false;
13654 return C->getUniqueInteger() == CurrentValue - Stride;
13655 }) &&
13656 CurrentValue <= Sz) {
13657 Order.clear();
13658 } else {
13659 const unsigned VF = RhsTE->getVectorFactor();
13660 Order.assign(NumElts: VF, Elt: VF);
13661 // Track which logical positions we've seen; reject duplicate shift amounts.
13662 SmallBitVector SeenPositions(VF);
13663 // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
13664 // ..., N-Stride), where N <= Sz.
13665 if (VF * Stride > Sz)
13666 return false;
13667 for (const auto [Idx, V] : enumerate(First: RhsTE->Scalars)) {
13668 if (isa<UndefValue>(Val: V))
13669 continue;
13670 auto *C = dyn_cast<Constant>(Val: V);
13671 if (!C)
13672 return false;
13673 const APInt &Val = C->getUniqueInteger();
13674 if (Val.isNegative() || Val.uge(RHS: Sz) || Val.getZExtValue() % Stride != 0)
13675 return false;
13676 unsigned Pos = Val.getZExtValue() / Stride;
13677 // TODO: Support Pos >= VF, in this case need to shift the final value.
13678 if (Order[Idx] != VF || Pos >= VF)
13679 return false;
13680 if (SeenPositions.test(Idx: Pos))
13681 return false;
13682 SeenPositions.set(Pos);
13683 Order[Idx] = Pos;
13684 }
13685 // One of the indices not set - exit.
13686 if (is_contained(Range&: Order, Element: VF))
13687 return false;
13688 }
13689 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13690 auto *SrcType = IntegerType::getIntNTy(C&: ScalarTy->getContext(),
13691 N: Stride * LhsTE->getVectorFactor());
13692 FastMathFlags FMF;
13693 SmallPtrSet<Value *, 4> CheckedExtracts;
13694 auto *VecTy = getWidenedType(ScalarTy, VF: TE.getVectorFactor());
13695 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
13696 TTI::CastContextHint CastCtx =
13697 getCastContextHint(TE: *getOperandEntry(E: LhsTE, /*Idx=*/0));
13698 InstructionCost VecCost =
13699 TTI->getArithmeticReductionCost(Opcode: Instruction::Or, Ty: VecTy, FMF, CostKind) +
13700 TTI->getArithmeticInstrCost(Opcode: Instruction::Shl, Ty: VecTy, CostKind,
13701 Opd1Info: getOperandInfo(Ops: LhsTE->Scalars)) +
13702 TTI->getCastInstrCost(
13703 Opcode: Instruction::ZExt, Dst: VecTy,
13704 Src: getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor()), CCH: CastCtx,
13705 CostKind);
13706 InstructionCost BitcastCost = TTI->getCastInstrCost(
13707 Opcode: Instruction::BitCast, Dst: SrcType, Src: SrcVecTy, CCH: CastCtx, CostKind);
13708 if (!Order.empty()) {
13709 fixupOrderingIndices(Order);
13710 SmallVector<int> Mask;
13711 inversePermutation(Indices: Order, Mask);
13712 BitcastCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SrcVecTy,
13713 Mask, CostKind);
13714 }
13715 // Check if the combination can be modeled as a bitcast+byteswap operation.
13716 constexpr unsigned ByteSize = 8;
13717 if (!Order.empty() && isReverseOrder(Order) &&
13718 DL->getTypeSizeInBits(Ty: SrcScalarTy) == ByteSize) {
13719 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13720 InstructionCost BSwapCost =
13721 TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: SrcType, Src: SrcVecTy, CCH: CastCtx,
13722 CostKind) +
13723 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13724 if (BSwapCost <= BitcastCost) {
13725 BitcastCost = BSwapCost;
13726 IsBSwap = true;
13727 Order.clear();
13728 // Check for loads in the ZExt node.
13729 const TreeEntry *SrcTE = getOperandEntry(E: LhsTE, /*Idx=*/0);
13730 if (SrcTE->State == TreeEntry::Vectorize &&
13731 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
13732 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13733 all_of(Range: SrcTE->Scalars, P: [](Value *V) { return V->hasOneUse(); })) {
13734 auto *LI = cast<LoadInst>(Val: SrcTE->getMainOp());
13735 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13736 InstructionCost BSwapCost =
13737 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI->getAlign(),
13738 AddressSpace: LI->getPointerAddressSpace(), CostKind) +
13739 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13740 if (BSwapCost <= BitcastCost) {
13741 VecCost +=
13742 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcVecTy, Alignment: LI->getAlign(),
13743 AddressSpace: LI->getPointerAddressSpace(), CostKind);
13744 BitcastCost = BSwapCost;
13745 ForLoads = true;
13746 }
13747 }
13748 }
13749 } else if (Order.empty() && DL->getTypeSizeInBits(Ty: SrcScalarTy) == ByteSize) {
13750 // Check for loads in the ZExt node.
13751 const TreeEntry *SrcTE = getOperandEntry(E: LhsTE, /*Idx=*/0);
13752 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
13753 SrcTE->ReuseShuffleIndices.empty() &&
13754 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13755 all_of(Range: SrcTE->Scalars, P: [](Value *V) { return V->hasOneUse(); })) {
13756 auto *LI = cast<LoadInst>(Val: SrcTE->getMainOp());
13757 BitcastCost =
13758 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI->getAlign(),
13759 AddressSpace: LI->getPointerAddressSpace(), CostKind);
13760 VecCost +=
13761 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcVecTy, Alignment: LI->getAlign(),
13762 AddressSpace: LI->getPointerAddressSpace(), CostKind);
13763 ForLoads = true;
13764 }
13765 }
13766 if (SrcType != ScalarTy) {
13767 BitcastCost += TTI->getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
13768 CCH: TTI::CastContextHint::None, CostKind);
13769 }
13770 return BitcastCost < VecCost;
13771}
13772
13773bool BoUpSLP::matchesInversedZExtSelect(
13774 const TreeEntry &SelectTE,
13775 SmallVectorImpl<unsigned> &InversedCmpsIndices) const {
13776 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13777 "Expected select node.");
13778 SmallVector<std::pair<Instruction *, unsigned>> ZExts;
13779 for (auto [Idx, V] : enumerate(First: SelectTE.Scalars)) {
13780 auto *Inst = dyn_cast<Instruction>(Val: V);
13781 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
13782 continue;
13783 ZExts.emplace_back(Args&: Inst, Args&: Idx);
13784 }
13785 if (ZExts.empty())
13786 return false;
13787 const auto *CmpTE = getOperandEntry(E: &SelectTE, Idx: 0);
13788 const auto *Op1TE = getOperandEntry(E: &SelectTE, Idx: 1);
13789 const auto *Op2TE = getOperandEntry(E: &SelectTE, Idx: 2);
13790 // Compares must be alternate vectorized, and other operands must be gathers
13791 // or copyables.
13792 // TODO: investigate opportunity for reordered/reused nodes.
13793 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
13794 (CmpTE->getOpcode() != Instruction::ICmp &&
13795 CmpTE->getOpcode() != Instruction::FCmp) ||
13796 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
13797 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13798 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
13799 return false;
13800 // The operands must be buildvectors/copyables.
13801 if (!Op1TE->isGather() || !Op2TE->isGather())
13802 return false;
13803 // TODO: investigate opportunity for the vector nodes with copyables.
13804 auto *Cmp = CmpTE->getMainOp();
13805 CmpPredicate Pred;
13806 auto MatchCmp = m_Cmp(Pred, L: m_Value(), R: m_Value());
13807 if (!match(V: Cmp, P: MatchCmp))
13808 return false;
13809 CmpPredicate MainPred = Pred;
13810 CmpPredicate InversedPred(CmpInst::getInversePredicate(pred: Pred),
13811 Pred.hasSameSign());
13812 for (const auto [Idx, V] : enumerate(First: CmpTE->Scalars)) {
13813 if (!match(V, P: MatchCmp))
13814 continue;
13815 if (CmpPredicate::getMatching(A: MainPred, B: Pred))
13816 continue;
13817 if (!CmpPredicate::getMatching(A: InversedPred, B: Pred))
13818 return false;
13819 if (!V->hasOneUse())
13820 return false;
13821 InversedCmpsIndices.push_back(Elt: Idx);
13822 }
13823
13824 if (InversedCmpsIndices.empty())
13825 return false;
13826 VectorType *VecTy =
13827 getWidenedType(ScalarTy: Cmp->getOperand(i: 0)->getType(), VF: CmpTE->getVectorFactor());
13828 Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: VecTy);
13829
13830 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13831 InstructionCost VecCost =
13832 TTI->getCmpSelInstrCost(Opcode: CmpTE->getOpcode(), ValTy: VecTy, CondTy: CmpTy, VecPred: MainPred,
13833 CostKind, Op1Info: getOperandInfo(Ops: CmpTE->getOperand(OpIdx: 0)),
13834 Op2Info: getOperandInfo(Ops: CmpTE->getOperand(OpIdx: 1)));
13835 InstructionCost BVCost =
13836 ::getScalarizationOverhead(TTI: *TTI, ScalarTy: Cmp->getType(), Ty: cast<VectorType>(Val: CmpTy),
13837 DemandedElts: APInt::getAllOnes(numBits: CmpTE->getVectorFactor()),
13838 /*Insert=*/true, /*Extract=*/false, CostKind);
13839 for (Value *V : CmpTE->Scalars) {
13840 auto *I = dyn_cast<Instruction>(Val: V);
13841 if (!I)
13842 continue;
13843 BVCost += TTI->getInstructionCost(U: I, CostKind);
13844 }
13845 return VecCost < BVCost;
13846}
13847
13848bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const {
13849 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13850 "Expected select node.");
13851 if (DL->isBigEndian())
13852 return false;
13853 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
13854 return false;
13855 if (!UserIgnoreList || SelectTE.Idx != 0)
13856 return false;
13857 if (any_of(Range: SelectTE.Scalars, P: [](Value *V) { return !V->hasOneUse(); }))
13858 return false;
13859 // Check that all reduction operands are or instructions.
13860 if (any_of(Range: *UserIgnoreList,
13861 P: [](Value *V) { return !match(V, P: m_Or(L: m_Value(), R: m_Value())); }))
13862 return false;
13863 const TreeEntry *Op1TE = getOperandEntry(E: &SelectTE, Idx: 1);
13864 const TreeEntry *Op2TE = getOperandEntry(E: &SelectTE, Idx: 2);
13865 if (!Op1TE->isGather() || !Op2TE->isGather())
13866 return false;
13867 // No need to check for zeroes reordering.
13868 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13869 !Op2TE->ReuseShuffleIndices.empty())
13870 return false;
13871 Type *ScalarTy = Op1TE->Scalars.front()->getType();
13872 if (!ScalarTy->isIntegerTy())
13873 return false;
13874 // Check that second operand is all zeroes.
13875 if (any_of(Range: Op2TE->Scalars, P: [](Value *V) { return !match(V, P: m_ZeroInt()); }))
13876 return false;
13877 // Check that first operand is 1,2,4,...
13878 if (any_of(Range: enumerate(First: Op1TE->Scalars), P: [](const auto &P) {
13879 uint64_t V;
13880 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(Value: V) &&
13881 Log2_64(Value: V) == P.index());
13882 }))
13883 return false;
13884 // Check if bitcast is cheaper than select.
13885 auto *DstTy = IntegerType::getIntNTy(C&: ScalarTy->getContext(),
13886 N: SelectTE.getVectorFactor());
13887 VectorType *OpTy = getWidenedType(ScalarTy: DstTy, VF: SelectTE.getVectorFactor());
13888 Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: OpTy);
13889 VectorType *VecTy = getWidenedType(ScalarTy, VF: SelectTE.getVectorFactor());
13890 auto It = MinBWs.find(Val: &SelectTE);
13891 if (It != MinBWs.end()) {
13892 auto *EffectiveScalarTy =
13893 IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
13894 VecTy = getWidenedType(ScalarTy: EffectiveScalarTy, VF: SelectTE.getVectorFactor());
13895 }
13896 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13897 InstructionCost BitcastCost = TTI->getCastInstrCost(
13898 Opcode: Instruction::BitCast, Dst: DstTy, Src: CmpTy, CCH: TTI::CastContextHint::None, CostKind);
13899 if (DstTy != ScalarTy) {
13900 BitcastCost += TTI->getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: DstTy,
13901 CCH: TTI::CastContextHint::None, CostKind);
13902 }
13903 FastMathFlags FMF;
13904 InstructionCost SelectCost =
13905 TTI->getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy: CmpTy,
13906 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind,
13907 Op1Info: getOperandInfo(Ops: Op1TE->Scalars),
13908 Op2Info: getOperandInfo(Ops: Op2TE->Scalars)) +
13909 TTI->getArithmeticReductionCost(Opcode: Instruction::Or, Ty: VecTy, FMF, CostKind);
13910 return BitcastCost <= SelectCost;
13911}
13912
13913void BoUpSLP::transformNodes() {
13914 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13915 BaseGraphSize = VectorizableTree.size();
13916 // Turn graph transforming mode on and off, when done.
13917 class GraphTransformModeRAAI {
13918 bool &SavedIsGraphTransformMode;
13919
13920 public:
13921 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13922 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13923 IsGraphTransformMode = true;
13924 }
13925 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13926 } TransformContext(IsGraphTransformMode);
13927 // Operands are profitable if they are:
13928 // 1. At least one constant
13929 // or
13930 // 2. Splats
13931 // or
13932 // 3. Results in good vectorization opportunity, i.e. may generate vector
13933 // nodes and reduce cost of the graph.
13934 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13935 const InstructionsState &S) {
13936 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
13937 for (unsigned Op : seq<unsigned>(Size: S.getMainOp()->getNumOperands()))
13938 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
13939 Args: I2->getOperand(i: Op));
13940 return all_of(Range&: Candidates, P: [this](
13941 ArrayRef<std::pair<Value *, Value *>> Cand) {
13942 return all_of(Range&: Cand,
13943 P: [](const std::pair<Value *, Value *> &P) {
13944 return isa<Constant>(Val: P.first) ||
13945 isa<Constant>(Val: P.second) || P.first == P.second;
13946 }) ||
13947 findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplatLoads).first;
13948 });
13949 };
13950
13951 // Try to reorder gather nodes for better vectorization opportunities.
13952 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
13953 TreeEntry &E = *VectorizableTree[Idx];
13954 if (E.isGather())
13955 reorderGatherNode(TE&: E);
13956 }
13957
13958 // Better to use full gathered loads analysis, if there are only 2 loads
13959 // gathered nodes each having less than 16 elements.
13960 constexpr unsigned VFLimit = 16;
13961 bool ForceLoadGather =
13962 count_if(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
13963 return TE->isGather() && TE->hasState() &&
13964 TE->getOpcode() == Instruction::Load &&
13965 TE->getVectorFactor() < VFLimit;
13966 }) == 2;
13967
13968 // Checks if the scalars are used in other node.
13969 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13970 function_ref<bool(Value *)> CheckContainer) {
13971 return TE->isSame(VL) || all_of(Range&: VL, P: [&](Value *V) {
13972 if (isa<PoisonValue>(Val: V))
13973 return true;
13974 auto *I = dyn_cast<Instruction>(Val: V);
13975 if (!I)
13976 return false;
13977 return is_contained(Range: TE->Scalars, Element: I) || CheckContainer(I);
13978 });
13979 };
13980 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13981 if (E.hasState()) {
13982 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: E.getMainOp());
13983 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13984 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13985 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13986 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13987 return is_contained(Range&: TEs, Element: TE);
13988 });
13989 });
13990 }))
13991 return true;
13992 ;
13993 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: E.getMainOp());
13994 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13995 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13996 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13997 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13998 return is_contained(Range&: TEs, Element: TE);
13999 });
14000 });
14001 }))
14002 return true;
14003 } else {
14004 // Check if the gather node full copy of split node.
14005 auto *It = find_if(Range: E.Scalars, P: IsaPred<Instruction>);
14006 if (It != E.Scalars.end()) {
14007 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: *It);
14008 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
14009 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
14010 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
14011 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
14012 return is_contained(Range&: TEs, Element: TE);
14013 });
14014 });
14015 }))
14016 return true;
14017 }
14018 }
14019 return false;
14020 };
14021 // The tree may grow here, so iterate over nodes, built before.
14022 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
14023 TreeEntry &E = *VectorizableTree[Idx];
14024 if (E.isGather()) {
14025 ArrayRef<Value *> VL = E.Scalars;
14026 const unsigned Sz = getVectorElementSize(V: VL.front());
14027 unsigned MinVF = getMinVF(Sz: 2 * Sz);
14028 // Do not try partial vectorization for small nodes (<= 2), nodes with the
14029 // same opcode and same parent block or all constants.
14030 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(key: Idx) ||
14031 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
14032 // We use allSameOpcode instead of isAltShuffle because we don't
14033 // want to use interchangeable instruction here.
14034 !allSameOpcode(VL) || !allSameBlock(VL)) ||
14035 allConstant(VL) || isSplat(VL))
14036 continue;
14037 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
14038 continue;
14039 // Check if the node is a copy of other vector nodes.
14040 if (CheckForSameVectorNodes(E))
14041 continue;
14042 // Try to find vectorizable sequences and transform them into a series of
14043 // insertvector instructions.
14044 unsigned StartIdx = 0;
14045 unsigned End = VL.size();
14046 SmallBitVector Processed(End);
14047 for (unsigned VF = getFloorFullVectorNumberOfElements(
14048 TTI: *TTI, Ty: VL.front()->getType(), Sz: VL.size() - 1);
14049 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
14050 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
14051 if (StartIdx + VF > End)
14052 continue;
14053 SmallVector<std::pair<unsigned, unsigned>> Slices;
14054 bool AllStrided = true;
14055 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
14056 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
14057 // If any instruction is vectorized already - do not try again.
14058 // Reuse the existing node, if it fully matches the slice.
14059 if ((Processed.test(Idx: Cnt) || isVectorized(V: Slice.front())) &&
14060 !getSameValuesTreeEntry(V: Slice.front(), VL: Slice, /*SameVF=*/true))
14061 continue;
14062 // Constant already handled effectively - skip.
14063 if (allConstant(VL: Slice))
14064 continue;
14065 // Do not try to vectorize small splats (less than vector register and
14066 // only with the single non-undef element).
14067 bool IsSplat = isSplat(VL: Slice);
14068 bool IsTwoRegisterSplat = true;
14069 if (IsSplat && VF == 2) {
14070 unsigned NumRegs2VF = ::getNumberOfParts(
14071 TTI: *TTI, VecTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF: 2 * VF));
14072 IsTwoRegisterSplat = NumRegs2VF == 2;
14073 }
14074 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
14075 count(Range&: Slice, Element: Slice.front()) ==
14076 static_cast<long>(isa<UndefValue>(Val: Slice.front()) ? VF - 1
14077 : 1)) {
14078 if (IsSplat)
14079 continue;
14080 InstructionsState S = getSameOpcode(VL: Slice, TLI: *TLI);
14081 if (!S || !allSameOpcode(VL: Slice) || !allSameBlock(VL: Slice) ||
14082 (S.getOpcode() == Instruction::Load &&
14083 areKnownNonVectorizableLoads(VL: Slice)) ||
14084 (S.getOpcode() != Instruction::Load &&
14085 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), Sz: VF)))
14086 continue;
14087 if (VF == 2) {
14088 // Try to vectorize reduced values or if all users are vectorized.
14089 // For expensive instructions extra extracts might be profitable.
14090 if ((!UserIgnoreList || E.Idx != 0) &&
14091 TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
14092 TTI::TCC_Expensive &&
14093 !all_of(Range&: Slice, P: [&](Value *V) {
14094 if (isa<PoisonValue>(Val: V))
14095 return true;
14096 return areAllUsersVectorized(I: cast<Instruction>(Val: V),
14097 VectorizedVals: UserIgnoreList);
14098 }))
14099 continue;
14100 if (S.getOpcode() == Instruction::Load) {
14101 OrdersType Order;
14102 SmallVector<Value *> PointerOps;
14103 StridedPtrInfo SPtrInfo;
14104 LoadsState Res = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
14105 PointerOps, SPtrInfo);
14106 AllStrided &= Res == LoadsState::StridedVectorize ||
14107 Res == LoadsState::ScatterVectorize ||
14108 Res == LoadsState::Gather;
14109 // Do not vectorize gathers.
14110 if (Res == LoadsState::ScatterVectorize ||
14111 Res == LoadsState::Gather) {
14112 if (Res == LoadsState::Gather) {
14113 registerNonVectorizableLoads(VL: Slice);
14114 // If reductions and the scalars from the root node are
14115 // analyzed - mark as non-vectorizable reduction.
14116 if (UserIgnoreList && E.Idx == 0)
14117 analyzedReductionVals(VL: Slice);
14118 }
14119 continue;
14120 }
14121 } else if (S.getOpcode() == Instruction::ExtractElement ||
14122 (TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
14123 TTI::TCC_Expensive &&
14124 !CheckOperandsProfitability(
14125 S.getMainOp(),
14126 cast<Instruction>(Val: *find_if(Range: reverse(C&: Slice),
14127 P: IsaPred<Instruction>)),
14128 S))) {
14129 // Do not vectorize extractelements (handled effectively
14130 // alread). Do not vectorize non-profitable instructions (with
14131 // low cost and non-vectorizable operands.)
14132 continue;
14133 }
14134 }
14135 }
14136 Slices.emplace_back(Args&: Cnt, Args: Slice.size());
14137 }
14138 // Do not try to vectorize if all slides are strided or gathered with
14139 // vector factor 2 and there are more than 2 slices. Better to handle
14140 // them in gathered loads analysis, may result in better vectorization.
14141 if (VF == 2 && AllStrided && Slices.size() > 2)
14142 continue;
14143 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
14144 E.CombinedEntriesWithIndices.emplace_back(Args&: Idx, Args&: Cnt);
14145 Processed.set(I: Cnt, E: Cnt + Sz);
14146 if (StartIdx == Cnt)
14147 StartIdx = Cnt + Sz;
14148 if (End == Cnt + Sz)
14149 End = Cnt;
14150 };
14151 for (auto [Cnt, Sz] : Slices) {
14152 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: Sz);
14153 const TreeEntry *SameTE = nullptr;
14154 if (const auto *It = find_if(Range&: Slice, P: IsaPred<Instruction>);
14155 It != Slice.end()) {
14156 // If any instruction is vectorized already - do not try again.
14157 SameTE = getSameValuesTreeEntry(V: *It, VL: Slice);
14158 }
14159 unsigned PrevSize = VectorizableTree.size();
14160 [[maybe_unused]] unsigned PrevEntriesSize =
14161 LoadEntriesToVectorize.size();
14162 buildTreeRec(VLRef: Slice, Depth: 0, UserTreeIdx: EdgeInfo(&E, UINT_MAX));
14163 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
14164 VectorizableTree[PrevSize]->isGather() &&
14165 VectorizableTree[PrevSize]->hasState() &&
14166 VectorizableTree[PrevSize]->getOpcode() !=
14167 Instruction::ExtractElement &&
14168 !isSplat(VL: Slice)) {
14169 if (UserIgnoreList && E.Idx == 0 && VF == 2)
14170 analyzedReductionVals(VL: Slice);
14171 VectorizableTree.pop_back();
14172 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
14173 "LoadEntriesToVectorize expected to remain the same");
14174 continue;
14175 }
14176 AddCombinedNode(PrevSize, Cnt, Sz);
14177 }
14178 }
14179 // Restore ordering, if no extra vectorization happened.
14180 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
14181 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
14182 reorderScalars(Scalars&: E.Scalars, Mask);
14183 E.ReorderIndices.clear();
14184 }
14185 }
14186 if (!E.hasState())
14187 continue;
14188 switch (E.getOpcode()) {
14189 case Instruction::Load: {
14190 // No need to reorder masked gather loads, just reorder the scalar
14191 // operands.
14192 if (E.State != TreeEntry::Vectorize)
14193 break;
14194 Type *ScalarTy = E.getMainOp()->getType();
14195 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
14196 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
14197 // Check if profitable to represent consecutive load + reverse as strided
14198 // load with stride -1.
14199 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
14200 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
14201 SmallVector<int> Mask;
14202 inversePermutation(Indices: E.ReorderIndices, Mask);
14203 auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
14204 InstructionCost OriginalVecCost =
14205 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
14206 AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
14207 OpdInfo: TTI::OperandValueInfo()) +
14208 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
14209 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
14210 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
14211 VecTy, BaseLI->getPointerOperand(),
14212 /*VariableMask=*/false, CommonAlignment,
14213 BaseLI),
14214 CostKind);
14215 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
14216 // Strided load is more profitable than consecutive load + reverse -
14217 // transform the node to strided load.
14218 Type *StrideTy = DL->getIndexType(PtrTy: cast<LoadInst>(Val: E.Scalars.front())
14219 ->getPointerOperand()
14220 ->getType());
14221 StridedPtrInfo SPtrInfo;
14222 SPtrInfo.StrideVal = ConstantInt::get(Ty: StrideTy, V: 1);
14223 SPtrInfo.Ty = VecTy;
14224 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
14225 E.State = TreeEntry::StridedVectorize;
14226 }
14227 }
14228 break;
14229 }
14230 case Instruction::Store: {
14231 Type *ScalarTy =
14232 cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
14233 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
14234 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
14235 // Check if profitable to represent consecutive load + reverse as strided
14236 // load with stride -1.
14237 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
14238 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
14239 SmallVector<int> Mask;
14240 inversePermutation(Indices: E.ReorderIndices, Mask);
14241 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
14242 InstructionCost OriginalVecCost =
14243 TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
14244 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
14245 OpdInfo: TTI::OperandValueInfo()) +
14246 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
14247 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
14248 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
14249 VecTy, BaseSI->getPointerOperand(),
14250 /*VariableMask=*/false, CommonAlignment,
14251 BaseSI),
14252 CostKind);
14253 if (StridedCost < OriginalVecCost)
14254 // Strided store is more profitable than reverse + consecutive store -
14255 // transform the node to strided store.
14256 E.State = TreeEntry::StridedVectorize;
14257 } else if (!E.ReorderIndices.empty()) {
14258 // Check for interleaved stores.
14259 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
14260 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.front());
14261 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
14262 if (Mask.size() < 4)
14263 return 0u;
14264 for (unsigned Factor : seq<unsigned>(Begin: 2, End: Mask.size() / 2 + 1)) {
14265 if (ShuffleVectorInst::isInterleaveMask(
14266 Mask, Factor, NumInputElts: VecTy->getElementCount().getFixedValue()) &&
14267 TTI.isLegalInterleavedAccessType(
14268 VTy: VecTy, Factor, Alignment: BaseSI->getAlign(),
14269 AddrSpace: BaseSI->getPointerAddressSpace()))
14270 return Factor;
14271 }
14272
14273 return 0u;
14274 };
14275 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
14276 unsigned InterleaveFactor = IsInterleaveMask(Mask);
14277 if (InterleaveFactor != 0)
14278 E.setInterleave(InterleaveFactor);
14279 }
14280 break;
14281 }
14282 case Instruction::Select: {
14283 if (E.State != TreeEntry::Vectorize)
14284 break;
14285 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: E.Scalars);
14286 if (MinMaxID != Intrinsic::not_intrinsic) {
14287 // This node is a minmax node.
14288 E.CombinedOp = TreeEntry::MinMax;
14289 TreeEntry *CondEntry = getOperandEntry(E: &E, Idx: 0);
14290 if (SelectOnly && CondEntry->UserTreeIndex &&
14291 CondEntry->State == TreeEntry::Vectorize) {
14292 // The condition node is part of the combined minmax node.
14293 CondEntry->State = TreeEntry::CombinedVectorize;
14294 }
14295 break;
14296 }
14297 // Check for zext + selects, which can be reordered.
14298 SmallVector<unsigned> InversedCmpsIndices;
14299 if (matchesInversedZExtSelect(SelectTE: E, InversedCmpsIndices)) {
14300 auto *CmpTE = getOperandEntry(E: &E, Idx: 0);
14301 auto *Op1TE = getOperandEntry(E: &E, Idx: 1);
14302 auto *Op2TE = getOperandEntry(E: &E, Idx: 2);
14303 // State now is uniform, not alternate opcode.
14304 CmpTE->setOperations(
14305 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
14306 // Update mapping between the swapped values and their internal matching
14307 // nodes.
14308 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
14309 Value *V) {
14310 if (isConstant(V))
14311 return;
14312 auto It = ValueToGatherNodes.find(Val: V);
14313 assert(It != ValueToGatherNodes.end() &&
14314 "Expected to find the value in the map.");
14315 auto &C = It->getSecond();
14316 if (!is_contained(Range&: OldTE->Scalars, Element: V))
14317 C.remove(X: OldTE);
14318 C.insert(X: NewTE);
14319 };
14320 ValueList &Op1 = E.getOperand(OpIdx: 1);
14321 ValueList &Op2 = E.getOperand(OpIdx: 2);
14322 for (const unsigned Idx : InversedCmpsIndices) {
14323 Value *V1 = Op1TE->Scalars[Idx];
14324 Value *V2 = Op2TE->Scalars[Idx];
14325 std::swap(a&: Op1TE->Scalars[Idx], b&: Op2TE->Scalars[Idx]);
14326 std::swap(a&: Op1[Idx], b&: Op2[Idx]);
14327 UpdateGatherEntry(Op1TE, Op2TE, V1);
14328 UpdateGatherEntry(Op2TE, Op1TE, V2);
14329 }
14330 OperandsToTreeEntry.emplace_or_assign(Key: std::make_pair(x: &E, y: 1), Args&: Op1TE);
14331 OperandsToTreeEntry.emplace_or_assign(Key: std::make_pair(x: &E, y: 2), Args&: Op2TE);
14332 // NB: Fallback to check if select can be converted to cmp bitcast.
14333 }
14334 if (matchesSelectOfBits(SelectTE: E)) {
14335 // This node is a (reduced or) cmp bitcast node.
14336 const TreeEntry::CombinedOpcode Code = TreeEntry::ReducedCmpBitcast;
14337 E.CombinedOp = Code;
14338 auto *Op1TE = getOperandEntry(E: &E, Idx: 1);
14339 auto *Op2TE = getOperandEntry(E: &E, Idx: 2);
14340 Op1TE->State = TreeEntry::CombinedVectorize;
14341 Op1TE->CombinedOp = Code;
14342 Op2TE->State = TreeEntry::CombinedVectorize;
14343 Op2TE->CombinedOp = Code;
14344 break;
14345 }
14346 break;
14347 }
14348 case Instruction::FSub:
14349 case Instruction::FAdd: {
14350 // Check if possible to convert (a*b)+c to fma.
14351 if (E.State != TreeEntry::Vectorize ||
14352 !E.getOperations().isAddSubLikeOp())
14353 break;
14354 const TreeEntry *LHS = getOperandEntry(E: &E, Idx: 0);
14355 const TreeEntry *RHS = getOperandEntry(E: &E, Idx: 1);
14356 auto IsOneUseVectorFMulOperand = [](const TreeEntry *TE) {
14357 return TE->State == TreeEntry::Vectorize &&
14358 TE->ReorderIndices.empty() && TE->ReuseShuffleIndices.empty() &&
14359 TE->getOpcode() == Instruction::FMul && !TE->isAltShuffle() &&
14360 all_of(Range: TE->Scalars, P: [&](Value *V) {
14361 return (TE->hasCopyableElements() &&
14362 TE->isCopyableElement(V)) ||
14363 V->hasOneUse();
14364 });
14365 };
14366 if (!IsOneUseVectorFMulOperand(LHS) &&
14367 (E.getOpcode() == Instruction::FSub ||
14368 !IsOneUseVectorFMulOperand(RHS)))
14369 break;
14370 if (!canConvertToFMA(VL: E.Scalars, S: E.getOperations(), DT&: *DT, DL: *DL, TTI&: *TTI, TLI: *TLI)
14371 .isValid())
14372 break;
14373 // This node is a fmuladd node.
14374 E.CombinedOp = TreeEntry::FMulAdd;
14375 TreeEntry *FMulEntry = getOperandEntry(E: &E, Idx: 0);
14376 if (FMulEntry->UserTreeIndex &&
14377 FMulEntry->State == TreeEntry::Vectorize) {
14378 // The FMul node is part of the combined fmuladd node.
14379 FMulEntry->State = TreeEntry::CombinedVectorize;
14380 }
14381 break;
14382 }
14383 case Instruction::Shl: {
14384 if (E.Idx != 0 || DL->isBigEndian())
14385 break;
14386 if (!UserIgnoreList)
14387 break;
14388 // Check that all reduction operands are disjoint or instructions.
14389 if (any_of(Range: *UserIgnoreList, P: [](Value *V) {
14390 return !match(V, P: m_DisjointOr(L: m_Value(), R: m_Value()));
14391 }))
14392 break;
14393 OrdersType Order;
14394 bool IsBSwap;
14395 bool ForLoads;
14396 if (!matchesShlZExt(TE: E, Order, IsBSwap, ForLoads))
14397 break;
14398 // This node is a (reduced disjoint or) bitcast node.
14399 TreeEntry::CombinedOpcode Code =
14400 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
14401 : TreeEntry::ReducedBitcastBSwap)
14402 : (ForLoads ? TreeEntry::ReducedBitcastLoads
14403 : TreeEntry::ReducedBitcast);
14404 E.CombinedOp = Code;
14405 E.ReorderIndices = std::move(Order);
14406 TreeEntry *ZExtEntry = getOperandEntry(E: &E, Idx: 0);
14407 assert(ZExtEntry->UserTreeIndex &&
14408 ZExtEntry->State == TreeEntry::Vectorize &&
14409 ZExtEntry->getOpcode() == Instruction::ZExt &&
14410 "Expected ZExt node.");
14411 // The ZExt node is part of the combined node.
14412 ZExtEntry->State = TreeEntry::CombinedVectorize;
14413 ZExtEntry->CombinedOp = Code;
14414 if (ForLoads) {
14415 TreeEntry *LoadsEntry = getOperandEntry(E: ZExtEntry, Idx: 0);
14416 assert(LoadsEntry->UserTreeIndex &&
14417 LoadsEntry->State == TreeEntry::Vectorize &&
14418 LoadsEntry->getOpcode() == Instruction::Load &&
14419 "Expected Load node.");
14420 // The Load node is part of the combined node.
14421 LoadsEntry->State = TreeEntry::CombinedVectorize;
14422 LoadsEntry->CombinedOp = Code;
14423 }
14424 TreeEntry *ConstEntry = getOperandEntry(E: &E, Idx: 1);
14425 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
14426 "Expected ZExt node.");
14427 // The ConstNode node is part of the combined node.
14428 ConstEntry->State = TreeEntry::CombinedVectorize;
14429 ConstEntry->CombinedOp = Code;
14430 break;
14431 }
14432 default:
14433 break;
14434 }
14435 }
14436
14437 if (LoadEntriesToVectorize.empty()) {
14438 // Single load node - exit.
14439 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
14440 VectorizableTree.front()->getOpcode() == Instruction::Load)
14441 return;
14442 // Small graph with small VF - exit.
14443 constexpr unsigned SmallTree = 3;
14444 constexpr unsigned SmallVF = 2;
14445 if ((VectorizableTree.size() <= SmallTree &&
14446 VectorizableTree.front()->Scalars.size() == SmallVF) ||
14447 (VectorizableTree.size() <= 2 && UserIgnoreList))
14448 return;
14449
14450 if (VectorizableTree.front()->isNonPowOf2Vec() &&
14451 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
14452 getCanonicalGraphSize() <= SmallTree &&
14453 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
14454 P: [](const std::unique_ptr<TreeEntry> &TE) {
14455 return TE->isGather() && TE->hasState() &&
14456 TE->getOpcode() == Instruction::Load &&
14457 !allSameBlock(VL: TE->Scalars);
14458 }) == 1)
14459 return;
14460 }
14461
14462 // A list of loads to be gathered during the vectorization process. We can
14463 // try to vectorize them at the end, if profitable.
14464 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
14465 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
14466 GatheredLoads;
14467
14468 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14469 TreeEntry &E = *TE;
14470 if (E.isGather() &&
14471 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
14472 (!E.hasState() && any_of(Range&: E.Scalars,
14473 P: [&](Value *V) {
14474 return isa<LoadInst>(Val: V) &&
14475 !isVectorized(V) &&
14476 !isDeleted(I: cast<Instruction>(Val: V));
14477 }))) &&
14478 !isSplat(VL: E.Scalars)) {
14479 for (Value *V : E.Scalars) {
14480 auto *LI = dyn_cast<LoadInst>(Val: V);
14481 if (!LI)
14482 continue;
14483 if (isDeleted(I: LI) || isVectorized(V: LI) || !LI->isSimple())
14484 continue;
14485 gatherPossiblyVectorizableLoads(
14486 R: *this, VL: V, DL: *DL, SE&: *SE, TTI: *TTI,
14487 GatheredLoads&: GatheredLoads[std::make_tuple(
14488 args: LI->getParent(),
14489 args: getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth),
14490 args: LI->getType())]);
14491 }
14492 }
14493 }
14494 // Try to vectorize gathered loads if this is not just a gather of loads.
14495 if (!GatheredLoads.empty())
14496 tryToVectorizeGatheredLoads(GatheredLoads);
14497}
14498
14499/// Merges shuffle masks and emits final shuffle instruction, if required. It
14500/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14501/// when the actual shuffle instruction is generated only if this is actually
14502/// required. Otherwise, the shuffle instruction emission is delayed till the
14503/// end of the process, to reduce the number of emitted instructions and further
14504/// analysis/transformations.
14505class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
14506 bool IsFinalized = false;
14507 SmallVector<int> CommonMask;
14508 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
14509 const TargetTransformInfo &TTI;
14510 InstructionCost Cost = 0;
14511 SmallDenseSet<Value *> VectorizedVals;
14512 BoUpSLP &R;
14513 SmallPtrSetImpl<Value *> &CheckedExtracts;
14514 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14515 /// While set, still trying to estimate the cost for the same nodes and we
14516 /// can delay actual cost estimation (virtual shuffle instruction emission).
14517 /// May help better estimate the cost if same nodes must be permuted + allows
14518 /// to move most of the long shuffles cost estimation to TTI.
14519 bool SameNodesEstimated = true;
14520
14521 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
14522 if (Ty->getScalarType()->isPointerTy()) {
14523 Constant *Res = ConstantExpr::getIntToPtr(
14524 C: ConstantInt::getAllOnesValue(
14525 Ty: IntegerType::get(C&: Ty->getContext(),
14526 NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
14527 Ty: Ty->getScalarType());
14528 if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
14529 Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
14530 return Res;
14531 }
14532 return Constant::getAllOnesValue(Ty);
14533 }
14534
14535 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
14536 if ((!Root && allConstant(VL)) || all_of(Range&: VL, P: IsaPred<UndefValue>))
14537 return TTI::TCC_Free;
14538 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
14539 InstructionCost GatherCost = 0;
14540 SmallVector<Value *> Gathers(VL);
14541 if (!Root && isSplat(VL)) {
14542 // Found the broadcasting of the single scalar, calculate the cost as
14543 // the broadcast.
14544 const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
14545 assert(It != VL.end() && "Expected at least one non-undef value.");
14546 // Add broadcast for non-identity shuffle only.
14547 bool NeedShuffle =
14548 count(Range&: VL, Element: *It) > 1 &&
14549 (VL.front() != *It || !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
14550 if (!NeedShuffle) {
14551 if (isa<FixedVectorType>(Val: ScalarTy)) {
14552 assert(SLPReVec && "FixedVectorType is not expected.");
14553 return TTI.getShuffleCost(
14554 Kind: TTI::SK_InsertSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind,
14555 Index: std::distance(first: VL.begin(), last: It) * getNumElements(Ty: ScalarTy),
14556 SubTp: cast<FixedVectorType>(Val: ScalarTy));
14557 }
14558 return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
14559 CostKind, Index: std::distance(first: VL.begin(), last: It),
14560 Op0: PoisonValue::get(T: VecTy), Op1: *It);
14561 }
14562
14563 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
14564 transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
14565 return isa<PoisonValue>(Val: V) ? PoisonMaskElem : 0;
14566 });
14567 InstructionCost InsertCost =
14568 TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0,
14569 Op0: PoisonValue::get(T: VecTy), Op1: *It);
14570 return InsertCost + ::getShuffleCost(TTI,
14571 Kind: TargetTransformInfo::SK_Broadcast,
14572 Tp: VecTy, Mask: ShuffleMask, CostKind,
14573 /*Index=*/0, /*SubTp=*/nullptr,
14574 /*Args=*/*It);
14575 }
14576 return GatherCost +
14577 (all_of(Range&: Gathers, P: IsaPred<UndefValue>)
14578 ? TTI::TCC_Free
14579 : R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
14580 ScalarTy));
14581 };
14582
14583 /// Compute the cost of creating a vector containing the extracted values from
14584 /// \p VL.
14585 InstructionCost
14586 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
14587 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14588 unsigned NumParts) {
14589 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
14590 unsigned NumElts =
14591 std::accumulate(first: VL.begin(), last: VL.end(), init: 0, binary_op: [](unsigned Sz, Value *V) {
14592 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
14593 if (!EE)
14594 return Sz;
14595 auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
14596 if (!VecTy)
14597 return Sz;
14598 return std::max(a: Sz, b: VecTy->getNumElements());
14599 });
14600 // FIXME: this must be moved to TTI for better estimation.
14601 unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
14602 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
14603 SmallVectorImpl<unsigned> &Indices,
14604 SmallVectorImpl<unsigned> &SubVecSizes)
14605 -> std::optional<TTI::ShuffleKind> {
14606 if (NumElts <= EltsPerVector)
14607 return std::nullopt;
14608 int OffsetReg0 =
14609 alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
14610 binary_op: [](int S, int I) {
14611 if (I == PoisonMaskElem)
14612 return S;
14613 return std::min(a: S, b: I);
14614 }),
14615 Align: EltsPerVector);
14616 int OffsetReg1 = OffsetReg0;
14617 DenseSet<int> RegIndices;
14618 // Check that if trying to permute same single/2 input vectors.
14619 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
14620 int FirstRegId = -1;
14621 Indices.assign(NumElts: 1, Elt: OffsetReg0);
14622 for (auto [Pos, I] : enumerate(First&: Mask)) {
14623 if (I == PoisonMaskElem)
14624 continue;
14625 int Idx = I - OffsetReg0;
14626 int RegId =
14627 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14628 if (FirstRegId < 0)
14629 FirstRegId = RegId;
14630 RegIndices.insert(V: RegId);
14631 if (RegIndices.size() > 2)
14632 return std::nullopt;
14633 if (RegIndices.size() == 2) {
14634 ShuffleKind = TTI::SK_PermuteTwoSrc;
14635 if (Indices.size() == 1) {
14636 OffsetReg1 = alignDown(
14637 Value: std::accumulate(
14638 first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
14639 binary_op: [&](int S, int I) {
14640 if (I == PoisonMaskElem)
14641 return S;
14642 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14643 ((I - OffsetReg0) % NumElts) / EltsPerVector;
14644 if (RegId == FirstRegId)
14645 return S;
14646 return std::min(a: S, b: I);
14647 }),
14648 Align: EltsPerVector);
14649 unsigned Index = OffsetReg1 % NumElts;
14650 Indices.push_back(Elt: Index);
14651 SubVecSizes.push_back(Elt: std::min(a: NumElts - Index, b: EltsPerVector));
14652 }
14653 Idx = I - OffsetReg1;
14654 }
14655 I = (Idx % NumElts) % EltsPerVector +
14656 (RegId == FirstRegId ? 0 : EltsPerVector);
14657 }
14658 return ShuffleKind;
14659 };
14660 InstructionCost Cost = 0;
14661
14662 // Process extracts in blocks of EltsPerVector to check if the source vector
14663 // operand can be re-used directly. If not, add the cost of creating a
14664 // shuffle to extract the values into a vector register.
14665 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
14666 if (!ShuffleKinds[Part])
14667 continue;
14668 ArrayRef<int> MaskSlice = Mask.slice(
14669 N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
14670 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
14671 copy(Range&: MaskSlice, Out: SubMask.begin());
14672 SmallVector<unsigned, 2> Indices;
14673 SmallVector<unsigned, 2> SubVecSizes;
14674 std::optional<TTI::ShuffleKind> RegShuffleKind =
14675 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14676 if (!RegShuffleKind) {
14677 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
14678 !ShuffleVectorInst::isIdentityMask(
14679 Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
14680 Cost +=
14681 ::getShuffleCost(TTI, Kind: *ShuffleKinds[Part],
14682 Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
14683 continue;
14684 }
14685 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
14686 !ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
14687 Cost +=
14688 ::getShuffleCost(TTI, Kind: *RegShuffleKind,
14689 Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
14690 }
14691 const unsigned BaseVF = getFullVectorNumberOfElements(
14692 TTI: *R.TTI, Ty: VL.front()->getType(), Sz: alignTo(Value: NumElts, Align: EltsPerVector));
14693 for (const auto [Idx, SubVecSize] : zip(t&: Indices, u&: SubVecSizes)) {
14694 assert((Idx + SubVecSize) <= BaseVF &&
14695 "SK_ExtractSubvector index out of range");
14696 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
14697 Tp: getWidenedType(ScalarTy, VF: BaseVF), Mask: {}, CostKind,
14698 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: SubVecSize));
14699 }
14700 // Second attempt to check, if just a permute is better estimated than
14701 // subvector extract.
14702 SubMask.assign(NumElts, Elt: PoisonMaskElem);
14703 copy(Range&: MaskSlice, Out: SubMask.begin());
14704 InstructionCost OriginalCost = ::getShuffleCost(
14705 TTI, Kind: *ShuffleKinds[Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
14706 if (OriginalCost < Cost)
14707 Cost = OriginalCost;
14708 }
14709 return Cost;
14710 }
14711 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14712 /// mask \p Mask, register number \p Part, that includes \p SliceSize
14713 /// elements.
14714 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14715 ArrayRef<int> Mask, unsigned Part,
14716 unsigned SliceSize) {
14717 if (SameNodesEstimated) {
14718 // Delay the cost estimation if the same nodes are reshuffling.
14719 // If we already requested the cost of reshuffling of E1 and E2 before, no
14720 // need to estimate another cost with the sub-Mask, instead include this
14721 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14722 // estimation.
14723 if ((InVectors.size() == 2 &&
14724 cast<const TreeEntry *>(Val&: InVectors.front()) == &E1 &&
14725 cast<const TreeEntry *>(Val&: InVectors.back()) == E2) ||
14726 (!E2 && cast<const TreeEntry *>(Val&: InVectors.front()) == &E1)) {
14727 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
14728 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14729 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14730 "Expected all poisoned elements.");
14731 ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
14732 copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
14733 return;
14734 }
14735 // Found non-matching nodes - need to estimate the cost for the matched
14736 // and transform mask.
14737 Cost += createShuffle(P1: InVectors.front(),
14738 P2: InVectors.size() == 1 ? nullptr : InVectors.back(),
14739 Mask: CommonMask);
14740 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14741 } else if (InVectors.size() == 2) {
14742 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
14743 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14744 }
14745 SameNodesEstimated = false;
14746 if (!E2 && InVectors.size() == 1) {
14747 unsigned VF = E1.getVectorFactor();
14748 if (Value *V1 = dyn_cast<Value *>(Val&: InVectors.front())) {
14749 VF = std::max(a: VF, b: getVF(V: V1));
14750 } else {
14751 const auto *E = cast<const TreeEntry *>(Val&: InVectors.front());
14752 VF = std::max(a: VF, b: E->getVectorFactor());
14753 }
14754 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14755 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14756 CommonMask[Idx] = Mask[Idx] + VF;
14757 Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
14758 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14759 } else {
14760 auto P = InVectors.front();
14761 Cost += createShuffle(P1: &E1, P2: E2, Mask);
14762 unsigned VF = Mask.size();
14763 if (Value *V1 = dyn_cast<Value *>(Val&: P)) {
14764 VF = std::max(a: VF,
14765 b: getNumElements(Ty: V1->getType()));
14766 } else {
14767 const auto *E = cast<const TreeEntry *>(Val&: P);
14768 VF = std::max(a: VF, b: E->getVectorFactor());
14769 }
14770 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14771 if (Mask[Idx] != PoisonMaskElem)
14772 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14773 Cost += createShuffle(P1: P, P2: InVectors.front(), Mask: CommonMask);
14774 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14775 }
14776 }
14777
14778 class ShuffleCostBuilder {
14779 const TargetTransformInfo &TTI;
14780
14781 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14782 int Index = -1;
14783 return Mask.empty() ||
14784 (VF == Mask.size() &&
14785 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) ||
14786 (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
14787 Index == 0);
14788 }
14789
14790 public:
14791 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14792 ~ShuffleCostBuilder() = default;
14793 InstructionCost createShuffleVector(Value *V1, Value *,
14794 ArrayRef<int> Mask) const {
14795 // Empty mask or identity mask are free.
14796 unsigned VF =
14797 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14798 if (isEmptyOrIdentity(Mask, VF))
14799 return TTI::TCC_Free;
14800 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
14801 Tp: cast<VectorType>(Val: V1->getType()), Mask);
14802 }
14803 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14804 // Empty mask or identity mask are free.
14805 unsigned VF =
14806 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14807 if (isEmptyOrIdentity(Mask, VF))
14808 return TTI::TCC_Free;
14809 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
14810 Tp: cast<VectorType>(Val: V1->getType()), Mask);
14811 }
14812 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14813 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14814 return TTI::TCC_Free;
14815 }
14816 void resizeToMatch(Value *&, Value *&) const {}
14817 };
14818
14819 /// Smart shuffle instruction emission, walks through shuffles trees and
14820 /// tries to find the best matching vector for the actual shuffle
14821 /// instruction.
14822 InstructionCost
14823 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14824 const PointerUnion<Value *, const TreeEntry *> &P2,
14825 ArrayRef<int> Mask) {
14826 ShuffleCostBuilder Builder(TTI);
14827 SmallVector<int> CommonMask(Mask);
14828 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14829 unsigned CommonVF = Mask.size();
14830 InstructionCost ExtraCost = 0;
14831 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14832 unsigned VF) -> InstructionCost {
14833 if (E.isGather() && allConstant(VL: E.Scalars))
14834 return TTI::TCC_Free;
14835 Type *EScalarTy = E.Scalars.front()->getType();
14836 bool IsSigned = true;
14837 if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
14838 EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
14839 IsSigned = It->second.second;
14840 }
14841 if (EScalarTy != ScalarTy) {
14842 unsigned CastOpcode = Instruction::Trunc;
14843 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14844 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14845 if (DstSz > SrcSz)
14846 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14847 return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
14848 Src: getWidenedType(ScalarTy: EScalarTy, VF),
14849 CCH: TTI::CastContextHint::None, CostKind);
14850 }
14851 return TTI::TCC_Free;
14852 };
14853 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14854 if (isa<Constant>(Val: V))
14855 return TTI::TCC_Free;
14856 auto *VecTy = cast<VectorType>(Val: V->getType());
14857 Type *EScalarTy = VecTy->getElementType();
14858 if (EScalarTy != ScalarTy) {
14859 bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL));
14860 unsigned CastOpcode = Instruction::Trunc;
14861 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14862 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14863 if (DstSz > SrcSz)
14864 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14865 return TTI.getCastInstrCost(
14866 Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
14867 Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
14868 }
14869 return TTI::TCC_Free;
14870 };
14871 if (!V1 && !V2 && !P2.isNull()) {
14872 // Shuffle 2 entry nodes.
14873 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
14874 unsigned VF = E->getVectorFactor();
14875 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
14876 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14877 assert(all_of(Mask,
14878 [=](int Idx) {
14879 return Idx < 2 * static_cast<int>(CommonVF);
14880 }) &&
14881 "All elements in mask must be less than 2 * CommonVF.");
14882 if (E->Scalars.size() == E2->Scalars.size()) {
14883 SmallVector<int> EMask = E->getCommonMask();
14884 SmallVector<int> E2Mask = E2->getCommonMask();
14885 if (!EMask.empty() || !E2Mask.empty()) {
14886 for (int &Idx : CommonMask) {
14887 if (Idx == PoisonMaskElem)
14888 continue;
14889 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14890 Idx = EMask[Idx];
14891 else if (Idx >= static_cast<int>(CommonVF))
14892 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14893 E->Scalars.size();
14894 }
14895 }
14896 CommonVF = E->Scalars.size();
14897 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14898 GetNodeMinBWAffectedCost(*E2, CommonVF);
14899 } else {
14900 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14901 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14902 }
14903 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14904 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14905 } else if (!V1 && P2.isNull()) {
14906 // Shuffle single entry node.
14907 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
14908 unsigned VF = E->getVectorFactor();
14909 CommonVF = VF;
14910 assert(
14911 all_of(Mask,
14912 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14913 "All elements in mask must be less than CommonVF.");
14914 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14915 SmallVector<int> EMask = E->getCommonMask();
14916 assert(!EMask.empty() && "Expected non-empty common mask.");
14917 for (int &Idx : CommonMask) {
14918 if (Idx != PoisonMaskElem)
14919 Idx = EMask[Idx];
14920 }
14921 CommonVF = E->Scalars.size();
14922 } else if (unsigned Factor = E->getInterleaveFactor();
14923 Factor > 0 && E->Scalars.size() != Mask.size() &&
14924 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask: CommonMask,
14925 Factor)) {
14926 // Deinterleaved nodes are free.
14927 std::iota(first: CommonMask.begin(), last: CommonMask.end(), value: 0);
14928 }
14929 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14930 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14931 // Not identity/broadcast? Try to see if the original vector is better.
14932 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14933 CommonVF == CommonMask.size() &&
14934 any_of(Range: enumerate(First&: CommonMask),
14935 P: [](const auto &&P) {
14936 return P.value() != PoisonMaskElem &&
14937 static_cast<unsigned>(P.value()) != P.index();
14938 }) &&
14939 any_of(Range&: CommonMask,
14940 P: [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14941 SmallVector<int> ReorderMask;
14942 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
14943 ::addMask(Mask&: CommonMask, SubMask: ReorderMask);
14944 }
14945 } else if (V1 && P2.isNull()) {
14946 // Shuffle single vector.
14947 ExtraCost += GetValueMinBWAffectedCost(V1);
14948 CommonVF = getVF(V: V1);
14949 assert(
14950 all_of(Mask,
14951 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14952 "All elements in mask must be less than CommonVF.");
14953 } else if (V1 && !V2) {
14954 // Shuffle vector and tree node.
14955 unsigned VF = getVF(V: V1);
14956 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
14957 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14958 assert(all_of(Mask,
14959 [=](int Idx) {
14960 return Idx < 2 * static_cast<int>(CommonVF);
14961 }) &&
14962 "All elements in mask must be less than 2 * CommonVF.");
14963 if (E2->Scalars.size() == VF && VF != CommonVF) {
14964 SmallVector<int> E2Mask = E2->getCommonMask();
14965 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14966 for (int &Idx : CommonMask) {
14967 if (Idx == PoisonMaskElem)
14968 continue;
14969 if (Idx >= static_cast<int>(CommonVF))
14970 Idx = E2Mask[Idx - CommonVF] + VF;
14971 }
14972 CommonVF = VF;
14973 }
14974 ExtraCost += GetValueMinBWAffectedCost(V1);
14975 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14976 ExtraCost += GetNodeMinBWAffectedCost(
14977 *E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
14978 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14979 } else if (!V1 && V2) {
14980 // Shuffle vector and tree node.
14981 unsigned VF = getVF(V: V2);
14982 const TreeEntry *E1 = cast<const TreeEntry *>(Val: P1);
14983 CommonVF = std::max(a: VF, b: E1->getVectorFactor());
14984 assert(all_of(Mask,
14985 [=](int Idx) {
14986 return Idx < 2 * static_cast<int>(CommonVF);
14987 }) &&
14988 "All elements in mask must be less than 2 * CommonVF.");
14989 if (E1->Scalars.size() == VF && VF != CommonVF) {
14990 SmallVector<int> E1Mask = E1->getCommonMask();
14991 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14992 for (int &Idx : CommonMask) {
14993 if (Idx == PoisonMaskElem)
14994 continue;
14995 if (Idx >= static_cast<int>(CommonVF))
14996 Idx = E1Mask[Idx - CommonVF] + VF;
14997 else
14998 Idx = E1Mask[Idx];
14999 }
15000 CommonVF = VF;
15001 }
15002 ExtraCost += GetNodeMinBWAffectedCost(
15003 *E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
15004 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
15005 ExtraCost += GetValueMinBWAffectedCost(V2);
15006 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
15007 } else {
15008 assert(V1 && V2 && "Expected both vectors.");
15009 unsigned VF = getVF(V: V1);
15010 CommonVF = std::max(a: VF, b: getVF(V: V2));
15011 assert(all_of(Mask,
15012 [=](int Idx) {
15013 return Idx < 2 * static_cast<int>(CommonVF);
15014 }) &&
15015 "All elements in mask must be less than 2 * CommonVF.");
15016 ExtraCost +=
15017 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
15018 if (V1->getType() != V2->getType()) {
15019 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
15020 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
15021 } else {
15022 if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
15023 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
15024 if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
15025 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
15026 }
15027 }
15028 InVectors.front() =
15029 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
15030 if (InVectors.size() == 2)
15031 InVectors.pop_back();
15032 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
15033 V1, V2, Mask: CommonMask, Builder, ScalarTy);
15034 }
15035
15036public:
15037 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
15038 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
15039 SmallPtrSetImpl<Value *> &CheckedExtracts)
15040 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
15041 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
15042 CheckedExtracts(CheckedExtracts) {}
15043 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
15044 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
15045 unsigned NumParts, bool &UseVecBaseAsInput) {
15046 UseVecBaseAsInput = false;
15047 if (Mask.empty())
15048 return nullptr;
15049 Value *VecBase = nullptr;
15050 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
15051 if (!E->ReorderIndices.empty()) {
15052 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
15053 E->ReorderIndices.end());
15054 reorderScalars(Scalars&: VL, Mask: ReorderMask);
15055 }
15056 // Check if it can be considered reused if same extractelements were
15057 // vectorized already.
15058 bool PrevNodeFound = any_of(
15059 Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
15060 P: [&](const std::unique_ptr<TreeEntry> &TE) {
15061 return ((TE->hasState() && !TE->isAltShuffle() &&
15062 TE->getOpcode() == Instruction::ExtractElement) ||
15063 TE->isGather()) &&
15064 all_of(Range: enumerate(First&: TE->Scalars), P: [&](auto &&Data) {
15065 return VL.size() > Data.index() &&
15066 (Mask[Data.index()] == PoisonMaskElem ||
15067 isa<UndefValue>(VL[Data.index()]) ||
15068 Data.value() == VL[Data.index()]);
15069 });
15070 });
15071 SmallPtrSet<Value *, 4> UniqueBases;
15072 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
15073 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
15074 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
15075 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
15076 ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
15077 for (auto [I, V] :
15078 enumerate(First: ArrayRef(VL).slice(N: Part * SliceSize, M: Limit))) {
15079 // Ignore non-extractelement scalars.
15080 if (isa<UndefValue>(Val: V) ||
15081 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
15082 continue;
15083 // If all users of instruction are going to be vectorized and this
15084 // instruction itself is not going to be vectorized, consider this
15085 // instruction as dead and remove its cost from the final cost of the
15086 // vectorized tree.
15087 // Also, avoid adjusting the cost for extractelements with multiple uses
15088 // in different graph entries.
15089 auto *EE = cast<ExtractElementInst>(Val: V);
15090 VecBase = EE->getVectorOperand();
15091 UniqueBases.insert(Ptr: VecBase);
15092 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
15093 if (!CheckedExtracts.insert(Ptr: V).second ||
15094 !R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) ||
15095 any_of(Range&: VEs,
15096 P: [&](const TreeEntry *TE) {
15097 return R.DeletedNodes.contains(Ptr: TE) ||
15098 R.TransformedToGatherNodes.contains(Val: TE);
15099 }) ||
15100 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
15101 !R.isVectorized(V: EE) &&
15102 count_if(Range: E->Scalars, P: [&](Value *V) { return V == EE; }) !=
15103 count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
15104 P: [&](Value *V) { return V == EE; })) ||
15105 any_of(Range: EE->users(),
15106 P: [&](User *U) {
15107 return isa<GetElementPtrInst>(Val: U) &&
15108 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
15109 VectorizedVals: &VectorizedVals);
15110 }) ||
15111 (!VEs.empty() && !is_contained(Range&: VEs, Element: E)))
15112 continue;
15113 std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
15114 if (!EEIdx)
15115 continue;
15116 unsigned Idx = *EEIdx;
15117 // Take credit for instruction that will become dead.
15118 if (EE->hasOneUse() || !PrevNodeFound) {
15119 Instruction *Ext = EE->user_back();
15120 if (isa<SExtInst, ZExtInst>(Val: Ext) &&
15121 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
15122 // Use getExtractWithExtendCost() to calculate the cost of
15123 // extractelement/ext pair.
15124 Cost -= TTI.getExtractWithExtendCost(
15125 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: EE->getVectorOperandType(),
15126 Index: Idx, CostKind);
15127 // Add back the cost of s|zext which is subtracted separately.
15128 Cost += TTI.getCastInstrCost(
15129 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
15130 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
15131 continue;
15132 }
15133 }
15134 APInt &DemandedElts =
15135 VectorOpsToExtracts
15136 .try_emplace(Key: VecBase,
15137 Args: APInt::getZero(numBits: getNumElements(Ty: VecBase->getType())))
15138 .first->getSecond();
15139 DemandedElts.setBit(Idx);
15140 }
15141 }
15142 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
15143 Cost -= TTI.getScalarizationOverhead(Ty: cast<VectorType>(Val: Vec->getType()),
15144 DemandedElts, /*Insert=*/false,
15145 /*Extract=*/true, CostKind);
15146 // Check that gather of extractelements can be represented as just a
15147 // shuffle of a single/two vectors the scalars are extracted from.
15148 // Found the bunch of extractelement instructions that must be gathered
15149 // into a vector and can be represented as a permutation elements in a
15150 // single input vector or of 2 input vectors.
15151 // Done for reused if same extractelements were vectorized already.
15152 if (!PrevNodeFound)
15153 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
15154 InVectors.assign(NumElts: 1, Elt: E);
15155 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
15156 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
15157 SameNodesEstimated = false;
15158 if (NumParts != 1 && UniqueBases.size() != 1) {
15159 UseVecBaseAsInput = true;
15160 VecBase =
15161 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
15162 }
15163 return VecBase;
15164 }
15165 /// Checks if the specified entry \p E needs to be delayed because of its
15166 /// dependency nodes.
15167 std::optional<InstructionCost>
15168 needToDelay(const TreeEntry *,
15169 ArrayRef<SmallVector<const TreeEntry *>>) const {
15170 // No need to delay the cost estimation during analysis.
15171 return std::nullopt;
15172 }
15173 /// Reset the builder to handle perfect diamond match.
15174 void resetForSameNode() {
15175 IsFinalized = false;
15176 CommonMask.clear();
15177 InVectors.clear();
15178 Cost = 0;
15179 VectorizedVals.clear();
15180 SameNodesEstimated = true;
15181 }
15182 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
15183 if (&E1 == &E2) {
15184 assert(all_of(Mask,
15185 [&](int Idx) {
15186 return Idx < static_cast<int>(E1.getVectorFactor());
15187 }) &&
15188 "Expected single vector shuffle mask.");
15189 add(E1, Mask);
15190 return;
15191 }
15192 if (InVectors.empty()) {
15193 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
15194 InVectors.assign(IL: {&E1, &E2});
15195 return;
15196 }
15197 assert(!CommonMask.empty() && "Expected non-empty common mask.");
15198 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
15199 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
15200 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
15201 const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
15202 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
15203 estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
15204 }
15205 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
15206 if (InVectors.empty()) {
15207 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
15208 InVectors.assign(NumElts: 1, Elt: &E1);
15209 return;
15210 }
15211 assert(!CommonMask.empty() && "Expected non-empty common mask.");
15212 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
15213 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
15214 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
15215 const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
15216 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
15217 estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
15218 if (!SameNodesEstimated && InVectors.size() == 1)
15219 InVectors.emplace_back(Args: &E1);
15220 }
15221 /// Adds 2 input vectors and the mask for their shuffling.
15222 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
15223 // May come only for shuffling of 2 vectors with extractelements, already
15224 // handled in adjustExtracts.
15225 assert(InVectors.size() == 1 &&
15226 all_of(enumerate(CommonMask),
15227 [&](auto P) {
15228 if (P.value() == PoisonMaskElem)
15229 return Mask[P.index()] == PoisonMaskElem;
15230 auto *EI = cast<ExtractElementInst>(
15231 cast<const TreeEntry *>(InVectors.front())
15232 ->getOrdered(P.index()));
15233 return EI->getVectorOperand() == V1 ||
15234 EI->getVectorOperand() == V2;
15235 }) &&
15236 "Expected extractelement vectors.");
15237 }
15238 /// Adds another one input vector and the mask for the shuffling.
15239 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
15240 if (InVectors.empty()) {
15241 assert(CommonMask.empty() && !ForExtracts &&
15242 "Expected empty input mask/vectors.");
15243 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
15244 InVectors.assign(NumElts: 1, Elt: V1);
15245 return;
15246 }
15247 if (ForExtracts) {
15248 // No need to add vectors here, already handled them in adjustExtracts.
15249 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
15250 !CommonMask.empty() &&
15251 all_of(enumerate(CommonMask),
15252 [&](auto P) {
15253 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
15254 ->getOrdered(P.index());
15255 if (P.value() == PoisonMaskElem)
15256 return P.value() == Mask[P.index()] ||
15257 isa<UndefValue>(Scalar);
15258 if (isa<Constant>(V1))
15259 return true;
15260 auto *EI = cast<ExtractElementInst>(Scalar);
15261 return EI->getVectorOperand() == V1;
15262 }) &&
15263 "Expected only tree entry for extractelement vectors.");
15264 return;
15265 }
15266 assert(!InVectors.empty() && !CommonMask.empty() &&
15267 "Expected only tree entries from extracts/reused buildvectors.");
15268 unsigned VF = getVF(V: V1);
15269 if (InVectors.size() == 2) {
15270 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
15271 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
15272 VF = std::max<unsigned>(a: VF, b: CommonMask.size());
15273 } else if (const auto *InTE =
15274 InVectors.front().dyn_cast<const TreeEntry *>()) {
15275 VF = std::max(a: VF, b: InTE->getVectorFactor());
15276 } else {
15277 VF = std::max(
15278 a: VF, b: cast<FixedVectorType>(Val: cast<Value *>(Val&: InVectors.front())->getType())
15279 ->getNumElements());
15280 }
15281 InVectors.push_back(Elt: V1);
15282 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
15283 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
15284 CommonMask[Idx] = Mask[Idx] + VF;
15285 }
15286 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
15287 Value *Root = nullptr) {
15288 Cost += getBuildVectorCost(VL, Root);
15289 if (!Root) {
15290 // FIXME: Need to find a way to avoid use of getNullValue here.
15291 SmallVector<Constant *> Vals;
15292 unsigned VF = VL.size();
15293 if (MaskVF != 0)
15294 VF = std::min(a: VF, b: MaskVF);
15295 Type *VLScalarTy = VL.front()->getType();
15296 for (Value *V : VL.take_front(N: VF)) {
15297 Type *ScalarTy = VLScalarTy->getScalarType();
15298 if (isa<PoisonValue>(Val: V)) {
15299 Vals.push_back(Elt: PoisonValue::get(T: ScalarTy));
15300 continue;
15301 }
15302 if (isa<UndefValue>(Val: V)) {
15303 Vals.push_back(Elt: UndefValue::get(T: ScalarTy));
15304 continue;
15305 }
15306 Vals.push_back(Elt: Constant::getNullValue(Ty: ScalarTy));
15307 }
15308 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: VLScalarTy)) {
15309 assert(SLPReVec && "FixedVectorType is not expected.");
15310 // When REVEC is enabled, we need to expand vector types into scalar
15311 // types.
15312 Vals = replicateMask(Val: Vals, VF: VecTy->getNumElements());
15313 }
15314 return ConstantVector::get(V: Vals);
15315 }
15316 return ConstantVector::getSplat(
15317 EC: ElementCount::getFixed(
15318 MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
15319 Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy->getScalarType()));
15320 }
15321 InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
15322 /// Finalize emission of the shuffles.
15323 InstructionCost finalize(
15324 ArrayRef<int> ExtMask,
15325 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
15326 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
15327 function_ref<void(Value *&, SmallVectorImpl<int> &,
15328 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
15329 Action = {}) {
15330 IsFinalized = true;
15331 if (Action) {
15332 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
15333 if (InVectors.size() == 2)
15334 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
15335 else
15336 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
15337 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
15338 assert(VF > 0 &&
15339 "Expected vector length for the final value before action.");
15340 Value *V = cast<Value *>(Val: Vec);
15341 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
15342 Cost += createShuffle(P1: V1, P2: V2, Mask);
15343 return V1;
15344 });
15345 InVectors.front() = V;
15346 }
15347 if (!SubVectors.empty()) {
15348 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
15349 if (InVectors.size() == 2)
15350 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
15351 else
15352 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
15353 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
15354 // Add subvectors permutation cost.
15355 if (!SubVectorsMask.empty()) {
15356 assert(SubVectorsMask.size() <= CommonMask.size() &&
15357 "Expected same size of masks for subvectors and common mask.");
15358 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
15359 copy(Range&: SubVectorsMask, Out: SVMask.begin());
15360 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
15361 if (I2 != PoisonMaskElem) {
15362 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
15363 I1 = I2 + CommonMask.size();
15364 }
15365 }
15366 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
15367 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()),
15368 Mask: SVMask, CostKind);
15369 }
15370 for (auto [E, Idx] : SubVectors) {
15371 Type *EScalarTy = E->Scalars.front()->getType();
15372 bool IsSigned = true;
15373 if (auto It = R.MinBWs.find(Val: E); It != R.MinBWs.end()) {
15374 EScalarTy =
15375 IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
15376 IsSigned = It->second.second;
15377 }
15378 if (ScalarTy != EScalarTy) {
15379 unsigned CastOpcode = Instruction::Trunc;
15380 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
15381 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
15382 if (DstSz > SrcSz)
15383 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15384 Cost += TTI.getCastInstrCost(
15385 Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF: E->getVectorFactor()),
15386 Src: getWidenedType(ScalarTy: EScalarTy, VF: E->getVectorFactor()),
15387 CCH: TTI::CastContextHint::Normal, CostKind);
15388 }
15389 Cost += ::getShuffleCost(
15390 TTI, Kind: TTI::SK_InsertSubvector,
15391 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), Mask: {}, CostKind, Index: Idx,
15392 SubTp: getWidenedType(ScalarTy, VF: E->getVectorFactor()));
15393 if (!CommonMask.empty()) {
15394 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
15395 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
15396 value: Idx);
15397 }
15398 }
15399 }
15400
15401 if (!ExtMask.empty()) {
15402 if (CommonMask.empty()) {
15403 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
15404 } else {
15405 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
15406 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
15407 if (ExtMask[I] == PoisonMaskElem)
15408 continue;
15409 NewMask[I] = CommonMask[ExtMask[I]];
15410 }
15411 CommonMask.swap(RHS&: NewMask);
15412 }
15413 }
15414 if (CommonMask.empty()) {
15415 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
15416 return Cost;
15417 }
15418 return Cost +
15419 createShuffle(P1: InVectors.front(),
15420 P2: InVectors.size() == 2 ? InVectors.back() : nullptr,
15421 Mask: CommonMask);
15422 }
15423
15424 ~ShuffleCostEstimator() {
15425 assert((IsFinalized || CommonMask.empty()) &&
15426 "Shuffle construction must be finalized.");
15427 }
15428};
15429
15430const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
15431 unsigned Idx) const {
15432 TreeEntry *Op = OperandsToTreeEntry.at(Val: {E, Idx});
15433 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
15434 return Op;
15435}
15436
15437TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
15438 if (TE.State == TreeEntry::ScatterVectorize ||
15439 TE.State == TreeEntry::StridedVectorize)
15440 return TTI::CastContextHint::GatherScatter;
15441 if (TE.State == TreeEntry::CompressVectorize)
15442 return TTI::CastContextHint::Masked;
15443 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
15444 !TE.isAltShuffle()) {
15445 if (TE.ReorderIndices.empty())
15446 return TTI::CastContextHint::Normal;
15447 SmallVector<int> Mask;
15448 inversePermutation(Indices: TE.ReorderIndices, Mask);
15449 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
15450 return TTI::CastContextHint::Reversed;
15451 }
15452 return TTI::CastContextHint::None;
15453}
15454
15455/// Get the assumed loop trip count for the loop \p L.
15456static unsigned getLoopTripCount(const Loop *L, ScalarEvolution &SE) {
15457 if (LoopAwareTripCount == 0)
15458 return 1;
15459 unsigned Scale = SE.getSmallConstantTripCount(L);
15460 if (Scale == 0)
15461 Scale = getLoopEstimatedTripCount(L: const_cast<Loop *>(L)).value_or(u: 0);
15462 if (Scale != 0) {
15463 // Multiple exiting blocks - choose the minimum between trip count (scale)
15464 // and LoopAwareTripCount, since the multiple exit loops can be terminated
15465 // early.
15466 if (!L->getExitingBlock())
15467 return std::min<unsigned>(a: LoopAwareTripCount, b: Scale);
15468 return Scale;
15469 }
15470 return LoopAwareTripCount;
15471}
15472
15473unsigned BoUpSLP::getScaleToLoopIterations(const TreeEntry &TE, Value *Scalar,
15474 Instruction *U) {
15475 BasicBlock *Parent = nullptr;
15476 if (U) {
15477 Parent = U->getParent();
15478 } else if (TE.isGather() || TE.State == TreeEntry::SplitVectorize) {
15479 EdgeInfo EI = TE.UserTreeIndex;
15480 while (EI.UserTE) {
15481 if (EI.UserTE->isGather() ||
15482 EI.UserTE->State == TreeEntry::SplitVectorize) {
15483 EI = EI.UserTE->UserTreeIndex;
15484 continue;
15485 }
15486 if (EI.UserTE->State == TreeEntry::Vectorize &&
15487 EI.UserTE->getOpcode() == Instruction::PHI) {
15488 auto *PH = cast<PHINode>(Val: EI.UserTE->getMainOp());
15489 Parent = PH->getIncomingBlock(i: EI.EdgeIdx);
15490 } else {
15491 Parent = EI.UserTE->getMainOp()->getParent();
15492 }
15493 break;
15494 }
15495 if (!Parent)
15496 return 1;
15497 } else {
15498 Parent = TE.getMainOp()->getParent();
15499 }
15500 if (const Loop *L = LI->getLoopFor(BB: Parent)) {
15501 const auto It = LoopToScaleFactor.find(Val: L);
15502 if (It != LoopToScaleFactor.end())
15503 return It->second;
15504 unsigned Scale = 1;
15505 if (const Loop *NonInvL = findInnermostNonInvariantLoop(
15506 L, VL: Scalar ? ArrayRef(Scalar) : ArrayRef(TE.Scalars))) {
15507 Scale = getLoopTripCount(L: NonInvL, SE&: *SE);
15508 for (const Loop *LN : getLoopNest(L: NonInvL)) {
15509 if (LN == L)
15510 break;
15511 auto LNRes = LoopToScaleFactor.try_emplace(Key: LN, Args: 0);
15512 auto &LoopScale = LNRes.first->getSecond();
15513 if (!LNRes.second) {
15514 Scale *= LoopScale;
15515 break;
15516 }
15517 Scale *= getLoopTripCount(L: LN, SE&: *SE);
15518 LoopScale = Scale;
15519 }
15520 }
15521 LoopToScaleFactor.try_emplace(Key: L, Args&: Scale);
15522 return Scale;
15523 }
15524 return 1;
15525}
15526
15527InstructionCost
15528BoUpSLP::getVectorSpillReloadCost(const TreeEntry *E, VectorType *VecTy,
15529 VectorType *FinalVecTy,
15530 TTI::TargetCostKind CostKind) const {
15531 InstructionCost SpillsReloads = 0;
15532
15533 // Estimate vector register pressure per target register class: operand
15534 // vectors plus the result. The same vector operand is counted once via
15535 // CountedOpEntries deduplication. PHIs take the max operand pressure across
15536 // incoming slots (only one predecessor is live at a time) plus the result.
15537 // All-constant operand bundles are skipped.
15538 if (!E->hasState() || E->getOpcode() == Instruction::Store ||
15539 E->getOpcode() == Instruction::ExtractElement ||
15540 E->getOpcode() == Instruction::ExtractValue ||
15541 E->getOpcode() == Instruction::Freeze ||
15542 (E->getOpcode() == Instruction::Load &&
15543 E->State != TreeEntry::ScatterVectorize))
15544 return SpillsReloads;
15545
15546 const bool IsPHI =
15547 E->State == TreeEntry::Vectorize && E->getOpcode() == Instruction::PHI;
15548 SmallPtrSet<const TreeEntry *, 8> CountedOpEntries;
15549 SmallDenseMap<unsigned, unsigned> PressureByClass;
15550 auto AddPartsToClass = [&](unsigned RegClass, unsigned Parts) {
15551 assert(Parts != 0 && "Expected non-zero number of parts (registers).");
15552 PressureByClass[RegClass] += Parts;
15553 };
15554
15555 auto GetEntryVecTy = [&](const TreeEntry *TE) -> VectorType * {
15556 Type *ScalarTy = getValueType(V: TE->Scalars.front());
15557 auto BWIt = MinBWs.find(Val: TE);
15558 if (BWIt != MinBWs.end()) {
15559 auto *VTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
15560 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: BWIt->second.first);
15561 if (VTy)
15562 ScalarTy = getWidenedType(ScalarTy, VF: VTy->getNumElements());
15563 }
15564 return getWidenedType(ScalarTy, VF: TE->getVectorFactor());
15565 };
15566
15567 if (E->State == TreeEntry::SplitVectorize) {
15568 for (const auto &[Idx, _] : E->CombinedEntriesWithIndices) {
15569 const TreeEntry *OpTE = VectorizableTree[Idx].get();
15570
15571 if (!CountedOpEntries.insert(Ptr: OpTE).second)
15572 continue;
15573 auto *OpVecTy = GetEntryVecTy(OpTE);
15574 const unsigned Parts = ::getNumberOfParts(TTI: *TTI, VecTy: OpVecTy);
15575 if (Parts == 0)
15576 continue;
15577 const unsigned RC =
15578 TTI->getRegisterClassForType(/*Vector=*/true, Ty: OpVecTy);
15579 AddPartsToClass(RC, Parts);
15580 }
15581 } else if (IsPHI) {
15582 // Only one predecessor is live at a time — take the max operand pressure
15583 // across incoming slots.
15584 SmallDenseMap<unsigned, unsigned> MaxOpPressureByClass;
15585 for (unsigned Idx : seq<unsigned>(Size: E->getNumOperands())) {
15586 const TreeEntry *OpTE = getOperandEntry(E, Idx);
15587 auto *OpVecTy = GetEntryVecTy(OpTE);
15588 const unsigned Parts = ::getNumberOfParts(TTI: *TTI, VecTy: OpVecTy);
15589 if (Parts == 0)
15590 continue;
15591 const unsigned RC =
15592 TTI->getRegisterClassForType(/*Vector=*/true, Ty: OpVecTy);
15593 MaxOpPressureByClass[RC] = std::max(a: MaxOpPressureByClass[RC], b: Parts);
15594 }
15595 for (auto [RC, Parts] : MaxOpPressureByClass)
15596 AddPartsToClass(RC, Parts);
15597 } else {
15598 for (unsigned Idx : seq<unsigned>(Size: E->getNumOperands())) {
15599 // InsertElement operand 0 is the vector being inserted into, which is
15600 // built incrementally and does not occupy an extra register.
15601 if (E->getOpcode() == Instruction::InsertElement && Idx == 0)
15602 continue;
15603 ArrayRef<Value *> Ops = E->getOperand(OpIdx: Idx);
15604 if (Ops.empty() || allConstant(VL: Ops) || isSplat(VL: Ops))
15605 continue;
15606 Value *Op = Ops.front();
15607 if (!Op)
15608 continue;
15609 const TreeEntry *OpTE = getOperandEntry(E, Idx);
15610
15611 if (!CountedOpEntries.insert(Ptr: OpTE).second)
15612 continue;
15613 auto *OpVecTy = getWidenedType(ScalarTy: Op->getType(), VF: Ops.size());
15614 const unsigned Parts = ::getNumberOfParts(TTI: *TTI, VecTy: OpVecTy);
15615 if (Parts == 0)
15616 continue;
15617 const unsigned RC =
15618 TTI->getRegisterClassForType(/*Vector=*/true, Ty: OpVecTy);
15619 AddPartsToClass(RC, Parts);
15620 }
15621 }
15622
15623 if (E->getOpcode() != Instruction::Load) {
15624 const unsigned ResParts = ::getNumberOfParts(TTI: *TTI, VecTy);
15625 if (ResParts != 0) {
15626 const unsigned RC = TTI->getRegisterClassForType(/*Vector=*/true, Ty: VecTy);
15627 AddPartsToClass(RC, ResParts);
15628 }
15629 if (VecTy != FinalVecTy) {
15630 const unsigned FinalResParts = ::getNumberOfParts(TTI: *TTI, VecTy: FinalVecTy);
15631 if (FinalResParts != 0) {
15632 const unsigned RC =
15633 TTI->getRegisterClassForType(/*Vector=*/true, Ty: FinalVecTy);
15634 AddPartsToClass(RC, FinalResParts);
15635 }
15636 }
15637 }
15638
15639 for (auto [RegClass, UsedRegs] : PressureByClass) {
15640 const unsigned NumAvailRegs = TTI->getNumberOfRegisters(ClassID: RegClass);
15641 if (NumAvailRegs == 0 || UsedRegs <= NumAvailRegs)
15642 continue;
15643 const unsigned SpillCount = UsedRegs - NumAvailRegs;
15644 InstructionCost SingleRegSpillReload =
15645 TTI->getRegisterClassReloadCost(ClassID: RegClass, CostKind);
15646 // No need to spill cost only for the root entry (Idx == 0), for reduction
15647 // and non-returning instructions, like void calls.
15648 if (E->Idx > 0 || !UserIgnoreList || !E->Scalars[0]->getType()->isVoidTy())
15649 SingleRegSpillReload +=
15650 TTI->getRegisterClassSpillCost(ClassID: RegClass, CostKind);
15651 SpillsReloads += SingleRegSpillReload * SpillCount;
15652 }
15653 return SpillsReloads;
15654}
15655
15656InstructionCost
15657BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
15658 SmallPtrSetImpl<Value *> &CheckedExtracts) {
15659 ArrayRef<Value *> VL = E->Scalars;
15660
15661 Type *ScalarTy = getValueType(V: VL[0]);
15662 if (!isValidElementType(Ty: ScalarTy))
15663 return InstructionCost::getInvalid();
15664 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15665
15666 // If we have computed a smaller type for the expression, update VecTy so
15667 // that the costs will be accurate.
15668 auto It = MinBWs.find(Val: E);
15669 Type *OrigScalarTy = ScalarTy;
15670 if (It != MinBWs.end()) {
15671 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
15672 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
15673 if (VecTy)
15674 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
15675 } else if (E->Idx == 0 && isReducedBitcastRoot()) {
15676 const TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
15677 ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
15678 }
15679 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
15680 unsigned EntryVF = E->getVectorFactor();
15681 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
15682
15683 const InstructionCost SpillsReloads =
15684 getVectorSpillReloadCost(E, VecTy, FinalVecTy, CostKind);
15685 if (E->isGather() || TransformedToGatherNodes.contains(Val: E)) {
15686 if (allConstant(VL))
15687 return 0;
15688 if (isa<InsertElementInst>(Val: VL[0]))
15689 return InstructionCost::getInvalid();
15690 if (isa<CmpInst>(Val: VL.front()))
15691 ScalarTy = VL.front()->getType();
15692 return SpillsReloads +
15693 processBuildVector<ShuffleCostEstimator, InstructionCost>(
15694 E, ScalarTy, Params&: *TTI, Params&: VectorizedVals, Params&: *this, Params&: CheckedExtracts);
15695 }
15696 if (E->State == TreeEntry::SplitVectorize) {
15697 assert(E->CombinedEntriesWithIndices.size() == 2 &&
15698 "Expected exactly 2 combined entries.");
15699 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
15700 InstructionCost VectorCost = 0;
15701 if (E->ReorderIndices.empty()) {
15702 VectorCost = ::getShuffleCost(
15703 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: FinalVecTy, Mask: {}, CostKind,
15704 Index: E->CombinedEntriesWithIndices.back().second,
15705 SubTp: getWidenedType(
15706 ScalarTy,
15707 VF: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
15708 ->getVectorFactor()));
15709 } else {
15710 unsigned CommonVF =
15711 std::max(a: VectorizableTree[E->CombinedEntriesWithIndices.front().first]
15712 ->getVectorFactor(),
15713 b: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
15714 ->getVectorFactor());
15715 VectorCost = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
15716 Tp: getWidenedType(ScalarTy, VF: CommonVF),
15717 Mask: E->getSplitMask(), CostKind);
15718 }
15719 VectorCost += SpillsReloads;
15720 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
15721 return VectorCost;
15722 }
15723 InstructionCost CommonCost = 0;
15724 SmallVector<int> Mask;
15725 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
15726 (E->State != TreeEntry::StridedVectorize ||
15727 !isReverseOrder(Order: E->ReorderIndices))) {
15728 SmallVector<int> NewMask;
15729 if (E->getOpcode() == Instruction::Store) {
15730 // For stores the order is actually a mask.
15731 NewMask.resize(N: E->ReorderIndices.size());
15732 copy(Range: E->ReorderIndices, Out: NewMask.begin());
15733 } else {
15734 inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
15735 }
15736 ::addMask(Mask, SubMask: NewMask);
15737 }
15738 if (!E->ReuseShuffleIndices.empty())
15739 ::addMask(Mask, SubMask: E->ReuseShuffleIndices);
15740 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
15741 CommonCost =
15742 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
15743 assert((E->State == TreeEntry::Vectorize ||
15744 E->State == TreeEntry::ScatterVectorize ||
15745 E->State == TreeEntry::StridedVectorize ||
15746 E->State == TreeEntry::CompressVectorize) &&
15747 "Unhandled state");
15748 assert(E->getOpcode() &&
15749 ((allSameType(VL) && allSameBlock(VL)) ||
15750 (E->getOpcode() == Instruction::GetElementPtr &&
15751 E->getMainOp()->getType()->isPointerTy()) ||
15752 E->hasCopyableElements()) &&
15753 "Invalid VL");
15754 Instruction *VL0 = E->getMainOp();
15755 unsigned ShuffleOrOp =
15756 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15757 if (E->CombinedOp != TreeEntry::NotCombinedOp)
15758 ShuffleOrOp = E->CombinedOp;
15759 SmallSetVector<Value *, 16> UniqueValues;
15760 SmallVector<unsigned, 16> UniqueIndexes;
15761 for (auto [Idx, V] : enumerate(First&: VL))
15762 if (UniqueValues.insert(X: V))
15763 UniqueIndexes.push_back(Elt: Idx);
15764 const unsigned Sz = UniqueValues.size();
15765 SmallBitVector UsedScalars(Sz, false);
15766 for (unsigned I = 0; I < Sz; ++I) {
15767 if (isa<Instruction>(Val: UniqueValues[I]) &&
15768 !E->isCopyableElement(V: UniqueValues[I]) &&
15769 getTreeEntries(V: UniqueValues[I]).front() == E)
15770 continue;
15771 UsedScalars.set(I);
15772 }
15773 auto GetCastContextHint = [&](Value *V) {
15774 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
15775 return getCastContextHint(TE: *OpTEs.front());
15776 InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: 0), TLI: *TLI);
15777 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15778 !SrcState.isAltShuffle())
15779 return TTI::CastContextHint::GatherScatter;
15780 return TTI::CastContextHint::None;
15781 };
15782 auto GetCostDiff =
15783 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
15784 function_ref<InstructionCost(InstructionCost)> VectorCost) {
15785 // Calculate the cost of this instruction.
15786 InstructionCost ScalarCost = 0;
15787 if (isa<CastInst, CallInst>(Val: VL0)) {
15788 // For some of the instructions no need to calculate cost for each
15789 // particular instruction, we can use the cost of the single
15790 // instruction x total number of scalar instructions.
15791 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
15792 } else {
15793 for (unsigned I = 0; I < Sz; ++I) {
15794 if (UsedScalars.test(Idx: I))
15795 continue;
15796 ScalarCost += ScalarEltCost(I);
15797 }
15798 }
15799
15800 InstructionCost VecCost = VectorCost(CommonCost);
15801 // Check if the current node must be resized, if the parent node is not
15802 // resized.
15803 if (It != MinBWs.end() && !UnaryInstruction::isCast(Opcode: E->getOpcode()) &&
15804 E->Idx != 0 &&
15805 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
15806 const EdgeInfo &EI = E->UserTreeIndex;
15807 if (!EI.UserTE->hasState() ||
15808 EI.UserTE->getOpcode() != Instruction::Select ||
15809 EI.EdgeIdx != 0) {
15810 auto UserBWIt = MinBWs.find(Val: EI.UserTE);
15811 Type *UserScalarTy =
15812 (EI.UserTE->isGather() ||
15813 EI.UserTE->State == TreeEntry::SplitVectorize)
15814 ? EI.UserTE->Scalars.front()->getType()
15815 : EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
15816 if (UserBWIt != MinBWs.end())
15817 UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
15818 NumBits: UserBWIt->second.first);
15819 if (ScalarTy != UserScalarTy) {
15820 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
15821 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
15822 unsigned VecOpcode;
15823 auto *UserVecTy = getWidenedType(ScalarTy: UserScalarTy, VF: E->Scalars.size());
15824 if (BWSz > SrcBWSz)
15825 VecOpcode = Instruction::Trunc;
15826 else
15827 VecOpcode =
15828 It->second.second ? Instruction::SExt : Instruction::ZExt;
15829 TTI::CastContextHint CCH = GetCastContextHint(VL0);
15830 VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
15831 CostKind);
15832 }
15833 }
15834 }
15835 VecCost += SpillsReloads;
15836 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
15837 ScalarCost, "Calculated costs for Tree"));
15838 return VecCost - ScalarCost;
15839 };
15840 // Calculate cost difference from vectorizing set of GEPs.
15841 // Negative value means vectorizing is profitable.
15842 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
15843 assert((E->State == TreeEntry::Vectorize ||
15844 E->State == TreeEntry::StridedVectorize ||
15845 E->State == TreeEntry::CompressVectorize) &&
15846 "Entry state expected to be Vectorize, StridedVectorize or "
15847 "MaskedLoadCompressVectorize here.");
15848 InstructionCost ScalarCost = 0;
15849 InstructionCost VecCost = 0;
15850 std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
15851 TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
15852 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
15853 "Calculated GEPs cost for Tree"));
15854
15855 return VecCost - ScalarCost + SpillsReloads;
15856 };
15857
15858 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
15859 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI ? VI : VL);
15860 if (MinMaxID == Intrinsic::not_intrinsic)
15861 return InstructionCost::getInvalid();
15862 Type *CanonicalType = Ty;
15863 if (CanonicalType->isPtrOrPtrVectorTy())
15864 CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
15865 C&: CanonicalType->getContext(),
15866 NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
15867
15868 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15869 {CanonicalType, CanonicalType});
15870 InstructionCost IntrinsicCost =
15871 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15872 // If the selects are the only uses of the compares, they will be
15873 // dead and we can adjust the cost by removing their cost.
15874 if (VI && SelectOnly) {
15875 assert((!Ty->isVectorTy() || SLPReVec) &&
15876 "Expected only for scalar type.");
15877 auto *CI = cast<CmpInst>(Val: VI->getOperand(i: 0));
15878 IntrinsicCost -= TTI->getCmpSelInstrCost(
15879 Opcode: CI->getOpcode(), ValTy: Ty, CondTy: Builder.getInt1Ty(), VecPred: CI->getPredicate(),
15880 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
15881 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CI);
15882 }
15883 return IntrinsicCost;
15884 };
15885 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
15886 Instruction *VI) {
15887 InstructionCost Cost = canConvertToFMA(VL: VI, S, DT&: *DT, DL: *DL, TTI, TLI: *TLI);
15888 return Cost;
15889 };
15890 switch (ShuffleOrOp) {
15891 case Instruction::PHI: {
15892 // Count reused scalars.
15893 InstructionCost ScalarCost = 0;
15894 SmallPtrSet<const TreeEntry *, 4> CountedOps;
15895 for (Value *V : UniqueValues) {
15896 auto *PHI = dyn_cast<PHINode>(Val: V);
15897 if (!PHI)
15898 continue;
15899
15900 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
15901 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
15902 Value *Op = PHI->getIncomingValue(i: I);
15903 Operands[I] = Op;
15904 }
15905 if (const TreeEntry *OpTE =
15906 getSameValuesTreeEntry(V: Operands.front(), VL: Operands))
15907 if (CountedOps.insert(Ptr: OpTE).second &&
15908 !OpTE->ReuseShuffleIndices.empty())
15909 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15910 OpTE->Scalars.size());
15911 }
15912
15913 return CommonCost - ScalarCost + SpillsReloads;
15914 }
15915 case Instruction::ExtractValue:
15916 case Instruction::ExtractElement: {
15917 APInt DemandedElts;
15918 VectorType *SrcVecTy = nullptr;
15919 auto GetScalarCost = [&](unsigned Idx) {
15920 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15921 return InstructionCost(TTI::TCC_Free);
15922
15923 auto *I = cast<Instruction>(Val: UniqueValues[Idx]);
15924 if (!SrcVecTy) {
15925 if (ShuffleOrOp == Instruction::ExtractElement) {
15926 auto *EE = cast<ExtractElementInst>(Val: I);
15927 SrcVecTy = EE->getVectorOperandType();
15928 } else {
15929 auto *EV = cast<ExtractValueInst>(Val: I);
15930 Type *AggregateTy = EV->getAggregateOperand()->getType();
15931 unsigned NumElts;
15932 if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
15933 NumElts = ATy->getNumElements();
15934 else
15935 NumElts = AggregateTy->getStructNumElements();
15936 SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
15937 }
15938 }
15939 if (I->hasOneUse()) {
15940 Instruction *Ext = I->user_back();
15941 if ((isa<SExtInst>(Val: Ext) || isa<ZExtInst>(Val: Ext)) &&
15942 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
15943 // Use getExtractWithExtendCost() to calculate the cost of
15944 // extractelement/ext pair.
15945 InstructionCost Cost = TTI->getExtractWithExtendCost(
15946 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I),
15947 CostKind);
15948 // Subtract the cost of s|zext which is subtracted separately.
15949 Cost -= TTI->getCastInstrCost(
15950 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
15951 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
15952 return Cost;
15953 }
15954 }
15955 if (DemandedElts.isZero())
15956 DemandedElts = APInt::getZero(numBits: getNumElements(Ty: SrcVecTy));
15957 DemandedElts.setBit(*getExtractIndex(E: I));
15958 return InstructionCost(TTI::TCC_Free);
15959 };
15960 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15961 return CommonCost - (DemandedElts.isZero()
15962 ? TTI::TCC_Free
15963 : TTI.getScalarizationOverhead(
15964 Ty: SrcVecTy, DemandedElts, /*Insert=*/false,
15965 /*Extract=*/true, CostKind));
15966 };
15967 return GetCostDiff(GetScalarCost, GetVectorCost);
15968 }
15969 case Instruction::InsertElement: {
15970 assert(E->ReuseShuffleIndices.empty() &&
15971 "Unique insertelements only are expected.");
15972 auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
15973 unsigned const NumElts = SrcVecTy->getNumElements();
15974 unsigned const NumScalars = VL.size();
15975
15976 unsigned NumOfParts = ::getNumberOfParts(TTI: *TTI, VecTy: SrcVecTy);
15977
15978 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15979 unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
15980 unsigned OffsetEnd = OffsetBeg;
15981 InsertMask[OffsetBeg] = 0;
15982 for (auto [I, V] : enumerate(First: VL.drop_front())) {
15983 unsigned Idx = *getElementIndex(Inst: V);
15984 if (OffsetBeg > Idx)
15985 OffsetBeg = Idx;
15986 else if (OffsetEnd < Idx)
15987 OffsetEnd = Idx;
15988 InsertMask[Idx] = I + 1;
15989 }
15990 unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
15991 if (NumOfParts > 0 && NumOfParts < NumElts)
15992 VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - 1) / NumOfParts);
15993 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15994 VecScalarsSz;
15995 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15996 unsigned InsertVecSz = std::min<unsigned>(
15997 a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + 1),
15998 b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15999 bool IsWholeSubvector =
16000 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
16001 // Check if we can safely insert a subvector. If it is not possible, just
16002 // generate a whole-sized vector and shuffle the source vector and the new
16003 // subvector.
16004 if (OffsetBeg + InsertVecSz > VecSz) {
16005 // Align OffsetBeg to generate correct mask.
16006 OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
16007 InsertVecSz = VecSz;
16008 }
16009
16010 APInt DemandedElts = APInt::getZero(numBits: NumElts);
16011 // TODO: Add support for Instruction::InsertValue.
16012 SmallVector<int> Mask;
16013 if (!E->ReorderIndices.empty()) {
16014 inversePermutation(Indices: E->ReorderIndices, Mask);
16015 Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
16016 } else {
16017 Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
16018 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: 0);
16019 }
16020 bool IsIdentity = true;
16021 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
16022 Mask.swap(RHS&: PrevMask);
16023 for (unsigned I = 0; I < NumScalars; ++I) {
16024 unsigned InsertIdx = *getElementIndex(Inst: VL[PrevMask[I]]);
16025 DemandedElts.setBit(InsertIdx);
16026 IsIdentity &= InsertIdx - OffsetBeg == I;
16027 Mask[InsertIdx - OffsetBeg] = I;
16028 }
16029 assert(Offset < NumElts && "Failed to find vector index offset");
16030
16031 InstructionCost Cost = 0;
16032 Cost -=
16033 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: SrcVecTy, DemandedElts,
16034 /*Insert*/ true, /*Extract*/ false, CostKind);
16035
16036 // First cost - resize to actual vector size if not identity shuffle or
16037 // need to shift the vector.
16038 // Do not calculate the cost if the actual size is the register size and
16039 // we can merge this shuffle with the following SK_Select.
16040 auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
16041 if (!IsIdentity)
16042 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc,
16043 Tp: InsertVecTy, Mask);
16044 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: [E](Value *V) {
16045 return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
16046 }));
16047 // Second cost - permutation with subvector, if some elements are from the
16048 // initial vector or inserting a subvector.
16049 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
16050 // subvector of ActualVecTy.
16051 SmallBitVector InMask =
16052 isUndefVector(V: FirstInsert->getOperand(i: 0),
16053 UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
16054 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
16055 if (InsertVecSz != VecSz) {
16056 auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
16057 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy, Mask: {},
16058 CostKind, Index: OffsetBeg - Offset, SubTp: InsertVecTy);
16059 } else {
16060 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
16061 Mask[I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
16062 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
16063 I <= End; ++I)
16064 if (Mask[I] != PoisonMaskElem)
16065 Mask[I] = I + VecSz;
16066 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
16067 Mask[I] =
16068 ((I >= InMask.size()) || InMask.test(Idx: I)) ? PoisonMaskElem : I;
16069 Cost +=
16070 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
16071 }
16072 }
16073 return Cost + SpillsReloads;
16074 }
16075 case Instruction::ZExt:
16076 case Instruction::SExt:
16077 case Instruction::FPToUI:
16078 case Instruction::FPToSI:
16079 case Instruction::FPExt:
16080 case Instruction::PtrToInt:
16081 case Instruction::IntToPtr:
16082 case Instruction::SIToFP:
16083 case Instruction::UIToFP:
16084 case Instruction::Trunc:
16085 case Instruction::FPTrunc:
16086 case Instruction::BitCast: {
16087 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
16088 Type *SrcScalarTy = VL0->getOperand(i: 0)->getType();
16089 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
16090 unsigned Opcode = ShuffleOrOp;
16091 unsigned VecOpcode = Opcode;
16092 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
16093 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
16094 // Check if the values are candidates to demote.
16095 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy->getScalarType());
16096 if (SrcIt != MinBWs.end()) {
16097 SrcBWSz = SrcIt->second.first;
16098 unsigned SrcScalarTyNumElements = getNumElements(Ty: SrcScalarTy);
16099 SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
16100 SrcVecTy =
16101 getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size() * SrcScalarTyNumElements);
16102 }
16103 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
16104 if (BWSz == SrcBWSz) {
16105 VecOpcode = Instruction::BitCast;
16106 } else if (BWSz < SrcBWSz) {
16107 VecOpcode = Instruction::Trunc;
16108 } else if (It != MinBWs.end()) {
16109 assert(BWSz > SrcBWSz && "Invalid cast!");
16110 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
16111 } else if (SrcIt != MinBWs.end()) {
16112 assert(BWSz > SrcBWSz && "Invalid cast!");
16113 VecOpcode =
16114 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
16115 }
16116 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
16117 !SrcIt->second.second) {
16118 VecOpcode = Instruction::UIToFP;
16119 }
16120 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
16121 assert(Idx == 0 && "Expected 0 index only");
16122 return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
16123 Src: VL0->getOperand(i: 0)->getType(),
16124 CCH: TTI::getCastContextHint(I: VL0), CostKind, I: VL0);
16125 };
16126 auto GetVectorCost = [=](InstructionCost CommonCost) {
16127 // Do not count cost here if minimum bitwidth is in effect and it is just
16128 // a bitcast (here it is just a noop).
16129 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
16130 return CommonCost;
16131 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
16132 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(i: 0));
16133
16134 bool IsArithmeticExtendedReduction =
16135 E->Idx == 0 && UserIgnoreList &&
16136 all_of(Range: *UserIgnoreList, P: [](Value *V) {
16137 auto *I = cast<Instruction>(Val: V);
16138 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
16139 Instruction::Mul, Instruction::FMul,
16140 Instruction::And, Instruction::Or,
16141 Instruction::Xor},
16142 Element: I->getOpcode());
16143 });
16144 if (IsArithmeticExtendedReduction &&
16145 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
16146 return CommonCost;
16147 return CommonCost +
16148 TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
16149 I: VecOpcode == Opcode ? VI : nullptr);
16150 };
16151 return GetCostDiff(GetScalarCost, GetVectorCost);
16152 }
16153 case Instruction::FCmp:
16154 case Instruction::ICmp:
16155 case Instruction::Select: {
16156 CmpPredicate VecPred, SwappedVecPred;
16157 auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
16158 if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) ||
16159 match(V: VL0, P: MatchCmp))
16160 SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
16161 else
16162 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
16163 ? CmpInst::BAD_FCMP_PREDICATE
16164 : CmpInst::BAD_ICMP_PREDICATE;
16165 auto GetScalarCost = [&](unsigned Idx) {
16166 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16167 return InstructionCost(TTI::TCC_Free);
16168
16169 if (!isa<SelectInst>(Val: UniqueValues[Idx]))
16170 return TTI->getInstructionCost(U: cast<Instruction>(Val: UniqueValues[Idx]),
16171 CostKind);
16172
16173 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
16174 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
16175 ? CmpInst::BAD_FCMP_PREDICATE
16176 : CmpInst::BAD_ICMP_PREDICATE;
16177 auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
16178 if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
16179 !match(V: VI, P: MatchCmp)) ||
16180 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
16181 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
16182 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
16183 ? CmpInst::BAD_FCMP_PREDICATE
16184 : CmpInst::BAD_ICMP_PREDICATE;
16185
16186 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
16187 Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
16188 CostKind,
16189 Op1Info: getOperandInfo(
16190 Ops: VI->getOperand(i: ShuffleOrOp == Instruction::Select ? 1 : 0)),
16191 Op2Info: getOperandInfo(
16192 Ops: VI->getOperand(i: ShuffleOrOp == Instruction::Select ? 2 : 1)),
16193 I: VI);
16194 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
16195 if (IntrinsicCost.isValid())
16196 ScalarCost = IntrinsicCost;
16197
16198 return ScalarCost;
16199 };
16200 auto GetVectorCost = [&](InstructionCost CommonCost) {
16201 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
16202
16203 InstructionCost VecCost = TTI->getCmpSelInstrCost(
16204 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred, CostKind,
16205 Op1Info: getOperandInfo(
16206 Ops: E->getOperand(OpIdx: ShuffleOrOp == Instruction::Select ? 1 : 0)),
16207 Op2Info: getOperandInfo(
16208 Ops: E->getOperand(OpIdx: ShuffleOrOp == Instruction::Select ? 2 : 1)),
16209 I: VL0);
16210 if (auto *SI = dyn_cast<SelectInst>(Val: VL0)) {
16211 auto *CondType =
16212 getWidenedType(ScalarTy: SI->getCondition()->getType(), VF: VL.size());
16213 unsigned CondNumElements = CondType->getNumElements();
16214 unsigned VecTyNumElements = getNumElements(Ty: VecTy);
16215 assert(VecTyNumElements >= CondNumElements &&
16216 VecTyNumElements % CondNumElements == 0 &&
16217 "Cannot vectorize Instruction::Select");
16218 if (CondNumElements != VecTyNumElements) {
16219 // When the return type is i1 but the source is fixed vector type, we
16220 // need to duplicate the condition value.
16221 VecCost += ::getShuffleCost(
16222 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: CondType,
16223 Mask: createReplicatedMask(ReplicationFactor: VecTyNumElements / CondNumElements,
16224 VF: CondNumElements));
16225 }
16226 }
16227 return VecCost + CommonCost;
16228 };
16229 return GetCostDiff(GetScalarCost, GetVectorCost);
16230 }
16231 case TreeEntry::MinMax: {
16232 auto GetScalarCost = [&](unsigned Idx) {
16233 return GetMinMaxCost(OrigScalarTy);
16234 };
16235 auto GetVectorCost = [&](InstructionCost CommonCost) {
16236 InstructionCost VecCost = GetMinMaxCost(VecTy);
16237 return VecCost + CommonCost;
16238 };
16239 return GetCostDiff(GetScalarCost, GetVectorCost);
16240 }
16241 case TreeEntry::FMulAdd: {
16242 auto GetScalarCost = [&](unsigned Idx) {
16243 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16244 return InstructionCost(TTI::TCC_Free);
16245 return GetFMulAddCost(E->getOperations(),
16246 cast<Instruction>(Val: UniqueValues[Idx]));
16247 };
16248 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16249 FastMathFlags FMF;
16250 FMF.set();
16251 for (Value *V : E->Scalars) {
16252 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: V)) {
16253 FMF &= FPCI->getFastMathFlags();
16254 if (auto *FPCIOp = dyn_cast<FPMathOperator>(Val: FPCI->getOperand(i: 0)))
16255 FMF &= FPCIOp->getFastMathFlags();
16256 }
16257 }
16258 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
16259 {VecTy, VecTy, VecTy}, FMF);
16260 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
16261 return VecCost + CommonCost;
16262 };
16263 return GetCostDiff(GetScalarCost, GetVectorCost);
16264 }
16265 case TreeEntry::ReducedBitcast:
16266 case TreeEntry::ReducedBitcastBSwap: {
16267 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
16268 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16269 return InstructionCost(TTI::TCC_Free);
16270 auto *Shl = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
16271 if (!Shl)
16272 return InstructionCost(TTI::TCC_Free);
16273 InstructionCost ScalarCost = TTI.getInstructionCost(U: Shl, CostKind);
16274 auto *ZExt = dyn_cast<Instruction>(Val: Shl->getOperand(i: 0));
16275 if (!ZExt)
16276 return ScalarCost;
16277 ScalarCost += TTI.getInstructionCost(U: ZExt, CostKind);
16278 return ScalarCost;
16279 };
16280 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16281 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
16282 TTI::CastContextHint CastCtx =
16283 getCastContextHint(TE: *getOperandEntry(E: LhsTE, /*Idx=*/0));
16284 Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
16285 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
16286 InstructionCost BitcastCost = TTI.getCastInstrCost(
16287 Opcode: Instruction::BitCast, Dst: ScalarTy, Src: SrcVecTy, CCH: CastCtx, CostKind);
16288 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
16289 auto *SrcType = IntegerType::getIntNTy(
16290 C&: ScalarTy->getContext(),
16291 N: DL->getTypeSizeInBits(Ty: SrcScalarTy) * EntryVF);
16292 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
16293 InstructionCost IntrinsicCost =
16294 TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
16295 BitcastCost += IntrinsicCost;
16296 if (SrcType != ScalarTy) {
16297 BitcastCost +=
16298 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
16299 CCH: TTI::CastContextHint::None, CostKind);
16300 }
16301 }
16302 return BitcastCost + CommonCost;
16303 };
16304 return GetCostDiff(GetScalarCost, GetVectorCost);
16305 }
16306 case TreeEntry::ReducedBitcastLoads:
16307 case TreeEntry::ReducedBitcastBSwapLoads: {
16308 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
16309 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16310 return InstructionCost(TTI::TCC_Free);
16311 auto *Shl = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
16312 if (!Shl)
16313 return InstructionCost(TTI::TCC_Free);
16314 InstructionCost ScalarCost = TTI.getInstructionCost(U: Shl, CostKind);
16315 auto *ZExt = dyn_cast<Instruction>(Val: Shl->getOperand(i: 0));
16316 if (!ZExt)
16317 return ScalarCost;
16318 ScalarCost += TTI.getInstructionCost(U: ZExt, CostKind);
16319 auto *Load = dyn_cast<Instruction>(Val: ZExt->getOperand(i: 0));
16320 if (!Load)
16321 return ScalarCost;
16322 ScalarCost += TTI.getInstructionCost(U: Load, CostKind);
16323 return ScalarCost;
16324 };
16325 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16326 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
16327 const TreeEntry *LoadTE = getOperandEntry(E: LhsTE, /*Idx=*/0);
16328 auto *LI0 = cast<LoadInst>(Val: LoadTE->getMainOp());
16329 auto *SrcType = IntegerType::getIntNTy(
16330 C&: ScalarTy->getContext(),
16331 N: DL->getTypeSizeInBits(Ty: LI0->getType()) * EntryVF);
16332 InstructionCost LoadCost =
16333 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI0->getAlign(),
16334 AddressSpace: LI0->getPointerAddressSpace(), CostKind);
16335 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
16336 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
16337 InstructionCost IntrinsicCost =
16338 TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
16339 LoadCost += IntrinsicCost;
16340 if (SrcType != ScalarTy) {
16341 LoadCost +=
16342 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
16343 CCH: TTI::CastContextHint::None, CostKind);
16344 }
16345 }
16346 return LoadCost + CommonCost;
16347 };
16348 return GetCostDiff(GetScalarCost, GetVectorCost);
16349 }
16350 case TreeEntry::ReducedCmpBitcast: {
16351 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
16352 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16353 return InstructionCost(TTI::TCC_Free);
16354 auto *Sel = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
16355 if (!Sel)
16356 return InstructionCost(TTI::TCC_Free);
16357 InstructionCost ScalarCost = TTI.getInstructionCost(U: Sel, CostKind);
16358 return ScalarCost;
16359 };
16360 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
16361 Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: VecTy);
16362 auto *DstTy =
16363 IntegerType::getIntNTy(C&: ScalarTy->getContext(), N: E->getVectorFactor());
16364 InstructionCost BitcastCost =
16365 TTI.getCastInstrCost(Opcode: Instruction::BitCast, Dst: DstTy, Src: CmpTy,
16366 CCH: TTI::CastContextHint::None, CostKind);
16367 if (DstTy != ScalarTy) {
16368 BitcastCost +=
16369 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: DstTy,
16370 CCH: TTI::CastContextHint::None, CostKind);
16371 }
16372 return BitcastCost + CommonCost;
16373 };
16374 return GetCostDiff(GetScalarCost, GetVectorCost);
16375 }
16376 case Instruction::FNeg:
16377 case Instruction::Add:
16378 case Instruction::FAdd:
16379 case Instruction::Sub:
16380 case Instruction::FSub:
16381 case Instruction::Mul:
16382 case Instruction::FMul:
16383 case Instruction::UDiv:
16384 case Instruction::SDiv:
16385 case Instruction::FDiv:
16386 case Instruction::URem:
16387 case Instruction::SRem:
16388 case Instruction::FRem:
16389 case Instruction::Shl:
16390 case Instruction::LShr:
16391 case Instruction::AShr:
16392 case Instruction::And:
16393 case Instruction::Or:
16394 case Instruction::Xor: {
16395 auto GetScalarCost = [&](unsigned Idx) {
16396 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16397 return InstructionCost(TTI::TCC_Free);
16398
16399 // We cannot retrieve the operand from UniqueValues[Idx] because an
16400 // interchangeable instruction may be used. The order and the actual
16401 // operand might differ from what is retrieved from UniqueValues[Idx].
16402 unsigned Lane = UniqueIndexes[Idx];
16403 Value *Op1 = E->getOperand(OpIdx: 0)[Lane];
16404 Value *Op2;
16405 SmallVector<const Value *, 2> Operands(1, Op1);
16406 if (isa<UnaryOperator>(Val: UniqueValues[Idx])) {
16407 Op2 = Op1;
16408 } else {
16409 Op2 = E->getOperand(OpIdx: 1)[Lane];
16410 Operands.push_back(Elt: Op2);
16411 }
16412 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: Op1);
16413 TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(V: Op2);
16414 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
16415 Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands);
16416 if (auto *I = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
16417 I && (ShuffleOrOp == Instruction::FAdd ||
16418 ShuffleOrOp == Instruction::FSub)) {
16419 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
16420 if (IntrinsicCost.isValid())
16421 ScalarCost = IntrinsicCost;
16422 }
16423 return ScalarCost;
16424 };
16425 auto GetVectorCost = [=](InstructionCost CommonCost) {
16426 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
16427 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
16428 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
16429 if (all_of(Range&: Ops, P: [&](Value *Op) {
16430 auto *CI = dyn_cast<ConstantInt>(Val: Op);
16431 return CI && CI->getValue().countr_one() >= It->second.first;
16432 }))
16433 return CommonCost;
16434 }
16435 }
16436 unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? 0 : 1;
16437 TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
16438 TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
16439 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
16440 Opd2Info: Op2Info, Args: {}, CxtI: nullptr, TLibInfo: TLI) +
16441 CommonCost;
16442 };
16443 return GetCostDiff(GetScalarCost, GetVectorCost);
16444 }
16445 case Instruction::GetElementPtr: {
16446 return CommonCost + GetGEPCostDiff(VL, VL0);
16447 }
16448 case Instruction::Load: {
16449 auto GetScalarCost = [&](unsigned Idx) {
16450 auto *VI = cast<LoadInst>(Val: UniqueValues[Idx]);
16451 return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
16452 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
16453 CostKind, OpdInfo: TTI::OperandValueInfo(), I: VI);
16454 };
16455 auto *LI0 = cast<LoadInst>(Val: VL0);
16456 auto GetVectorCost = [&](InstructionCost CommonCost) {
16457 InstructionCost VecLdCost;
16458 switch (E->State) {
16459 case TreeEntry::Vectorize:
16460 if (unsigned Factor = E->getInterleaveFactor()) {
16461 VecLdCost = TTI->getInterleavedMemoryOpCost(
16462 Opcode: Instruction::Load, VecTy, Factor, Indices: {}, Alignment: LI0->getAlign(),
16463 AddressSpace: LI0->getPointerAddressSpace(), CostKind);
16464
16465 } else {
16466 VecLdCost = TTI->getMemoryOpCost(
16467 Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
16468 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
16469 }
16470 break;
16471 case TreeEntry::StridedVectorize: {
16472 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
16473 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
16474 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
16475 Align CommonAlignment =
16476 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
16477 VecLdCost = TTI->getMemIntrinsicInstrCost(
16478 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
16479 StridedLoadTy, LI0->getPointerOperand(),
16480 /*VariableMask=*/false, CommonAlignment),
16481 CostKind);
16482 if (StridedLoadTy != VecTy)
16483 VecLdCost +=
16484 TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: VecTy, Src: StridedLoadTy,
16485 CCH: getCastContextHint(TE: *E), CostKind);
16486
16487 break;
16488 }
16489 case TreeEntry::CompressVectorize: {
16490 bool IsMasked;
16491 unsigned InterleaveFactor;
16492 SmallVector<int> CompressMask;
16493 VectorType *LoadVecTy;
16494 SmallVector<Value *> Scalars(VL);
16495 if (!E->ReorderIndices.empty()) {
16496 SmallVector<int> Mask(E->ReorderIndices.begin(),
16497 E->ReorderIndices.end());
16498 reorderScalars(Scalars, Mask);
16499 }
16500 SmallVector<Value *> PointerOps(Scalars.size());
16501 for (auto [I, V] : enumerate(First&: Scalars))
16502 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
16503 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
16504 VL: Scalars, PointerOps, Order: E->ReorderIndices, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
16505 TLI: *TLI, AreAllUsersVectorized: [](Value *) { return true; }, IsMasked, InterleaveFactor,
16506 CompressMask, LoadVecTy);
16507 assert(IsVectorized && "Failed to vectorize load");
16508 CompressEntryToData.try_emplace(Key: E, Args&: CompressMask, Args&: LoadVecTy,
16509 Args&: InterleaveFactor, Args&: IsMasked);
16510 Align CommonAlignment = LI0->getAlign();
16511 if (InterleaveFactor) {
16512 VecLdCost = TTI->getInterleavedMemoryOpCost(
16513 Opcode: Instruction::Load, VecTy: LoadVecTy, Factor: InterleaveFactor, Indices: {},
16514 Alignment: CommonAlignment, AddressSpace: LI0->getPointerAddressSpace(), CostKind);
16515 } else if (IsMasked) {
16516 VecLdCost = TTI->getMemIntrinsicInstrCost(
16517 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
16518 CommonAlignment,
16519 LI0->getPointerAddressSpace()),
16520 CostKind);
16521 // TODO: include this cost into CommonCost.
16522 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
16523 Tp: LoadVecTy, Mask: CompressMask, CostKind);
16524 } else {
16525 VecLdCost = TTI->getMemoryOpCost(
16526 Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
16527 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
16528 // TODO: include this cost into CommonCost.
16529 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
16530 Tp: LoadVecTy, Mask: CompressMask, CostKind);
16531 }
16532 break;
16533 }
16534 case TreeEntry::ScatterVectorize: {
16535 Align CommonAlignment =
16536 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
16537 VecLdCost = TTI->getMemIntrinsicInstrCost(
16538 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
16539 LI0->getPointerOperand(),
16540 /*VariableMask=*/false, CommonAlignment),
16541 CostKind);
16542 break;
16543 }
16544 case TreeEntry::CombinedVectorize:
16545 case TreeEntry::SplitVectorize:
16546 case TreeEntry::NeedToGather:
16547 llvm_unreachable("Unexpected vectorization state.");
16548 }
16549 return VecLdCost + CommonCost;
16550 };
16551
16552 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
16553 // If this node generates masked gather load then it is not a terminal node.
16554 // Hence address operand cost is estimated separately.
16555 if (E->State == TreeEntry::ScatterVectorize)
16556 return Cost;
16557
16558 // Estimate cost of GEPs since this tree node is a terminator.
16559 SmallVector<Value *> PointerOps(VL.size());
16560 for (auto [I, V] : enumerate(First&: VL))
16561 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
16562 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
16563 }
16564 case Instruction::Store: {
16565 bool IsReorder = !E->ReorderIndices.empty();
16566 auto GetScalarCost = [=](unsigned Idx) {
16567 auto *VI = cast<StoreInst>(Val: VL[Idx]);
16568 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
16569 return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
16570 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
16571 CostKind, OpdInfo: OpInfo, I: VI);
16572 };
16573 auto *BaseSI =
16574 cast<StoreInst>(Val: IsReorder ? VL[E->ReorderIndices.front()] : VL0);
16575 auto GetVectorCost = [=](InstructionCost CommonCost) {
16576 // We know that we can merge the stores. Calculate the cost.
16577 InstructionCost VecStCost;
16578 if (E->State == TreeEntry::StridedVectorize) {
16579 Align CommonAlignment =
16580 computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
16581 VecStCost = TTI->getMemIntrinsicInstrCost(
16582 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
16583 VecTy, BaseSI->getPointerOperand(),
16584 /*VariableMask=*/false, CommonAlignment),
16585 CostKind);
16586 } else {
16587 assert(E->State == TreeEntry::Vectorize &&
16588 "Expected either strided or consecutive stores.");
16589 if (unsigned Factor = E->getInterleaveFactor()) {
16590 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
16591 "No reused shuffles expected");
16592 CommonCost = 0;
16593 VecStCost = TTI->getInterleavedMemoryOpCost(
16594 Opcode: Instruction::Store, VecTy, Factor, Indices: {}, Alignment: BaseSI->getAlign(),
16595 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind);
16596 } else {
16597 TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
16598 VecStCost = TTI->getMemoryOpCost(
16599 Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
16600 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
16601 }
16602 }
16603 return VecStCost + CommonCost;
16604 };
16605 SmallVector<Value *> PointerOps(VL.size());
16606 for (auto [I, V] : enumerate(First&: VL)) {
16607 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
16608 PointerOps[Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
16609 }
16610
16611 return GetCostDiff(GetScalarCost, GetVectorCost) +
16612 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
16613 }
16614 case Instruction::Call: {
16615 auto GetScalarCost = [&](unsigned Idx) {
16616 auto *CI = cast<CallInst>(Val: UniqueValues[Idx]);
16617 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
16618 if (ID != Intrinsic::not_intrinsic) {
16619 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
16620 return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
16621 }
16622 return TTI->getCallInstrCost(F: CI->getCalledFunction(),
16623 RetTy: CI->getFunctionType()->getReturnType(),
16624 Tys: CI->getFunctionType()->params(), CostKind);
16625 };
16626 auto GetVectorCost = [=](InstructionCost CommonCost) {
16627 auto *CI = cast<CallInst>(Val: VL0);
16628 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
16629 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
16630 CI, ID, VF: VecTy->getNumElements(),
16631 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
16632 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16633 return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
16634 };
16635 return GetCostDiff(GetScalarCost, GetVectorCost);
16636 }
16637 case Instruction::ShuffleVector: {
16638 if (!SLPReVec || E->isAltShuffle())
16639 assert(E->isAltShuffle() &&
16640 ((Instruction::isBinaryOp(E->getOpcode()) &&
16641 Instruction::isBinaryOp(E->getAltOpcode())) ||
16642 (Instruction::isCast(E->getOpcode()) &&
16643 Instruction::isCast(E->getAltOpcode())) ||
16644 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16645 "Invalid Shuffle Vector Operand");
16646 // Try to find the previous shuffle node with the same operands and same
16647 // main/alternate ops.
16648 auto TryFindNodeWithEqualOperands = [=]() {
16649 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16650 if (TE.get() == E)
16651 break;
16652 if (TE->hasState() && TE->isAltShuffle() &&
16653 ((TE->getOpcode() == E->getOpcode() &&
16654 TE->getAltOpcode() == E->getAltOpcode()) ||
16655 (TE->getOpcode() == E->getAltOpcode() &&
16656 TE->getAltOpcode() == E->getOpcode())) &&
16657 TE->hasEqualOperands(TE: *E))
16658 return true;
16659 }
16660 return false;
16661 };
16662 auto GetScalarCost = [&](unsigned Idx) {
16663 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16664 return InstructionCost(TTI::TCC_Free);
16665
16666 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
16667 assert(E->getMatchingMainOpOrAltOp(VI) &&
16668 "Unexpected main/alternate opcode");
16669 (void)E;
16670 return TTI->getInstructionCost(U: VI, CostKind);
16671 };
16672 // Need to clear CommonCost since the final shuffle cost is included into
16673 // vector cost.
16674 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
16675 // VecCost is equal to sum of the cost of creating 2 vectors
16676 // and the cost of creating shuffle.
16677 InstructionCost VecCost = 0;
16678 if (TryFindNodeWithEqualOperands()) {
16679 LLVM_DEBUG({
16680 dbgs() << "SLP: diamond match for alternate node found.\n";
16681 E->dump();
16682 });
16683 // No need to add new vector costs here since we're going to reuse
16684 // same main/alternate vector ops, just do different shuffling.
16685 } else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
16686 VecCost =
16687 TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
16688 VecCost +=
16689 TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
16690 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
16691 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
16692 VecCost = TTIRef.getCmpSelInstrCost(
16693 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred: CI0->getPredicate(), CostKind,
16694 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
16695 I: VL0);
16696 VecCost += TTIRef.getCmpSelInstrCost(
16697 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
16698 VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
16699 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
16700 I: E->getAltOp());
16701 } else {
16702 Type *SrcSclTy = E->getMainOp()->getOperand(i: 0)->getType();
16703 auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
16704 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
16705 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
16706 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
16707 unsigned SrcBWSz =
16708 DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: 0)->getType());
16709 if (SrcIt != MinBWs.end()) {
16710 SrcBWSz = SrcIt->second.first;
16711 SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
16712 SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
16713 }
16714 if (BWSz <= SrcBWSz) {
16715 if (BWSz < SrcBWSz)
16716 VecCost =
16717 TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
16718 CCH: TTI::CastContextHint::None, CostKind);
16719 LLVM_DEBUG({
16720 dbgs()
16721 << "SLP: alternate extension, which should be truncated.\n";
16722 E->dump();
16723 });
16724 return VecCost;
16725 }
16726 }
16727 VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
16728 CCH: TTI::CastContextHint::None, CostKind);
16729 VecCost +=
16730 TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
16731 CCH: TTI::CastContextHint::None, CostKind);
16732 }
16733 SmallVector<int> Mask;
16734 E->buildAltOpShuffleMask(
16735 IsAltOp: [&](Instruction *I) {
16736 assert(E->getMatchingMainOpOrAltOp(I) &&
16737 "Unexpected main/alternate opcode");
16738 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
16739 TLI: *TLI);
16740 },
16741 Mask);
16742 VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
16743 Tp: FinalVecTy, Mask, CostKind);
16744 // Patterns like [fadd,fsub] can be combined into a single instruction
16745 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
16746 // need to take into account their order when looking for the most used
16747 // order.
16748 unsigned Opcode0 = E->getOpcode();
16749 unsigned Opcode1 = E->getAltOpcode();
16750 SmallBitVector OpcodeMask(
16751 getAltInstrMask(VL: E->Scalars, ScalarTy, Opcode0, Opcode1));
16752 // If this pattern is supported by the target then we consider the
16753 // order.
16754 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
16755 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
16756 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
16757 return AltVecCost < VecCost ? AltVecCost : VecCost;
16758 }
16759 // TODO: Check the reverse order too.
16760 return VecCost;
16761 };
16762 if (SLPReVec && !E->isAltShuffle())
16763 return GetCostDiff(
16764 GetScalarCost, [&](InstructionCost) -> InstructionCost {
16765 // If a group uses mask in order, the shufflevector can be
16766 // eliminated by instcombine. Then the cost is 0.
16767 assert(isa<ShuffleVectorInst>(VL.front()) &&
16768 "Not supported shufflevector usage.");
16769 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
16770 unsigned SVNumElements =
16771 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())
16772 ->getNumElements();
16773 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
16774 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
16775 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
16776 int NextIndex = 0;
16777 if (!all_of(Range&: Group, P: [&](Value *V) {
16778 assert(isa<ShuffleVectorInst>(V) &&
16779 "Not supported shufflevector usage.");
16780 auto *SV = cast<ShuffleVectorInst>(Val: V);
16781 int Index;
16782 [[maybe_unused]] bool IsExtractSubvectorMask =
16783 SV->isExtractSubvectorMask(Index);
16784 assert(IsExtractSubvectorMask &&
16785 "Not supported shufflevector usage.");
16786 if (NextIndex != Index)
16787 return false;
16788 NextIndex += SV->getShuffleMask().size();
16789 return true;
16790 }))
16791 return ::getShuffleCost(
16792 TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy,
16793 Mask: calculateShufflevectorMask(VL: E->Scalars));
16794 }
16795 return TTI::TCC_Free;
16796 });
16797 return GetCostDiff(GetScalarCost, GetVectorCost);
16798 }
16799 case Instruction::Freeze:
16800 return CommonCost;
16801 default:
16802 llvm_unreachable("Unknown instruction");
16803 }
16804}
16805
16806bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
16807 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
16808 << VectorizableTree.size() << " is fully vectorizable .\n");
16809
16810 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
16811 SmallVector<int> Mask;
16812 return TE->isGather() &&
16813 !any_of(Range: TE->Scalars,
16814 P: [this](Value *V) { return EphValues.contains(Ptr: V); }) &&
16815 (allConstant(VL: TE->Scalars) || isSplat(VL: TE->Scalars) ||
16816 TE->Scalars.size() < Limit ||
16817 (((TE->hasState() &&
16818 TE->getOpcode() == Instruction::ExtractElement) ||
16819 all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
16820 isFixedVectorShuffle(VL: TE->Scalars, Mask, AC)) ||
16821 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
16822 !TE->isAltShuffle()) ||
16823 any_of(Range: TE->Scalars, P: IsaPred<LoadInst>));
16824 };
16825
16826 // We only handle trees of heights 1 and 2.
16827 if (VectorizableTree.size() == 1 &&
16828 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
16829 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
16830 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
16831 (ForReduction &&
16832 AreVectorizableGathers(VectorizableTree[0].get(),
16833 VectorizableTree[0]->Scalars.size()) &&
16834 VectorizableTree[0]->getVectorFactor() > 2)))
16835 return true;
16836
16837 if (VectorizableTree.size() != 2)
16838 return false;
16839
16840 // Handle splat and all-constants stores. Also try to vectorize tiny trees
16841 // with the second gather nodes if they have less scalar operands rather than
16842 // the initial tree element (may be profitable to shuffle the second gather)
16843 // or they are extractelements, which form shuffle.
16844 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
16845 AreVectorizableGathers(VectorizableTree[1].get(),
16846 VectorizableTree[0]->Scalars.size()))
16847 return true;
16848
16849 // Gathering cost would be too much for tiny trees.
16850 if (VectorizableTree[0]->isGather() ||
16851 (VectorizableTree[1]->isGather() &&
16852 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
16853 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
16854 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
16855 return false;
16856
16857 return true;
16858}
16859
16860bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
16861 if (!DebugCounter::shouldExecute(Counter&: VectorizedGraphs))
16862 return true;
16863
16864 // Graph is empty - do nothing.
16865 if (VectorizableTree.empty()) {
16866 assert(ExternalUses.empty() && "We shouldn't have any external users");
16867
16868 return true;
16869 }
16870
16871 if (VectorizableTree.size() == 1 && !ForReduction &&
16872 VectorizableTree.front()->isGather() &&
16873 VectorizableTree.front()->hasState() &&
16874 VectorizableTree.front()->getOpcode() == Instruction::ExtractElement)
16875 return true;
16876 if (VectorizableTree.size() == 1 && !ForReduction &&
16877 VectorizableTree.front()->isGather() &&
16878 any_of(Range&: VectorizableTree.front()->Scalars, P: IsaPred<Instruction>))
16879 return true;
16880 if (VectorizableTree.size() <= MinTreeSize && !ForReduction &&
16881 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16882 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
16883 }))
16884 return true;
16885 if (VectorizableTree.size() == 1 && !ForReduction && SLPCostThreshold < 0 &&
16886 VectorizableTree.front()->hasState() &&
16887 VectorizableTree.front()->getOpcode() == Instruction::ExtractElement &&
16888 (VectorizableTree.front()->getVectorFactor() == 2 ||
16889 all_of(
16890 Range&: VectorizableTree.front()->Scalars,
16891 P: [&](Value *V) {
16892 auto *I = dyn_cast<Instruction>(Val: V);
16893 return !I || !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList);
16894 })))
16895 return true;
16896 // No need to vectorize inserts of gathered values.
16897 if (VectorizableTree.size() == 2 &&
16898 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
16899 VectorizableTree[1]->isGather() &&
16900 (VectorizableTree[1]->getVectorFactor() <= 2 ||
16901 !(isSplat(VL: VectorizableTree[1]->Scalars) ||
16902 allConstant(VL: VectorizableTree[1]->Scalars))))
16903 return true;
16904
16905 // The tree with only 3 nodes, where 2 last are gathers/buildvectors, not
16906 // profitable for vectorization.
16907 constexpr int Limit = 4;
16908 if (VectorizableTree.size() == 3 && SLPCostThreshold == 0 &&
16909 (!ForReduction || VectorizableTree.front()->getVectorFactor() <= 2) &&
16910 all_of(Range: ArrayRef(VectorizableTree).drop_front(),
16911 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16912 return TE->isGather() && TE->getVectorFactor() <= Limit &&
16913 !all_of(
16914 Range&: TE->Scalars,
16915 P: IsaPred<ExtractElementInst, UndefValue, Constant>);
16916 }))
16917 return true;
16918
16919 // If the graph includes only PHI nodes and gathers, it is defnitely not
16920 // profitable for the vectorization, we can skip it, if the cost threshold is
16921 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
16922 // gathers/buildvectors.
16923 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16924 !VectorizableTree.empty() &&
16925 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16926 return (TE->isGather() &&
16927 (!TE->hasState() ||
16928 TE->getOpcode() != Instruction::ExtractElement) &&
16929 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) ||
16930 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
16931 }))
16932 return true;
16933
16934 // Do not vectorize small tree of phis only, if all vector phis are also
16935 // gathered.
16936 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16937 VectorizableTree.size() <= Limit &&
16938 all_of(Range: VectorizableTree,
16939 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16940 return (TE->isGather() &&
16941 (!TE->hasState() ||
16942 TE->getOpcode() != Instruction::ExtractElement) &&
16943 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <=
16944 Limit) ||
16945 (TE->hasState() &&
16946 (TE->getOpcode() == Instruction::InsertElement ||
16947 (TE->getOpcode() == Instruction::PHI &&
16948 all_of(Range&: TE->Scalars, P: [&](Value *V) {
16949 return isa<PoisonValue>(Val: V) || MustGather.contains(Ptr: V);
16950 }))));
16951 }) &&
16952 any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16953 return TE->State == TreeEntry::Vectorize &&
16954 TE->getOpcode() == Instruction::PHI;
16955 }))
16956 return true;
16957
16958 // PHI nodes only and gathers cannot be vectorized, skip.
16959 constexpr unsigned LargeTree = 20;
16960 bool HasSingleLoad = false;
16961 if (!ForReduction && SLPCostThreshold >= 0 &&
16962 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16963 bool PrevLoad = HasSingleLoad;
16964 HasSingleLoad |=
16965 TE->hasState() && !TE->isGather() &&
16966 (TE->getOpcode() == Instruction::Load ||
16967 TE->hasCopyableElements()) &&
16968 (TE->getVectorFactor() > 2 || TE->ReorderIndices.empty());
16969 return (TE->hasState() &&
16970 (TE->getOpcode() == Instruction::PHI ||
16971 (VectorizableTree.size() >= LargeTree &&
16972 (TE->getOpcode() == Instruction::Store ||
16973 (TE->getOpcode() == Instruction::Load && !PrevLoad)) &&
16974 TE->getVectorFactor() <= Limit))) ||
16975 (TE->isGather() &&
16976 (!TE->hasState() ||
16977 TE->getOpcode() != Instruction::ExtractElement));
16978 }))
16979 return true;
16980
16981 // Single non-phi vector node - skip the tree.
16982 bool VectorNodeFound = false;
16983 bool AnyNonConst = false;
16984 if (!ForReduction && SLPCostThreshold >= 0 && VectorizableTree.size() >= 5 &&
16985 VectorizableTree.front()->getVectorFactor() <= 2 &&
16986 VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() &&
16987 all_of(Range: VectorizableTree,
16988 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16989 if (TE->State == TreeEntry::Vectorize && TE->hasState()) {
16990 if (TE->hasState() && (TE->getOpcode() == Instruction::PHI ||
16991 !TE->ReorderIndices.empty()))
16992 return true;
16993 bool PrevVectorNodeFound = VectorNodeFound;
16994 VectorNodeFound = true;
16995 return !PrevVectorNodeFound;
16996 }
16997 AnyNonConst |= !allConstant(VL: TE->Scalars);
16998 return TE->isGather() || TE->State == TreeEntry::SplitVectorize;
16999 }) &&
17000 AnyNonConst)
17001 return true;
17002
17003 // If the tree contains only phis, buildvectors, split nodes and
17004 // small nodes with reuses, we can skip it.
17005 SmallVector<const TreeEntry *> StoreLoadNodes;
17006 unsigned NumGathers = 0;
17007 constexpr int LimitTreeSize = 36;
17008 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
17009 all_of(Range: VectorizableTree,
17010 P: [&](const std::unique_ptr<TreeEntry> &TE) {
17011 if (!TE->isGather() && TE->hasState() &&
17012 (TE->getOpcode() == Instruction::Load ||
17013 TE->getOpcode() == Instruction::Store)) {
17014 StoreLoadNodes.push_back(Elt: TE.get());
17015 return true;
17016 }
17017 if (TE->isGather())
17018 ++NumGathers;
17019 return TE->State == TreeEntry::SplitVectorize ||
17020 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
17021 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
17022 VectorizableTree.size() > LimitTreeSize) ||
17023 (TE->isGather() &&
17024 none_of(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>)) ||
17025 (TE->hasState() &&
17026 (TE->getOpcode() == Instruction::PHI ||
17027 (TE->hasCopyableElements() &&
17028 static_cast<unsigned>(count_if(
17029 Range&: TE->Scalars, P: IsaPred<PHINode, Constant>)) >=
17030 TE->Scalars.size() / 2) ||
17031 ((!TE->ReuseShuffleIndices.empty() ||
17032 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
17033 TE->Scalars.size() == 2)));
17034 }) &&
17035 (StoreLoadNodes.empty() ||
17036 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
17037 (NumGathers > 0 || none_of(Range&: StoreLoadNodes, P: [&](const TreeEntry *TE) {
17038 return TE->getOpcode() == Instruction::Store ||
17039 all_of(Range: TE->Scalars, P: [&](Value *V) {
17040 return !isa<LoadInst>(Val: V) ||
17041 areAllUsersVectorized(I: cast<Instruction>(Val: V));
17042 });
17043 })))))
17044 return true;
17045
17046 // If the tree contains only buildvector, 2 non-buildvectors (with root user
17047 // tree node) and other buildvectors, we can skip it.
17048 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
17049 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
17050 VectorizableTree.size() >= Limit &&
17051 count_if(Range: ArrayRef(VectorizableTree).drop_front(),
17052 P: [&](const std::unique_ptr<TreeEntry> &TE) {
17053 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
17054 TE->UserTreeIndex.UserTE->Idx == 0;
17055 }) == 2)
17056 return true;
17057
17058 // If the tree contains only vectorization of the phi node from the
17059 // buildvector - skip it.
17060 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
17061 VectorizableTree.size() > 2 &&
17062 VectorizableTree.front()->State == TreeEntry::Vectorize &&
17063 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17064 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17065 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
17066 all_of(
17067 Range: ArrayRef(VectorizableTree).drop_front(N: 2),
17068 P: [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
17069 return true;
17070
17071 // We can vectorize the tree if its size is greater than or equal to the
17072 // minimum size specified by the MinTreeSize command line option.
17073 if (VectorizableTree.size() >= MinTreeSize)
17074 return false;
17075
17076 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
17077 // can vectorize it if we can prove it fully vectorizable.
17078 if (isFullyVectorizableTinyTree(ForReduction))
17079 return false;
17080
17081 // Check if any of the gather node forms an insertelement buildvector
17082 // somewhere.
17083 bool IsAllowedSingleBVNode =
17084 VectorizableTree.size() > 1 ||
17085 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
17086 !VectorizableTree.front()->isAltShuffle() &&
17087 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
17088 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
17089 allSameBlock(VL: VectorizableTree.front()->Scalars));
17090 if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
17091 return TE->isGather() && all_of(Range&: TE->Scalars, P: [&](Value *V) {
17092 return isa<ExtractElementInst, Constant>(Val: V) ||
17093 (IsAllowedSingleBVNode &&
17094 !V->hasNUsesOrMore(N: UsesLimit) &&
17095 any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
17096 });
17097 }))
17098 return false;
17099
17100 if (VectorizableTree.back()->isGather() &&
17101 VectorizableTree.back()->hasState() &&
17102 VectorizableTree.back()->isAltShuffle() &&
17103 VectorizableTree.back()->getVectorFactor() > 2 &&
17104 allSameBlock(VL: VectorizableTree.back()->Scalars) &&
17105 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
17106 TTI->getScalarizationOverhead(
17107 Ty: getWidenedType(ScalarTy: VectorizableTree.back()->Scalars.front()->getType(),
17108 VF: VectorizableTree.back()->getVectorFactor()),
17109 DemandedElts: APInt::getAllOnes(numBits: VectorizableTree.back()->getVectorFactor()),
17110 /*Insert=*/true, /*Extract=*/false,
17111 CostKind: TTI::TCK_RecipThroughput) > -SLPCostThreshold)
17112 return false;
17113
17114 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
17115 // vectorizable.
17116 return true;
17117}
17118
17119bool BoUpSLP::isTreeNotExtendable() const {
17120 if (getCanonicalGraphSize() != getTreeSize()) {
17121 constexpr unsigned SmallTree = 3;
17122 if (VectorizableTree.front()->isNonPowOf2Vec() &&
17123 getCanonicalGraphSize() <= SmallTree &&
17124 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
17125 P: [](const std::unique_ptr<TreeEntry> &TE) {
17126 return TE->isGather() && TE->hasState() &&
17127 TE->getOpcode() == Instruction::Load &&
17128 !allSameBlock(VL: TE->Scalars);
17129 }) == 1)
17130 return true;
17131 return false;
17132 }
17133 bool Res = false;
17134 for (unsigned Idx : seq<unsigned>(Size: getTreeSize())) {
17135 TreeEntry &E = *VectorizableTree[Idx];
17136 if (E.State == TreeEntry::SplitVectorize)
17137 return false;
17138 if (!E.isGather())
17139 continue;
17140 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
17141 (!E.hasState() &&
17142 all_of(Range&: E.Scalars, P: IsaPred<ExtractElementInst, LoadInst>)) ||
17143 (isa<ExtractElementInst>(Val: E.Scalars.front()) &&
17144 getSameOpcode(VL: ArrayRef(E.Scalars).drop_front(), TLI: *TLI).valid()))
17145 return false;
17146 if (isSplat(VL: E.Scalars) || allConstant(VL: E.Scalars))
17147 continue;
17148 Res = true;
17149 }
17150 return Res;
17151}
17152
17153InstructionCost BoUpSLP::getSpillCost() {
17154 // Walk from the bottom of the tree to the top, tracking which values are
17155 // live. When we see a call instruction that is not part of our tree,
17156 // query TTI to see if there is a cost to keeping values live over it
17157 // (for example, if spills and fills are required).
17158
17159 const TreeEntry *Root = VectorizableTree.front().get();
17160 if (Root->isGather())
17161 return 0;
17162
17163 InstructionCost Cost = 0;
17164 SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
17165 EntriesToOperands;
17166 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
17167 SmallPtrSet<const Instruction *, 8> LastInstructions;
17168 SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
17169 for (const auto &TEPtr : VectorizableTree) {
17170 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
17171 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
17172 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
17173 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
17174 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
17175 ScalarOrPseudoEntries.insert(Ptr: TEPtr.get());
17176 continue;
17177 }
17178 if (!TEPtr->isGather()) {
17179 Instruction *LastInst = &getLastInstructionInBundle(E: TEPtr.get());
17180 EntriesToLastInstruction.try_emplace(Key: TEPtr.get(), Args&: LastInst);
17181 LastInstructions.insert(Ptr: LastInst);
17182 }
17183 if (TEPtr->UserTreeIndex)
17184 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(Elt: TEPtr.get());
17185 }
17186
17187 auto NoCallIntrinsic = [this](const Instruction *I) {
17188 const auto *II = dyn_cast<IntrinsicInst>(Val: I);
17189 if (!II)
17190 return false;
17191 if (II->isAssumeLikeIntrinsic())
17192 return true;
17193 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
17194 InstructionCost IntrCost =
17195 TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
17196 InstructionCost CallCost = TTI->getCallInstrCost(
17197 F: nullptr, RetTy: II->getType(), Tys: ICA.getArgTypes(), CostKind: TTI::TCK_RecipThroughput);
17198 return IntrCost < CallCost;
17199 };
17200
17201 // Maps last instruction in the entry to the last instruction for the one of
17202 // operand entries and the flag. If the flag is true, there are no calls in
17203 // between these instructions.
17204 SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
17205 CheckedInstructions;
17206 unsigned Budget = 0;
17207 const unsigned BudgetLimit =
17208 ScheduleRegionSizeBudget / VectorizableTree.size();
17209 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
17210 const Instruction *Last) {
17211 assert(First->getParent() == Last->getParent() &&
17212 "Expected instructions in same block.");
17213 if (auto It = CheckedInstructions.find(Val: Last);
17214 It != CheckedInstructions.end()) {
17215 const Instruction *Checked = It->second.getPointer();
17216 if (Checked == First || Checked->comesBefore(Other: First))
17217 return It->second.getInt() != 0;
17218 Last = Checked;
17219 } else if (Last == First || Last->comesBefore(Other: First)) {
17220 return true;
17221 }
17222 BasicBlock::const_reverse_iterator InstIt =
17223 ++First->getIterator().getReverse(),
17224 PrevInstIt =
17225 Last->getIterator().getReverse();
17226 SmallVector<const Instruction *> LastInstsInRange;
17227 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
17228 // Debug information does not impact spill cost.
17229 // Vectorized calls, represented as vector intrinsics, do not impact spill
17230 // cost.
17231 if (const auto *CB = dyn_cast<CallBase>(Val: &*PrevInstIt);
17232 CB && !NoCallIntrinsic(CB) && !isVectorized(V: CB)) {
17233 for (const Instruction *LastInst : LastInstsInRange)
17234 CheckedInstructions.try_emplace(Key: LastInst, Args: &*PrevInstIt, Args: 0);
17235 return false;
17236 }
17237 if (LastInstructions.contains(Ptr: &*PrevInstIt))
17238 LastInstsInRange.push_back(Elt: &*PrevInstIt);
17239
17240 ++PrevInstIt;
17241 ++Budget;
17242 }
17243 for (const Instruction *LastInst : LastInstsInRange)
17244 CheckedInstructions.try_emplace(
17245 Key: LastInst, Args: PrevInstIt == InstIt ? First : &*PrevInstIt,
17246 Args: Budget <= BudgetLimit ? 1 : 0);
17247 return Budget <= BudgetLimit;
17248 };
17249 auto AddCosts = [&](const TreeEntry *Op) {
17250 if (ScalarOrPseudoEntries.contains(Ptr: Op))
17251 return;
17252 Type *ScalarTy = Op->Scalars.front()->getType();
17253 auto It = MinBWs.find(Val: Op);
17254 if (It != MinBWs.end())
17255 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
17256 auto *VecTy = getWidenedType(ScalarTy, VF: Op->getVectorFactor());
17257 unsigned Scale = getScaleToLoopIterations(TE: *Op);
17258 InstructionCost KeepLiveCost = TTI->getCostOfKeepingLiveOverCall(Tys: VecTy);
17259 KeepLiveCost *= Scale;
17260 Cost += KeepLiveCost;
17261 if (ScalarTy->isVectorTy()) {
17262 // Handle revec dead vector instructions.
17263 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(Tys: ScalarTy) *
17264 Scale;
17265 }
17266 };
17267 // Memoize the relationship between blocks, i.e. if there is (at least one)
17268 // non-vectorized call between the blocks. This allows to skip the analysis of
17269 // the same block paths multiple times.
17270 SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
17271 ParentOpParentToPreds;
17272 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
17273 BasicBlock *OpParent) {
17274 auto Key = std::make_pair(x&: Root, y&: OpParent);
17275 if (auto It = ParentOpParentToPreds.find(Val: Key);
17276 It != ParentOpParentToPreds.end())
17277 return It->second;
17278 SmallVector<BasicBlock *> Worklist;
17279 if (Pred)
17280 Worklist.push_back(Elt: Pred);
17281 else
17282 Worklist.append(in_start: pred_begin(BB: Root), in_end: pred_end(BB: Root));
17283 SmallPtrSet<const BasicBlock *, 16> Visited;
17284 SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
17285 ParentsPairsToAdd;
17286 bool Res = false;
17287 llvm::scope_exit Cleanup([&]() {
17288 for (const auto &KeyPair : ParentsPairsToAdd) {
17289 assert(!ParentOpParentToPreds.contains(KeyPair) &&
17290 "Should not have been added before.");
17291 ParentOpParentToPreds.try_emplace(Key: KeyPair, Args&: Res);
17292 }
17293 });
17294 while (!Worklist.empty()) {
17295 BasicBlock *BB = Worklist.pop_back_val();
17296 if (BB == OpParent || !Visited.insert(Ptr: BB).second)
17297 continue;
17298 auto Pair = std::make_pair(x&: BB, y&: OpParent);
17299 if (auto It = ParentOpParentToPreds.find(Val: Pair);
17300 It != ParentOpParentToPreds.end()) {
17301 Res = It->second;
17302 return Res;
17303 }
17304 ParentsPairsToAdd.insert(V: Pair);
17305 unsigned BlockSize = BB->size();
17306 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
17307 return Res;
17308 Budget += BlockSize;
17309 if (Budget > BudgetLimit)
17310 return Res;
17311 if (!isa<CatchSwitchInst>(Val: BB->getTerminator()) &&
17312 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
17313 BB->getTerminator()))
17314 return Res;
17315 Worklist.append(in_start: pred_begin(BB), in_end: pred_end(BB));
17316 }
17317 Res = true;
17318 return Res;
17319 };
17320 SmallVector<const TreeEntry *> LiveEntries(1, Root);
17321 auto FindNonScalarParentEntry = [&](const TreeEntry *E) -> const TreeEntry * {
17322 assert(ScalarOrPseudoEntries.contains(E) &&
17323 "Expected scalar or pseudo entry.");
17324 const TreeEntry *Entry = E;
17325 while (Entry->UserTreeIndex) {
17326 Entry = Entry->UserTreeIndex.UserTE;
17327 if (!ScalarOrPseudoEntries.contains(Ptr: Entry))
17328 return Entry;
17329 }
17330 return nullptr;
17331 };
17332 while (!LiveEntries.empty()) {
17333 const TreeEntry *Entry = LiveEntries.pop_back_val();
17334 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Val: Entry);
17335 if (Operands.empty())
17336 continue;
17337 if (ScalarOrPseudoEntries.contains(Ptr: Entry)) {
17338 Entry = FindNonScalarParentEntry(Entry);
17339 if (!Entry) {
17340 for (const TreeEntry *Op : Operands) {
17341 if (!Op->isGather())
17342 LiveEntries.push_back(Elt: Op);
17343 }
17344 continue;
17345 }
17346 }
17347 Instruction *LastInst = EntriesToLastInstruction.at(Val: Entry);
17348 BasicBlock *Parent = LastInst->getParent();
17349 for (const TreeEntry *Op : Operands) {
17350 if (!Op->isGather())
17351 LiveEntries.push_back(Elt: Op);
17352 if (ScalarOrPseudoEntries.contains(Ptr: Op))
17353 continue;
17354 if (Entry->State == TreeEntry::SplitVectorize ||
17355 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
17356 (Op->isGather() && allConstant(VL: Op->Scalars)))
17357 continue;
17358 Budget = 0;
17359 BasicBlock *Pred = nullptr;
17360 if (auto *Phi = dyn_cast<PHINode>(Val: Entry->getMainOp()))
17361 Pred = Phi->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
17362 BasicBlock *OpParent;
17363 Instruction *OpLastInst;
17364 if (Op->isGather()) {
17365 assert(Entry->getOpcode() == Instruction::PHI &&
17366 "Expected phi node only.");
17367 OpParent = cast<PHINode>(Val: Entry->getMainOp())
17368 ->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
17369 OpLastInst = OpParent->getTerminator();
17370 for (Value *V : Op->Scalars) {
17371 auto *Inst = dyn_cast<Instruction>(Val: V);
17372 if (!Inst)
17373 continue;
17374 if (isVectorized(V)) {
17375 OpParent = Inst->getParent();
17376 OpLastInst = Inst;
17377 break;
17378 }
17379 }
17380 } else {
17381 OpLastInst = EntriesToLastInstruction.at(Val: Op);
17382 OpParent = OpLastInst->getParent();
17383 }
17384 // Check the call instructions within the same basic blocks.
17385 if (OpParent == Parent) {
17386 if (Entry->getOpcode() == Instruction::PHI) {
17387 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
17388 AddCosts(Op);
17389 continue;
17390 }
17391 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
17392 AddCosts(Op);
17393 continue;
17394 }
17395 // Check for call instruction in between blocks.
17396 // 1. Check entry's block to the head.
17397 if (Entry->getOpcode() != Instruction::PHI &&
17398 !CheckForNonVecCallsInSameBlock(
17399 &*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
17400 AddCosts(Op);
17401 continue;
17402 }
17403 // 2. Check op's block from the end.
17404 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
17405 OpParent->getTerminator())) {
17406 AddCosts(Op);
17407 continue;
17408 }
17409 // 3. Check the predecessors of entry's block till op's block.
17410 if (!CheckPredecessors(Parent, Pred, OpParent)) {
17411 AddCosts(Op);
17412 continue;
17413 }
17414 }
17415 }
17416
17417 return Cost;
17418}
17419
17420/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
17421/// buildvector sequence.
17422static bool isFirstInsertElement(const InsertElementInst *IE1,
17423 const InsertElementInst *IE2) {
17424 if (IE1 == IE2)
17425 return false;
17426 const auto *I1 = IE1;
17427 const auto *I2 = IE2;
17428 const InsertElementInst *PrevI1;
17429 const InsertElementInst *PrevI2;
17430 unsigned Idx1 = *getElementIndex(Inst: IE1);
17431 unsigned Idx2 = *getElementIndex(Inst: IE2);
17432 do {
17433 if (I2 == IE1)
17434 return true;
17435 if (I1 == IE2)
17436 return false;
17437 PrevI1 = I1;
17438 PrevI2 = I2;
17439 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
17440 getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
17441 I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: 0));
17442 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
17443 getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
17444 I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: 0));
17445 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
17446 llvm_unreachable("Two different buildvectors not expected.");
17447}
17448
17449namespace {
17450/// Returns incoming Value *, if the requested type is Value * too, or a default
17451/// value, otherwise.
17452struct ValueSelect {
17453 template <typename U>
17454 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
17455 return V;
17456 }
17457 template <typename U>
17458 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
17459 return U();
17460 }
17461};
17462} // namespace
17463
17464/// Does the analysis of the provided shuffle masks and performs the requested
17465/// actions on the vectors with the given shuffle masks. It tries to do it in
17466/// several steps.
17467/// 1. If the Base vector is not undef vector, resizing the very first mask to
17468/// have common VF and perform action for 2 input vectors (including non-undef
17469/// Base). Other shuffle masks are combined with the resulting after the 1 stage
17470/// and processed as a shuffle of 2 elements.
17471/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
17472/// action only for 1 vector with the given mask, if it is not the identity
17473/// mask.
17474/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
17475/// vectors, combing the masks properly between the steps.
17476template <typename T>
17477static T *performExtractsShuffleAction(
17478 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
17479 function_ref<unsigned(T *)> GetVF,
17480 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
17481 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
17482 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
17483 SmallVector<int> Mask(ShuffleMask.begin()->second);
17484 auto VMIt = std::next(ShuffleMask.begin());
17485 T *Prev = nullptr;
17486 SmallBitVector UseMask =
17487 buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
17488 SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
17489 if (!IsBaseUndef.all()) {
17490 // Base is not undef, need to combine it with the next subvectors.
17491 std::pair<T *, bool> Res =
17492 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
17493 SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
17494 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
17495 if (Mask[Idx] == PoisonMaskElem)
17496 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
17497 else
17498 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
17499 }
17500 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
17501 assert((!V || GetVF(V) == Mask.size()) &&
17502 "Expected base vector of VF number of elements.");
17503 Prev = Action(Mask, {nullptr, Res.first});
17504 } else if (ShuffleMask.size() == 1) {
17505 // Base is undef and only 1 vector is shuffled - perform the action only for
17506 // single vector, if the mask is not the identity mask.
17507 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
17508 /*ForSingleMask=*/true);
17509 if (Res.second)
17510 // Identity mask is found.
17511 Prev = Res.first;
17512 else
17513 Prev = Action(Mask, {ShuffleMask.begin()->first});
17514 } else {
17515 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
17516 // shuffles step by step, combining shuffle between the steps.
17517 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
17518 unsigned Vec2VF = GetVF(VMIt->first);
17519 if (Vec1VF == Vec2VF) {
17520 // No need to resize the input vectors since they are of the same size, we
17521 // can shuffle them directly.
17522 ArrayRef<int> SecMask = VMIt->second;
17523 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
17524 if (SecMask[I] != PoisonMaskElem) {
17525 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
17526 Mask[I] = SecMask[I] + Vec1VF;
17527 }
17528 }
17529 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
17530 } else {
17531 // Vectors of different sizes - resize and reshuffle.
17532 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
17533 /*ForSingleMask=*/false);
17534 std::pair<T *, bool> Res2 =
17535 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
17536 ArrayRef<int> SecMask = VMIt->second;
17537 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
17538 if (Mask[I] != PoisonMaskElem) {
17539 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
17540 if (Res1.second)
17541 Mask[I] = I;
17542 } else if (SecMask[I] != PoisonMaskElem) {
17543 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
17544 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
17545 }
17546 }
17547 Prev = Action(Mask, {Res1.first, Res2.first});
17548 }
17549 VMIt = std::next(VMIt);
17550 }
17551 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
17552 // Perform requested actions for the remaining masks/vectors.
17553 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
17554 // Shuffle other input vectors, if any.
17555 std::pair<T *, bool> Res =
17556 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
17557 ArrayRef<int> SecMask = VMIt->second;
17558 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
17559 if (SecMask[I] != PoisonMaskElem) {
17560 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
17561 "Multiple uses of scalars.");
17562 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
17563 } else if (Mask[I] != PoisonMaskElem) {
17564 Mask[I] = I;
17565 }
17566 }
17567 Prev = Action(Mask, {Prev, Res.first});
17568 }
17569 return Prev;
17570}
17571
17572InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
17573 ArrayRef<Value *> VectorizedVals) {
17574 SmallDenseMap<const TreeEntry *, InstructionCost> NodesCosts;
17575 SmallPtrSet<Value *, 4> CheckedExtracts;
17576 SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
17577 SmallDenseMap<const TreeEntry *, InstructionCost> ExtractCosts;
17578 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
17579 << VectorizableTree.size() << ".\n");
17580 auto IsExternallyUsed = [&](const TreeEntry &TE, Value *V) {
17581 assert(TE.hasState() && !TE.isGather() &&
17582 TE.State != TreeEntry::SplitVectorize && "Expected vector node.");
17583 if (V->hasOneUse() || V->getType()->isVoidTy())
17584 return false;
17585 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
17586 return false;
17587 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
17588 if (V->hasNUsesOrMore(N: NumVectScalars))
17589 return true;
17590 auto *I = dyn_cast<Instruction>(Val: V);
17591 // Check if any user is used outside of the tree.
17592 return I && any_of(Range: I->users(), P: [&](const User *U) {
17593 // store/insertelt v, [cast]U will likely be vectorized.
17594 if (match(V: U, P: m_InsertElt(Val: m_Value(),
17595 Elt: m_OneUse(SubPattern: m_CastOrSelf(Op: m_Specific(V: I))),
17596 Idx: m_ConstantInt())))
17597 return false;
17598 if (match(V: U,
17599 P: m_InsertElt(Val: m_Value(), Elt: m_Specific(V: I), Idx: m_ConstantInt())))
17600 return false;
17601 if (match(V: U, P: m_Store(ValueOp: m_OneUse(SubPattern: m_CastOrSelf(Op: m_Specific(V: I))),
17602 PointerOp: m_Value())))
17603 return false;
17604 if (match(V: U, P: m_Store(ValueOp: m_Specific(V: I), PointerOp: m_Value())))
17605 return false;
17606 ArrayRef<TreeEntry *> Entries = getTreeEntries(V: U);
17607 if (Entries.empty() && !MustGather.contains(Ptr: U))
17608 return true;
17609 if (any_of(Range&: Entries, P: [&](TreeEntry *TE) {
17610 return DeletedNodes.contains(Ptr: TE);
17611 }))
17612 return true;
17613 return any_of(Range: ValueToGatherNodes.lookup(Val: U),
17614 P: [&](const TreeEntry *TE) {
17615 return DeletedNodes.contains(Ptr: TE);
17616 });
17617 });
17618 };
17619 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17620 InstructionCost Cost = 0;
17621 SmallDenseMap<const TreeEntry *, unsigned> EntryToScale;
17622 unsigned PrevScale = 0;
17623 BasicBlock *PrevVecParent = nullptr;
17624 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17625 TreeEntry &TE = *Ptr;
17626 // No need to count the cost for combined entries, they are combined and
17627 // just skip their cost.
17628 if (TE.State == TreeEntry::CombinedVectorize) {
17629 LLVM_DEBUG(
17630 dbgs() << "SLP: Skipping cost for combined node that starts with "
17631 << *TE.Scalars[0] << ".\n";
17632 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17633 NodesCosts.try_emplace(Key: &TE);
17634 continue;
17635 }
17636 if (TE.hasState() &&
17637 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
17638 if (const TreeEntry *E =
17639 getSameValuesTreeEntry(V: TE.getMainOp(), VL: TE.Scalars);
17640 E && E->getVectorFactor() == TE.getVectorFactor()) {
17641 // Some gather nodes might be absolutely the same as some vectorizable
17642 // nodes after reordering, need to handle it.
17643 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
17644 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
17645 << "SLP: Current total cost = " << Cost << "\n");
17646 NodesCosts.try_emplace(Key: &TE);
17647 continue;
17648 }
17649 }
17650
17651 // Exclude cost of gather loads nodes which are not used. These nodes were
17652 // built as part of the final attempt to vectorize gathered loads.
17653 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
17654 "Expected gather nodes with users only.");
17655
17656 InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
17657 unsigned Scale = 0;
17658 bool CostIsFree = C == 0;
17659 if (!CostIsFree && !TE.isGather() && TE.hasState()) {
17660 if (PrevVecParent == TE.getMainOp()->getParent()) {
17661 Scale = PrevScale;
17662 C *= Scale;
17663 EntryToScale.try_emplace(Key: &TE, Args&: Scale);
17664 }
17665 }
17666 if (!CostIsFree && !Scale) {
17667 Scale = getScaleToLoopIterations(TE);
17668 C *= Scale;
17669 EntryToScale.try_emplace(Key: &TE, Args&: Scale);
17670 if (!TE.isGather() && TE.hasState()) {
17671 PrevVecParent = TE.getMainOp()->getParent();
17672 PrevScale = Scale;
17673 }
17674 }
17675 Cost += C;
17676 NodesCosts.try_emplace(Key: &TE, Args&: C);
17677 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
17678 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
17679 << "SLP: Current total cost = " << Cost << "\n");
17680 // Add gathered loads nodes to the set for later processing.
17681 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
17682 TE.getOpcode() == Instruction::Load)
17683 GatheredLoadsNodes.insert(X: &TE);
17684 if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize &&
17685 !(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement ||
17686 TE.getOpcode() == Instruction::Store))) {
17687 // Calculate costs of external uses.
17688 APInt DemandedElts = APInt::getZero(numBits: TE.getVectorFactor());
17689 for (Value *V : TE.Scalars) {
17690 if (IsExternallyUsed(TE, V))
17691 DemandedElts.setBit(TE.findLaneForValue(V));
17692 }
17693 if (!DemandedElts.isZero()) {
17694 Type *ScalarTy = TE.Scalars.front()->getType();
17695 auto It = MinBWs.find(Val: &TE);
17696 if (It != MinBWs.end())
17697 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
17698 auto *VecTy = getWidenedType(ScalarTy, VF: TE.getVectorFactor());
17699 InstructionCost ExtCost = ::getScalarizationOverhead(
17700 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts, /*Insert=*/false,
17701 /*Extract=*/true, CostKind);
17702 if (ExtCost.isValid() && ExtCost != 0) {
17703 if (!Scale)
17704 Scale = getScaleToLoopIterations(TE);
17705 ExtCost *= Scale;
17706 EntryToScale.try_emplace(Key: &TE, Args&: Scale);
17707 }
17708 ExtractCosts.try_emplace(Key: &TE, Args&: ExtCost);
17709 }
17710 }
17711 }
17712 // Bail out if the cost threshold is negative and cost already below it.
17713 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
17714 Cost < -SLPCostThreshold)
17715 return Cost;
17716 // The narrow non-profitable tree in loop? Skip, may cause regressions.
17717 constexpr unsigned PartLimit = 2;
17718 const unsigned Sz =
17719 getVectorElementSize(V: VectorizableTree.front()->Scalars.front());
17720 const unsigned MinVF = getMinVF(Sz);
17721 if (Cost >= -SLPCostThreshold &&
17722 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
17723 (!VectorizableTree.front()->hasState() ||
17724 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
17725 LI->getLoopFor(BB: VectorizableTree.front()->getMainOp()->getParent()))))
17726 return Cost;
17727 // Store the cost + external uses estimation as the first element of the
17728 // tuple, just the cost as the second element of the tuple. Required to return
17729 // correct cost estimation for the tree, extracts are calculated separately.
17730 // Extracts, calculated here, are just quick estimations.
17731 SmallVector<
17732 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
17733 SubtreeCosts(VectorizableTree.size());
17734 auto UpdateParentNodes =
17735 [&](const TreeEntry *UserTE, const TreeEntry *TE,
17736 InstructionCost TotalCost, InstructionCost Cost,
17737 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17738 &VisitedUser,
17739 bool AddToList = true) {
17740 while (UserTE &&
17741 VisitedUser.insert(V: std::make_pair(x&: TE, y&: UserTE)).second) {
17742 std::get<0>(t&: SubtreeCosts[UserTE->Idx]) += TotalCost;
17743 std::get<1>(t&: SubtreeCosts[UserTE->Idx]) += Cost;
17744 if (AddToList)
17745 std::get<2>(t&: SubtreeCosts[UserTE->Idx]).push_back(Elt: TE->Idx);
17746 UserTE = UserTE->UserTreeIndex.UserTE;
17747 }
17748 };
17749 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17750 TreeEntry &TE = *Ptr;
17751 InstructionCost C = NodesCosts.at(Val: &TE);
17752 InstructionCost ExtractCost = ExtractCosts.lookup(Val: &TE);
17753 std::get<0>(t&: SubtreeCosts[TE.Idx]) += C + ExtractCost;
17754 std::get<1>(t&: SubtreeCosts[TE.Idx]) += C;
17755 if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
17756 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17757 VisitedUser;
17758 UpdateParentNodes(UserTE, &TE, C + ExtractCost, C, VisitedUser);
17759 }
17760 }
17761 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
17762 for (TreeEntry *TE : GatheredLoadsNodes) {
17763 InstructionCost TotalCost = std::get<0>(t&: SubtreeCosts[TE->Idx]);
17764 InstructionCost Cost = std::get<1>(t&: SubtreeCosts[TE->Idx]);
17765 for (Value *V : TE->Scalars) {
17766 for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(Val: V))
17767 UpdateParentNodes(BVTE, TE, TotalCost, Cost, Visited,
17768 /*AddToList=*/false);
17769 }
17770 }
17771 Visited.clear();
17772 using CostIndicesTy =
17773 std::pair<TreeEntry *, std::tuple<InstructionCost, InstructionCost,
17774 SmallVector<unsigned>>>;
17775 struct FirstGreater {
17776 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
17777 return std::get<0>(t: LHS.second) < std::get<0>(t: RHS.second) ||
17778 (std::get<0>(t: LHS.second) == std::get<0>(t: RHS.second) &&
17779 LHS.first->Idx < RHS.first->Idx);
17780 }
17781 };
17782 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
17783 Worklist;
17784 for (const auto [Idx, P] : enumerate(First&: SubtreeCosts))
17785 Worklist.emplace(args: VectorizableTree[Idx].get(), args&: P);
17786
17787 // Narrow store trees with non-profitable immediate values - exit.
17788 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
17789 VectorizableTree.front()->hasState() &&
17790 VectorizableTree.front()->getOpcode() == Instruction::Store &&
17791 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
17792 return Cost;
17793
17794 bool Changed = false;
17795 bool PreferTrimmedTree = false;
17796 while (!Worklist.empty() && std::get<0>(t: Worklist.top().second) > 0) {
17797 TreeEntry *TE = Worklist.top().first;
17798 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(Ptr: TE) ||
17799 // Exit early if the parent node is split node and any of scalars is
17800 // used in other split nodes.
17801 (TE->UserTreeIndex &&
17802 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
17803 any_of(Range&: TE->Scalars, P: [&](Value *V) {
17804 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
17805 return Entries.size() > 1;
17806 }))) {
17807 Worklist.pop();
17808 continue;
17809 }
17810 // Skip inversed compare nodes, they cannot be transformed to buildvectors.
17811 if (TE->State == TreeEntry::Vectorize && !TE->isAltShuffle() &&
17812 (TE->getOpcode() == Instruction::ICmp ||
17813 TE->getOpcode() == Instruction::FCmp) &&
17814 any_of(Range&: TE->Scalars, P: [&](Value *V) {
17815 auto *I = dyn_cast<CmpInst>(Val: V);
17816 if (!I)
17817 return false;
17818 return I->getPredicate() !=
17819 cast<CmpInst>(Val: TE->getMainOp())->getPredicate();
17820 })) {
17821 Worklist.pop();
17822 continue;
17823 }
17824
17825 // Calculate the gather cost of the root node.
17826 InstructionCost TotalSubtreeCost = std::get<0>(t: Worklist.top().second);
17827 InstructionCost SubtreeCost = std::get<1>(t: Worklist.top().second);
17828 if (TotalSubtreeCost < TE->Scalars.size()) {
17829 Worklist.pop();
17830 continue;
17831 }
17832 if (!TransformedToGatherNodes.empty()) {
17833 for (unsigned Idx : std::get<2>(t: Worklist.top().second)) {
17834 auto It = TransformedToGatherNodes.find(Val: VectorizableTree[Idx].get());
17835 if (It != TransformedToGatherNodes.end()) {
17836 TotalSubtreeCost -= std::get<0>(t&: SubtreeCosts[Idx]);
17837 SubtreeCost -= std::get<1>(t&: SubtreeCosts[Idx]);
17838 TotalSubtreeCost += It->second;
17839 SubtreeCost += It->second;
17840 }
17841 }
17842 }
17843 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
17844 Worklist.pop();
17845 continue;
17846 }
17847 const unsigned Sz = TE->Scalars.size();
17848 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
17849 for (auto [Idx, V] : enumerate(First&: TE->Scalars)) {
17850 if (isConstant(V))
17851 DemandedElts.clearBit(BitPosition: Idx);
17852 }
17853
17854 Type *ScalarTy = getValueType(V: TE->Scalars.front());
17855 auto It = MinBWs.find(Val: TE);
17856 if (It != MinBWs.end())
17857 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
17858 if (isa<CmpInst>(Val: TE->Scalars.front()))
17859 ScalarTy = TE->Scalars.front()->getType();
17860 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
17861 const unsigned EntryVF = TE->getVectorFactor();
17862 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
17863 InstructionCost GatherCost = ::getScalarizationOverhead(
17864 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
17865 /*Insert=*/true, /*Extract=*/false, CostKind);
17866 SmallVector<int> Mask;
17867 if (!TE->ReorderIndices.empty() &&
17868 TE->State != TreeEntry::CompressVectorize &&
17869 (TE->State != TreeEntry::StridedVectorize ||
17870 !isReverseOrder(Order: TE->ReorderIndices))) {
17871 SmallVector<int> NewMask;
17872 if (TE->getOpcode() == Instruction::Store) {
17873 // For stores the order is actually a mask.
17874 NewMask.resize(N: TE->ReorderIndices.size());
17875 copy(Range&: TE->ReorderIndices, Out: NewMask.begin());
17876 } else {
17877 inversePermutation(Indices: TE->ReorderIndices, Mask&: NewMask);
17878 }
17879 ::addMask(Mask, SubMask: NewMask);
17880 }
17881 if (!TE->ReuseShuffleIndices.empty())
17882 ::addMask(Mask, SubMask: TE->ReuseShuffleIndices);
17883 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: EntryVF))
17884 GatherCost +=
17885 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
17886 // If all scalars are reused in gather node(s) or other vector nodes, there
17887 // might be extra cost for inserting them.
17888 if ((!TE->hasState() || !TE->isAltShuffle()) &&
17889 all_of(Range&: TE->Scalars, P: [&](Value *V) {
17890 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
17891 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
17892 }))
17893 GatherCost *= 2;
17894 // Erase subtree if it is non-profitable.
17895 ArrayRef<unsigned> Nodes = std::get<2>(t: Worklist.top().second);
17896 // Prefer trimming equal-cost alternate-shuffle subtrees rooted at binary
17897 // ops: alt-shuffles introduce runtime shuffle overhead that the cost model
17898 // may underestimate. Skip if the subtree contains ExtractElement nodes,
17899 // since those operate on already-materialized vectors where the cost model
17900 // is more accurate.
17901 auto IsEqualCostAltShuffleToTrim = [&]() {
17902 return TotalSubtreeCost == GatherCost && TE->hasState() &&
17903 TE->isAltShuffle() && Instruction::isBinaryOp(Opcode: TE->getOpcode()) &&
17904 none_of(Range&: Nodes, P: [&](unsigned Idx) {
17905 return VectorizableTree[Idx]->hasState() &&
17906 VectorizableTree[Idx]->getOpcode() ==
17907 Instruction::ExtractElement;
17908 });
17909 };
17910 if (TotalSubtreeCost > GatherCost || IsEqualCostAltShuffleToTrim()) {
17911 PreferTrimmedTree |= TotalSubtreeCost == GatherCost;
17912 // If the remaining tree is just a buildvector - exit, it will cause
17913 // endless attempts to vectorize.
17914 if (VectorizableTree.front()->hasState() &&
17915 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17916 TE->Idx == 1)
17917 return InstructionCost::getInvalid();
17918
17919 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
17920 << TE->Idx << " with cost "
17921 << std::get<0>(Worklist.top().second)
17922 << " and gather cost " << GatherCost << ".\n");
17923 if (TE->UserTreeIndex) {
17924 TransformedToGatherNodes.try_emplace(Key: TE, Args&: GatherCost);
17925 NodesCosts.erase(Val: TE);
17926 } else {
17927 DeletedNodes.insert(Ptr: TE);
17928 TransformedToGatherNodes.erase(Val: TE);
17929 NodesCosts.erase(Val: TE);
17930 }
17931 for (unsigned Idx : Nodes) {
17932 TreeEntry &ChildTE = *VectorizableTree[Idx];
17933 DeletedNodes.insert(Ptr: &ChildTE);
17934 TransformedToGatherNodes.erase(Val: &ChildTE);
17935 NodesCosts.erase(Val: &ChildTE);
17936 }
17937 Changed = true;
17938 }
17939 Worklist.pop();
17940 }
17941 if (!Changed)
17942 return std::get<1>(t&: SubtreeCosts.front());
17943
17944 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
17945 InstructionCost LoadsExtractsCost = 0;
17946 // Check if all loads of gathered loads nodes are marked for deletion. In this
17947 // case the whole gathered loads subtree must be deleted.
17948 // Also, try to account for extracts, which might be required, if only part of
17949 // gathered load must be vectorized. Keep partially vectorized nodes, if
17950 // extracts are cheaper than gathers.
17951 for (TreeEntry *TE : GatheredLoadsNodes) {
17952 if (DeletedNodes.contains(Ptr: TE) || TransformedToGatherNodes.contains(Val: TE))
17953 continue;
17954 GatheredLoadsToDelete.insert(Ptr: TE);
17955 APInt DemandedElts = APInt::getZero(numBits: TE->getVectorFactor());
17956 // All loads are removed from gathered? Need to delete the subtree.
17957 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
17958 for (Value *V : TE->Scalars) {
17959 unsigned Pos = TE->findLaneForValue(V);
17960 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
17961 if (DeletedNodes.contains(Ptr: BVE))
17962 continue;
17963 DemandedElts.setBit(Pos);
17964 ValuesToInsert.try_emplace(Key: BVE).first->second.push_back(Elt: V);
17965 }
17966 }
17967 if (!DemandedElts.isZero()) {
17968 Type *ScalarTy = TE->Scalars.front()->getType();
17969 auto It = MinBWs.find(Val: TE);
17970 if (It != MinBWs.end())
17971 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
17972 auto *VecTy = getWidenedType(ScalarTy, VF: TE->getVectorFactor());
17973 InstructionCost ExtractsCost = ::getScalarizationOverhead(
17974 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
17975 /*Insert=*/false, /*Extract=*/true, CostKind);
17976 InstructionCost BVCost = 0;
17977 for (const auto &[BVE, Values] : ValuesToInsert) {
17978 APInt BVDemandedElts = APInt::getZero(numBits: BVE->getVectorFactor());
17979 SmallVector<Value *> BVValues(BVE->getVectorFactor(),
17980 PoisonValue::get(T: ScalarTy));
17981 for (Value *V : Values) {
17982 unsigned Pos = BVE->findLaneForValue(V);
17983 BVValues[Pos] = V;
17984 BVDemandedElts.setBit(Pos);
17985 }
17986 auto *BVVecTy = getWidenedType(ScalarTy, VF: BVE->getVectorFactor());
17987 BVCost += ::getScalarizationOverhead(
17988 TTI: *TTI, ScalarTy, Ty: BVVecTy, DemandedElts: BVDemandedElts,
17989 /*Insert=*/true, /*Extract=*/false, CostKind,
17990 ForPoisonSrc: BVDemandedElts.isAllOnes(), VL: BVValues);
17991 }
17992 if (ExtractsCost < BVCost) {
17993 LoadsExtractsCost += ExtractsCost;
17994 GatheredLoadsToDelete.erase(Ptr: TE);
17995 continue;
17996 }
17997 LoadsExtractsCost += BVCost;
17998 }
17999 NodesCosts.erase(Val: TE);
18000 }
18001
18002 // Deleted all subtrees rooted at gathered loads nodes.
18003 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18004 if (TE->UserTreeIndex &&
18005 GatheredLoadsToDelete.contains(Ptr: TE->UserTreeIndex.UserTE)) {
18006 DeletedNodes.insert(Ptr: TE.get());
18007 NodesCosts.erase(Val: TE.get());
18008 GatheredLoadsToDelete.insert(Ptr: TE.get());
18009 }
18010 if (GatheredLoadsToDelete.contains(Ptr: TE.get()))
18011 DeletedNodes.insert(Ptr: TE.get());
18012 }
18013
18014 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
18015 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(Val: TE.get())) {
18016 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
18017 continue;
18018 }
18019 if (DeletedNodes.contains(Ptr: TE.get()))
18020 continue;
18021 if (!NodesCosts.contains(Val: TE.get())) {
18022 InstructionCost C =
18023 getEntryCost(E: TE.get(), VectorizedVals, CheckedExtracts);
18024 if (!C.isValid() || C == 0) {
18025 NodesCosts.try_emplace(Key: TE.get(), Args&: C);
18026 continue;
18027 }
18028 unsigned Scale = EntryToScale.lookup(Val: TE.get());
18029 if (!Scale)
18030 Scale = getScaleToLoopIterations(TE: *TE.get());
18031 C *= Scale;
18032 NodesCosts.try_emplace(Key: TE.get(), Args&: C);
18033 }
18034 }
18035
18036 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
18037 InstructionCost NewCost = 0;
18038 for (const auto &P : NodesCosts) {
18039 NewCost += P.second;
18040 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
18041 << shortBundleName(P.first->Scalars, P.first->Idx)
18042 << ".\n"
18043 << "SLP: Current total cost = " << NewCost << "\n");
18044 }
18045 if (NewCost + LoadsExtractsCost > Cost ||
18046 (!PreferTrimmedTree && NewCost + LoadsExtractsCost == Cost)) {
18047 DeletedNodes.clear();
18048 TransformedToGatherNodes.clear();
18049 NewCost = Cost;
18050 } else {
18051 // If the remaining tree is just a buildvector - exit, it will cause
18052 // endless attempts to vectorize.
18053 if (VectorizableTree.size() >= 2 && VectorizableTree.front()->hasState() &&
18054 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
18055 TransformedToGatherNodes.contains(Val: VectorizableTree[1].get()))
18056 return InstructionCost::getInvalid();
18057 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
18058 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
18059 VectorizableTree[1]->hasState() &&
18060 VectorizableTree[1]->State == TreeEntry::Vectorize &&
18061 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
18062 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
18063 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
18064 TransformedToGatherNodes.contains(Val: VectorizableTree[2].get()))
18065 return InstructionCost::getInvalid();
18066 }
18067 return NewCost;
18068}
18069
18070namespace {
18071/// Data type for handling buildvector sequences with the reused scalars from
18072/// other tree entries.
18073template <typename T> struct ShuffledInsertData {
18074 /// List of insertelements to be replaced by shuffles.
18075 SmallVector<InsertElementInst *> InsertElements;
18076 /// The parent vectors and shuffle mask for the given list of inserts.
18077 MapVector<T, SmallVector<int>> ValueMasks;
18078};
18079} // namespace
18080
18081InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
18082 ArrayRef<Value *> VectorizedVals,
18083 InstructionCost ReductionCost,
18084 Instruction *RdxRoot) {
18085 InstructionCost Cost = TreeCost;
18086
18087 SmallDenseMap<std::tuple<const TreeEntry *, Value *, Instruction *>, unsigned>
18088 EntryToScale;
18089 auto ScaleCost = [&](InstructionCost C, const TreeEntry &TE,
18090 Value *Scalar = nullptr, Instruction *U = nullptr) {
18091 if (!C.isValid() || C == 0)
18092 return C;
18093 unsigned &Scale =
18094 EntryToScale.try_emplace(Key: std::make_tuple(args: &TE, args&: Scalar, args&: U), Args: 0)
18095 .first->getSecond();
18096 if (!Scale)
18097 Scale = getScaleToLoopIterations(TE, Scalar, U);
18098 LLVM_DEBUG(dbgs() << "Scale " << Scale << " For entry " << TE.Idx << "\n");
18099 return C * Scale;
18100 };
18101 Instruction *ReductionRoot = RdxRoot;
18102 if (UserIgnoreList) {
18103 // Scale reduction cost to the factor of the loop nest trip count.
18104 ReductionCost = ScaleCost(ReductionCost, *VectorizableTree.front().get(),
18105 /*Scalar=*/nullptr, ReductionRoot);
18106 }
18107
18108 // Add the cost for reduction.
18109 Cost += ReductionCost;
18110
18111 // Skip trees, which are non-profitable even if there are insertelements with
18112 // external uses.
18113 constexpr unsigned CostLimit = 100;
18114 if (Cost >= -SLPCostThreshold + CostLimit &&
18115 (VectorizableTree.size() - DeletedNodes.size()) *
18116 VectorizableTree.front()->getVectorFactor() <
18117 CostLimit)
18118 return Cost;
18119
18120 if (Cost >= -SLPCostThreshold &&
18121 none_of(Range&: ExternalUses, P: [](const ExternalUser &EU) {
18122 return isa_and_nonnull<InsertElementInst>(Val: EU.User);
18123 }))
18124 return Cost;
18125
18126 SmallPtrSet<Value *, 16> ExtractCostCalculated;
18127 InstructionCost ExtractCost = 0;
18128 SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
18129 SmallVector<APInt> DemandedElts;
18130 SmallDenseSet<Value *, 4> UsedInserts;
18131 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
18132 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
18133 DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
18134 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
18135 // Keep track {Scalar, Index, User} tuple.
18136 // On AArch64, this helps in fusing a mov instruction, associated with
18137 // extractelement, with fmul in the backend so that extractelement is free.
18138 SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
18139 for (ExternalUser &EU : ExternalUses) {
18140 ScalarUserAndIdx.emplace_back(Args&: EU.Scalar, Args&: EU.User, Args&: EU.Lane);
18141 }
18142 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
18143 for (ExternalUser &EU : ExternalUses) {
18144 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
18145 << EU.E.Idx << " in lane " << EU.Lane << "\n");
18146 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
18147 else dbgs() << " User: nullptr\n");
18148 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
18149
18150 // Uses by ephemeral values are free (because the ephemeral value will be
18151 // removed prior to code generation, and so the extraction will be
18152 // removed as well).
18153 if (EphValues.count(Ptr: EU.User))
18154 continue;
18155
18156 // Check if the scalar for the given user or all users is accounted already.
18157 if (!CheckedScalarUser.insert(V: std::make_pair(x&: EU.Scalar, y&: EU.User)).second ||
18158 (EU.User &&
18159 CheckedScalarUser.contains(V: std::make_pair(x&: EU.Scalar, y: nullptr))))
18160 continue;
18161
18162 // Used in unreachable blocks or in EH pads (rarely executed) or is
18163 // terminated with unreachable instruction.
18164 if (BasicBlock *UserParent =
18165 EU.User ? cast<Instruction>(Val: EU.User)->getParent() : nullptr;
18166 UserParent &&
18167 (!DT->isReachableFromEntry(A: UserParent) || UserParent->isEHPad() ||
18168 isa_and_present<UnreachableInst>(Val: UserParent->getTerminator())))
18169 continue;
18170
18171 // We only add extract cost once for the same scalar.
18172 if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
18173 !ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
18174 continue;
18175
18176 // No extract cost for vector "scalar" if REVEC is disabled
18177 if (!SLPReVec && isa<FixedVectorType>(Val: EU.Scalar->getType()))
18178 continue;
18179
18180 // If found user is an insertelement, do not calculate extract cost but try
18181 // to detect it as a final shuffled/identity match.
18182 // TODO: what if a user is insertvalue when REVEC is enabled?
18183 if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
18184 VU && VU->getOperand(i_nocapture: 1) == EU.Scalar) {
18185 if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
18186 if (!UsedInserts.insert(V: VU).second)
18187 continue;
18188 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
18189 if (InsertIdx) {
18190 const TreeEntry *ScalarTE = &EU.E;
18191 auto *It = find_if(
18192 Range&: ShuffledInserts,
18193 P: [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
18194 // Checks if 2 insertelements are from the same buildvector.
18195 InsertElementInst *VecInsert = Data.InsertElements.front();
18196 return areTwoInsertFromSameBuildVector(
18197 VU, V: VecInsert, GetBaseOperand: [this](InsertElementInst *II) -> Value * {
18198 Value *Op0 = II->getOperand(i_nocapture: 0);
18199 if (isVectorized(V: II) && !isVectorized(V: Op0))
18200 return nullptr;
18201 return Op0;
18202 });
18203 });
18204 int VecId = -1;
18205 if (It == ShuffledInserts.end()) {
18206 auto &Data = ShuffledInserts.emplace_back();
18207 Data.InsertElements.emplace_back(Args&: VU);
18208 DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
18209 VecId = ShuffledInserts.size() - 1;
18210 auto It = MinBWs.find(Val: ScalarTE);
18211 if (It != MinBWs.end() &&
18212 VectorCasts
18213 .insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
18214 .second) {
18215 unsigned BWSz = It->second.first;
18216 unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
18217 unsigned VecOpcode;
18218 if (DstBWSz < BWSz)
18219 VecOpcode = Instruction::Trunc;
18220 else
18221 VecOpcode =
18222 It->second.second ? Instruction::SExt : Instruction::ZExt;
18223 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18224 InstructionCost C = TTI->getCastInstrCost(
18225 Opcode: VecOpcode, Dst: FTy,
18226 Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
18227 VF: FTy->getNumElements()),
18228 CCH: TTI::CastContextHint::None, CostKind);
18229 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
18230 << " for extending externally used vector with "
18231 "non-equal minimum bitwidth.\n");
18232 Cost += C;
18233 }
18234 } else {
18235 if (isFirstInsertElement(IE1: VU, IE2: It->InsertElements.front()))
18236 It->InsertElements.front() = VU;
18237 VecId = std::distance(first: ShuffledInserts.begin(), last: It);
18238 }
18239 int InIdx = *InsertIdx;
18240 SmallVectorImpl<int> &Mask =
18241 ShuffledInserts[VecId].ValueMasks[ScalarTE];
18242 if (Mask.empty())
18243 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
18244 Mask[InIdx] = EU.Lane;
18245 DemandedElts[VecId].setBit(InIdx);
18246 continue;
18247 }
18248 }
18249 }
18250
18251 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18252 // If we plan to rewrite the tree in a smaller type, we will need to sign
18253 // extend the extracted value back to the original type. Here, we account
18254 // for the extract and the added cost of the sign extend if needed.
18255 InstructionCost ExtraCost = TTI::TCC_Free;
18256 auto *ScalarTy = EU.Scalar->getType();
18257 const unsigned BundleWidth = EU.E.getVectorFactor();
18258 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
18259 auto *VecTy = getWidenedType(ScalarTy, VF: BundleWidth);
18260 const TreeEntry *Entry = &EU.E;
18261 auto It = MinBWs.find(Val: Entry);
18262 if (It != MinBWs.end()) {
18263 Type *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
18264 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
18265 MinTy = getWidenedType(ScalarTy: MinTy, VF: VecTy->getNumElements());
18266 unsigned Extend = isKnownNonNegative(V: EU.Scalar, SQ: SimplifyQuery(*DL))
18267 ? Instruction::ZExt
18268 : Instruction::SExt;
18269 VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
18270 ExtraCost =
18271 getExtractWithExtendCost(TTI: *TTI, Opcode: Extend, Dst: ScalarTy, VecTy, Index: EU.Lane);
18272 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
18273 << ExtraCost << "\n");
18274 } else {
18275 ExtraCost =
18276 getVectorInstrCost(TTI: *TTI, ScalarTy, Opcode: Instruction::ExtractElement, Val: VecTy,
18277 CostKind, Index: EU.Lane, Scalar: EU.Scalar, ScalarUserAndIdx);
18278 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
18279 << *VecTy << ": " << ExtraCost << "\n");
18280 }
18281 // Leave the scalar instructions as is if they are cheaper than extracts.
18282 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
18283 Entry->getOpcode() == Instruction::Load) {
18284 // Checks if the user of the external scalar is phi in loop body.
18285 auto IsPhiInLoop = [&](const ExternalUser &U) {
18286 if (auto *Phi = dyn_cast_if_present<PHINode>(Val: U.User)) {
18287 auto *I = cast<Instruction>(Val: U.Scalar);
18288 const Loop *L = LI->getLoopFor(BB: Phi->getParent());
18289 return L && (Phi->getParent() == I->getParent() ||
18290 L == LI->getLoopFor(BB: I->getParent()));
18291 }
18292 return false;
18293 };
18294 if (!ValueToExtUses) {
18295 ValueToExtUses.emplace();
18296 for (const auto &P : enumerate(First&: ExternalUses)) {
18297 // Ignore phis in loops.
18298 if (IsPhiInLoop(P.value()))
18299 continue;
18300
18301 ValueToExtUses->try_emplace(Key: P.value().Scalar, Args: P.index());
18302 }
18303 }
18304 // Can use original instruction, if no operands vectorized or they are
18305 // marked as externally used already.
18306 auto *Inst = cast<Instruction>(Val: EU.Scalar);
18307 InstructionCost ScalarCost = TTI->getInstructionCost(U: Inst, CostKind);
18308 auto OperandIsScalar = [&](Value *V) {
18309 if (!isVectorized(V)) {
18310 // Some extractelements might be not vectorized, but
18311 // transformed into shuffle and removed from the function,
18312 // consider it here.
18313 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V))
18314 return !EE->hasOneUse() || !MustGather.contains(Ptr: EE);
18315 return true;
18316 }
18317 return ValueToExtUses->contains(Val: V);
18318 };
18319 bool CanBeUsedAsScalar = all_of(Range: Inst->operands(), P: OperandIsScalar);
18320 bool CanBeUsedAsScalarCast = false;
18321 if (auto *CI = dyn_cast<CastInst>(Val: Inst); CI && !CanBeUsedAsScalar) {
18322 if (auto *Op = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: 0));
18323 Op && all_of(Range: Op->operands(), P: OperandIsScalar)) {
18324 InstructionCost OpCost =
18325 (isVectorized(V: Op) && !ValueToExtUses->contains(Val: Op))
18326 ? TTI->getInstructionCost(U: Op, CostKind)
18327 : 0;
18328 if (ScalarCost + OpCost <= ExtraCost) {
18329 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
18330 ScalarCost += OpCost;
18331 }
18332 }
18333 }
18334 if (CanBeUsedAsScalar) {
18335 bool KeepScalar = ScalarCost <= ExtraCost;
18336 // Try to keep original scalar if the user is the phi node from the same
18337 // block as the root phis, currently vectorized. It allows to keep
18338 // better ordering info of PHIs, being vectorized currently.
18339 bool IsProfitablePHIUser =
18340 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
18341 VectorizableTree.front()->Scalars.size() > 2)) &&
18342 VectorizableTree.front()->hasState() &&
18343 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
18344 !Inst->hasNUsesOrMore(N: UsesLimit) &&
18345 none_of(Range: Inst->users(),
18346 P: [&](User *U) {
18347 auto *PHIUser = dyn_cast<PHINode>(Val: U);
18348 return (!PHIUser ||
18349 PHIUser->getParent() !=
18350 cast<Instruction>(
18351 Val: VectorizableTree.front()->getMainOp())
18352 ->getParent()) &&
18353 !isVectorized(V: U);
18354 }) &&
18355 count_if(Range: Entry->Scalars, P: [&](Value *V) {
18356 return ValueToExtUses->contains(Val: V);
18357 }) <= 2;
18358 if (IsProfitablePHIUser) {
18359 KeepScalar = true;
18360 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
18361 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
18362 (!GatheredLoadsEntriesFirst.has_value() ||
18363 Entry->Idx < *GatheredLoadsEntriesFirst)) {
18364 unsigned ScalarUsesCount = count_if(Range: Entry->Scalars, P: [&](Value *V) {
18365 return ValueToExtUses->contains(Val: V);
18366 });
18367 auto It = ExtractsCount.find(Val: Entry);
18368 if (It != ExtractsCount.end()) {
18369 assert(ScalarUsesCount >= It->getSecond().size() &&
18370 "Expected total number of external uses not less than "
18371 "number of scalar uses.");
18372 ScalarUsesCount -= It->getSecond().size();
18373 }
18374 // Keep original scalar if number of externally used instructions in
18375 // the same entry is not power of 2. It may help to do some extra
18376 // vectorization for now.
18377 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(Value: ScalarUsesCount);
18378 }
18379 if (KeepScalar) {
18380 ExternalUsesAsOriginalScalar.insert(Ptr: EU.Scalar);
18381 for (Value *V : Inst->operands()) {
18382 auto It = ValueToExtUses->find(Val: V);
18383 if (It != ValueToExtUses->end()) {
18384 // Replace all uses to avoid compiler crash.
18385 ExternalUses[It->second].User = nullptr;
18386 }
18387 }
18388 ExtraCost = ScalarCost;
18389 if (!IsPhiInLoop(EU))
18390 ExtractsCount[Entry].insert(V: Inst);
18391 if (CanBeUsedAsScalarCast) {
18392 ScalarOpsFromCasts.insert(Ptr: Inst->getOperand(i: 0));
18393 // Update the users of the operands of the cast operand to avoid
18394 // compiler crash.
18395 if (auto *IOp = dyn_cast<Instruction>(Val: Inst->getOperand(i: 0))) {
18396 for (Value *V : IOp->operands()) {
18397 auto It = ValueToExtUses->find(Val: V);
18398 if (It != ValueToExtUses->end()) {
18399 // Replace all uses to avoid compiler crash.
18400 ExternalUses[It->second].User = nullptr;
18401 }
18402 }
18403 }
18404 }
18405 }
18406 }
18407 }
18408
18409 ExtraCost = ScaleCost(ExtraCost, *Entry, EU.Scalar,
18410 cast_or_null<Instruction>(Val: EU.User));
18411
18412 ExtractCost += ExtraCost;
18413 }
18414 // Insert externals for extract of operands of casts to be emitted as scalars
18415 // instead of extractelement.
18416 for (Value *V : ScalarOpsFromCasts) {
18417 ExternalUsesAsOriginalScalar.insert(Ptr: V);
18418 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
18419 const auto *It = find_if_not(Range&: TEs, P: [&](TreeEntry *TE) {
18420 return TransformedToGatherNodes.contains(Val: TE) ||
18421 DeletedNodes.contains(Ptr: TE);
18422 });
18423 if (It != TEs.end()) {
18424 const TreeEntry *UserTE = *It;
18425 ExternalUses.emplace_back(Args&: V, Args: nullptr, Args: *UserTE,
18426 Args: UserTE->findLaneForValue(V));
18427 }
18428 }
18429 }
18430 // Add reduced value cost, if resized.
18431 if (!VectorizedVals.empty()) {
18432 const TreeEntry &Root = *VectorizableTree.front();
18433 auto BWIt = MinBWs.find(Val: &Root);
18434 if (BWIt != MinBWs.end()) {
18435 Type *DstTy = Root.Scalars.front()->getType();
18436 unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy->getScalarType());
18437 unsigned SrcSz =
18438 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
18439 if (OriginalSz != SrcSz) {
18440 unsigned Opcode = Instruction::Trunc;
18441 if (OriginalSz > SrcSz)
18442 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
18443 Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
18444 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DstTy)) {
18445 assert(SLPReVec && "Only supported by REVEC.");
18446 SrcTy = getWidenedType(ScalarTy: SrcTy, VF: VecTy->getNumElements());
18447 }
18448 InstructionCost CastCost =
18449 TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
18450 CCH: TTI::CastContextHint::None,
18451 CostKind: TTI::TCK_RecipThroughput);
18452 CastCost = ScaleCost(CastCost, Root, /*Scalar=*/nullptr, ReductionRoot);
18453 Cost += CastCost;
18454 }
18455 }
18456 }
18457
18458 // Buildvector with externally used scalars, which should remain as scalars,
18459 // should not be vectorized, the compiler may hang.
18460 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
18461 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
18462 VectorizableTree[1]->hasState() &&
18463 VectorizableTree[1]->State == TreeEntry::Vectorize &&
18464 all_of(Range&: VectorizableTree[1]->Scalars, P: [&](Value *V) {
18465 return ExternalUsesAsOriginalScalar.contains(Ptr: V);
18466 }))
18467 return InstructionCost::getInvalid();
18468
18469 Cost += ExtractCost;
18470 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
18471 bool ForSingleMask) {
18472 InstructionCost C = 0;
18473 unsigned VF = Mask.size();
18474 unsigned VecVF = TE->getVectorFactor();
18475 bool HasLargeIndex =
18476 any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); });
18477 if ((VF != VecVF && HasLargeIndex) ||
18478 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) {
18479
18480 if (HasLargeIndex) {
18481 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
18482 std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
18483 result: OrigMask.begin());
18484 C = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
18485 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
18486 Mask: OrigMask);
18487 LLVM_DEBUG(
18488 dbgs() << "SLP: Adding cost " << C
18489 << " for final shuffle of insertelement external users.\n";
18490 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
18491 Cost += C;
18492 return std::make_pair(x&: TE, y: true);
18493 }
18494
18495 if (!ForSingleMask) {
18496 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
18497 for (unsigned I = 0; I < VF; ++I) {
18498 if (Mask[I] != PoisonMaskElem)
18499 ResizeMask[Mask[I]] = Mask[I];
18500 }
18501 if (!ShuffleVectorInst::isIdentityMask(Mask: ResizeMask, NumSrcElts: VF))
18502 C = ::getShuffleCost(
18503 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
18504 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), Mask: ResizeMask);
18505 LLVM_DEBUG(
18506 dbgs() << "SLP: Adding cost " << C
18507 << " for final shuffle of insertelement external users.\n";
18508 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
18509
18510 Cost += C;
18511 }
18512 }
18513 return std::make_pair(x&: TE, y: false);
18514 };
18515 // Calculate the cost of the reshuffled vectors, if any.
18516 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
18517 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(i_nocapture: 0);
18518 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
18519 unsigned VF = 0;
18520 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
18521 ArrayRef<const TreeEntry *> TEs) {
18522 assert((TEs.size() == 1 || TEs.size() == 2) &&
18523 "Expected exactly 1 or 2 tree entries.");
18524 if (TEs.size() == 1) {
18525 if (VF == 0)
18526 VF = TEs.front()->getVectorFactor();
18527 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
18528 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
18529 !all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
18530 return Data.value() == PoisonMaskElem ||
18531 (Data.index() < VF &&
18532 static_cast<int>(Data.index()) == Data.value());
18533 })) {
18534 InstructionCost C =
18535 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
18536 C = ScaleCost(C, *TEs.front());
18537 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
18538 << " for final shuffle of insertelement "
18539 "external users.\n";
18540 TEs.front()->dump();
18541 dbgs() << "SLP: Current total cost = " << Cost << "\n");
18542 Cost += C;
18543 }
18544 } else {
18545 if (VF == 0) {
18546 if (TEs.front() &&
18547 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
18548 VF = TEs.front()->getVectorFactor();
18549 else
18550 VF = Mask.size();
18551 }
18552 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
18553 InstructionCost C =
18554 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
18555 C = ScaleCost(C, *TEs.back());
18556 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
18557 << " for final shuffle of vector node and external "
18558 "insertelement users.\n";
18559 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
18560 dbgs() << "SLP: Current total cost = " << Cost << "\n");
18561 Cost += C;
18562 }
18563 VF = Mask.size();
18564 return TEs.back();
18565 };
18566 (void)performExtractsShuffleAction<const TreeEntry>(
18567 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
18568 GetVF: [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeAction: ResizeToVF,
18569 Action: EstimateShufflesCost);
18570 InstructionCost InsertCost = TTI->getScalarizationOverhead(
18571 Ty: cast<FixedVectorType>(
18572 Val: ShuffledInserts[I].InsertElements.front()->getType()),
18573 DemandedElts: DemandedElts[I],
18574 /*Insert*/ true, /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
18575 Cost -= InsertCost;
18576 }
18577
18578 // Add the cost for reduced value resize (if required).
18579 if (ReductionBitWidth != 0) {
18580 assert(UserIgnoreList && "Expected reduction tree.");
18581 const TreeEntry &E = *VectorizableTree.front();
18582 auto It = MinBWs.find(Val: &E);
18583 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
18584 unsigned SrcSize = It->second.first;
18585 unsigned DstSize = ReductionBitWidth;
18586 unsigned Opcode = Instruction::Trunc;
18587 if (SrcSize < DstSize) {
18588 bool IsArithmeticExtendedReduction =
18589 all_of(Range: *UserIgnoreList, P: [](Value *V) {
18590 auto *I = cast<Instruction>(Val: V);
18591 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
18592 Instruction::Mul, Instruction::FMul,
18593 Instruction::And, Instruction::Or,
18594 Instruction::Xor},
18595 Element: I->getOpcode());
18596 });
18597 if (IsArithmeticExtendedReduction)
18598 Opcode =
18599 Instruction::BitCast; // Handle it by getExtendedReductionCost
18600 else
18601 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
18602 }
18603 if (Opcode != Instruction::BitCast) {
18604 auto *SrcVecTy =
18605 getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
18606 auto *DstVecTy =
18607 getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
18608 TTI::CastContextHint CCH = getCastContextHint(TE: E);
18609 switch (E.getOpcode()) {
18610 case Instruction::SExt:
18611 case Instruction::ZExt:
18612 case Instruction::Trunc: {
18613 const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: 0);
18614 CCH = getCastContextHint(TE: *OpTE);
18615 break;
18616 }
18617 default:
18618 break;
18619 }
18620 InstructionCost CastCost =
18621 TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
18622 CostKind: TTI::TCK_RecipThroughput);
18623 CastCost = ScaleCost(CastCost, *VectorizableTree.front().get(),
18624 /*Scalar=*/nullptr, ReductionRoot);
18625 Cost += CastCost;
18626 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
18627 << " for final resize for reduction from " << SrcVecTy
18628 << " to " << DstVecTy << "\n";
18629 dbgs() << "SLP: Current total cost = " << Cost << "\n");
18630 }
18631 }
18632 }
18633
18634 std::optional<InstructionCost> SpillCost;
18635 if (Cost < -SLPCostThreshold) {
18636 SpillCost = getSpillCost();
18637 Cost += *SpillCost;
18638 }
18639#ifndef NDEBUG
18640 SmallString<256> Str;
18641 {
18642 raw_svector_ostream OS(Str);
18643 OS << "SLP: Spill Cost = ";
18644 if (SpillCost)
18645 OS << *SpillCost;
18646 else
18647 OS << "<skipped>";
18648 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n";
18649 if (ReductionRoot)
18650 OS << "SLP: Reduction Cost = " << ReductionCost << ".\n";
18651 OS << "SLP: Total Cost = " << Cost << ".\n";
18652 }
18653 LLVM_DEBUG(dbgs() << Str);
18654 if (ViewSLPTree)
18655 ViewGraph(this, "SLP" + F->getName(), false, Str);
18656#endif
18657
18658 return Cost;
18659}
18660
18661/// Tries to find extractelement instructions with constant indices from fixed
18662/// vector type and gather such instructions into a bunch, which highly likely
18663/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
18664/// successful, the matched scalars are replaced by poison values in \p VL for
18665/// future analysis.
18666std::optional<TTI::ShuffleKind>
18667BoUpSLP::tryToGatherSingleRegisterExtractElements(
18668 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
18669 // Scan list of gathered scalars for extractelements that can be represented
18670 // as shuffles.
18671 MapVector<Value *, SmallVector<int>> VectorOpToIdx;
18672 SmallVector<int> UndefVectorExtracts;
18673 for (int I = 0, E = VL.size(); I < E; ++I) {
18674 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
18675 if (!EI) {
18676 if (isa<UndefValue>(Val: VL[I]))
18677 UndefVectorExtracts.push_back(Elt: I);
18678 continue;
18679 }
18680 auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
18681 if (!VecTy || !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
18682 continue;
18683 std::optional<unsigned> Idx = getExtractIndex(E: EI);
18684 // Undefined index.
18685 if (!Idx) {
18686 UndefVectorExtracts.push_back(Elt: I);
18687 continue;
18688 }
18689 if (Idx >= VecTy->getNumElements()) {
18690 UndefVectorExtracts.push_back(Elt: I);
18691 continue;
18692 }
18693 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
18694 ExtractMask.reset(Idx: *Idx);
18695 if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
18696 UndefVectorExtracts.push_back(Elt: I);
18697 continue;
18698 }
18699 VectorOpToIdx[EI->getVectorOperand()].push_back(Elt: I);
18700 }
18701 // Sort the vector operands by the maximum number of uses in extractelements.
18702 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
18703 VectorOpToIdx.takeVector();
18704 stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
18705 return P1.second.size() > P2.second.size();
18706 });
18707 // Find the best pair of the vectors or a single vector.
18708 const int UndefSz = UndefVectorExtracts.size();
18709 unsigned SingleMax = 0;
18710 unsigned PairMax = 0;
18711 if (!Vectors.empty()) {
18712 SingleMax = Vectors.front().second.size() + UndefSz;
18713 if (Vectors.size() > 1) {
18714 auto *ItNext = std::next(x: Vectors.begin());
18715 PairMax = SingleMax + ItNext->second.size();
18716 }
18717 }
18718 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
18719 return std::nullopt;
18720 // Check if better to perform a shuffle of 2 vectors or just of a single
18721 // vector.
18722 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
18723 SmallVector<Value *> GatheredExtracts(
18724 VL.size(), PoisonValue::get(T: VL.front()->getType()));
18725 if (SingleMax >= PairMax && SingleMax) {
18726 for (int Idx : Vectors.front().second)
18727 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
18728 } else if (!Vectors.empty()) {
18729 for (unsigned Idx : {0, 1})
18730 for (int Idx : Vectors[Idx].second)
18731 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
18732 }
18733 // Add extracts from undefs too.
18734 for (int Idx : UndefVectorExtracts)
18735 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
18736 // Check that gather of extractelements can be represented as just a
18737 // shuffle of a single/two vectors the scalars are extracted from.
18738 std::optional<TTI::ShuffleKind> Res =
18739 isFixedVectorShuffle(VL: GatheredExtracts, Mask, AC);
18740 if (!Res || all_of(Range&: Mask, P: equal_to(Arg: PoisonMaskElem))) {
18741 // TODO: try to check other subsets if possible.
18742 // Restore the original VL if attempt was not successful.
18743 copy(Range&: SavedVL, Out: VL.begin());
18744 return std::nullopt;
18745 }
18746 // Restore unused scalars from mask, if some of the extractelements were not
18747 // selected for shuffle.
18748 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
18749 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts[I]) &&
18750 isa<UndefValue>(Val: GatheredExtracts[I])) {
18751 std::swap(a&: VL[I], b&: GatheredExtracts[I]);
18752 continue;
18753 }
18754 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
18755 if (!EI || !isa<FixedVectorType>(Val: EI->getVectorOperandType()) ||
18756 !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) ||
18757 is_contained(Range&: UndefVectorExtracts, Element: I))
18758 continue;
18759 }
18760 return Res;
18761}
18762
18763/// Tries to find extractelement instructions with constant indices from fixed
18764/// vector type and gather such instructions into a bunch, which highly likely
18765/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
18766/// successful, the matched scalars are replaced by poison values in \p VL for
18767/// future analysis.
18768SmallVector<std::optional<TTI::ShuffleKind>>
18769BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
18770 SmallVectorImpl<int> &Mask,
18771 unsigned NumParts) const {
18772 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
18773 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
18774 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
18775 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
18776 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
18777 // Scan list of gathered scalars for extractelements that can be represented
18778 // as shuffles.
18779 const unsigned PartOffset = Part * SliceSize;
18780 const unsigned PartSize = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
18781 // It may happen in case of revec, need to check no access out of bounds.
18782 if (PartOffset + PartSize > VL.size())
18783 break;
18784 MutableArrayRef<Value *> SubVL =
18785 MutableArrayRef(VL).slice(N: PartOffset, M: PartSize);
18786 SmallVector<int> SubMask;
18787 std::optional<TTI::ShuffleKind> Res =
18788 tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
18789 ShufflesRes[Part] = Res;
18790 copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
18791 }
18792 if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
18793 return Res.has_value();
18794 }))
18795 ShufflesRes.clear();
18796 return ShufflesRes;
18797}
18798
18799std::optional<TargetTransformInfo::ShuffleKind>
18800BoUpSLP::isGatherShuffledSingleRegisterEntry(
18801 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
18802 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
18803 Entries.clear();
18804 if (TE->Idx == 0)
18805 return std::nullopt;
18806 // TODO: currently checking only for Scalars in the tree entry, need to count
18807 // reused elements too for better cost estimation.
18808 auto GetUserEntry = [&](const TreeEntry *TE) {
18809 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18810 TE = TE->UserTreeIndex.UserTE;
18811 if (TE == VectorizableTree.front().get())
18812 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
18813 return TE->UserTreeIndex;
18814 };
18815 auto HasGatherUser = [&](const TreeEntry *TE) {
18816 while (TE->Idx != 0 && TE->UserTreeIndex) {
18817 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18818 return true;
18819 TE = TE->UserTreeIndex.UserTE;
18820 }
18821 return false;
18822 };
18823 const EdgeInfo TEUseEI = GetUserEntry(TE);
18824 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
18825 !TEUseEI.UserTE->hasState()))
18826 return std::nullopt;
18827 const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
18828 const BasicBlock *TEInsertBlock = nullptr;
18829 // Main node of PHI entries keeps the correct order of operands/incoming
18830 // blocks.
18831 if (auto *PHI = dyn_cast_or_null<PHINode>(
18832 Val: TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
18833 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
18834 TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
18835 TEInsertPt = TEInsertBlock->getTerminator();
18836 } else {
18837 TEInsertBlock = TEInsertPt->getParent();
18838 }
18839 if (!DT->isReachableFromEntry(A: TEInsertBlock))
18840 return std::nullopt;
18841 auto *NodeUI = DT->getNode(BB: TEInsertBlock);
18842 assert(NodeUI && "Should only process reachable instructions");
18843 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
18844 auto CheckOrdering = [&](const Instruction *InsertPt) {
18845 // Argument InsertPt is an instruction where vector code for some other
18846 // tree entry (one that shares one or more scalars with TE) is going to be
18847 // generated. This lambda returns true if insertion point of vector code
18848 // for the TE dominates that point (otherwise dependency is the other way
18849 // around). The other node is not limited to be of a gather kind. Gather
18850 // nodes are not scheduled and their vector code is inserted before their
18851 // first user. If user is PHI, that is supposed to be at the end of a
18852 // predecessor block. Otherwise it is the last instruction among scalars of
18853 // the user node. So, instead of checking dependency between instructions
18854 // themselves, we check dependency between their insertion points for vector
18855 // code (since each scalar instruction ends up as a lane of a vector
18856 // instruction).
18857 const BasicBlock *InsertBlock = InsertPt->getParent();
18858 auto *NodeEUI = DT->getNode(BB: InsertBlock);
18859 if (!NodeEUI)
18860 return false;
18861 assert((NodeUI == NodeEUI) ==
18862 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
18863 "Different nodes should have different DFS numbers");
18864 // Check the order of the gather nodes users.
18865 if (TEInsertPt->getParent() != InsertBlock &&
18866 (DT->dominates(A: NodeUI, B: NodeEUI) || !DT->dominates(A: NodeEUI, B: NodeUI)))
18867 return false;
18868 if (TEInsertPt->getParent() == InsertBlock &&
18869 TEInsertPt->comesBefore(Other: InsertPt))
18870 return false;
18871 return true;
18872 };
18873 // Find all tree entries used by the gathered values. If no common entries
18874 // found - not a shuffle.
18875 // Here we build a set of tree nodes for each gathered value and trying to
18876 // find the intersection between these sets. If we have at least one common
18877 // tree node for each gathered value - we have just a permutation of the
18878 // single vector. If we have 2 different sets, we're in situation where we
18879 // have a permutation of 2 input vectors.
18880 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
18881 SmallDenseMap<Value *, int> UsedValuesEntry;
18882 SmallPtrSet<const Value *, 16> VisitedValue;
18883 bool IsReusedNodeFound = false;
18884 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
18885 // The node is reused - exit.
18886 if (IsReusedNodeFound)
18887 return false;
18888 if ((TEPtr->getVectorFactor() != VL.size() &&
18889 TEPtr->Scalars.size() != VL.size()) ||
18890 (!TEPtr->isSame(VL) && !TEPtr->isSame(VL: TE->Scalars)))
18891 return false;
18892 IsReusedNodeFound =
18893 equal(LRange: TE->Scalars, RRange: TEPtr->Scalars) &&
18894 equal(LRange: TE->ReorderIndices, RRange: TEPtr->ReorderIndices) &&
18895 equal(LRange: TE->ReuseShuffleIndices, RRange: TEPtr->ReuseShuffleIndices);
18896 UsedTEs.clear();
18897 UsedTEs.emplace_back().insert(Ptr: TEPtr);
18898 for (Value *V : VL) {
18899 if (isConstant(V))
18900 continue;
18901 UsedValuesEntry.try_emplace(Key: V, Args: 0);
18902 }
18903 return true;
18904 };
18905 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
18906 unsigned EdgeIdx) {
18907 const TreeEntry *Ptr1 = User1;
18908 const TreeEntry *Ptr2 = User2;
18909 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
18910 while (Ptr2) {
18911 PtrToIdx.try_emplace(Key: Ptr2, Args&: EdgeIdx);
18912 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
18913 Ptr2 = Ptr2->UserTreeIndex.UserTE;
18914 }
18915 while (Ptr1) {
18916 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
18917 Ptr1 = Ptr1->UserTreeIndex.UserTE;
18918 if (auto It = PtrToIdx.find(Val: Ptr1); It != PtrToIdx.end())
18919 return Idx < It->second;
18920 }
18921 return false;
18922 };
18923 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
18924 Instruction *InsertPt) {
18925 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
18926 !TEUseEI.UserTE->isCopyableElement(
18927 V: const_cast<Instruction *>(TEInsertPt)) &&
18928 isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
18929 InsertPt->getNextNode() == TEInsertPt &&
18930 (!E->hasCopyableElements() || !E->isCopyableElement(V: InsertPt) ||
18931 !isUsedOutsideBlock(V: InsertPt));
18932 };
18933 for (Value *V : VL) {
18934 if (isConstant(V) || !VisitedValue.insert(Ptr: V).second)
18935 continue;
18936 // Build a list of tree entries where V is used.
18937 SmallPtrSet<const TreeEntry *, 4> VToTEs;
18938 SmallVector<const TreeEntry *> GatherNodes(
18939 ValueToGatherNodes.lookup(Val: V).takeVector());
18940 if (TransformedToGatherNodes.contains(Val: TE)) {
18941 for (TreeEntry *E : getSplitTreeEntries(V)) {
18942 if (TE == E || !TransformedToGatherNodes.contains(Val: E) ||
18943 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
18944 continue;
18945 GatherNodes.push_back(Elt: E);
18946 }
18947 for (TreeEntry *E : getTreeEntries(V)) {
18948 if (TE == E || !TransformedToGatherNodes.contains(Val: E) ||
18949 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
18950 continue;
18951 GatherNodes.push_back(Elt: E);
18952 }
18953 }
18954 for (const TreeEntry *TEPtr : GatherNodes) {
18955 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(Ptr: TEPtr))
18956 continue;
18957 assert(any_of(TEPtr->Scalars,
18958 [&](Value *V) { return GatheredScalars.contains(V); }) &&
18959 "Must contain at least single gathered value.");
18960 assert(TEPtr->UserTreeIndex &&
18961 "Expected only single user of a gather node.");
18962 if (any_of(Range: TEPtr->CombinedEntriesWithIndices,
18963 P: [&](const auto &P) { return P.first == TE->Idx; }))
18964 continue;
18965 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
18966
18967 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
18968 UseEI.UserTE->hasState())
18969 ? dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp())
18970 : nullptr;
18971 Instruction *InsertPt =
18972 UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
18973 : &getLastInstructionInBundle(E: UseEI.UserTE);
18974 if (TEInsertPt == InsertPt) {
18975 // Check nodes, which might be emitted first.
18976 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18977 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
18978 TEUseEI.UserTE->isAltShuffle()) &&
18979 all_of(Range&: TEUseEI.UserTE->Scalars, P: isUsedOutsideBlock)) {
18980 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
18981 (UseEI.UserTE->hasState() &&
18982 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18983 !UseEI.UserTE->isAltShuffle()) ||
18984 !all_of(Range&: UseEI.UserTE->Scalars, P: isUsedOutsideBlock))
18985 continue;
18986 }
18987
18988 // If the schedulable insertion point is used in multiple entries - just
18989 // exit, no known ordering at this point, available only after real
18990 // scheduling.
18991 if (!doesNotNeedToBeScheduled(V: InsertPt) &&
18992 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
18993 continue;
18994 // If the users are the PHI nodes with the same incoming blocks - skip.
18995 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18996 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
18997 UseEI.UserTE->State == TreeEntry::Vectorize &&
18998 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18999 TEUseEI.UserTE != UseEI.UserTE)
19000 continue;
19001 // If 2 gathers are operands of the same entry (regardless of whether
19002 // user is PHI or else), compare operands indices, use the earlier one
19003 // as the base.
19004 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
19005 continue;
19006 // If the user instruction is used for some reason in different
19007 // vectorized nodes - make it depend on index.
19008 if (TEUseEI.UserTE != UseEI.UserTE &&
19009 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
19010 HasGatherUser(TEUseEI.UserTE)))
19011 continue;
19012 // If the user node is the operand of the other user node - skip.
19013 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
19014 continue;
19015 }
19016
19017 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
19018 TEUseEI.UserTE->doesNotNeedToSchedule() !=
19019 UseEI.UserTE->doesNotNeedToSchedule() &&
19020 is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))
19021 continue;
19022 // Check if the user node of the TE comes after user node of TEPtr,
19023 // otherwise TEPtr depends on TE.
19024 if ((TEInsertBlock != InsertPt->getParent() ||
19025 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
19026 (!CheckOrdering(InsertPt) ||
19027 (UseEI.UserTE->hasCopyableElements() &&
19028 isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
19029 is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))))
19030 continue;
19031 // The node is reused - exit.
19032 if (CheckAndUseSameNode(TEPtr))
19033 break;
19034 // The parent node is copyable with last inst used outside? And the last
19035 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
19036 // preserve def-use chain.
19037 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
19038 continue;
19039 VToTEs.insert(Ptr: TEPtr);
19040 }
19041 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
19042 const auto *It = find_if(Range&: VTEs, P: [&](const TreeEntry *MTE) {
19043 return MTE != TE && MTE != TEUseEI.UserTE &&
19044 !DeletedNodes.contains(Ptr: MTE) &&
19045 !TransformedToGatherNodes.contains(Val: MTE);
19046 });
19047 if (It != VTEs.end()) {
19048 const TreeEntry *VTE = *It;
19049 if (none_of(Range: TE->CombinedEntriesWithIndices,
19050 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
19051 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
19052 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
19053 continue;
19054 }
19055 // The node is reused - exit.
19056 if (CheckAndUseSameNode(VTE))
19057 break;
19058 VToTEs.insert(Ptr: VTE);
19059 }
19060 }
19061 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
19062 const auto *It = find_if(Range&: VTEs, P: [&, MainTE = TE](const TreeEntry *TE) {
19063 return TE != MainTE && !DeletedNodes.contains(Ptr: TE) &&
19064 !TransformedToGatherNodes.contains(Val: TE);
19065 });
19066 if (It != VTEs.end()) {
19067 const TreeEntry *VTE = *It;
19068 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(u: 0) &&
19069 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
19070 VTEs = VTEs.drop_front();
19071 // Iterate through all vectorized nodes.
19072 const auto *MIt = find_if(Range&: VTEs, P: [](const TreeEntry *MTE) {
19073 return MTE->State == TreeEntry::Vectorize;
19074 });
19075 if (MIt == VTEs.end())
19076 continue;
19077 VTE = *MIt;
19078 }
19079 if (none_of(Range: TE->CombinedEntriesWithIndices,
19080 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
19081 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
19082 if (&LastBundleInst == TEInsertPt ||
19083 !CheckOrdering(&LastBundleInst) ||
19084 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
19085 continue;
19086 }
19087 // The node is reused - exit.
19088 if (CheckAndUseSameNode(VTE))
19089 break;
19090 VToTEs.insert(Ptr: VTE);
19091 }
19092 }
19093 if (IsReusedNodeFound)
19094 break;
19095 if (VToTEs.empty())
19096 continue;
19097 if (UsedTEs.empty()) {
19098 // The first iteration, just insert the list of nodes to vector.
19099 UsedTEs.push_back(Elt: VToTEs);
19100 UsedValuesEntry.try_emplace(Key: V, Args: 0);
19101 } else {
19102 // Need to check if there are any previously used tree nodes which use V.
19103 // If there are no such nodes, consider that we have another one input
19104 // vector.
19105 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
19106 unsigned Idx = 0;
19107 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
19108 // Do we have a non-empty intersection of previously listed tree entries
19109 // and tree entries using current V?
19110 set_intersect(S1&: VToTEs, S2: Set);
19111 if (!VToTEs.empty()) {
19112 // Yes, write the new subset and continue analysis for the next
19113 // scalar.
19114 Set.swap(RHS&: VToTEs);
19115 break;
19116 }
19117 VToTEs = SavedVToTEs;
19118 ++Idx;
19119 }
19120 // No non-empty intersection found - need to add a second set of possible
19121 // source vectors.
19122 if (Idx == UsedTEs.size()) {
19123 // If the number of input vectors is greater than 2 - not a permutation,
19124 // fallback to the regular gather.
19125 // TODO: support multiple reshuffled nodes.
19126 if (UsedTEs.size() == 2)
19127 continue;
19128 UsedTEs.push_back(Elt: SavedVToTEs);
19129 Idx = UsedTEs.size() - 1;
19130 }
19131 UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
19132 }
19133 }
19134
19135 if (UsedTEs.empty()) {
19136 Entries.clear();
19137 return std::nullopt;
19138 }
19139
19140 unsigned VF = 0;
19141 if (UsedTEs.size() == 1) {
19142 // Keep the order to avoid non-determinism.
19143 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
19144 UsedTEs.front().end());
19145 sort(C&: FirstEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
19146 return TE1->Idx < TE2->Idx;
19147 });
19148 // Try to find the perfect match in another gather node at first.
19149 auto *It = find_if(Range&: FirstEntries, P: [=](const TreeEntry *EntryPtr) {
19150 return EntryPtr->isSame(VL) || EntryPtr->isSame(VL: TE->Scalars);
19151 });
19152 if (It != FirstEntries.end() &&
19153 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
19154 ((*It)->getVectorFactor() == TE->Scalars.size() &&
19155 TE->ReuseShuffleIndices.size() == VL.size() &&
19156 (*It)->isSame(VL: TE->Scalars)))) {
19157 Entries.push_back(Elt: *It);
19158 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
19159 std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
19160 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: 0);
19161 } else {
19162 SmallVector<int> CommonMask = TE->getCommonMask();
19163 copy(Range&: CommonMask, Out: Mask.begin());
19164 }
19165 // Clear undef scalars.
19166 for (unsigned I : seq<unsigned>(Size: VL.size()))
19167 if (isa<PoisonValue>(Val: VL[I]))
19168 Mask[Part * VL.size() + I] = PoisonMaskElem;
19169 return TargetTransformInfo::SK_PermuteSingleSrc;
19170 }
19171 // No perfect match, just shuffle, so choose the first tree node from the
19172 // tree.
19173 Entries.push_back(Elt: FirstEntries.front());
19174 // Update mapping between values and corresponding tree entries.
19175 for (auto &P : UsedValuesEntry)
19176 P.second = 0;
19177 VF = FirstEntries.front()->getVectorFactor();
19178 } else {
19179 // Try to find nodes with the same vector factor.
19180 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
19181 // Keep the order of tree nodes to avoid non-determinism.
19182 DenseMap<int, const TreeEntry *> VFToTE;
19183 for (const TreeEntry *TE : UsedTEs.front()) {
19184 unsigned VF = TE->getVectorFactor();
19185 auto It = VFToTE.find(Val: VF);
19186 if (It != VFToTE.end()) {
19187 if (It->second->Idx > TE->Idx)
19188 It->getSecond() = TE;
19189 continue;
19190 }
19191 VFToTE.try_emplace(Key: VF, Args&: TE);
19192 }
19193 // Same, keep the order to avoid non-determinism.
19194 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
19195 UsedTEs.back().end());
19196 sort(C&: SecondEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
19197 return TE1->Idx < TE2->Idx;
19198 });
19199 for (const TreeEntry *TE : SecondEntries) {
19200 auto It = VFToTE.find(Val: TE->getVectorFactor());
19201 if (It != VFToTE.end()) {
19202 VF = It->first;
19203 Entries.push_back(Elt: It->second);
19204 Entries.push_back(Elt: TE);
19205 break;
19206 }
19207 }
19208 // No 2 source vectors with the same vector factor - just choose 2 with max
19209 // index.
19210 if (Entries.empty()) {
19211 Entries.push_back(Elt: *llvm::max_element(
19212 Range&: UsedTEs.front(), C: [](const TreeEntry *TE1, const TreeEntry *TE2) {
19213 return TE1->Idx < TE2->Idx;
19214 }));
19215 Entries.push_back(Elt: SecondEntries.front());
19216 VF = std::max(a: Entries.front()->getVectorFactor(),
19217 b: Entries.back()->getVectorFactor());
19218 } else {
19219 VF = Entries.front()->getVectorFactor();
19220 }
19221 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
19222 for (const TreeEntry *E : Entries)
19223 ValuesToEntries.emplace_back().insert(I: E->Scalars.begin(),
19224 E: E->Scalars.end());
19225 // Update mapping between values and corresponding tree entries.
19226 for (auto &P : UsedValuesEntry) {
19227 for (unsigned Idx : seq<unsigned>(Size: ValuesToEntries.size()))
19228 if (ValuesToEntries[Idx].contains(Ptr: P.first)) {
19229 P.second = Idx;
19230 break;
19231 }
19232 }
19233 }
19234
19235 bool IsSplatOrUndefs = isSplat(VL) || all_of(Range&: VL, P: IsaPred<UndefValue>);
19236 // Checks if the 2 PHIs are compatible in terms of high possibility to be
19237 // vectorized.
19238 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
19239 auto *PHI = cast<PHINode>(Val: V);
19240 auto *PHI1 = cast<PHINode>(Val: V1);
19241 // Check that all incoming values are compatible/from same parent (if they
19242 // are instructions).
19243 // The incoming values are compatible if they all are constants, or
19244 // instruction with the same/alternate opcodes from the same basic block.
19245 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
19246 Value *In = PHI->getIncomingValue(i: I);
19247 Value *In1 = PHI1->getIncomingValue(i: I);
19248 if (isConstant(V: In) && isConstant(V: In1))
19249 continue;
19250 if (!getSameOpcode(VL: {In, In1}, TLI: *TLI))
19251 return false;
19252 if (cast<Instruction>(Val: In)->getParent() !=
19253 cast<Instruction>(Val: In1)->getParent())
19254 return false;
19255 }
19256 return true;
19257 };
19258 // Check if the value can be ignored during analysis for shuffled gathers.
19259 // We suppose it is better to ignore instruction, which do not form splats,
19260 // are not vectorized/not extractelements (these instructions will be handled
19261 // by extractelements processing) or may form vector node in future.
19262 auto MightBeIgnored = [=](Value *V) {
19263 auto *I = dyn_cast<Instruction>(Val: V);
19264 return I && !IsSplatOrUndefs && !isVectorized(V: I) &&
19265 !isVectorLikeInstWithConstOps(V: I) &&
19266 !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
19267 };
19268 // Check that the neighbor instruction may form a full vector node with the
19269 // current instruction V. It is possible, if they have same/alternate opcode
19270 // and same parent basic block.
19271 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
19272 Value *V1 = VL[Idx];
19273 bool UsedInSameVTE = false;
19274 auto It = UsedValuesEntry.find(Val: V1);
19275 if (It != UsedValuesEntry.end())
19276 UsedInSameVTE = It->second == UsedValuesEntry.find(Val: V)->second;
19277 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
19278 getSameOpcode(VL: {V, V1}, TLI: *TLI) &&
19279 cast<Instruction>(Val: V)->getParent() ==
19280 cast<Instruction>(Val: V1)->getParent() &&
19281 (!isa<PHINode>(Val: V1) || AreCompatiblePHIs(V, V1));
19282 };
19283 // Build a shuffle mask for better cost estimation and vector emission.
19284 SmallBitVector UsedIdxs(Entries.size());
19285 SmallVector<std::pair<unsigned, int>> EntryLanes;
19286 for (int I = 0, E = VL.size(); I < E; ++I) {
19287 Value *V = VL[I];
19288 auto It = UsedValuesEntry.find(Val: V);
19289 if (It == UsedValuesEntry.end())
19290 continue;
19291 // Do not try to shuffle scalars, if they are constants, or instructions
19292 // that can be vectorized as a result of the following vector build
19293 // vectorization.
19294 if (isConstant(V) || (MightBeIgnored(V) &&
19295 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
19296 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
19297 continue;
19298 unsigned Idx = It->second;
19299 EntryLanes.emplace_back(Args&: Idx, Args&: I);
19300 UsedIdxs.set(Idx);
19301 }
19302 // Iterate through all shuffled scalars and select entries, which can be used
19303 // for final shuffle.
19304 SmallVector<const TreeEntry *> TempEntries;
19305 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
19306 if (!UsedIdxs.test(Idx: I))
19307 continue;
19308 // Fix the entry number for the given scalar. If it is the first entry, set
19309 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
19310 // These indices are used when calculating final shuffle mask as the vector
19311 // offset.
19312 for (std::pair<unsigned, int> &Pair : EntryLanes)
19313 if (Pair.first == I)
19314 Pair.first = TempEntries.size();
19315 TempEntries.push_back(Elt: Entries[I]);
19316 }
19317 Entries.swap(RHS&: TempEntries);
19318 if (EntryLanes.size() == Entries.size() &&
19319 !VL.equals(RHS: ArrayRef(TE->Scalars)
19320 .slice(N: Part * VL.size(),
19321 M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
19322 // We may have here 1 or 2 entries only. If the number of scalars is equal
19323 // to the number of entries, no need to do the analysis, it is not very
19324 // profitable. Since VL is not the same as TE->Scalars, it means we already
19325 // have some shuffles before. Cut off not profitable case.
19326 Entries.clear();
19327 return std::nullopt;
19328 }
19329 // Build the final mask, check for the identity shuffle, if possible.
19330 bool IsIdentity = Entries.size() == 1;
19331 // Pair.first is the offset to the vector, while Pair.second is the index of
19332 // scalar in the list.
19333 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
19334 unsigned Idx = Part * VL.size() + Pair.second;
19335 Mask[Idx] =
19336 Pair.first * VF +
19337 (ForOrder ? std::distance(
19338 first: Entries[Pair.first]->Scalars.begin(),
19339 last: find(Range: Entries[Pair.first]->Scalars, Val: VL[Pair.second]))
19340 : Entries[Pair.first]->findLaneForValue(V: VL[Pair.second]));
19341 IsIdentity &= Mask[Idx] == Pair.second;
19342 }
19343 if (ForOrder || IsIdentity || Entries.empty()) {
19344 switch (Entries.size()) {
19345 case 1:
19346 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
19347 return TargetTransformInfo::SK_PermuteSingleSrc;
19348 break;
19349 case 2:
19350 if (EntryLanes.size() > 2 || VL.size() <= 2)
19351 return TargetTransformInfo::SK_PermuteTwoSrc;
19352 break;
19353 default:
19354 break;
19355 }
19356 } else if (!isa<VectorType>(Val: VL.front()->getType()) &&
19357 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
19358 // Do the cost estimation if shuffle beneficial than buildvector.
19359 SmallVector<int> SubMask(std::next(x: Mask.begin(), n: Part * VL.size()),
19360 std::next(x: Mask.begin(), n: (Part + 1) * VL.size()));
19361 int MinElement = SubMask.front(), MaxElement = SubMask.front();
19362 for (int Idx : SubMask) {
19363 if (Idx == PoisonMaskElem)
19364 continue;
19365 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
19366 MinElement = Idx;
19367 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
19368 MaxElement = Idx;
19369 }
19370 assert(MaxElement >= 0 && MinElement >= 0 &&
19371 MaxElement % VF >= MinElement % VF &&
19372 "Expected at least single element.");
19373 unsigned NewVF = std::max<unsigned>(
19374 a: VL.size(), b: getFullVectorNumberOfElements(TTI: *TTI, Ty: VL.front()->getType(),
19375 Sz: (MaxElement % VF) -
19376 (MinElement % VF) + 1));
19377 if (NewVF < VF) {
19378 for (int &Idx : SubMask) {
19379 if (Idx == PoisonMaskElem)
19380 continue;
19381 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
19382 (Idx >= static_cast<int>(VF) ? NewVF : 0);
19383 }
19384 } else {
19385 NewVF = VF;
19386 }
19387
19388 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19389 auto *VecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: NewVF);
19390 auto *MaskVecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: SubMask.size());
19391 auto GetShuffleCost = [&,
19392 &TTI = *TTI](ArrayRef<int> Mask,
19393 ArrayRef<const TreeEntry *> Entries,
19394 VectorType *VecTy) -> InstructionCost {
19395 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
19396 ShuffleVectorInst::isDeInterleaveMaskOfFactor(
19397 Mask, Factor: Entries.front()->getInterleaveFactor()))
19398 return TTI::TCC_Free;
19399 return ::getShuffleCost(TTI,
19400 Kind: Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
19401 : TTI::SK_PermuteSingleSrc,
19402 Tp: VecTy, Mask, CostKind);
19403 };
19404 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
19405 InstructionCost FirstShuffleCost = 0;
19406 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
19407 if (Entries.size() == 1 || !Entries[0]->isGather()) {
19408 FirstShuffleCost = ShuffleCost;
19409 } else {
19410 // Transform mask to include only first entry.
19411 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
19412 bool IsIdentity = true;
19413 for (auto [I, Idx] : enumerate(First&: FirstMask)) {
19414 if (Idx >= static_cast<int>(NewVF)) {
19415 Idx = PoisonMaskElem;
19416 } else {
19417 DemandedElts.clearBit(BitPosition: I);
19418 if (Idx != PoisonMaskElem)
19419 IsIdentity &= static_cast<int>(I) == Idx;
19420 }
19421 }
19422 if (!IsIdentity)
19423 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
19424 FirstShuffleCost += getScalarizationOverhead(
19425 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
19426 /*Extract=*/false, CostKind);
19427 }
19428 InstructionCost SecondShuffleCost = 0;
19429 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
19430 if (Entries.size() == 1 || !Entries[1]->isGather()) {
19431 SecondShuffleCost = ShuffleCost;
19432 } else {
19433 // Transform mask to include only first entry.
19434 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
19435 bool IsIdentity = true;
19436 for (auto [I, Idx] : enumerate(First&: SecondMask)) {
19437 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
19438 Idx = PoisonMaskElem;
19439 } else {
19440 DemandedElts.clearBit(BitPosition: I);
19441 if (Idx != PoisonMaskElem) {
19442 Idx -= NewVF;
19443 IsIdentity &= static_cast<int>(I) == Idx;
19444 }
19445 }
19446 }
19447 if (!IsIdentity)
19448 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
19449 SecondShuffleCost += getScalarizationOverhead(
19450 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
19451 /*Extract=*/false, CostKind);
19452 }
19453 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
19454 for (auto [I, Idx] : enumerate(First&: SubMask))
19455 if (Idx == PoisonMaskElem)
19456 DemandedElts.clearBit(BitPosition: I);
19457 InstructionCost BuildVectorCost = getScalarizationOverhead(
19458 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
19459 /*Extract=*/false, CostKind);
19460 const TreeEntry *BestEntry = nullptr;
19461 if (FirstShuffleCost < ShuffleCost) {
19462 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
19463 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
19464 f: [&](int &Idx) {
19465 if (Idx >= static_cast<int>(VF))
19466 Idx = PoisonMaskElem;
19467 });
19468 BestEntry = Entries.front();
19469 ShuffleCost = FirstShuffleCost;
19470 }
19471 if (SecondShuffleCost < ShuffleCost) {
19472 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
19473 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
19474 f: [&](int &Idx) {
19475 if (Idx < static_cast<int>(VF))
19476 Idx = PoisonMaskElem;
19477 else
19478 Idx -= VF;
19479 });
19480 BestEntry = Entries[1];
19481 ShuffleCost = SecondShuffleCost;
19482 }
19483 if (BuildVectorCost >= ShuffleCost) {
19484 if (BestEntry) {
19485 Entries.clear();
19486 Entries.push_back(Elt: BestEntry);
19487 }
19488 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
19489 : TargetTransformInfo::SK_PermuteSingleSrc;
19490 }
19491 }
19492 Entries.clear();
19493 // Clear the corresponding mask elements.
19494 std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
19495 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: PoisonMaskElem);
19496 return std::nullopt;
19497}
19498
19499SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
19500BoUpSLP::isGatherShuffledEntry(
19501 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
19502 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
19503 bool ForOrder) {
19504 assert(NumParts > 0 && NumParts < VL.size() &&
19505 "Expected positive number of registers.");
19506 Entries.clear();
19507 // No need to check for the topmost gather node.
19508 if (TE == VectorizableTree.front().get() &&
19509 (!GatheredLoadsEntriesFirst.has_value() ||
19510 none_of(Range: ArrayRef(VectorizableTree).drop_front(),
19511 P: [](const std::unique_ptr<TreeEntry> &TE) {
19512 return !TE->isGather();
19513 })))
19514 return {};
19515 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
19516 // implemented yet.
19517 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
19518 return {};
19519 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
19520 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
19521 "Expected only single user of the gather node.");
19522 assert(VL.size() % NumParts == 0 &&
19523 "Number of scalars must be divisible by NumParts.");
19524 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
19525 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
19526 (TE->Idx == 0 ||
19527 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
19528 isSplat(VL: TE->Scalars) ||
19529 (TE->hasState() &&
19530 getSameValuesTreeEntry(V: TE->getMainOp(), VL: TE->Scalars))))
19531 return {};
19532 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
19533 SmallVector<std::optional<TTI::ShuffleKind>> Res;
19534 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
19535 ArrayRef<Value *> SubVL =
19536 VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
19537 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
19538 std::optional<TTI::ShuffleKind> SubRes =
19539 isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
19540 ForOrder);
19541 if (!SubRes)
19542 SubEntries.clear();
19543 Res.push_back(Elt: SubRes);
19544 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
19545 SubEntries.front()->getVectorFactor() == VL.size() &&
19546 (SubEntries.front()->isSame(VL: TE->Scalars) ||
19547 SubEntries.front()->isSame(VL))) {
19548 SmallVector<const TreeEntry *> LocalSubEntries;
19549 LocalSubEntries.swap(RHS&: SubEntries);
19550 Entries.clear();
19551 Res.clear();
19552 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
19553 // Clear undef scalars.
19554 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
19555 if (isa<PoisonValue>(Val: VL[I]))
19556 Mask[I] = PoisonMaskElem;
19557 Entries.emplace_back(Args: 1, Args&: LocalSubEntries.front());
19558 Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
19559 return Res;
19560 }
19561 }
19562 if (all_of(Range&: Res,
19563 P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
19564 Entries.clear();
19565 return {};
19566 }
19567 return Res;
19568}
19569
19570InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
19571 Type *ScalarTy) const {
19572 const unsigned VF = VL.size();
19573 auto *VecTy = getWidenedType(ScalarTy, VF);
19574 // Find the cost of inserting/extracting values from the vector.
19575 // Check if the same elements are inserted several times and count them as
19576 // shuffle candidates.
19577 APInt DemandedElements = APInt::getZero(numBits: VF);
19578 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
19579 InstructionCost Cost;
19580 auto EstimateInsertCost = [&](unsigned I, Value *V) {
19581 DemandedElements.setBit(I);
19582 if (V->getType() != ScalarTy)
19583 Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
19584 CCH: TTI::CastContextHint::None, CostKind);
19585 };
19586 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
19587 std::iota(first: ConstantShuffleMask.begin(), last: ConstantShuffleMask.end(), value: 0);
19588 for (auto [I, V] : enumerate(First&: VL)) {
19589 // No need to shuffle duplicates for constants.
19590 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(Val: V))
19591 continue;
19592
19593 if (isConstant(V)) {
19594 ConstantShuffleMask[I] = I + VF;
19595 continue;
19596 }
19597 EstimateInsertCost(I, V);
19598 }
19599 // FIXME: add a cost for constant vector materialization.
19600 bool IsAnyNonUndefConst =
19601 any_of(Range&: VL, P: [](Value *V) { return !isa<UndefValue>(Val: V) && isConstant(V); });
19602 // 1. Shuffle input source vector and constant vector.
19603 if (!ForPoisonSrc && IsAnyNonUndefConst) {
19604 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: VecTy,
19605 Mask: ConstantShuffleMask);
19606 }
19607
19608 // 2. Insert unique non-constants.
19609 if (!DemandedElements.isZero())
19610 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: DemandedElements,
19611 /*Insert=*/true,
19612 /*Extract=*/false, CostKind,
19613 ForPoisonSrc: ForPoisonSrc && !IsAnyNonUndefConst, VL);
19614 return Cost;
19615}
19616
19617Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
19618 auto It = EntryToLastInstruction.find(Val: E);
19619 if (It != EntryToLastInstruction.end())
19620 return *cast<Instruction>(Val&: It->second);
19621 Instruction *Res = nullptr;
19622 // Get the basic block this bundle is in. All instructions in the bundle
19623 // should be in this block (except for extractelement-like instructions with
19624 // constant indices or gathered loads or copyables).
19625 Instruction *Front;
19626 unsigned Opcode;
19627 if (E->hasState()) {
19628 Front = E->getMainOp();
19629 Opcode = E->getOpcode();
19630 } else {
19631 Front = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: IsaPred<Instruction>));
19632 Opcode = Front->getOpcode();
19633 }
19634 auto *BB = Front->getParent();
19635 assert(
19636 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
19637 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
19638 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
19639 all_of(E->Scalars,
19640 [=](Value *V) -> bool {
19641 if (Opcode == Instruction::GetElementPtr &&
19642 !isa<GetElementPtrInst>(V))
19643 return true;
19644 auto *I = dyn_cast<Instruction>(V);
19645 return !I || !E->getMatchingMainOpOrAltOp(I) ||
19646 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
19647 })) &&
19648 "Expected gathered loads or GEPs or instructions from same basic "
19649 "block.");
19650
19651 auto FindLastInst = [&]() {
19652 Instruction *LastInst = Front;
19653 for (Value *V : E->Scalars) {
19654 auto *I = dyn_cast<Instruction>(Val: V);
19655 if (!I)
19656 continue;
19657 if (E->isCopyableElement(V: I))
19658 continue;
19659 if (LastInst->getParent() == I->getParent()) {
19660 if (LastInst->comesBefore(Other: I))
19661 LastInst = I;
19662 continue;
19663 }
19664 assert(((Opcode == Instruction::GetElementPtr &&
19665 !isa<GetElementPtrInst>(I)) ||
19666 E->State == TreeEntry::SplitVectorize ||
19667 (isVectorLikeInstWithConstOps(LastInst) &&
19668 isVectorLikeInstWithConstOps(I)) ||
19669 (GatheredLoadsEntriesFirst.has_value() &&
19670 Opcode == Instruction::Load && E->isGather() &&
19671 E->Idx < *GatheredLoadsEntriesFirst)) &&
19672 "Expected vector-like or non-GEP in GEP node insts only.");
19673 if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
19674 LastInst = I;
19675 continue;
19676 }
19677 if (!DT->isReachableFromEntry(A: I->getParent()))
19678 continue;
19679 auto *NodeA = DT->getNode(BB: LastInst->getParent());
19680 auto *NodeB = DT->getNode(BB: I->getParent());
19681 assert(NodeA && "Should only process reachable instructions");
19682 assert(NodeB && "Should only process reachable instructions");
19683 assert((NodeA == NodeB) ==
19684 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
19685 "Different nodes should have different DFS numbers");
19686 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
19687 LastInst = I;
19688 }
19689 BB = LastInst->getParent();
19690 return LastInst;
19691 };
19692
19693 auto FindFirstInst = [&]() {
19694 Instruction *FirstInst = Front;
19695 for (Value *V : E->Scalars) {
19696 auto *I = dyn_cast<Instruction>(Val: V);
19697 if (!I)
19698 continue;
19699 if (E->isCopyableElement(V: I))
19700 continue;
19701 if (FirstInst->getParent() == I->getParent()) {
19702 if (I->comesBefore(Other: FirstInst))
19703 FirstInst = I;
19704 continue;
19705 }
19706 assert(((Opcode == Instruction::GetElementPtr &&
19707 !isa<GetElementPtrInst>(I)) ||
19708 (isVectorLikeInstWithConstOps(FirstInst) &&
19709 isVectorLikeInstWithConstOps(I))) &&
19710 "Expected vector-like or non-GEP in GEP node insts only.");
19711 if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
19712 FirstInst = I;
19713 continue;
19714 }
19715 if (!DT->isReachableFromEntry(A: I->getParent()))
19716 continue;
19717 auto *NodeA = DT->getNode(BB: FirstInst->getParent());
19718 auto *NodeB = DT->getNode(BB: I->getParent());
19719 assert(NodeA && "Should only process reachable instructions");
19720 assert(NodeB && "Should only process reachable instructions");
19721 assert((NodeA == NodeB) ==
19722 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
19723 "Different nodes should have different DFS numbers");
19724 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
19725 FirstInst = I;
19726 }
19727 return FirstInst;
19728 };
19729
19730 if (E->State == TreeEntry::SplitVectorize) {
19731 Res = FindLastInst();
19732 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V: Res); !Entries.empty()) {
19733 for (auto *E : Entries) {
19734 auto *I = dyn_cast_or_null<Instruction>(Val&: E->VectorizedValue);
19735 if (!I)
19736 I = &getLastInstructionInBundle(E);
19737 if (Res->getParent() == I->getParent() && Res->comesBefore(Other: I))
19738 Res = I;
19739 }
19740 }
19741 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19742 return *Res;
19743 }
19744
19745 // Set insertpoint for gathered loads to the very first load.
19746 if (GatheredLoadsEntriesFirst.has_value() &&
19747 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
19748 Opcode == Instruction::Load) {
19749 Res = FindFirstInst();
19750 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19751 return *Res;
19752 }
19753
19754 // Set the insert point to the beginning of the basic block if the entry
19755 // should not be scheduled.
19756 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
19757 if (E->isGather())
19758 return nullptr;
19759 // Found previously that the instruction do not need to be scheduled.
19760 const auto *It = BlocksSchedules.find(Key: BB);
19761 if (It == BlocksSchedules.end())
19762 return nullptr;
19763 for (Value *V : E->Scalars) {
19764 auto *I = dyn_cast<Instruction>(Val: V);
19765 if (!I || isa<PHINode>(Val: I) ||
19766 (!E->isCopyableElement(V: I) && doesNotNeedToBeScheduled(V: I)))
19767 continue;
19768 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(V: I);
19769 if (Bundles.empty())
19770 continue;
19771 const auto *It = find_if(
19772 Range&: Bundles, P: [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
19773 if (It != Bundles.end())
19774 return *It;
19775 }
19776 return nullptr;
19777 };
19778 const ScheduleBundle *Bundle = FindScheduleBundle(E);
19779 if (!E->isGather() && !Bundle) {
19780 if ((Opcode == Instruction::GetElementPtr &&
19781 any_of(Range: E->Scalars,
19782 P: [](Value *V) {
19783 return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
19784 })) ||
19785 (all_of(Range: E->Scalars,
19786 P: [&](Value *V) {
19787 return isa<PoisonValue>(Val: V) ||
19788 (E->Idx == 0 && isa<InsertElementInst>(Val: V)) ||
19789 E->isCopyableElement(V) ||
19790 (!isVectorLikeInstWithConstOps(V) &&
19791 isUsedOutsideBlock(V));
19792 }) &&
19793 (!E->doesNotNeedToSchedule() ||
19794 any_of(Range: E->Scalars,
19795 P: [&](Value *V) {
19796 if (!isa<Instruction>(Val: V) ||
19797 (E->hasCopyableElements() && E->isCopyableElement(V)))
19798 return false;
19799 return !areAllOperandsNonInsts(V);
19800 }) ||
19801 none_of(Range: E->Scalars, P: [&](Value *V) {
19802 if (!isa<Instruction>(Val: V) ||
19803 (E->hasCopyableElements() && E->isCopyableElement(V)))
19804 return false;
19805 return MustGather.contains(Ptr: V);
19806 }))))
19807 Res = FindLastInst();
19808 else
19809 Res = FindFirstInst();
19810 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19811 return *Res;
19812 }
19813
19814 // Find the last instruction. The common case should be that BB has been
19815 // scheduled, and the last instruction is VL.back(). So we start with
19816 // VL.back() and iterate over schedule data until we reach the end of the
19817 // bundle. The end of the bundle is marked by null ScheduleData.
19818 if (Bundle) {
19819 assert(!E->isGather() && "Gathered instructions should not be scheduled");
19820 Res = Bundle->getBundle().back()->getInst();
19821 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19822 return *Res;
19823 }
19824
19825 // LastInst can still be null at this point if there's either not an entry
19826 // for BB in BlocksSchedules or there's no ScheduleData available for
19827 // VL.back(). This can be the case if buildTreeRec aborts for various
19828 // reasons (e.g., the maximum recursion depth is reached, the maximum region
19829 // size is reached, etc.). ScheduleData is initialized in the scheduling
19830 // "dry-run".
19831 //
19832 // If this happens, we can still find the last instruction by brute force. We
19833 // iterate forwards from Front (inclusive) until we either see all
19834 // instructions in the bundle or reach the end of the block. If Front is the
19835 // last instruction in program order, LastInst will be set to Front, and we
19836 // will visit all the remaining instructions in the block.
19837 //
19838 // One of the reasons we exit early from buildTreeRec is to place an upper
19839 // bound on compile-time. Thus, taking an additional compile-time hit here is
19840 // not ideal. However, this should be exceedingly rare since it requires that
19841 // we both exit early from buildTreeRec and that the bundle be out-of-order
19842 // (causing us to iterate all the way to the end of the block).
19843 if (!Res)
19844 Res = FindLastInst();
19845 assert(Res && "Failed to find last instruction in bundle");
19846 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19847 return *Res;
19848}
19849
19850void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
19851 auto *Front = E->getMainOp();
19852 Instruction *LastInst = &getLastInstructionInBundle(E);
19853 assert(LastInst && "Failed to find last instruction in bundle");
19854 BasicBlock::iterator LastInstIt = LastInst->getIterator();
19855 // If the instruction is PHI, set the insert point after all the PHIs.
19856 bool IsPHI = isa<PHINode>(Val: LastInst);
19857 if (IsPHI) {
19858 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
19859 if (LastInstIt != LastInst->getParent()->end() &&
19860 LastInstIt->getParent()->isLandingPad())
19861 LastInstIt = std::next(x: LastInstIt);
19862 }
19863 if (IsPHI ||
19864 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
19865 (E->doesNotNeedToSchedule() ||
19866 (E->hasCopyableElements() && !E->isCopyableElement(V: LastInst) &&
19867 isUsedOutsideBlock(V: LastInst)))) ||
19868 (GatheredLoadsEntriesFirst.has_value() &&
19869 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
19870 E->getOpcode() == Instruction::Load)) {
19871 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
19872 } else {
19873 // Set the insertion point after the last instruction in the bundle. Set the
19874 // debug location to Front.
19875 Builder.SetInsertPoint(
19876 TheBB: LastInst->getParent(),
19877 IP: LastInst->getNextNode()->getIterator());
19878 if (Instruction *Res = LastInstructionToPos.lookup(Val: LastInst)) {
19879 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
19880 } else {
19881 Res = Builder.CreateAlignedLoad(Ty: Builder.getPtrTy(),
19882 Ptr: PoisonValue::get(T: Builder.getPtrTy()),
19883 Align: MaybeAlign());
19884 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
19885 eraseInstruction(I: Res);
19886 LastInstructionToPos.try_emplace(Key: LastInst, Args&: Res);
19887 }
19888 }
19889 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
19890}
19891
19892Value *BoUpSLP::gather(
19893 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
19894 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
19895 // List of instructions/lanes from current block and/or the blocks which are
19896 // part of the current loop. These instructions will be inserted at the end to
19897 // make it possible to optimize loops and hoist invariant instructions out of
19898 // the loops body with better chances for success.
19899 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
19900 SmallSet<int, 4> PostponedIndices;
19901 Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
19902 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
19903 SmallPtrSet<BasicBlock *, 4> Visited;
19904 while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
19905 InsertBB = InsertBB->getSinglePredecessor();
19906 return InsertBB && InsertBB == InstBB;
19907 };
19908 for (int I = 0, E = VL.size(); I < E; ++I) {
19909 if (auto *Inst = dyn_cast<Instruction>(Val: VL[I]))
19910 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
19911 isVectorized(V: Inst) ||
19912 (L && (!Root || L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
19913 PostponedIndices.insert(V: I).second)
19914 PostponedInsts.emplace_back(Args&: Inst, Args&: I);
19915 }
19916
19917 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
19918 Type *Ty) {
19919 Value *Scalar = V;
19920 // Drop NUW from trunc to avoid incorrect codegen.
19921 Value *Trunced;
19922 if (match(V: Scalar, P: m_NUWTrunc(Op: m_Value(V&: Trunced))))
19923 cast<TruncInst>(Val: Scalar)->setHasNoUnsignedWrap(/*B=*/false);
19924 if (Scalar->getType() != Ty) {
19925 assert(Scalar->getType()->isIntOrIntVectorTy() &&
19926 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
19927 Value *V = Scalar;
19928 if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
19929 isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
19930 Value *Op = CI->getOperand(i_nocapture: 0);
19931 if (auto *IOp = dyn_cast<Instruction>(Val: Op);
19932 !IOp || !(isDeleted(I: IOp) || isVectorized(V: IOp)))
19933 V = Op;
19934 }
19935 Scalar = Builder.CreateIntCast(
19936 V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
19937 }
19938
19939 Instruction *InsElt;
19940 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
19941 assert(SLPReVec && "FixedVectorType is not expected.");
19942 Vec =
19943 createInsertVector(Builder, Vec, V: Scalar, Index: Pos * getNumElements(Ty: VecTy));
19944 auto *II = dyn_cast<Instruction>(Val: Vec);
19945 if (!II)
19946 return Vec;
19947 InsElt = II;
19948 } else {
19949 Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
19950 InsElt = dyn_cast<InsertElementInst>(Val: Vec);
19951 if (!InsElt)
19952 return Vec;
19953 }
19954 GatherShuffleExtractSeq.insert(X: InsElt);
19955 CSEBlocks.insert(V: InsElt->getParent());
19956 // Add to our 'need-to-extract' list.
19957 if (isa<Instruction>(Val: V)) {
19958 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
19959 const auto *It = find_if(Range&: Entries, P: [&](const TreeEntry *E) {
19960 return !TransformedToGatherNodes.contains(Val: E) &&
19961 !DeletedNodes.contains(Ptr: E);
19962 });
19963 if (It != Entries.end()) {
19964 // Find which lane we need to extract.
19965 User *UserOp = nullptr;
19966 if (Scalar != V) {
19967 if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
19968 UserOp = SI;
19969 } else {
19970 if (V->getType()->isVectorTy()) {
19971 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: InsElt);
19972 SV && SV->getOperand(i_nocapture: 0) != V && SV->getOperand(i_nocapture: 1) != V) {
19973 // Find shufflevector, caused by resize.
19974 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
19975 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Vec)) {
19976 if (SV->getOperand(i_nocapture: 0) == V)
19977 return SV;
19978 if (SV->getOperand(i_nocapture: 1) == V)
19979 return SV;
19980 }
19981 return nullptr;
19982 };
19983 InsElt = nullptr;
19984 if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: 0), V))
19985 InsElt = User;
19986 else if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: 1), V))
19987 InsElt = User;
19988 assert(InsElt &&
19989 "Failed to find shufflevector, caused by resize.");
19990 }
19991 }
19992 UserOp = InsElt;
19993 }
19994 if (UserOp) {
19995 unsigned FoundLane = (*It)->findLaneForValue(V);
19996 ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: **It, Args&: FoundLane);
19997 }
19998 }
19999 }
20000 return Vec;
20001 };
20002 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
20003 Value *Vec = PoisonValue::get(T: VecTy);
20004 SmallVector<int> NonConsts;
20005 SmallVector<int> Mask(VL.size());
20006 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
20007 Value *OriginalRoot = Root;
20008 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Val: Root);
20009 SV && isa<PoisonValue>(Val: SV->getOperand(i_nocapture: 1)) &&
20010 SV->getOperand(i_nocapture: 0)->getType() == VecTy) {
20011 Root = SV->getOperand(i_nocapture: 0);
20012 Mask.assign(in_start: SV->getShuffleMask().begin(), in_end: SV->getShuffleMask().end());
20013 }
20014 // Insert constant values at first.
20015 for (int I = 0, E = VL.size(); I < E; ++I) {
20016 if (PostponedIndices.contains(V: I))
20017 continue;
20018 if (!isConstant(V: VL[I])) {
20019 NonConsts.push_back(Elt: I);
20020 continue;
20021 }
20022 if (isa<PoisonValue>(Val: VL[I]))
20023 continue;
20024 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
20025 Mask[I] = I + E;
20026 }
20027 if (Root) {
20028 if (isa<PoisonValue>(Val: Vec)) {
20029 Vec = OriginalRoot;
20030 } else {
20031 Vec = CreateShuffle(Root, Vec, Mask);
20032 if (auto *OI = dyn_cast<Instruction>(Val: OriginalRoot);
20033 OI && OI->use_empty() &&
20034 none_of(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
20035 return TE->VectorizedValue == OI;
20036 }))
20037 eraseInstruction(I: OI);
20038 }
20039 }
20040 // Insert non-constant values.
20041 for (int I : NonConsts)
20042 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
20043 // Append instructions, which are/may be part of the loop, in the end to make
20044 // it possible to hoist non-loop-based instructions.
20045 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
20046 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
20047
20048 return Vec;
20049}
20050
20051/// Merges shuffle masks and emits final shuffle instruction, if required. It
20052/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
20053/// when the actual shuffle instruction is generated only if this is actually
20054/// required. Otherwise, the shuffle instruction emission is delayed till the
20055/// end of the process, to reduce the number of emitted instructions and further
20056/// analysis/transformations.
20057/// The class also will look through the previously emitted shuffle instructions
20058/// and properly mark indices in mask as undef.
20059/// For example, given the code
20060/// \code
20061/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
20062/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
20063/// \endcode
20064/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
20065/// look through %s1 and %s2 and emit
20066/// \code
20067/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
20068/// \endcode
20069/// instead.
20070/// If 2 operands are of different size, the smallest one will be resized and
20071/// the mask recalculated properly.
20072/// For example, given the code
20073/// \code
20074/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
20075/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
20076/// \endcode
20077/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
20078/// look through %s1 and %s2 and emit
20079/// \code
20080/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
20081/// \endcode
20082/// instead.
20083class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
20084 bool IsFinalized = false;
20085 /// Combined mask for all applied operands and masks. It is built during
20086 /// analysis and actual emission of shuffle vector instructions.
20087 SmallVector<int> CommonMask;
20088 /// List of operands for the shuffle vector instruction. It hold at max 2
20089 /// operands, if the 3rd is going to be added, the first 2 are combined into
20090 /// shuffle with \p CommonMask mask, the first operand sets to be the
20091 /// resulting shuffle and the second operand sets to be the newly added
20092 /// operand. The \p CommonMask is transformed in the proper way after that.
20093 SmallVector<Value *, 2> InVectors;
20094 IRBuilderBase &Builder;
20095 BoUpSLP &R;
20096
20097 class ShuffleIRBuilder {
20098 IRBuilderBase &Builder;
20099 /// Holds all of the instructions that we gathered.
20100 SetVector<Instruction *> &GatherShuffleExtractSeq;
20101 /// A list of blocks that we are going to CSE.
20102 DenseSet<BasicBlock *> &CSEBlocks;
20103 /// Data layout.
20104 const DataLayout &DL;
20105
20106 public:
20107 ShuffleIRBuilder(IRBuilderBase &Builder,
20108 SetVector<Instruction *> &GatherShuffleExtractSeq,
20109 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
20110 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
20111 CSEBlocks(CSEBlocks), DL(DL) {}
20112 ~ShuffleIRBuilder() = default;
20113 /// Creates shufflevector for the 2 operands with the given mask.
20114 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
20115 if (V1->getType() != V2->getType()) {
20116 assert(V1->getType()->isIntOrIntVectorTy() &&
20117 V1->getType()->isIntOrIntVectorTy() &&
20118 "Expected integer vector types only.");
20119 if (V1->getType() != V2->getType()) {
20120 if (cast<VectorType>(Val: V2->getType())
20121 ->getElementType()
20122 ->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
20123 ->getElementType()
20124 ->getIntegerBitWidth())
20125 V2 = Builder.CreateIntCast(
20126 V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery(DL)));
20127 else
20128 V1 = Builder.CreateIntCast(
20129 V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery(DL)));
20130 }
20131 }
20132 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
20133 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
20134 GatherShuffleExtractSeq.insert(X: I);
20135 CSEBlocks.insert(V: I->getParent());
20136 }
20137 return Vec;
20138 }
20139 /// Creates permutation of the single vector operand with the given mask, if
20140 /// it is not identity mask.
20141 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
20142 if (Mask.empty())
20143 return V1;
20144 unsigned VF = Mask.size();
20145 unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
20146 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
20147 return V1;
20148 Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
20149 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
20150 GatherShuffleExtractSeq.insert(X: I);
20151 CSEBlocks.insert(V: I->getParent());
20152 }
20153 return Vec;
20154 }
20155 Value *createIdentity(Value *V) { return V; }
20156 Value *createPoison(Type *Ty, unsigned VF) {
20157 return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
20158 }
20159 /// Resizes 2 input vector to match the sizes, if the they are not equal
20160 /// yet. The smallest vector is resized to the size of the larger vector.
20161 void resizeToMatch(Value *&V1, Value *&V2) {
20162 if (V1->getType() == V2->getType())
20163 return;
20164 int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
20165 int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
20166 int VF = std::max(a: V1VF, b: V2VF);
20167 int MinVF = std::min(a: V1VF, b: V2VF);
20168 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
20169 std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
20170 value: 0);
20171 Value *&Op = MinVF == V1VF ? V1 : V2;
20172 Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
20173 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
20174 GatherShuffleExtractSeq.insert(X: I);
20175 CSEBlocks.insert(V: I->getParent());
20176 }
20177 if (MinVF == V1VF)
20178 V1 = Op;
20179 else
20180 V2 = Op;
20181 }
20182 };
20183
20184 /// Smart shuffle instruction emission, walks through shuffles trees and
20185 /// tries to find the best matching vector for the actual shuffle
20186 /// instruction.
20187 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
20188 assert(V1 && "Expected at least one vector value.");
20189 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
20190 R.CSEBlocks, *R.DL);
20191 return BaseShuffleAnalysis::createShuffle<Value *>(
20192 V1, V2, Mask, Builder&: ShuffleBuilder, ScalarTy);
20193 }
20194
20195 /// Cast value \p V to the vector type with the same number of elements, but
20196 /// the base type \p ScalarTy.
20197 Value *castToScalarTyElem(Value *V,
20198 std::optional<bool> IsSigned = std::nullopt) {
20199 auto *VecTy = cast<VectorType>(Val: V->getType());
20200 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
20201 if (VecTy->getElementType() == ScalarTy->getScalarType())
20202 return V;
20203 return Builder.CreateIntCast(
20204 V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
20205 isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL))));
20206 }
20207
20208 Value *getVectorizedValue(const TreeEntry &E) {
20209 Value *Vec = E.VectorizedValue;
20210 if (!Vec->getType()->isIntOrIntVectorTy())
20211 return Vec;
20212 return castToScalarTyElem(V: Vec, IsSigned: any_of(Range: E.Scalars, P: [&](Value *V) {
20213 return !isa<PoisonValue>(Val: V) &&
20214 !isKnownNonNegative(
20215 V, SQ: SimplifyQuery(*R.DL));
20216 }));
20217 }
20218
20219public:
20220 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
20221 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
20222
20223 /// Adjusts extractelements after reusing them.
20224 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
20225 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
20226 unsigned NumParts, bool &UseVecBaseAsInput) {
20227 UseVecBaseAsInput = false;
20228 SmallPtrSet<Value *, 4> UniqueBases;
20229 Value *VecBase = nullptr;
20230 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
20231 if (!E->ReorderIndices.empty()) {
20232 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
20233 E->ReorderIndices.end());
20234 reorderScalars(Scalars&: VL, Mask: ReorderMask);
20235 }
20236 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
20237 int Idx = Mask[I];
20238 if (Idx == PoisonMaskElem)
20239 continue;
20240 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
20241 VecBase = EI->getVectorOperand();
20242 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecBase); !TEs.empty())
20243 VecBase = TEs.front()->VectorizedValue;
20244 assert(VecBase && "Expected vectorized value.");
20245 UniqueBases.insert(Ptr: VecBase);
20246 // If the only one use is vectorized - can delete the extractelement
20247 // itself.
20248 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(Ptr: EI) ||
20249 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
20250 !R.isVectorized(V: EI) &&
20251 count_if(Range: E->Scalars, P: [&](Value *V) { return V == EI; }) !=
20252 count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
20253 P: [&](Value *V) { return V == EI; })) ||
20254 (NumParts != 1 && count(Range&: VL, Element: EI) > 1) ||
20255 any_of(Range: EI->users(), P: [&](User *U) {
20256 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(V: U);
20257 return UTEs.empty() || UTEs.size() > 1 ||
20258 any_of(Range&: UTEs,
20259 P: [&](const TreeEntry *TE) {
20260 return R.DeletedNodes.contains(Ptr: TE) ||
20261 R.TransformedToGatherNodes.contains(Val: TE);
20262 }) ||
20263 (isa<GetElementPtrInst>(Val: U) &&
20264 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) ||
20265 (!UTEs.empty() &&
20266 count_if(Range&: R.VectorizableTree,
20267 P: [&](const std::unique_ptr<TreeEntry> &TE) {
20268 return TE->UserTreeIndex.UserTE ==
20269 UTEs.front() &&
20270 is_contained(Range&: VL, Element: EI);
20271 }) != 1);
20272 }))
20273 continue;
20274 R.eraseInstruction(I: EI);
20275 }
20276 if (NumParts == 1 || UniqueBases.size() == 1) {
20277 assert(VecBase && "Expected vectorized value.");
20278 return castToScalarTyElem(V: VecBase);
20279 }
20280 UseVecBaseAsInput = true;
20281 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
20282 for (auto [I, Idx] : enumerate(First&: Mask))
20283 if (Idx != PoisonMaskElem)
20284 Idx = I;
20285 };
20286 // Perform multi-register vector shuffle, joining them into a single virtual
20287 // long vector.
20288 // Need to shuffle each part independently and then insert all this parts
20289 // into a long virtual vector register, forming the original vector.
20290 Value *Vec = nullptr;
20291 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
20292 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
20293 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
20294 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
20295 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(N: Part * SliceSize, M: Limit);
20296 MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
20297 constexpr int MaxBases = 2;
20298 SmallVector<Value *, MaxBases> Bases(MaxBases);
20299 auto VLMask = zip(t&: SubVL, u&: SubMask);
20300 const unsigned VF = std::accumulate(
20301 first: VLMask.begin(), last: VLMask.end(), init: 0U, binary_op: [&](unsigned S, const auto &D) {
20302 if (std::get<1>(D) == PoisonMaskElem)
20303 return S;
20304 Value *VecOp =
20305 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
20306 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp);
20307 !TEs.empty())
20308 VecOp = TEs.front()->VectorizedValue;
20309 assert(VecOp && "Expected vectorized value.");
20310 const unsigned Size =
20311 cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
20312 return std::max(a: S, b: Size);
20313 });
20314 for (const auto [V, I] : VLMask) {
20315 if (I == PoisonMaskElem)
20316 continue;
20317 Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
20318 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); !TEs.empty())
20319 VecOp = TEs.front()->VectorizedValue;
20320 assert(VecOp && "Expected vectorized value.");
20321 VecOp = castToScalarTyElem(V: VecOp);
20322 Bases[I / VF] = VecOp;
20323 }
20324 if (!Bases.front())
20325 continue;
20326 Value *SubVec;
20327 if (Bases.back()) {
20328 SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
20329 TransformToIdentity(SubMask);
20330 } else {
20331 SubVec = Bases.front();
20332 }
20333 if (!Vec) {
20334 Vec = SubVec;
20335 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
20336 [&](unsigned P) {
20337 ArrayRef<int> SubMask =
20338 Mask.slice(P * SliceSize,
20339 getNumElems(Mask.size(),
20340 SliceSize, P));
20341 return all_of(SubMask, [](int Idx) {
20342 return Idx == PoisonMaskElem;
20343 });
20344 })) &&
20345 "Expected first part or all previous parts masked.");
20346 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
20347 } else {
20348 unsigned NewVF =
20349 cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
20350 if (Vec->getType() != SubVec->getType()) {
20351 unsigned SubVecVF =
20352 cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
20353 NewVF = std::max(a: NewVF, b: SubVecVF);
20354 }
20355 // Adjust SubMask.
20356 for (int &Idx : SubMask)
20357 if (Idx != PoisonMaskElem)
20358 Idx += NewVF;
20359 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
20360 Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
20361 TransformToIdentity(VecMask);
20362 }
20363 }
20364 copy(Range&: VecMask, Out: Mask.begin());
20365 return Vec;
20366 }
20367 /// Checks if the specified entry \p E needs to be delayed because of its
20368 /// dependency nodes.
20369 std::optional<Value *>
20370 needToDelay(const TreeEntry *E,
20371 ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
20372 // No need to delay emission if all deps are ready.
20373 if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
20374 return all_of(
20375 Range&: TEs, P: [](const TreeEntry *TE) { return TE->VectorizedValue; });
20376 }))
20377 return std::nullopt;
20378 // Postpone gather emission, will be emitted after the end of the
20379 // process to keep correct order.
20380 auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
20381 return Builder.CreateAlignedLoad(
20382 Ty: ResVecTy,
20383 Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
20384 Align: MaybeAlign());
20385 }
20386 /// Reset the builder to handle perfect diamond match.
20387 void resetForSameNode() {
20388 IsFinalized = false;
20389 CommonMask.clear();
20390 InVectors.clear();
20391 }
20392 /// Adds 2 input vectors (in form of tree entries) and the mask for their
20393 /// shuffling.
20394 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
20395 Value *V1 = getVectorizedValue(E: E1);
20396 Value *V2 = getVectorizedValue(E: E2);
20397 add(V1, V2, Mask);
20398 }
20399 /// Adds single input vector (in form of tree entry) and the mask for its
20400 /// shuffling.
20401 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
20402 Value *V1 = getVectorizedValue(E: E1);
20403 add(V1, Mask);
20404 }
20405 /// Adds 2 input vectors and the mask for their shuffling.
20406 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
20407 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
20408 assert(isa<FixedVectorType>(V1->getType()) &&
20409 isa<FixedVectorType>(V2->getType()) &&
20410 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
20411 V1 = castToScalarTyElem(V: V1);
20412 V2 = castToScalarTyElem(V: V2);
20413 if (InVectors.empty()) {
20414 InVectors.push_back(Elt: V1);
20415 InVectors.push_back(Elt: V2);
20416 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
20417 return;
20418 }
20419 Value *Vec = InVectors.front();
20420 if (InVectors.size() == 2) {
20421 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
20422 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
20423 } else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
20424 Mask.size()) {
20425 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
20426 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
20427 }
20428 V1 = createShuffle(V1, V2, Mask);
20429 unsigned VF = std::max(a: getVF(V: V1), b: getVF(V: Vec));
20430 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20431 if (Mask[Idx] != PoisonMaskElem)
20432 CommonMask[Idx] = Idx + VF;
20433 InVectors.front() = Vec;
20434 if (InVectors.size() == 2)
20435 InVectors.back() = V1;
20436 else
20437 InVectors.push_back(Elt: V1);
20438 }
20439 /// Adds another one input vector and the mask for the shuffling.
20440 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
20441 assert(isa<FixedVectorType>(V1->getType()) &&
20442 "castToScalarTyElem expects V1 to be FixedVectorType");
20443 V1 = castToScalarTyElem(V: V1);
20444 if (InVectors.empty()) {
20445 InVectors.push_back(Elt: V1);
20446 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
20447 return;
20448 }
20449 const auto *It = find(Range&: InVectors, Val: V1);
20450 if (It == InVectors.end()) {
20451 if (InVectors.size() == 2 ||
20452 InVectors.front()->getType() != V1->getType()) {
20453 Value *V = InVectors.front();
20454 if (InVectors.size() == 2) {
20455 V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
20456 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
20457 } else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
20458 CommonMask.size()) {
20459 V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
20460 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
20461 }
20462 unsigned VF = std::max(a: CommonMask.size(), b: Mask.size());
20463 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20464 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
20465 CommonMask[Idx] = V->getType() != V1->getType()
20466 ? Idx + VF
20467 : Mask[Idx] + getVF(V: V1);
20468 if (V->getType() != V1->getType())
20469 V1 = createShuffle(V1, V2: nullptr, Mask);
20470 InVectors.front() = V;
20471 if (InVectors.size() == 2)
20472 InVectors.back() = V1;
20473 else
20474 InVectors.push_back(Elt: V1);
20475 return;
20476 }
20477 // Check if second vector is required if the used elements are already
20478 // used from the first one.
20479 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20480 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
20481 InVectors.push_back(Elt: V1);
20482 break;
20483 }
20484 }
20485 unsigned VF = 0;
20486 for (Value *V : InVectors)
20487 VF = std::max(a: VF, b: getVF(V));
20488 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
20489 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
20490 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
20491 }
20492 /// Adds another one input vector and the mask for the shuffling.
20493 void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
20494 SmallVector<int> NewMask;
20495 inversePermutation(Indices: Order, Mask&: NewMask);
20496 add(V1, Mask: NewMask);
20497 }
20498 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
20499 Value *Root = nullptr) {
20500 return R.gather(VL, Root, ScalarTy,
20501 CreateShuffle: [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
20502 return createShuffle(V1, V2, Mask);
20503 });
20504 }
20505 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
20506 /// Finalize emission of the shuffles.
20507 /// \param Action the action (if any) to be performed before final applying of
20508 /// the \p ExtMask mask.
20509 Value *finalize(
20510 ArrayRef<int> ExtMask,
20511 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
20512 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
20513 function_ref<void(Value *&, SmallVectorImpl<int> &,
20514 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
20515 Action = {}) {
20516 IsFinalized = true;
20517 if (Action) {
20518 Value *Vec = InVectors.front();
20519 if (InVectors.size() == 2) {
20520 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
20521 InVectors.pop_back();
20522 } else {
20523 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
20524 }
20525 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
20526 assert(VF > 0 &&
20527 "Expected vector length for the final value before action.");
20528 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
20529 if (VecVF < VF) {
20530 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
20531 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
20532 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
20533 }
20534 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
20535 return createShuffle(V1, V2, Mask);
20536 });
20537 InVectors.front() = Vec;
20538 }
20539 if (!SubVectors.empty()) {
20540 Value *Vec = InVectors.front();
20541 if (InVectors.size() == 2) {
20542 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
20543 InVectors.pop_back();
20544 } else {
20545 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
20546 }
20547 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
20548 auto CreateSubVectors = [&](Value *Vec,
20549 SmallVectorImpl<int> &CommonMask) {
20550 for (auto [E, Idx] : SubVectors) {
20551 Value *V = getVectorizedValue(E: *E);
20552 unsigned InsertionIndex = Idx * getNumElements(Ty: ScalarTy);
20553 // Use scalar version of the SCalarType to correctly handle shuffles
20554 // for revectorization. The revectorization mode operates by the
20555 // vectors, but here we need to operate on the scalars, because the
20556 // masks were already transformed for the vector elements and we don't
20557 // need doing this transformation again.
20558 Type *OrigScalarTy = ScalarTy;
20559 ScalarTy = ScalarTy->getScalarType();
20560 Vec = createInsertVector(
20561 Builder, Vec, V, Index: InsertionIndex,
20562 Generator: std::bind(f: &ShuffleInstructionBuilder::createShuffle, args: this, args: _1, args: _2,
20563 args: _3));
20564 ScalarTy = OrigScalarTy;
20565 if (!CommonMask.empty()) {
20566 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
20567 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
20568 value: Idx);
20569 }
20570 }
20571 return Vec;
20572 };
20573 if (SubVectorsMask.empty()) {
20574 Vec = CreateSubVectors(Vec, CommonMask);
20575 } else {
20576 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
20577 copy(Range&: SubVectorsMask, Out: SVMask.begin());
20578 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
20579 if (I2 != PoisonMaskElem) {
20580 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
20581 I1 = I2 + CommonMask.size();
20582 }
20583 }
20584 Value *InsertVec =
20585 CreateSubVectors(PoisonValue::get(T: Vec->getType()), CommonMask);
20586 Vec = createShuffle(V1: InsertVec, V2: Vec, Mask: SVMask);
20587 transformMaskAfterShuffle(CommonMask, Mask: SVMask);
20588 }
20589 InVectors.front() = Vec;
20590 }
20591
20592 if (!ExtMask.empty()) {
20593 if (CommonMask.empty()) {
20594 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
20595 } else {
20596 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
20597 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
20598 if (ExtMask[I] == PoisonMaskElem)
20599 continue;
20600 NewMask[I] = CommonMask[ExtMask[I]];
20601 }
20602 CommonMask.swap(RHS&: NewMask);
20603 }
20604 }
20605 if (CommonMask.empty()) {
20606 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
20607 return InVectors.front();
20608 }
20609 if (InVectors.size() == 2)
20610 return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
20611 return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
20612 }
20613
20614 ~ShuffleInstructionBuilder() {
20615 assert((IsFinalized || CommonMask.empty()) &&
20616 "Shuffle construction must be finalized.");
20617 }
20618};
20619
20620Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
20621 return vectorizeTree(E: getOperandEntry(E, Idx: NodeIdx));
20622}
20623
20624template <typename BVTy, typename ResTy, typename... Args>
20625ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
20626 Args &...Params) {
20627 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
20628 "Expected gather node.");
20629 unsigned VF = E->getVectorFactor();
20630
20631 bool NeedFreeze = false;
20632 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
20633 // Do not process split vectorize node, marked to be gathers/buildvectors.
20634 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
20635 E->CombinedEntriesWithIndices.size());
20636 if (E->State == TreeEntry::SplitVectorize &&
20637 TransformedToGatherNodes.contains(Val: E)) {
20638 SubVectors.clear();
20639 } else {
20640 // Clear values, to be replaced by insertvector instructions.
20641 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
20642 for_each(MutableArrayRef(GatheredScalars)
20643 .slice(N: Idx, M: VectorizableTree[EIdx]->getVectorFactor()),
20644 [&](Value *&V) { V = PoisonValue::get(T: V->getType()); });
20645 transform(
20646 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
20647 return std::make_pair(VectorizableTree[P.first].get(), P.second);
20648 });
20649 }
20650 // Build a mask out of the reorder indices and reorder scalars per this
20651 // mask.
20652 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
20653 E->ReorderIndices.end());
20654 if (!ReorderMask.empty())
20655 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
20656 SmallVector<int> SubVectorsMask;
20657 inversePermutation(Indices: E->ReorderIndices, Mask&: SubVectorsMask);
20658 // Transform non-clustered elements in the mask to poison (-1).
20659 // "Clustered" operations will be reordered using this mask later.
20660 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
20661 for (unsigned I : seq<unsigned>(Size: GatheredScalars.size()))
20662 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
20663 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
20664 } else {
20665 SubVectorsMask.clear();
20666 }
20667 SmallVector<Value *> StoredGS(GatheredScalars);
20668 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
20669 unsigned I, unsigned SliceSize,
20670 bool IsNotPoisonous) {
20671 if (!isSplat(VL: E->Scalars) || none_of(E->Scalars, [](Value *V) {
20672 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
20673 }))
20674 return false;
20675 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
20676 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
20677 if (UserTE->getNumOperands() != 2)
20678 return false;
20679 if (!IsNotPoisonous) {
20680 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(N: UserTE->Idx + 1),
20681 [=](const std::unique_ptr<TreeEntry> &TE) {
20682 return TE->UserTreeIndex.UserTE == UserTE &&
20683 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
20684 });
20685 if (It == VectorizableTree.end())
20686 return false;
20687 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
20688 if (!(*It)->ReorderIndices.empty()) {
20689 inversePermutation((*It)->ReorderIndices, ReorderMask);
20690 reorderScalars(Scalars&: GS, Mask: ReorderMask);
20691 }
20692 if (!all_of(zip(t&: GatheredScalars, u&: GS), [&](const auto &P) {
20693 Value *V0 = std::get<0>(P);
20694 Value *V1 = std::get<1>(P);
20695 return !isa<UndefValue>(Val: V0) || isa<PoisonValue>(Val: V0) ||
20696 (isa<UndefValue>(Val: V0) && !isa<PoisonValue>(Val: V0) &&
20697 is_contained(Range: E->Scalars, Element: V1));
20698 }))
20699 return false;
20700 }
20701 int Idx;
20702 if ((Mask.size() < InputVF &&
20703 ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
20704 Idx == 0) ||
20705 (Mask.size() == InputVF &&
20706 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
20707 std::iota(
20708 first: std::next(x: Mask.begin(), n: I * SliceSize),
20709 last: std::next(x: Mask.begin(),
20710 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
20711 value: 0);
20712 } else {
20713 unsigned IVal =
20714 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
20715 std::fill(
20716 first: std::next(x: Mask.begin(), n: I * SliceSize),
20717 last: std::next(x: Mask.begin(),
20718 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
20719 value: IVal);
20720 }
20721 return true;
20722 };
20723 BVTy ShuffleBuilder(ScalarTy, Params...);
20724 ResTy Res = ResTy();
20725 SmallVector<int> Mask;
20726 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
20727 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
20728 Value *ExtractVecBase = nullptr;
20729 bool UseVecBaseAsInput = false;
20730 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
20731 SmallVector<SmallVector<const TreeEntry *>> Entries;
20732 Type *OrigScalarTy = GatheredScalars.front()->getType();
20733 auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
20734 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: GatheredScalars.size());
20735 if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
20736 // Check for gathered extracts.
20737 bool Resized = false;
20738 ExtractShuffles =
20739 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
20740 if (!ExtractShuffles.empty()) {
20741 SmallVector<const TreeEntry *> ExtractEntries;
20742 for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
20743 if (I == PoisonMaskElem)
20744 continue;
20745 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
20746 V: cast<ExtractElementInst>(Val: StoredGS[Idx])->getVectorOperand());
20747 !TEs.empty())
20748 ExtractEntries.append(in_start: TEs.begin(), in_end: TEs.end());
20749 }
20750 if (std::optional<ResTy> Delayed =
20751 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
20752 // Delay emission of gathers which are not ready yet.
20753 PostponedGathers.insert(X: E);
20754 // Postpone gather emission, will be emitted after the end of the
20755 // process to keep correct order.
20756 return *Delayed;
20757 }
20758 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
20759 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
20760 ExtractVecBase = VecBase;
20761 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
20762 if (VF == VecBaseTy->getNumElements() &&
20763 GatheredScalars.size() != VF) {
20764 Resized = true;
20765 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
20766 Elt: PoisonValue::get(T: OrigScalarTy));
20767 NumParts =
20768 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF), Limit: VF);
20769 }
20770 }
20771 }
20772 // Gather extracts after we check for full matched gathers only.
20773 if (!ExtractShuffles.empty() || !E->hasState() ||
20774 E->getOpcode() != Instruction::Load ||
20775 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
20776 any_of(Range: E->Scalars, P: IsaPred<LoadInst>)) &&
20777 any_of(E->Scalars,
20778 [this](Value *V) {
20779 return isa<LoadInst>(Val: V) && isVectorized(V);
20780 })) ||
20781 (E->hasState() && E->isAltShuffle()) ||
20782 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
20783 isSplat(VL: E->Scalars) ||
20784 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
20785 GatherShuffles =
20786 isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
20787 }
20788 if (!GatherShuffles.empty()) {
20789 if (std::optional<ResTy> Delayed =
20790 ShuffleBuilder.needToDelay(E, Entries)) {
20791 // Delay emission of gathers which are not ready yet.
20792 PostponedGathers.insert(X: E);
20793 // Postpone gather emission, will be emitted after the end of the
20794 // process to keep correct order.
20795 return *Delayed;
20796 }
20797 if (GatherShuffles.size() == 1 &&
20798 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
20799 Entries.front().front()->isSame(VL: E->Scalars)) {
20800 // Perfect match in the graph, will reuse the previously vectorized
20801 // node. Cost is 0.
20802 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
20803 << shortBundleName(E->Scalars, E->Idx) << ".\n");
20804 // Restore the mask for previous partially matched values.
20805 Mask.resize(N: E->Scalars.size());
20806 const TreeEntry *FrontTE = Entries.front().front();
20807 if (FrontTE->ReorderIndices.empty() &&
20808 ((FrontTE->ReuseShuffleIndices.empty() &&
20809 E->Scalars.size() == FrontTE->Scalars.size()) ||
20810 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
20811 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
20812 } else {
20813 for (auto [I, V] : enumerate(First: E->Scalars)) {
20814 if (isa<PoisonValue>(Val: V)) {
20815 Mask[I] = PoisonMaskElem;
20816 continue;
20817 }
20818 Mask[I] = FrontTE->findLaneForValue(V);
20819 }
20820 }
20821 // Reset the builder(s) to correctly handle perfect diamond matched
20822 // nodes.
20823 ShuffleBuilder.resetForSameNode();
20824 // Full matched entry found, no need to insert subvectors.
20825 if (equal(LRange: E->Scalars, RRange: FrontTE->Scalars) &&
20826 equal(LRange: E->ReorderIndices, RRange: FrontTE->ReorderIndices) &&
20827 equal(LRange: E->ReuseShuffleIndices, RRange: FrontTE->ReuseShuffleIndices)) {
20828 Mask.resize(N: FrontTE->getVectorFactor());
20829 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
20830 ShuffleBuilder.add(*FrontTE, Mask);
20831 Res = ShuffleBuilder.finalize({}, {}, {});
20832 } else {
20833 ShuffleBuilder.add(*FrontTE, Mask);
20834 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
20835 }
20836 return Res;
20837 }
20838 if (!Resized) {
20839 if (GatheredScalars.size() != VF &&
20840 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
20841 return any_of(TEs, [&](const TreeEntry *TE) {
20842 return TE->getVectorFactor() == VF;
20843 });
20844 }))
20845 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
20846 Elt: PoisonValue::get(T: OrigScalarTy));
20847 }
20848 // Remove shuffled elements from list of gathers.
20849 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
20850 if (Mask[I] != PoisonMaskElem)
20851 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
20852 }
20853 }
20854 }
20855 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
20856 SmallVectorImpl<int> &ReuseMask,
20857 bool IsRootPoison) {
20858 // For splats with can emit broadcasts instead of gathers, so try to find
20859 // such sequences.
20860 bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
20861 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
20862 Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
20863 SmallVector<int> UndefPos;
20864 DenseMap<Value *, unsigned> UniquePositions;
20865 // Gather unique non-const values and all constant values.
20866 // For repeated values, just shuffle them.
20867 int NumNonConsts = 0;
20868 int SinglePos = 0;
20869 for (auto [I, V] : enumerate(First&: Scalars)) {
20870 if (isa<UndefValue>(Val: V)) {
20871 if (!isa<PoisonValue>(Val: V)) {
20872 ReuseMask[I] = I;
20873 UndefPos.push_back(Elt: I);
20874 }
20875 continue;
20876 }
20877 if (isConstant(V)) {
20878 ReuseMask[I] = I;
20879 continue;
20880 }
20881 ++NumNonConsts;
20882 SinglePos = I;
20883 Value *OrigV = V;
20884 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
20885 if (IsSplat) {
20886 Scalars.front() = OrigV;
20887 ReuseMask[I] = 0;
20888 } else {
20889 const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
20890 Scalars[Res.first->second] = OrigV;
20891 ReuseMask[I] = Res.first->second;
20892 }
20893 }
20894 if (NumNonConsts == 1) {
20895 // Restore single insert element.
20896 if (IsSplat) {
20897 ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
20898 std::swap(a&: Scalars.front(), b&: Scalars[SinglePos]);
20899 if (!UndefPos.empty() && UndefPos.front() == 0)
20900 Scalars.front() = UndefValue::get(T: OrigScalarTy);
20901 }
20902 ReuseMask[SinglePos] = SinglePos;
20903 } else if (!UndefPos.empty() && IsSplat) {
20904 // For undef values, try to replace them with the simple broadcast.
20905 // We can do it if the broadcasted value is guaranteed to be
20906 // non-poisonous, or by freezing the incoming scalar value first.
20907 auto *It = find_if(Scalars, [this, E](Value *V) {
20908 return !isa<UndefValue>(Val: V) &&
20909 (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||
20910 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
20911 // Check if the value already used in the same operation in
20912 // one of the nodes already.
20913 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
20914 is_contained(Range&: E->UserTreeIndex.UserTE->Scalars,
20915 Element: U.getUser());
20916 })));
20917 });
20918 if (It != Scalars.end()) {
20919 // Replace undefs by the non-poisoned scalars and emit broadcast.
20920 int Pos = std::distance(Scalars.begin(), It);
20921 for (int I : UndefPos) {
20922 // Set the undef position to the non-poisoned scalar.
20923 ReuseMask[I] = Pos;
20924 // Replace the undef by the poison, in the mask it is replaced by
20925 // non-poisoned scalar already.
20926 if (I != Pos)
20927 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
20928 }
20929 } else {
20930 // Replace undefs by the poisons, emit broadcast and then emit
20931 // freeze.
20932 for (int I : UndefPos) {
20933 ReuseMask[I] = PoisonMaskElem;
20934 if (isa<UndefValue>(Val: Scalars[I]))
20935 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
20936 }
20937 NeedFreeze = true;
20938 }
20939 }
20940 };
20941 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
20942 bool IsNonPoisoned = true;
20943 bool IsUsedInExpr = true;
20944 Value *Vec1 = nullptr;
20945 if (!ExtractShuffles.empty()) {
20946 // Gather of extractelements can be represented as just a shuffle of
20947 // a single/two vectors the scalars are extracted from.
20948 // Find input vectors.
20949 Value *Vec2 = nullptr;
20950 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
20951 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
20952 ExtractMask[I] = PoisonMaskElem;
20953 }
20954 if (UseVecBaseAsInput) {
20955 Vec1 = ExtractVecBase;
20956 } else {
20957 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
20958 if (ExtractMask[I] == PoisonMaskElem)
20959 continue;
20960 if (isa<UndefValue>(Val: StoredGS[I]))
20961 continue;
20962 auto *EI = cast<ExtractElementInst>(Val: StoredGS[I]);
20963 Value *VecOp = EI->getVectorOperand();
20964 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: VecOp);
20965 !TEs.empty() && TEs.front()->VectorizedValue)
20966 VecOp = TEs.front()->VectorizedValue;
20967 if (!Vec1) {
20968 Vec1 = VecOp;
20969 } else if (Vec1 != VecOp) {
20970 assert((!Vec2 || Vec2 == VecOp) &&
20971 "Expected only 1 or 2 vectors shuffle.");
20972 Vec2 = VecOp;
20973 }
20974 }
20975 }
20976 if (Vec2) {
20977 IsUsedInExpr = false;
20978 IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1, AC) &&
20979 isGuaranteedNotToBePoison(V: Vec2, AC);
20980 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
20981 } else if (Vec1) {
20982 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(V: Vec1, AC);
20983 IsUsedInExpr &= FindReusedSplat(
20984 ExtractMask,
20985 cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), 0,
20986 ExtractMask.size(), IsNotPoisonedVec);
20987 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
20988 IsNonPoisoned &= IsNotPoisonedVec;
20989 } else {
20990 IsUsedInExpr = false;
20991 ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
20992 /*ForExtracts=*/true);
20993 }
20994 }
20995 if (!GatherShuffles.empty()) {
20996 unsigned SliceSize =
20997 getPartNumElems(Size: E->Scalars.size(),
20998 NumParts: ::getNumberOfParts(TTI: *TTI, VecTy, Limit: E->Scalars.size()));
20999 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
21000 for (const auto [I, TEs] : enumerate(First&: Entries)) {
21001 if (TEs.empty()) {
21002 assert(!GatherShuffles[I] &&
21003 "No shuffles with empty entries list expected.");
21004 continue;
21005 }
21006 assert((TEs.size() == 1 || TEs.size() == 2) &&
21007 "Expected shuffle of 1 or 2 entries.");
21008 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
21009 auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
21010 VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
21011 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
21012 if (TEs.size() == 1) {
21013 bool IsNotPoisonedVec =
21014 TEs.front()->VectorizedValue
21015 ? isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC)
21016 : true;
21017 IsUsedInExpr &=
21018 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
21019 SliceSize, IsNotPoisonedVec);
21020 ShuffleBuilder.add(*TEs.front(), VecMask);
21021 IsNonPoisoned &= IsNotPoisonedVec;
21022 } else {
21023 IsUsedInExpr = false;
21024 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
21025 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
21026 IsNonPoisoned &=
21027 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) &&
21028 isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue, AC);
21029 }
21030 }
21031 }
21032 // Try to figure out best way to combine values: build a shuffle and insert
21033 // elements or just build several shuffles.
21034 // Insert non-constant scalars.
21035 SmallVector<Value *> NonConstants(GatheredScalars);
21036 int EMSz = ExtractMask.size();
21037 int MSz = Mask.size();
21038 // Try to build constant vector and shuffle with it only if currently we
21039 // have a single permutation and more than 1 scalar constants.
21040 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
21041 bool IsIdentityShuffle =
21042 ((UseVecBaseAsInput ||
21043 all_of(ExtractShuffles,
21044 [](const std::optional<TTI::ShuffleKind> &SK) {
21045 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
21046 TTI::SK_PermuteSingleSrc;
21047 })) &&
21048 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
21049 ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) ||
21050 (!GatherShuffles.empty() &&
21051 all_of(GatherShuffles,
21052 [](const std::optional<TTI::ShuffleKind> &SK) {
21053 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
21054 TTI::SK_PermuteSingleSrc;
21055 }) &&
21056 none_of(Mask, [&](int I) { return I >= MSz; }) &&
21057 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
21058 bool EnoughConstsForShuffle =
21059 IsSingleShuffle &&
21060 (none_of(GatheredScalars,
21061 [](Value *V) {
21062 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
21063 }) ||
21064 any_of(GatheredScalars,
21065 [](Value *V) {
21066 return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
21067 })) &&
21068 (!IsIdentityShuffle ||
21069 (GatheredScalars.size() == 2 &&
21070 any_of(GatheredScalars,
21071 [](Value *V) { return !isa<UndefValue>(Val: V); })) ||
21072 count_if(GatheredScalars, [](Value *V) {
21073 return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
21074 }) > 1);
21075 // NonConstants array contains just non-constant values, GatheredScalars
21076 // contains only constant to build final vector and then shuffle.
21077 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
21078 if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars[I]))
21079 NonConstants[I] = PoisonValue::get(T: OrigScalarTy);
21080 else
21081 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
21082 }
21083 // Generate constants for final shuffle and build a mask for them.
21084 if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
21085 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
21086 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
21087 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
21088 ShuffleBuilder.add(BV, BVMask);
21089 }
21090 if (all_of(NonConstants, [=](Value *V) {
21091 return isa<PoisonValue>(Val: V) ||
21092 (IsSingleShuffle && ((IsIdentityShuffle &&
21093 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(Val: V));
21094 }))
21095 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
21096 SubVectorsMask);
21097 else
21098 Res = ShuffleBuilder.finalize(
21099 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
21100 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
21101 bool IsSplat = isSplat(VL: NonConstants);
21102 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
21103 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
21104 auto CheckIfSplatIsProfitable = [&]() {
21105 // Estimate the cost of splatting + shuffle and compare with
21106 // insert + shuffle.
21107 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
21108 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
21109 if (isa<ExtractElementInst>(Val: V) || isVectorized(V))
21110 return false;
21111 InstructionCost SplatCost = TTI->getVectorInstrCost(
21112 Opcode: Instruction::InsertElement, Val: VecTy, CostKind, /*Index=*/0,
21113 Op0: PoisonValue::get(T: VecTy), Op1: V);
21114 SmallVector<int> NewMask(Mask.begin(), Mask.end());
21115 for (auto [Idx, I] : enumerate(First&: BVMask))
21116 if (I != PoisonMaskElem)
21117 NewMask[Idx] = Mask.size();
21118 SplatCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy,
21119 Mask: NewMask, CostKind);
21120 InstructionCost BVCost = TTI->getVectorInstrCost(
21121 Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
21122 Index: *find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem)), Op0: Vec, Op1: V);
21123 // Shuffle required?
21124 if (count(Range&: BVMask, Element: PoisonMaskElem) <
21125 static_cast<int>(BVMask.size() - 1)) {
21126 SmallVector<int> NewMask(Mask.begin(), Mask.end());
21127 for (auto [Idx, I] : enumerate(First&: BVMask))
21128 if (I != PoisonMaskElem)
21129 NewMask[Idx] = I;
21130 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
21131 Tp: VecTy, Mask: NewMask, CostKind);
21132 }
21133 return SplatCost <= BVCost;
21134 };
21135 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
21136 for (auto [Idx, I] : enumerate(First&: BVMask))
21137 if (I != PoisonMaskElem)
21138 Mask[Idx] = I;
21139 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
21140 } else {
21141 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
21142 SmallVector<Value *> Values(NonConstants.size(),
21143 PoisonValue::get(T: ScalarTy));
21144 Values[0] = V;
21145 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
21146 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
21147 transform(BVMask, SplatMask.begin(), [](int I) {
21148 return I == PoisonMaskElem ? PoisonMaskElem : 0;
21149 });
21150 if (!ShuffleVectorInst::isIdentityMask(Mask: SplatMask, NumSrcElts: VF))
21151 BV = CreateShuffle(BV, nullptr, SplatMask);
21152 for (auto [Idx, I] : enumerate(First&: BVMask))
21153 if (I != PoisonMaskElem)
21154 Mask[Idx] = BVMask.size() + Idx;
21155 Vec = CreateShuffle(Vec, BV, Mask);
21156 for (auto [Idx, I] : enumerate(First&: Mask))
21157 if (I != PoisonMaskElem)
21158 Mask[Idx] = Idx;
21159 }
21160 });
21161 } else if (!allConstant(VL: GatheredScalars)) {
21162 // Gather unique scalars and all constants.
21163 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
21164 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
21165 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
21166 ShuffleBuilder.add(BV, ReuseMask);
21167 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
21168 SubVectorsMask);
21169 } else {
21170 // Gather all constants.
21171 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
21172 for (auto [I, V] : enumerate(First&: GatheredScalars)) {
21173 if (!isa<PoisonValue>(Val: V))
21174 Mask[I] = I;
21175 }
21176 Value *BV = ShuffleBuilder.gather(GatheredScalars);
21177 ShuffleBuilder.add(BV, Mask);
21178 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
21179 SubVectorsMask);
21180 }
21181
21182 if (NeedFreeze)
21183 Res = ShuffleBuilder.createFreeze(Res);
21184 return Res;
21185}
21186
21187Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
21188 // Do not do this for split vectorize node, marked to be gathers/buildvectors.
21189 if (E->State != TreeEntry::SplitVectorize ||
21190 !TransformedToGatherNodes.contains(Val: E)) {
21191 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
21192 (void)vectorizeTree(E: VectorizableTree[EIdx].get());
21193 }
21194 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
21195 Params&: Builder, Params&: *this);
21196}
21197
21198/// \returns \p I after propagating metadata from \p VL only for instructions in
21199/// \p VL.
21200static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
21201 SmallVector<Value *> Insts;
21202 for (Value *V : VL)
21203 if (isa<Instruction>(Val: V))
21204 Insts.push_back(Elt: V);
21205 return llvm::propagateMetadata(I: Inst, VL: Insts);
21206}
21207
21208static DebugLoc getDebugLocFromPHI(PHINode &PN) {
21209 if (DebugLoc DL = PN.getDebugLoc())
21210 return DL;
21211 return DebugLoc::getUnknown();
21212}
21213
21214Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
21215 IRBuilderBase::InsertPointGuard Guard(Builder);
21216
21217 Value *V = E->Scalars.front();
21218 Type *ScalarTy = V->getType();
21219 if (!isa<CmpInst>(Val: V))
21220 ScalarTy = getValueType(V);
21221 auto It = MinBWs.find(Val: E);
21222 if (It != MinBWs.end()) {
21223 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
21224 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
21225 if (VecTy)
21226 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
21227 }
21228 if (E->VectorizedValue)
21229 return E->VectorizedValue;
21230 auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
21231 if (E->isGather() || TransformedToGatherNodes.contains(Val: E)) {
21232 // Set insert point for non-reduction initial nodes.
21233 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
21234 setInsertPointAfterBundle(E);
21235 Value *Vec = createBuildVector(E, ScalarTy);
21236 E->VectorizedValue = Vec;
21237 return Vec;
21238 }
21239 if (E->State == TreeEntry::SplitVectorize) {
21240 assert(E->CombinedEntriesWithIndices.size() == 2 &&
21241 "Expected exactly 2 combined entries.");
21242 setInsertPointAfterBundle(E);
21243 TreeEntry &OpTE1 =
21244 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
21245 assert(OpTE1.isSame(
21246 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
21247 "Expected same first part of scalars.");
21248 Value *Op1 = vectorizeTree(E: &OpTE1);
21249 TreeEntry &OpTE2 =
21250 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
21251 assert(
21252 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
21253 "Expected same second part of scalars.");
21254 Value *Op2 = vectorizeTree(E: &OpTE2);
21255 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
21256 bool IsSigned = false;
21257 auto It = MinBWs.find(Val: OpE);
21258 if (It != MinBWs.end())
21259 IsSigned = It->second.second;
21260 else
21261 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
21262 if (isa<PoisonValue>(Val: V))
21263 return false;
21264 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
21265 });
21266 return IsSigned;
21267 };
21268 if (cast<VectorType>(Val: Op1->getType())->getElementType() !=
21269 ScalarTy->getScalarType()) {
21270 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
21271 Op1 = Builder.CreateIntCast(
21272 V: Op1,
21273 DestTy: getWidenedType(
21274 ScalarTy,
21275 VF: cast<FixedVectorType>(Val: Op1->getType())->getNumElements()),
21276 isSigned: GetOperandSignedness(&OpTE1));
21277 }
21278 if (cast<VectorType>(Val: Op2->getType())->getElementType() !=
21279 ScalarTy->getScalarType()) {
21280 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
21281 Op2 = Builder.CreateIntCast(
21282 V: Op2,
21283 DestTy: getWidenedType(
21284 ScalarTy,
21285 VF: cast<FixedVectorType>(Val: Op2->getType())->getNumElements()),
21286 isSigned: GetOperandSignedness(&OpTE2));
21287 }
21288 if (E->ReorderIndices.empty()) {
21289 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
21290 std::iota(
21291 first: Mask.begin(),
21292 last: std::next(x: Mask.begin(), n: E->CombinedEntriesWithIndices.back().second),
21293 value: 0);
21294 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
21295 if (ScalarTyNumElements != 1) {
21296 assert(SLPReVec && "Only supported by REVEC.");
21297 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask);
21298 }
21299 Value *Vec = Builder.CreateShuffleVector(V: Op1, Mask);
21300 Vec = createInsertVector(Builder, Vec, V: Op2,
21301 Index: E->CombinedEntriesWithIndices.back().second *
21302 ScalarTyNumElements);
21303 E->VectorizedValue = Vec;
21304 return Vec;
21305 }
21306 unsigned CommonVF =
21307 std::max(a: OpTE1.getVectorFactor(), b: OpTE2.getVectorFactor());
21308 const unsigned Scale = getNumElements(Ty: ScalarTy);
21309 CommonVF *= Scale;
21310 if (getNumElements(Ty: Op1->getType()) != CommonVF) {
21311 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
21312 copy(Range: createReplicatedMask(ReplicationFactor: Scale, VF: OpTE1.getVectorFactor() * Scale),
21313 Out: Mask.begin());
21314 Op1 = Builder.CreateShuffleVector(V: Op1, Mask);
21315 }
21316 if (getNumElements(Ty: Op2->getType()) != CommonVF) {
21317 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
21318 copy(Range: createReplicatedMask(ReplicationFactor: Scale, VF: OpTE2.getVectorFactor() * Scale),
21319 Out: Mask.begin());
21320 Op2 = Builder.CreateShuffleVector(V: Op2, Mask);
21321 }
21322 Value *Vec = Builder.CreateShuffleVector(V1: Op1, V2: Op2, Mask: E->getSplitMask());
21323 E->VectorizedValue = Vec;
21324 return Vec;
21325 }
21326
21327 bool IsReverseOrder =
21328 !E->ReorderIndices.empty() && isReverseOrder(Order: E->ReorderIndices);
21329 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
21330 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
21331 if (E->getOpcode() == Instruction::Store &&
21332 E->State == TreeEntry::Vectorize) {
21333 ArrayRef<int> Mask =
21334 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
21335 E->ReorderIndices.size());
21336 ShuffleBuilder.add(V1: V, Mask);
21337 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
21338 E->State == TreeEntry::CompressVectorize) {
21339 ShuffleBuilder.addOrdered(V1: V, Order: {});
21340 } else {
21341 ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
21342 }
21343 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
21344 E->CombinedEntriesWithIndices.size());
21345 transform(
21346 Range: E->CombinedEntriesWithIndices, d_first: SubVectors.begin(), F: [&](const auto &P) {
21347 return std::make_pair(VectorizableTree[P.first].get(), P.second);
21348 });
21349 assert(
21350 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
21351 "Expected either combined subnodes or reordering");
21352 return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices, SubVectors, SubVectorsMask: {});
21353 };
21354
21355 assert(!E->isGather() && "Unhandled state");
21356 unsigned ShuffleOrOp =
21357 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
21358 if (!E->isAltShuffle()) {
21359 switch (E->CombinedOp) {
21360 case TreeEntry::ReducedBitcast:
21361 case TreeEntry::ReducedBitcastBSwap:
21362 case TreeEntry::ReducedBitcastLoads:
21363 case TreeEntry::ReducedBitcastBSwapLoads:
21364 case TreeEntry::ReducedCmpBitcast:
21365 ShuffleOrOp = E->CombinedOp;
21366 break;
21367 default:
21368 break;
21369 }
21370 }
21371 Instruction *VL0 = E->getMainOp();
21372 auto GetOperandSignedness = [&](unsigned Idx) {
21373 const TreeEntry *OpE = getOperandEntry(E, Idx);
21374 bool IsSigned = false;
21375 auto It = MinBWs.find(Val: OpE);
21376 if (It != MinBWs.end())
21377 IsSigned = It->second.second;
21378 else
21379 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
21380 if (isa<PoisonValue>(Val: V))
21381 return false;
21382 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
21383 });
21384 return IsSigned;
21385 };
21386 switch (ShuffleOrOp) {
21387 case Instruction::PHI: {
21388 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
21389 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
21390 "PHI reordering is free.");
21391 auto *PH = cast<PHINode>(Val: VL0);
21392 Builder.SetInsertPoint(TheBB: PH->getParent(),
21393 IP: PH->getParent()->getFirstNonPHIIt());
21394 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
21395 PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
21396 Value *V = NewPhi;
21397
21398 // Adjust insertion point once all PHI's have been generated.
21399 Builder.SetInsertPoint(TheBB: PH->getParent(),
21400 IP: PH->getParent()->getFirstInsertionPt());
21401 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
21402
21403 V = FinalShuffle(V, E);
21404
21405 E->VectorizedValue = V;
21406 // If phi node is fully emitted - exit.
21407 if (NewPhi->getNumIncomingValues() != 0)
21408 return NewPhi;
21409
21410 // PHINodes may have multiple entries from the same block. We want to
21411 // visit every block once.
21412 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
21413 for (unsigned I : seq<unsigned>(Size: PH->getNumIncomingValues())) {
21414 BasicBlock *IBB = PH->getIncomingBlock(i: I);
21415
21416 // Stop emission if all incoming values are generated.
21417 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
21418 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
21419 return NewPhi;
21420 }
21421
21422 auto Res = VisitedBBs.try_emplace(Key: IBB, Args&: I);
21423 if (!Res.second) {
21424 TreeEntry *OpTE = getOperandEntry(E, Idx: I);
21425 if (OpTE->isGather() || DeletedNodes.contains(Ptr: OpTE) ||
21426 TransformedToGatherNodes.contains(Val: OpTE)) {
21427 Value *VecOp = NewPhi->getIncomingValue(i: Res.first->getSecond());
21428 NewPhi->addIncoming(V: VecOp, BB: IBB);
21429 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
21430 OpTE->VectorizedValue = VecOp;
21431 continue;
21432 }
21433 }
21434
21435 Builder.SetInsertPoint(IBB->getTerminator());
21436 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
21437 Value *Vec = vectorizeOperand(E, NodeIdx: I);
21438 if (VecTy != Vec->getType()) {
21439 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
21440 MinBWs.contains(getOperandEntry(E, I))) &&
21441 "Expected item in MinBWs.");
21442 Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
21443 }
21444 NewPhi->addIncoming(V: Vec, BB: IBB);
21445 }
21446
21447 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
21448 "Invalid number of incoming values");
21449 assert(E->VectorizedValue && "Expected vectorized value.");
21450 return E->VectorizedValue;
21451 }
21452
21453 case Instruction::ExtractElement: {
21454 Value *V = E->getSingleOperand(OpIdx: 0);
21455 setInsertPointAfterBundle(E);
21456 V = FinalShuffle(V, E);
21457 E->VectorizedValue = V;
21458 return V;
21459 }
21460 case Instruction::ExtractValue: {
21461 auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: 0));
21462 Builder.SetInsertPoint(LI);
21463 Value *Ptr = LI->getPointerOperand();
21464 LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
21465 Value *NewV = ::propagateMetadata(Inst: V, VL: E->Scalars);
21466 NewV = FinalShuffle(NewV, E);
21467 E->VectorizedValue = NewV;
21468 return NewV;
21469 }
21470 case Instruction::InsertElement: {
21471 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
21472 if (const TreeEntry *OpE = getOperandEntry(E, Idx: 1);
21473 OpE && !OpE->isGather() && OpE->hasState() &&
21474 !OpE->hasCopyableElements())
21475 Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
21476 else
21477 setInsertPointAfterBundle(E);
21478 Value *V = vectorizeOperand(E, NodeIdx: 1);
21479 ArrayRef<Value *> Op = E->getOperand(OpIdx: 1);
21480 Type *ScalarTy = Op.front()->getType();
21481 if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
21482 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
21483 std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: 1));
21484 assert(Res.first > 0 && "Expected item in MinBWs.");
21485 V = Builder.CreateIntCast(
21486 V,
21487 DestTy: getWidenedType(
21488 ScalarTy,
21489 VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
21490 isSigned: Res.second);
21491 }
21492
21493 // Create InsertVector shuffle if necessary
21494 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range&: E->Scalars, P: [E](Value *V) {
21495 return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
21496 }));
21497 const unsigned NumElts =
21498 cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
21499 const unsigned NumScalars = E->Scalars.size();
21500
21501 unsigned Offset = *getElementIndex(Inst: VL0);
21502 assert(Offset < NumElts && "Failed to find vector index offset");
21503
21504 // Create shuffle to resize vector
21505 SmallVector<int> Mask;
21506 if (!E->ReorderIndices.empty()) {
21507 inversePermutation(Indices: E->ReorderIndices, Mask);
21508 Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
21509 } else {
21510 Mask.assign(NumElts, Elt: PoisonMaskElem);
21511 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: 0);
21512 }
21513 // Create InsertVector shuffle if necessary
21514 bool IsIdentity = true;
21515 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
21516 Mask.swap(RHS&: PrevMask);
21517 for (unsigned I = 0; I < NumScalars; ++I) {
21518 Value *Scalar = E->Scalars[PrevMask[I]];
21519 unsigned InsertIdx = *getElementIndex(Inst: Scalar);
21520 IsIdentity &= InsertIdx - Offset == I;
21521 Mask[InsertIdx - Offset] = I;
21522 }
21523 if (!IsIdentity || NumElts != NumScalars) {
21524 Value *V2 = nullptr;
21525 bool IsVNonPoisonous =
21526 !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
21527 SmallVector<int> InsertMask(Mask);
21528 if (NumElts != NumScalars && Offset == 0) {
21529 // Follow all insert element instructions from the current buildvector
21530 // sequence.
21531 InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
21532 do {
21533 std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
21534 if (!InsertIdx)
21535 break;
21536 if (InsertMask[*InsertIdx] == PoisonMaskElem)
21537 InsertMask[*InsertIdx] = *InsertIdx;
21538 if (!Ins->hasOneUse())
21539 break;
21540 Ins = dyn_cast_or_null<InsertElementInst>(
21541 Val: Ins->getUniqueUndroppableUser());
21542 } while (Ins);
21543 SmallBitVector UseMask =
21544 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
21545 SmallBitVector IsFirstPoison =
21546 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
21547 SmallBitVector IsFirstUndef =
21548 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
21549 if (!IsFirstPoison.all()) {
21550 unsigned Idx = 0;
21551 for (unsigned I = 0; I < NumElts; I++) {
21552 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
21553 IsFirstUndef.test(Idx: I)) {
21554 if (IsVNonPoisonous) {
21555 InsertMask[I] = I < NumScalars ? I : 0;
21556 continue;
21557 }
21558 if (!V2)
21559 V2 = UndefValue::get(T: V->getType());
21560 if (Idx >= NumScalars)
21561 Idx = NumScalars - 1;
21562 InsertMask[I] = NumScalars + Idx;
21563 ++Idx;
21564 } else if (InsertMask[I] != PoisonMaskElem &&
21565 Mask[I] == PoisonMaskElem) {
21566 InsertMask[I] = PoisonMaskElem;
21567 }
21568 }
21569 } else {
21570 InsertMask = Mask;
21571 }
21572 }
21573 if (!V2)
21574 V2 = PoisonValue::get(T: V->getType());
21575 V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
21576 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21577 GatherShuffleExtractSeq.insert(X: I);
21578 CSEBlocks.insert(V: I->getParent());
21579 }
21580 }
21581
21582 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
21583 for (unsigned I = 0; I < NumElts; I++) {
21584 if (Mask[I] != PoisonMaskElem)
21585 InsertMask[Offset + I] = I;
21586 }
21587 SmallBitVector UseMask =
21588 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
21589 SmallBitVector IsFirstUndef =
21590 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
21591 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
21592 NumElts != NumScalars) {
21593 if (IsFirstUndef.all()) {
21594 if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
21595 SmallBitVector IsFirstPoison =
21596 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
21597 if (!IsFirstPoison.all()) {
21598 for (unsigned I = 0; I < NumElts; I++) {
21599 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
21600 InsertMask[I] = I + NumElts;
21601 }
21602 }
21603 V = Builder.CreateShuffleVector(
21604 V1: V,
21605 V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
21606 : FirstInsert->getOperand(i: 0),
21607 Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
21608 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21609 GatherShuffleExtractSeq.insert(X: I);
21610 CSEBlocks.insert(V: I->getParent());
21611 }
21612 }
21613 } else {
21614 SmallBitVector IsFirstPoison =
21615 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
21616 for (unsigned I = 0; I < NumElts; I++) {
21617 if (InsertMask[I] == PoisonMaskElem)
21618 InsertMask[I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
21619 else
21620 InsertMask[I] += NumElts;
21621 }
21622 V = Builder.CreateShuffleVector(
21623 V1: FirstInsert->getOperand(i: 0), V2: V, Mask: InsertMask,
21624 Name: cast<Instruction>(Val: E->Scalars.back())->getName());
21625 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21626 GatherShuffleExtractSeq.insert(X: I);
21627 CSEBlocks.insert(V: I->getParent());
21628 }
21629 }
21630 }
21631
21632 ++NumVectorInstructions;
21633 E->VectorizedValue = V;
21634 return V;
21635 }
21636 case Instruction::ZExt:
21637 case Instruction::SExt:
21638 case Instruction::FPToUI:
21639 case Instruction::FPToSI:
21640 case Instruction::FPExt:
21641 case Instruction::PtrToInt:
21642 case Instruction::IntToPtr:
21643 case Instruction::SIToFP:
21644 case Instruction::UIToFP:
21645 case Instruction::Trunc:
21646 case Instruction::FPTrunc:
21647 case Instruction::BitCast: {
21648 setInsertPointAfterBundle(E);
21649
21650 Value *InVec = vectorizeOperand(E, NodeIdx: 0);
21651
21652 auto *CI = cast<CastInst>(Val: VL0);
21653 Instruction::CastOps VecOpcode = CI->getOpcode();
21654 Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
21655 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
21656 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
21657 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
21658 SrcScalarTy != CI->getOperand(i_nocapture: 0)->getType()->getScalarType())) {
21659 // Check if the values are candidates to demote.
21660 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
21661 if (SrcIt != MinBWs.end())
21662 SrcBWSz = SrcIt->second.first;
21663 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
21664 if (BWSz == SrcBWSz) {
21665 VecOpcode = Instruction::BitCast;
21666 } else if (BWSz < SrcBWSz) {
21667 VecOpcode = Instruction::Trunc;
21668 } else if (It != MinBWs.end()) {
21669 assert(BWSz > SrcBWSz && "Invalid cast!");
21670 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
21671 } else if (SrcIt != MinBWs.end()) {
21672 assert(BWSz > SrcBWSz && "Invalid cast!");
21673 VecOpcode =
21674 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
21675 }
21676 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
21677 !SrcIt->second.second) {
21678 VecOpcode = Instruction::UIToFP;
21679 } else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
21680 ScalarTy->isFPOrFPVectorTy()) {
21681 Type *OrigSrcScalarTy = CI->getSrcTy();
21682 auto *OrigSrcVectorTy =
21683 getWidenedType(ScalarTy: OrigSrcScalarTy, VF: E->Scalars.size());
21684 InVec =
21685 Builder.CreateIntCast(V: InVec, DestTy: OrigSrcVectorTy, isSigned: SrcIt->second.second);
21686 }
21687 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
21688 ? InVec
21689 : Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
21690 V = FinalShuffle(V, E);
21691
21692 E->VectorizedValue = V;
21693 ++NumVectorInstructions;
21694 return V;
21695 }
21696 case Instruction::FCmp:
21697 case Instruction::ICmp: {
21698 setInsertPointAfterBundle(E);
21699
21700 Value *L = vectorizeOperand(E, NodeIdx: 0);
21701 Value *R = vectorizeOperand(E, NodeIdx: 1);
21702 if (L->getType() != R->getType()) {
21703 assert((getOperandEntry(E, 0)->isGather() ||
21704 getOperandEntry(E, 1)->isGather() ||
21705 MinBWs.contains(getOperandEntry(E, 0)) ||
21706 MinBWs.contains(getOperandEntry(E, 1))) &&
21707 "Expected item in MinBWs.");
21708 const unsigned LBW = cast<VectorType>(Val: L->getType())
21709 ->getElementType()
21710 ->getIntegerBitWidth();
21711 const unsigned RBW = cast<VectorType>(Val: R->getType())
21712 ->getElementType()
21713 ->getIntegerBitWidth();
21714 if ((LBW < RBW && (!allConstant(VL: E->getOperand(OpIdx: 1)) ||
21715 any_of(
21716 Range&: E->getOperand(OpIdx: 1),
21717 P: [&](Value *V) {
21718 auto *CI = dyn_cast<ConstantInt>(Val: V);
21719 return !CI ||
21720 CI->getValue().getActiveBits() > LBW;
21721 }))) ||
21722 (LBW > RBW && allConstant(VL: E->getOperand(OpIdx: 0)) &&
21723 all_of(Range&: E->getOperand(OpIdx: 1), P: [&](Value *V) {
21724 auto *CI = dyn_cast<ConstantInt>(Val: V);
21725 return CI && CI->getValue().getActiveBits() <= RBW;
21726 }))) {
21727 Type *CastTy = R->getType();
21728 L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness(0));
21729 } else {
21730 Type *CastTy = L->getType();
21731 R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness(1));
21732 }
21733 }
21734
21735 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
21736 Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
21737 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21738 if (auto *ICmp = dyn_cast<ICmpInst>(Val: V); ICmp && It == MinBWs.end())
21739 ICmp->setSameSign(/*B=*/false);
21740 // Do not cast for cmps.
21741 VecTy = cast<FixedVectorType>(Val: V->getType());
21742 V = FinalShuffle(V, E);
21743
21744 E->VectorizedValue = V;
21745 ++NumVectorInstructions;
21746 return V;
21747 }
21748 case Instruction::Select: {
21749 setInsertPointAfterBundle(E);
21750
21751 Value *Cond = vectorizeOperand(E, NodeIdx: 0);
21752 Value *True = vectorizeOperand(E, NodeIdx: 1);
21753 Value *False = vectorizeOperand(E, NodeIdx: 2);
21754 if (True->getType() != VecTy || False->getType() != VecTy) {
21755 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
21756 getOperandEntry(E, 2)->isGather() ||
21757 MinBWs.contains(getOperandEntry(E, 1)) ||
21758 MinBWs.contains(getOperandEntry(E, 2))) &&
21759 "Expected item in MinBWs.");
21760 if (True->getType() != VecTy)
21761 True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness(1));
21762 if (False->getType() != VecTy)
21763 False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness(2));
21764 }
21765
21766 unsigned CondNumElements = getNumElements(Ty: Cond->getType());
21767 unsigned TrueNumElements = getNumElements(Ty: True->getType());
21768 assert(TrueNumElements >= CondNumElements &&
21769 TrueNumElements % CondNumElements == 0 &&
21770 "Cannot vectorize Instruction::Select");
21771 assert(TrueNumElements == getNumElements(False->getType()) &&
21772 "Cannot vectorize Instruction::Select");
21773 if (CondNumElements != TrueNumElements) {
21774 // When the return type is i1 but the source is fixed vector type, we
21775 // need to duplicate the condition value.
21776 Cond = Builder.CreateShuffleVector(
21777 V: Cond, Mask: createReplicatedMask(ReplicationFactor: TrueNumElements / CondNumElements,
21778 VF: CondNumElements));
21779 }
21780 assert(getNumElements(Cond->getType()) == TrueNumElements &&
21781 "Cannot vectorize Instruction::Select");
21782 Value *V =
21783 Builder.CreateSelectWithUnknownProfile(C: Cond, True, False, DEBUG_TYPE);
21784 V = FinalShuffle(V, E);
21785
21786 E->VectorizedValue = V;
21787 ++NumVectorInstructions;
21788 return V;
21789 }
21790 case Instruction::FNeg: {
21791 setInsertPointAfterBundle(E);
21792
21793 Value *Op = vectorizeOperand(E, NodeIdx: 0);
21794
21795 Value *V = Builder.CreateUnOp(
21796 Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
21797 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21798 if (auto *I = dyn_cast<Instruction>(Val: V))
21799 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21800
21801 V = FinalShuffle(V, E);
21802
21803 E->VectorizedValue = V;
21804 ++NumVectorInstructions;
21805
21806 return V;
21807 }
21808 case Instruction::Freeze: {
21809 setInsertPointAfterBundle(E);
21810
21811 Value *Op = vectorizeOperand(E, NodeIdx: 0);
21812
21813 if (Op->getType() != VecTy) {
21814 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
21815 MinBWs.contains(getOperandEntry(E, 0))) &&
21816 "Expected item in MinBWs.");
21817 Op = Builder.CreateIntCast(V: Op, DestTy: VecTy, isSigned: GetOperandSignedness(0));
21818 }
21819 Value *V = Builder.CreateFreeze(V: Op);
21820 V = FinalShuffle(V, E);
21821
21822 E->VectorizedValue = V;
21823 ++NumVectorInstructions;
21824
21825 return V;
21826 }
21827 case Instruction::Add:
21828 case Instruction::FAdd:
21829 case Instruction::Sub:
21830 case Instruction::FSub:
21831 case Instruction::Mul:
21832 case Instruction::FMul:
21833 case Instruction::UDiv:
21834 case Instruction::SDiv:
21835 case Instruction::FDiv:
21836 case Instruction::URem:
21837 case Instruction::SRem:
21838 case Instruction::FRem:
21839 case Instruction::Shl:
21840 case Instruction::LShr:
21841 case Instruction::AShr:
21842 case Instruction::And:
21843 case Instruction::Or:
21844 case Instruction::Xor: {
21845 setInsertPointAfterBundle(E);
21846
21847 Value *LHS = vectorizeOperand(E, NodeIdx: 0);
21848 Value *RHS = vectorizeOperand(E, NodeIdx: 1);
21849 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
21850 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
21851 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
21852 if (all_of(Range&: Ops, P: [&](Value *Op) {
21853 auto *CI = dyn_cast<ConstantInt>(Val: Op);
21854 return CI && CI->getValue().countr_one() >= It->second.first;
21855 })) {
21856 V = FinalShuffle(I == 0 ? RHS : LHS, E);
21857 E->VectorizedValue = V;
21858 ++NumVectorInstructions;
21859 return V;
21860 }
21861 }
21862 }
21863 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
21864 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
21865 getOperandEntry(E, 1)->isGather() ||
21866 MinBWs.contains(getOperandEntry(E, 0)) ||
21867 MinBWs.contains(getOperandEntry(E, 1))) &&
21868 "Expected item in MinBWs.");
21869 if (LHS->getType() != VecTy)
21870 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness(0));
21871 if (RHS->getType() != VecTy)
21872 RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness(1));
21873 }
21874
21875 Value *V = Builder.CreateBinOp(
21876 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
21877 RHS);
21878 propagateIRFlags(I: V, VL: E->Scalars, OpValue: nullptr, IncludeWrapFlags: It == MinBWs.end());
21879 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21880 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21881 // Drop nuw flags for abs(sub(commutative), true).
21882 if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
21883 any_of(Range&: E->Scalars, P: [E](Value *V) {
21884 return isa<PoisonValue>(Val: V) ||
21885 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
21886 isCommutative(I: cast<Instruction>(Val: V));
21887 }))
21888 I->setHasNoUnsignedWrap(/*b=*/false);
21889 }
21890
21891 V = FinalShuffle(V, E);
21892
21893 E->VectorizedValue = V;
21894 ++NumVectorInstructions;
21895
21896 return V;
21897 }
21898 case Instruction::Load: {
21899 // Loads are inserted at the head of the tree because we don't want to
21900 // sink them all the way down past store instructions.
21901 setInsertPointAfterBundle(E);
21902
21903 LoadInst *LI = cast<LoadInst>(Val: VL0);
21904 Instruction *NewLI;
21905 FixedVectorType *StridedLoadTy = nullptr;
21906 Value *PO = LI->getPointerOperand();
21907 if (E->State == TreeEntry::Vectorize) {
21908 NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
21909 } else if (E->State == TreeEntry::CompressVectorize) {
21910 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
21911 CompressEntryToData.at(Val: E);
21912 Align CommonAlignment = LI->getAlign();
21913 if (IsMasked) {
21914 unsigned VF = getNumElements(Ty: LoadVecTy);
21915 SmallVector<Constant *> MaskValues(
21916 VF / getNumElements(Ty: LI->getType()),
21917 ConstantInt::getFalse(Context&: VecTy->getContext()));
21918 for (int I : CompressMask)
21919 MaskValues[I] = ConstantInt::getTrue(Context&: VecTy->getContext());
21920 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
21921 assert(SLPReVec && "Only supported by REVEC.");
21922 MaskValues = replicateMask(Val: MaskValues, VF: VecTy->getNumElements());
21923 }
21924 Constant *MaskValue = ConstantVector::get(V: MaskValues);
21925 NewLI = Builder.CreateMaskedLoad(Ty: LoadVecTy, Ptr: PO, Alignment: CommonAlignment,
21926 Mask: MaskValue);
21927 } else {
21928 NewLI = Builder.CreateAlignedLoad(Ty: LoadVecTy, Ptr: PO, Align: CommonAlignment);
21929 }
21930 NewLI = ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
21931 // TODO: include this cost into CommonCost.
21932 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
21933 assert(SLPReVec && "FixedVectorType is not expected.");
21934 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(),
21935 Mask&: CompressMask);
21936 }
21937 NewLI =
21938 cast<Instruction>(Val: Builder.CreateShuffleVector(V: NewLI, Mask: CompressMask));
21939 } else if (E->State == TreeEntry::StridedVectorize) {
21940 Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
21941 Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
21942 PO = IsReverseOrder ? PtrN : Ptr0;
21943 Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
21944 Value *StrideVal;
21945 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
21946 StridedLoadTy = SPtrInfo.Ty;
21947 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
21948 unsigned StridedLoadEC =
21949 StridedLoadTy->getElementCount().getKnownMinValue();
21950
21951 Value *Stride = SPtrInfo.StrideVal;
21952 if (!Stride) {
21953 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
21954 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
21955 SCEVExpander Expander(*SE, "strided-load-vec");
21956 Stride = Expander.expandCodeFor(SH: StrideSCEV, Ty: StrideSCEV->getType(),
21957 I: &*Builder.GetInsertPoint());
21958 }
21959 Value *NewStride =
21960 Builder.CreateIntCast(V: Stride, DestTy: StrideTy, /*isSigned=*/true);
21961 StrideVal = Builder.CreateMul(
21962 LHS: NewStride, RHS: ConstantInt::getSigned(
21963 Ty: StrideTy, V: (IsReverseOrder ? -1 : 1) *
21964 static_cast<int>(
21965 DL->getTypeAllocSize(Ty: ScalarTy))));
21966 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
21967 auto *Inst = Builder.CreateIntrinsic(
21968 ID: Intrinsic::experimental_vp_strided_load,
21969 Types: {StridedLoadTy, PO->getType(), StrideTy},
21970 Args: {PO, StrideVal,
21971 Builder.getAllOnesMask(NumElts: ElementCount::getFixed(MinVal: StridedLoadEC)),
21972 Builder.getInt32(C: StridedLoadEC)});
21973 Inst->addParamAttr(
21974 /*ArgNo=*/0,
21975 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
21976 NewLI = Inst;
21977 } else {
21978 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
21979 Value *VecPtr = vectorizeOperand(E, NodeIdx: 0);
21980 if (isa<FixedVectorType>(Val: ScalarTy)) {
21981 assert(SLPReVec && "FixedVectorType is not expected.");
21982 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
21983 // to expand VecPtr if ScalarTy is a vector type.
21984 unsigned ScalarTyNumElements =
21985 cast<FixedVectorType>(Val: ScalarTy)->getNumElements();
21986 unsigned VecTyNumElements =
21987 cast<FixedVectorType>(Val: VecTy)->getNumElements();
21988 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
21989 "Cannot expand getelementptr.");
21990 unsigned VF = VecTyNumElements / ScalarTyNumElements;
21991 SmallVector<Constant *> Indices(VecTyNumElements);
21992 transform(Range: seq(Size: VecTyNumElements), d_first: Indices.begin(), F: [=](unsigned I) {
21993 return Builder.getInt64(C: I % ScalarTyNumElements);
21994 });
21995 VecPtr = Builder.CreateGEP(
21996 Ty: VecTy->getElementType(),
21997 Ptr: Builder.CreateShuffleVector(
21998 V: VecPtr, Mask: createReplicatedMask(ReplicationFactor: ScalarTyNumElements, VF)),
21999 IdxList: ConstantVector::get(V: Indices));
22000 }
22001 // Use the minimum alignment of the gathered loads.
22002 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
22003 NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
22004 }
22005 Value *V = E->State == TreeEntry::CompressVectorize
22006 ? NewLI
22007 : ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
22008
22009 if (StridedLoadTy != VecTy)
22010 V = Builder.CreateBitOrPointerCast(V, DestTy: VecTy);
22011 V = FinalShuffle(V, E);
22012 E->VectorizedValue = V;
22013 ++NumVectorInstructions;
22014 return V;
22015 }
22016 case Instruction::Store: {
22017 auto *SI = cast<StoreInst>(Val: VL0);
22018
22019 setInsertPointAfterBundle(E);
22020
22021 Value *VecValue = vectorizeOperand(E, NodeIdx: 0);
22022 if (VecValue->getType() != VecTy)
22023 VecValue =
22024 Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness(0));
22025 VecValue = FinalShuffle(VecValue, E);
22026
22027 Value *Ptr = SI->getPointerOperand();
22028 Instruction *ST;
22029 if (E->State == TreeEntry::Vectorize) {
22030 ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
22031 } else {
22032 assert(E->State == TreeEntry::StridedVectorize &&
22033 "Expected either strided or consecutive stores.");
22034 if (!E->ReorderIndices.empty()) {
22035 SI = cast<StoreInst>(Val: E->Scalars[E->ReorderIndices.front()]);
22036 Ptr = SI->getPointerOperand();
22037 }
22038 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
22039 Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
22040 auto *Inst = Builder.CreateIntrinsic(
22041 ID: Intrinsic::experimental_vp_strided_store,
22042 Types: {VecTy, Ptr->getType(), StrideTy},
22043 Args: {VecValue, Ptr,
22044 ConstantInt::getSigned(
22045 Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
22046 Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
22047 Builder.getInt32(C: E->Scalars.size())});
22048 Inst->addParamAttr(
22049 /*ArgNo=*/1,
22050 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
22051 ST = Inst;
22052 }
22053
22054 Value *V = ::propagateMetadata(Inst: ST, VL: E->Scalars);
22055
22056 E->VectorizedValue = V;
22057 ++NumVectorInstructions;
22058 return V;
22059 }
22060 case Instruction::GetElementPtr: {
22061 auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
22062 setInsertPointAfterBundle(E);
22063
22064 Value *Op0 = vectorizeOperand(E, NodeIdx: 0);
22065
22066 SmallVector<Value *> OpVecs;
22067 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
22068 Value *OpVec = vectorizeOperand(E, NodeIdx: J);
22069 OpVecs.push_back(Elt: OpVec);
22070 }
22071
22072 Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
22073 if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
22074 SmallVector<Value *> GEPs;
22075 for (Value *V : E->Scalars) {
22076 if (isa<GetElementPtrInst>(Val: V))
22077 GEPs.push_back(Elt: V);
22078 }
22079 V = ::propagateMetadata(Inst: I, VL: GEPs);
22080 }
22081
22082 V = FinalShuffle(V, E);
22083
22084 E->VectorizedValue = V;
22085 ++NumVectorInstructions;
22086
22087 return V;
22088 }
22089 case Instruction::Call: {
22090 CallInst *CI = cast<CallInst>(Val: VL0);
22091 setInsertPointAfterBundle(E);
22092
22093 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
22094
22095 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
22096 CI, ID, VF: VecTy->getNumElements(),
22097 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
22098 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
22099 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
22100 VecCallCosts.first <= VecCallCosts.second;
22101
22102 Value *ScalarArg = nullptr;
22103 SmallVector<Value *> OpVecs;
22104 SmallVector<Type *, 2> TysForDecl;
22105 // Add return type if intrinsic is overloaded on it.
22106 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -1, TTI))
22107 TysForDecl.push_back(Elt: VecTy);
22108 auto *CEI = cast<CallInst>(Val: VL0);
22109 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
22110 // Some intrinsics have scalar arguments. This argument should not be
22111 // vectorized.
22112 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) {
22113 ScalarArg = CEI->getArgOperand(i: I);
22114 // if decided to reduce bitwidth of abs intrinsic, it second argument
22115 // must be set false (do not return poison, if value issigned min).
22116 if (ID == Intrinsic::abs && It != MinBWs.end() &&
22117 It->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
22118 ScalarArg = Builder.getFalse();
22119 OpVecs.push_back(Elt: ScalarArg);
22120 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
22121 TysForDecl.push_back(Elt: ScalarArg->getType());
22122 continue;
22123 }
22124
22125 Value *OpVec = vectorizeOperand(E, NodeIdx: I);
22126 ScalarArg = CEI->getArgOperand(i: I);
22127 if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
22128 ScalarArg->getType()->getScalarType() &&
22129 It == MinBWs.end()) {
22130 auto *CastTy =
22131 getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
22132 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness(I));
22133 } else if (It != MinBWs.end()) {
22134 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
22135 }
22136 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
22137 OpVecs.push_back(Elt: OpVec);
22138 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
22139 TysForDecl.push_back(Elt: OpVec->getType());
22140 }
22141
22142 Function *CF;
22143 if (!UseIntrinsic) {
22144 VFShape Shape =
22145 VFShape::get(FTy: CI->getFunctionType(),
22146 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
22147 HasGlobalPred: false /*HasGlobalPred*/);
22148 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
22149 } else {
22150 CF = Intrinsic::getOrInsertDeclaration(M: F->getParent(), id: ID, OverloadTys: TysForDecl);
22151 }
22152
22153 SmallVector<OperandBundleDef, 1> OpBundles;
22154 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
22155 Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
22156
22157 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
22158 cast<CallInst>(Val: V)->setCallingConv(CF->getCallingConv());
22159 V = FinalShuffle(V, E);
22160
22161 E->VectorizedValue = V;
22162 ++NumVectorInstructions;
22163 return V;
22164 }
22165 case Instruction::ShuffleVector: {
22166 Value *V;
22167 if (SLPReVec && !E->isAltShuffle()) {
22168 setInsertPointAfterBundle(E);
22169 Value *Src = vectorizeOperand(E, NodeIdx: 0);
22170 SmallVector<int> ThisMask(calculateShufflevectorMask(VL: E->Scalars));
22171 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Val: Src)) {
22172 SmallVector<int> NewMask(ThisMask.size());
22173 transform(Range&: ThisMask, d_first: NewMask.begin(), F: [&SVSrc](int Mask) {
22174 return SVSrc->getShuffleMask()[Mask];
22175 });
22176 V = Builder.CreateShuffleVector(V1: SVSrc->getOperand(i_nocapture: 0),
22177 V2: SVSrc->getOperand(i_nocapture: 1), Mask: NewMask);
22178 } else {
22179 V = Builder.CreateShuffleVector(V: Src, Mask: ThisMask);
22180 }
22181 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
22182 if (auto *I = dyn_cast<Instruction>(Val: V))
22183 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
22184 V = FinalShuffle(V, E);
22185 } else {
22186 assert(E->isAltShuffle() &&
22187 ((Instruction::isBinaryOp(E->getOpcode()) &&
22188 Instruction::isBinaryOp(E->getAltOpcode())) ||
22189 (Instruction::isCast(E->getOpcode()) &&
22190 Instruction::isCast(E->getAltOpcode())) ||
22191 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
22192 "Invalid Shuffle Vector Operand");
22193
22194 Value *LHS = nullptr, *RHS = nullptr;
22195 if (Instruction::isBinaryOp(Opcode: E->getOpcode()) || isa<CmpInst>(Val: VL0)) {
22196 setInsertPointAfterBundle(E);
22197 LHS = vectorizeOperand(E, NodeIdx: 0);
22198 RHS = vectorizeOperand(E, NodeIdx: 1);
22199 } else {
22200 setInsertPointAfterBundle(E);
22201 LHS = vectorizeOperand(E, NodeIdx: 0);
22202 }
22203 if (LHS && RHS &&
22204 ((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
22205 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
22206 (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
22207 assert((It != MinBWs.end() ||
22208 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
22209 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
22210 MinBWs.contains(getOperandEntry(E, 0)) ||
22211 MinBWs.contains(getOperandEntry(E, 1))) &&
22212 "Expected item in MinBWs.");
22213 Type *CastTy = VecTy;
22214 if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
22215 if (cast<VectorType>(Val: LHS->getType())
22216 ->getElementType()
22217 ->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
22218 ->getElementType()
22219 ->getIntegerBitWidth())
22220 CastTy = RHS->getType();
22221 else
22222 CastTy = LHS->getType();
22223 }
22224 if (LHS->getType() != CastTy)
22225 LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness(0));
22226 if (RHS->getType() != CastTy)
22227 RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness(1));
22228 }
22229
22230 Value *V0, *V1;
22231 if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
22232 V0 = Builder.CreateBinOp(
22233 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
22234 V1 = Builder.CreateBinOp(
22235 Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
22236 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
22237 V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
22238 auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
22239 CmpInst::Predicate AltPred = AltCI->getPredicate();
22240 V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
22241 } else {
22242 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
22243 unsigned SrcBWSz = DL->getTypeSizeInBits(
22244 Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
22245 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
22246 if (BWSz <= SrcBWSz) {
22247 if (BWSz < SrcBWSz)
22248 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It->second.first);
22249 assert(LHS->getType() == VecTy &&
22250 "Expected same type as operand.");
22251 if (auto *I = dyn_cast<Instruction>(Val: LHS))
22252 LHS = ::propagateMetadata(Inst: I, VL: E->Scalars);
22253 LHS = FinalShuffle(LHS, E);
22254 E->VectorizedValue = LHS;
22255 ++NumVectorInstructions;
22256 return LHS;
22257 }
22258 }
22259 V0 = Builder.CreateCast(
22260 Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
22261 V1 = Builder.CreateCast(
22262 Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
22263 }
22264 // Add V0 and V1 to later analysis to try to find and remove matching
22265 // instruction, if any.
22266 for (Value *V : {V0, V1}) {
22267 if (auto *I = dyn_cast<Instruction>(Val: V)) {
22268 GatherShuffleExtractSeq.insert(X: I);
22269 CSEBlocks.insert(V: I->getParent());
22270 }
22271 }
22272
22273 // Create shuffle to take alternate operations from the vector.
22274 // Also, gather up main and alt scalar ops to propagate IR flags to
22275 // each vector operation.
22276 ValueList OpScalars, AltScalars;
22277 SmallVector<int> Mask;
22278 E->buildAltOpShuffleMask(
22279 IsAltOp: [E, this](Instruction *I) {
22280 assert(E->getMatchingMainOpOrAltOp(I) &&
22281 "Unexpected main/alternate opcode");
22282 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
22283 TLI: *TLI);
22284 },
22285 Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
22286
22287 propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
22288 propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
22289 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
22290 // Drop nuw flags for abs(sub(commutative), true).
22291 if (auto *I = dyn_cast<Instruction>(Val: Vec);
22292 I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
22293 any_of(Range&: E->Scalars, P: [E](Value *V) {
22294 if (isa<PoisonValue>(Val: V))
22295 return false;
22296 if (E->hasCopyableElements() && E->isCopyableElement(V))
22297 return false;
22298 auto *IV = cast<Instruction>(Val: V);
22299 return IV->getOpcode() == Instruction::Sub && isCommutative(I: IV);
22300 }))
22301 I->setHasNoUnsignedWrap(/*b=*/false);
22302 };
22303 DropNuwFlag(V0, E->getOpcode());
22304 DropNuwFlag(V1, E->getAltOpcode());
22305
22306 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
22307 assert(SLPReVec && "FixedVectorType is not expected.");
22308 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), Mask);
22309 }
22310 V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
22311 if (auto *I = dyn_cast<Instruction>(Val: V)) {
22312 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
22313 GatherShuffleExtractSeq.insert(X: I);
22314 CSEBlocks.insert(V: I->getParent());
22315 }
22316 }
22317
22318 E->VectorizedValue = V;
22319 ++NumVectorInstructions;
22320
22321 return V;
22322 }
22323 case TreeEntry::ReducedBitcast:
22324 case TreeEntry::ReducedBitcastBSwap: {
22325 assert(UserIgnoreList && "Expected reduction operations only.");
22326 setInsertPointAfterBundle(E);
22327 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
22328 ZExt->VectorizedValue = PoisonValue::get(T: getWidenedType(
22329 ScalarTy: ZExt->getMainOp()->getType(), VF: ZExt->getVectorFactor()));
22330 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
22331 Const->VectorizedValue = PoisonValue::get(T: getWidenedType(
22332 ScalarTy: Const->Scalars.front()->getType(), VF: Const->getVectorFactor()));
22333 Value *Op = vectorizeOperand(E: ZExt, NodeIdx: 0);
22334 auto *SrcType = IntegerType::get(
22335 C&: Op->getContext(),
22336 NumBits: DL->getTypeSizeInBits(Ty: cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy()) *
22337 E->getVectorFactor());
22338 auto *OrigScalarTy = ScalarTy;
22339 // Set the scalar type properly to avoid casting to the extending type.
22340 ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
22341 Op = FinalShuffle(Op, E);
22342 auto *V = Builder.CreateBitCast(V: Op, DestTy: SrcType);
22343 ++NumVectorInstructions;
22344 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
22345 V = Builder.CreateUnaryIntrinsic(ID: Intrinsic::bswap, V);
22346 ++NumVectorInstructions;
22347 }
22348 if (SrcType != OrigScalarTy) {
22349 V = Builder.CreateIntCast(V, DestTy: OrigScalarTy, /*isSigned=*/false);
22350 ++NumVectorInstructions;
22351 }
22352 E->VectorizedValue = V;
22353 return V;
22354 }
22355 case TreeEntry::ReducedBitcastLoads:
22356 case TreeEntry::ReducedBitcastBSwapLoads: {
22357 assert(UserIgnoreList && "Expected reduction operations only.");
22358 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
22359 TreeEntry *Load = getOperandEntry(E: ZExt, /*Idx=*/0);
22360 setInsertPointAfterBundle(Load);
22361 ZExt->VectorizedValue = PoisonValue::get(T: getWidenedType(
22362 ScalarTy: ZExt->getMainOp()->getType(), VF: ZExt->getVectorFactor()));
22363 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
22364 Const->VectorizedValue = PoisonValue::get(T: getWidenedType(
22365 ScalarTy: Const->Scalars.front()->getType(), VF: Const->getVectorFactor()));
22366 Load->VectorizedValue = PoisonValue::get(T: getWidenedType(
22367 ScalarTy: Load->getMainOp()->getType(), VF: Load->getVectorFactor()));
22368 LoadInst *LI = cast<LoadInst>(Val: Load->getMainOp());
22369 Value *PO = LI->getPointerOperand();
22370 auto *SrcTy = IntegerType::get(
22371 C&: ScalarTy->getContext(),
22372 NumBits: DL->getTypeSizeInBits(Ty: cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy()) *
22373 E->getVectorFactor());
22374 auto *OrigScalarTy = ScalarTy;
22375 ScalarTy = ZExt->getMainOp()->getType();
22376 Value *V = Builder.CreateAlignedLoad(Ty: SrcTy, Ptr: PO, Align: LI->getAlign());
22377 ++NumVectorInstructions;
22378 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
22379 V = Builder.CreateUnaryIntrinsic(ID: Intrinsic::bswap, V);
22380 ++NumVectorInstructions;
22381 }
22382 if (SrcTy != OrigScalarTy) {
22383 V = Builder.CreateIntCast(V, DestTy: OrigScalarTy, /*isSigned=*/false);
22384 ++NumVectorInstructions;
22385 }
22386 E->VectorizedValue = V;
22387 return V;
22388 }
22389 case TreeEntry::ReducedCmpBitcast: {
22390 assert(UserIgnoreList && "Expected reduction operations only.");
22391 setInsertPointAfterBundle(E);
22392 TreeEntry *Op1TE = getOperandEntry(E, /*Idx=*/1);
22393 TreeEntry *Op2TE = getOperandEntry(E, /*Idx=*/2);
22394 Op1TE->VectorizedValue =
22395 PoisonValue::get(T: getWidenedType(ScalarTy, VF: Op1TE->getVectorFactor()));
22396 Op2TE->VectorizedValue =
22397 PoisonValue::get(T: getWidenedType(ScalarTy, VF: Op2TE->getVectorFactor()));
22398 Value *Cmp = vectorizeOperand(E, /*NodeIdx=*/0);
22399 // Set the scalar type properly to avoid casting to the extending type.
22400 auto *DstTy =
22401 IntegerType::getIntNTy(C&: ScalarTy->getContext(), N: E->getVectorFactor());
22402 auto *V = Builder.CreateBitCast(V: Cmp, DestTy: DstTy);
22403 ++NumVectorInstructions;
22404 if (DstTy != ScalarTy) {
22405 V = Builder.CreateIntCast(V, DestTy: ScalarTy, /*isSigned=*/false);
22406 ++NumVectorInstructions;
22407 }
22408 E->VectorizedValue = V;
22409 return V;
22410 }
22411 default:
22412 llvm_unreachable("unknown inst");
22413 }
22414 return nullptr;
22415}
22416
22417Value *BoUpSLP::vectorizeTree() {
22418 ExtraValueToDebugLocsMap ExternallyUsedValues;
22419 return vectorizeTree(ExternallyUsedValues);
22420}
22421
22422Value *BoUpSLP::vectorizeTree(
22423 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
22424 Instruction *ReductionRoot,
22425 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
22426 VectorValuesAndScales) {
22427 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
22428 // need to rebuild it.
22429 EntryToLastInstruction.clear();
22430 // All blocks must be scheduled before any instructions are inserted.
22431 for (auto &BSIter : BlocksSchedules)
22432 scheduleBlock(R: *this, BS: BSIter.second.get());
22433 // Cache last instructions for the nodes to avoid side effects, which may
22434 // appear during vectorization, like extra uses, etc.
22435 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22436 // Need to generate insertion point for loads nodes of the bitcast/bswap
22437 // ops.
22438 if (TE->isGather() || DeletedNodes.contains(Ptr: TE.get()) ||
22439 (TE->State == TreeEntry::CombinedVectorize &&
22440 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
22441 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
22442 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
22443 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
22444 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
22445 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
22446 continue;
22447 (void)getLastInstructionInBundle(E: TE.get());
22448 }
22449
22450 if (ReductionRoot)
22451 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
22452 IP: ReductionRoot->getIterator());
22453 else
22454 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
22455
22456 // Vectorize gather operands of the nodes with the external uses only.
22457 SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
22458 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22459 if (DeletedNodes.contains(Ptr: TE.get()))
22460 continue;
22461 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
22462 TE->UserTreeIndex.UserTE->hasState() &&
22463 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
22464 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
22465 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
22466 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
22467 all_of(Range&: TE->UserTreeIndex.UserTE->Scalars,
22468 P: [](Value *V) { return isUsedOutsideBlock(V); })) {
22469 Instruction &LastInst =
22470 getLastInstructionInBundle(E: TE->UserTreeIndex.UserTE);
22471 GatherEntries.emplace_back(Args: TE.get(), Args: &LastInst);
22472 }
22473 }
22474 for (auto &Entry : GatherEntries) {
22475 IRBuilderBase::InsertPointGuard Guard(Builder);
22476 Builder.SetInsertPoint(Entry.second);
22477 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
22478 (void)vectorizeTree(E: Entry.first);
22479 }
22480 // Emit gathered loads first to emit better code for the users of those
22481 // gathered loads.
22482 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
22483 if (DeletedNodes.contains(Ptr: TE.get()))
22484 continue;
22485 if (GatheredLoadsEntriesFirst.has_value() &&
22486 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
22487 (!TE->isGather() || TE->UserTreeIndex)) {
22488 assert((TE->UserTreeIndex ||
22489 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
22490 "Expected gathered load node.");
22491 (void)vectorizeTree(E: TE.get());
22492 }
22493 }
22494 (void)vectorizeTree(E: VectorizableTree[0].get());
22495 // Run through the list of postponed gathers and emit them, replacing the temp
22496 // emitted allocas with actual vector instructions.
22497 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
22498 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
22499 for (const TreeEntry *E : PostponedNodes) {
22500 auto *TE = const_cast<TreeEntry *>(E);
22501 auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
22502 TE->VectorizedValue = nullptr;
22503 auto *UserI = cast<Instruction>(Val&: TE->UserTreeIndex.UserTE->VectorizedValue);
22504 // If user is a PHI node, its vector code have to be inserted right before
22505 // block terminator. Since the node was delayed, there were some unresolved
22506 // dependencies at the moment when stab instruction was emitted. In a case
22507 // when any of these dependencies turn out an operand of another PHI, coming
22508 // from this same block, position of a stab instruction will become invalid.
22509 // The is because source vector that supposed to feed this gather node was
22510 // inserted at the end of the block [after stab instruction]. So we need
22511 // to adjust insertion point again to the end of block.
22512 if (isa<PHINode>(Val: UserI) ||
22513 (TE->UserTreeIndex.UserTE->hasState() &&
22514 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22515 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
22516 // Insert before all users.
22517 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
22518 for (User *U : PrevVec->users()) {
22519 if (U == UserI)
22520 continue;
22521 auto *UI = dyn_cast<Instruction>(Val: U);
22522 if (!UI || isa<PHINode>(Val: UI) || UI->getParent() != InsertPt->getParent())
22523 continue;
22524 if (UI->comesBefore(Other: InsertPt))
22525 InsertPt = UI;
22526 }
22527 Builder.SetInsertPoint(InsertPt);
22528 } else {
22529 Builder.SetInsertPoint(PrevVec);
22530 }
22531 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
22532 Value *Vec = vectorizeTree(E: TE);
22533 if (auto *VecI = dyn_cast<Instruction>(Val: Vec);
22534 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
22535 Builder.GetInsertPoint()->comesBefore(Other: VecI))
22536 VecI->moveBeforePreserving(BB&: *Builder.GetInsertBlock(),
22537 I: Builder.GetInsertPoint());
22538 if (Vec->getType() != PrevVec->getType()) {
22539 assert(Vec->getType()->isIntOrIntVectorTy() &&
22540 PrevVec->getType()->isIntOrIntVectorTy() &&
22541 "Expected integer vector types only.");
22542 std::optional<bool> IsSigned;
22543 for (Value *V : TE->Scalars) {
22544 if (isVectorized(V)) {
22545 for (const TreeEntry *MNTE : getTreeEntries(V)) {
22546 auto It = MinBWs.find(Val: MNTE);
22547 if (It != MinBWs.end()) {
22548 IsSigned = IsSigned.value_or(u: false) || It->second.second;
22549 if (*IsSigned)
22550 break;
22551 }
22552 }
22553 if (IsSigned.value_or(u: false))
22554 break;
22555 // Scan through gather nodes.
22556 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
22557 auto It = MinBWs.find(Val: BVE);
22558 if (It != MinBWs.end()) {
22559 IsSigned = IsSigned.value_or(u: false) || It->second.second;
22560 if (*IsSigned)
22561 break;
22562 }
22563 }
22564 if (IsSigned.value_or(u: false))
22565 break;
22566 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
22567 IsSigned =
22568 IsSigned.value_or(u: false) ||
22569 !isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery(*DL));
22570 continue;
22571 }
22572 if (IsSigned.value_or(u: false))
22573 break;
22574 }
22575 }
22576 if (IsSigned.value_or(u: false)) {
22577 // Final attempt - check user node.
22578 auto It = MinBWs.find(Val: TE->UserTreeIndex.UserTE);
22579 if (It != MinBWs.end())
22580 IsSigned = It->second.second;
22581 }
22582 assert(IsSigned &&
22583 "Expected user node or perfect diamond match in MinBWs.");
22584 Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
22585 }
22586 PrevVec->replaceAllUsesWith(V: Vec);
22587 PostponedValues.try_emplace(Key: Vec).first->second.push_back(Elt: TE);
22588 // Replace the stub vector node, if it was used before for one of the
22589 // buildvector nodes already.
22590 auto It = PostponedValues.find(Val: PrevVec);
22591 if (It != PostponedValues.end()) {
22592 for (TreeEntry *VTE : It->getSecond())
22593 VTE->VectorizedValue = Vec;
22594 }
22595 eraseInstruction(I: PrevVec);
22596 }
22597
22598 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
22599 << " values .\n");
22600
22601 SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
22602 // Maps vector instruction to original insertelement instruction
22603 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
22604 // Maps extract Scalar to the corresponding extractelement instruction in the
22605 // basic block. Only one extractelement per block should be emitted.
22606 DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
22607 ScalarToEEs;
22608 SmallDenseSet<Value *, 4> UsedInserts;
22609 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
22610 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
22611 SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;
22612 // Extract all of the elements with the external uses.
22613 for (const auto &ExternalUse : ExternalUses) {
22614 Value *Scalar = ExternalUse.Scalar;
22615 llvm::User *User = ExternalUse.User;
22616
22617 // Skip users that we already RAUW. This happens when one instruction
22618 // has multiple uses of the same value.
22619 if (User && !is_contained(Range: Scalar->users(), Element: User))
22620 continue;
22621 const TreeEntry *E = &ExternalUse.E;
22622 assert(E && "Invalid scalar");
22623 assert(!E->isGather() && "Extracting from a gather list");
22624 // Non-instruction pointers are not deleted, just skip them.
22625 if (E->getOpcode() == Instruction::GetElementPtr &&
22626 !isa<GetElementPtrInst>(Val: Scalar))
22627 continue;
22628
22629 Value *Vec = E->VectorizedValue;
22630 assert(Vec && "Can't find vectorizable value");
22631
22632 Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
22633 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
22634 if (Scalar->getType() != Vec->getType()) {
22635 Value *Ex = nullptr;
22636 Value *ExV = nullptr;
22637 auto *Inst = dyn_cast<Instruction>(Val: Scalar);
22638 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Ptr: Inst);
22639 auto It = ScalarToEEs.find(Val: Scalar);
22640 if (It != ScalarToEEs.end()) {
22641 // No need to emit many extracts, just move the only one in the
22642 // current block.
22643 auto EEIt = It->second.find(Val: ReplaceInst ? Inst->getParent()
22644 : Builder.GetInsertBlock());
22645 if (EEIt != It->second.end()) {
22646 Value *PrevV = EEIt->second.first;
22647 if (auto *I = dyn_cast<Instruction>(Val: PrevV);
22648 I && !ReplaceInst &&
22649 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
22650 Builder.GetInsertPoint()->comesBefore(Other: I)) {
22651 I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
22652 I: Builder.GetInsertPoint());
22653 if (auto *CI = dyn_cast<Instruction>(Val: EEIt->second.second))
22654 CI->moveAfter(MovePos: I);
22655 }
22656 Ex = PrevV;
22657 ExV = EEIt->second.second ? EEIt->second.second : Ex;
22658 }
22659 }
22660 if (!Ex) {
22661 // "Reuse" the existing extract to improve final codegen.
22662 if (ReplaceInst) {
22663 // Leave the instruction as is, if it cheaper extracts and all
22664 // operands are scalar.
22665 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) {
22666 IgnoredExtracts.insert(V: EE);
22667 Ex = EE;
22668 } else {
22669 auto *CloneInst = Inst->clone();
22670 CloneInst->insertBefore(InsertPos: Inst->getIterator());
22671 if (Inst->hasName())
22672 CloneInst->takeName(V: Inst);
22673 Ex = CloneInst;
22674 }
22675 } else if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar);
22676 ES && isa<Instruction>(Val: Vec)) {
22677 Value *V = ES->getVectorOperand();
22678 auto *IVec = cast<Instruction>(Val: Vec);
22679 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
22680 V = ETEs.front()->VectorizedValue;
22681 if (auto *IV = dyn_cast<Instruction>(Val: V);
22682 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
22683 IV->comesBefore(Other: IVec))
22684 Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
22685 else
22686 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
22687 } else if (auto *VecTy =
22688 dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
22689 assert(SLPReVec && "FixedVectorType is not expected.");
22690 unsigned VecTyNumElements = VecTy->getNumElements();
22691 // When REVEC is enabled, we need to extract a vector.
22692 // Note: The element size of Scalar may be different from the
22693 // element size of Vec.
22694 Ex = createExtractVector(Builder, Vec, SubVecVF: VecTyNumElements,
22695 Index: ExternalUse.Lane * VecTyNumElements);
22696 } else {
22697 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
22698 }
22699 // If necessary, sign-extend or zero-extend ScalarRoot
22700 // to the larger type.
22701 ExV = Ex;
22702 if (Scalar->getType() != Ex->getType())
22703 ExV = Builder.CreateIntCast(
22704 V: Ex, DestTy: Scalar->getType(),
22705 isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
22706 auto *I = dyn_cast<Instruction>(Val: Ex);
22707 ScalarToEEs[Scalar].try_emplace(Key: I ? I->getParent()
22708 : &F->getEntryBlock(),
22709 Args: std::make_pair(x&: Ex, y&: ExV));
22710 }
22711 // The then branch of the previous if may produce constants, since 0
22712 // operand might be a constant.
22713 if (auto *ExI = dyn_cast<Instruction>(Val: Ex);
22714 ExI && !isa<PHINode>(Val: ExI) && !mayHaveNonDefUseDependency(I: *ExI)) {
22715 GatherShuffleExtractSeq.insert(X: ExI);
22716 CSEBlocks.insert(V: ExI->getParent());
22717 }
22718 return ExV;
22719 }
22720 assert(isa<FixedVectorType>(Scalar->getType()) &&
22721 isa<InsertElementInst>(Scalar) &&
22722 "In-tree scalar of vector type is not insertelement?");
22723 auto *IE = cast<InsertElementInst>(Val: Scalar);
22724 VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
22725 return Vec;
22726 };
22727 // If User == nullptr, the Scalar remains as scalar in vectorized
22728 // instructions or is used as extra arg. Generate ExtractElement instruction
22729 // and update the record for this scalar in ExternallyUsedValues.
22730 if (!User) {
22731 if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
22732 continue;
22733 assert(
22734 (ExternallyUsedValues.count(Scalar) ||
22735 ExternalUsesWithNonUsers.count(Scalar) ||
22736 ExternalUsesAsOriginalScalar.contains(Scalar) ||
22737 any_of(
22738 Scalar->users(),
22739 [&, TTI = TTI](llvm::User *U) {
22740 if (ExternalUsesAsOriginalScalar.contains(U))
22741 return true;
22742 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
22743 return !UseEntries.empty() &&
22744 (E->State == TreeEntry::Vectorize ||
22745 E->State == TreeEntry::StridedVectorize ||
22746 E->State == TreeEntry::CompressVectorize) &&
22747 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
22748 return (UseEntry->State == TreeEntry::Vectorize ||
22749 UseEntry->State ==
22750 TreeEntry::StridedVectorize ||
22751 UseEntry->State ==
22752 TreeEntry::CompressVectorize) &&
22753 doesInTreeUserNeedToExtract(
22754 Scalar, getRootEntryInstruction(*UseEntry),
22755 TLI, TTI);
22756 });
22757 })) &&
22758 "Scalar with nullptr User must be registered in "
22759 "ExternallyUsedValues map or remain as scalar in vectorized "
22760 "instructions");
22761 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
22762 if (auto *PHI = dyn_cast<PHINode>(Val: VecI)) {
22763 if (PHI->getParent()->isLandingPad())
22764 Builder.SetInsertPoint(
22765 TheBB: PHI->getParent(),
22766 IP: std::next(
22767 x: PHI->getParent()->getLandingPadInst()->getIterator()));
22768 else
22769 Builder.SetInsertPoint(TheBB: PHI->getParent(),
22770 IP: PHI->getParent()->getFirstNonPHIIt());
22771 } else {
22772 Builder.SetInsertPoint(TheBB: VecI->getParent(),
22773 IP: std::next(x: VecI->getIterator()));
22774 }
22775 } else {
22776 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
22777 }
22778 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22779 // Required to update internally referenced instructions.
22780 if (Scalar != NewInst) {
22781 assert((!isa<ExtractElementInst>(Scalar) ||
22782 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
22783 "Extractelements should not be replaced.");
22784 Scalar->replaceAllUsesWith(V: NewInst);
22785 }
22786 continue;
22787 }
22788
22789 if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
22790 VU && VU->getOperand(i_nocapture: 1) == Scalar) {
22791 // Skip if the scalar is another vector op or Vec is not an instruction.
22792 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
22793 if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
22794 if (!UsedInserts.insert(V: VU).second)
22795 continue;
22796 // Need to use original vector, if the root is truncated.
22797 auto BWIt = MinBWs.find(Val: E);
22798 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
22799 auto *ScalarTy = FTy->getElementType();
22800 auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
22801 auto VecIt = VectorCasts.find(Val: Key);
22802 if (VecIt == VectorCasts.end()) {
22803 IRBuilderBase::InsertPointGuard Guard(Builder);
22804 if (auto *IVec = dyn_cast<PHINode>(Val: Vec)) {
22805 if (IVec->getParent()->isLandingPad())
22806 Builder.SetInsertPoint(TheBB: IVec->getParent(),
22807 IP: std::next(x: IVec->getParent()
22808 ->getLandingPadInst()
22809 ->getIterator()));
22810 else
22811 Builder.SetInsertPoint(
22812 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
22813 } else if (auto *IVec = dyn_cast<Instruction>(Val: Vec)) {
22814 Builder.SetInsertPoint(IVec->getNextNode());
22815 }
22816 Vec = Builder.CreateIntCast(
22817 V: Vec,
22818 DestTy: getWidenedType(
22819 ScalarTy,
22820 VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
22821 isSigned: BWIt->second.second);
22822 VectorCasts.try_emplace(Key, Args&: Vec);
22823 } else {
22824 Vec = VecIt->second;
22825 }
22826 }
22827
22828 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
22829 if (InsertIdx) {
22830 auto *It = find_if(
22831 Range&: ShuffledInserts, P: [VU](const ShuffledInsertData<Value *> &Data) {
22832 // Checks if 2 insertelements are from the same buildvector.
22833 InsertElementInst *VecInsert = Data.InsertElements.front();
22834 return areTwoInsertFromSameBuildVector(
22835 VU, V: VecInsert,
22836 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); });
22837 });
22838 unsigned Idx = *InsertIdx;
22839 if (It == ShuffledInserts.end()) {
22840 (void)ShuffledInserts.emplace_back();
22841 It = std::next(x: ShuffledInserts.begin(),
22842 n: ShuffledInserts.size() - 1);
22843 }
22844 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
22845 if (Mask.empty())
22846 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
22847 Mask[Idx] = ExternalUse.Lane;
22848 It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
22849 continue;
22850 }
22851 }
22852 }
22853 }
22854
22855 // Generate extracts for out-of-tree users.
22856 // Find the insertion point for the extractelement lane.
22857 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
22858 if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
22859 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
22860 if (PH->getIncomingValue(i: I) == Scalar) {
22861 Instruction *IncomingTerminator =
22862 PH->getIncomingBlock(i: I)->getTerminator();
22863 if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
22864 Builder.SetInsertPoint(TheBB: VecI->getParent(),
22865 IP: std::next(x: VecI->getIterator()));
22866 } else {
22867 Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
22868 }
22869 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22870 PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
22871 }
22872 }
22873 } else {
22874 Builder.SetInsertPoint(cast<Instruction>(Val: User));
22875 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22876 User->replaceUsesOfWith(From: Scalar, To: NewInst);
22877 }
22878 } else {
22879 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
22880 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22881 User->replaceUsesOfWith(From: Scalar, To: NewInst);
22882 }
22883
22884 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
22885 }
22886
22887 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
22888 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
22889 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
22890 int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
22891 for (int I = 0, E = Mask.size(); I < E; ++I) {
22892 if (Mask[I] < VF)
22893 CombinedMask1[I] = Mask[I];
22894 else
22895 CombinedMask2[I] = Mask[I] - VF;
22896 }
22897 ShuffleInstructionBuilder ShuffleBuilder(
22898 cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
22899 ShuffleBuilder.add(V1, Mask: CombinedMask1);
22900 if (V2)
22901 ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
22902 return ShuffleBuilder.finalize(ExtMask: {}, SubVectors: {}, SubVectorsMask: {});
22903 };
22904
22905 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
22906 bool ForSingleMask) {
22907 unsigned VF = Mask.size();
22908 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
22909 if (VF != VecVF) {
22910 if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
22911 Vec = CreateShuffle(Vec, nullptr, Mask);
22912 return std::make_pair(x&: Vec, y: true);
22913 }
22914 if (!ForSingleMask) {
22915 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
22916 for (unsigned I = 0; I < VF; ++I) {
22917 if (Mask[I] != PoisonMaskElem)
22918 ResizeMask[Mask[I]] = Mask[I];
22919 }
22920 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
22921 }
22922 }
22923
22924 return std::make_pair(x&: Vec, y: false);
22925 };
22926 // Perform shuffling of the vectorize tree entries for better handling of
22927 // external extracts.
22928 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
22929 // Find the first and the last instruction in the list of insertelements.
22930 sort(C&: ShuffledInserts[I].InsertElements, Comp: isFirstInsertElement);
22931 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
22932 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
22933 Builder.SetInsertPoint(LastInsert);
22934 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
22935 Value *NewInst = performExtractsShuffleAction<Value>(
22936 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
22937 Base: FirstInsert->getOperand(i_nocapture: 0),
22938 GetVF: [](Value *Vec) {
22939 return cast<VectorType>(Val: Vec->getType())
22940 ->getElementCount()
22941 .getKnownMinValue();
22942 },
22943 ResizeAction: ResizeToVF,
22944 Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
22945 ArrayRef<Value *> Vals) {
22946 assert((Vals.size() == 1 || Vals.size() == 2) &&
22947 "Expected exactly 1 or 2 input values.");
22948 if (Vals.size() == 1) {
22949 // Do not create shuffle if the mask is a simple identity
22950 // non-resizing mask.
22951 if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
22952 ->getNumElements() ||
22953 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
22954 return CreateShuffle(Vals.front(), nullptr, Mask);
22955 return Vals.front();
22956 }
22957 return CreateShuffle(Vals.front() ? Vals.front()
22958 : FirstInsert->getOperand(i_nocapture: 0),
22959 Vals.back(), Mask);
22960 });
22961 auto It = ShuffledInserts[I].InsertElements.rbegin();
22962 // Rebuild buildvector chain.
22963 InsertElementInst *II = nullptr;
22964 if (It != ShuffledInserts[I].InsertElements.rend())
22965 II = *It;
22966 SmallVector<Instruction *> Inserts;
22967 while (It != ShuffledInserts[I].InsertElements.rend()) {
22968 assert(II && "Must be an insertelement instruction.");
22969 if (*It == II)
22970 ++It;
22971 else
22972 Inserts.push_back(Elt: cast<Instruction>(Val: II));
22973 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
22974 }
22975 for (Instruction *II : reverse(C&: Inserts)) {
22976 II->replaceUsesOfWith(From: II->getOperand(i: 0), To: NewInst);
22977 if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
22978 if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
22979 II->moveAfter(MovePos: NewI);
22980 NewInst = II;
22981 }
22982 LastInsert->replaceAllUsesWith(V: NewInst);
22983 for (InsertElementInst *IE : reverse(C&: ShuffledInserts[I].InsertElements)) {
22984 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 0),
22985 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 0)->getType()));
22986 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 1),
22987 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 1)->getType()));
22988 eraseInstruction(I: IE);
22989 }
22990 CSEBlocks.insert(V: LastInsert->getParent());
22991 }
22992
22993 SmallVector<Instruction *> RemovedInsts;
22994 // For each vectorized value:
22995 for (auto &TEPtr : VectorizableTree) {
22996 TreeEntry *Entry = TEPtr.get();
22997
22998 // No need to handle users of gathered values.
22999 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
23000 DeletedNodes.contains(Ptr: Entry) ||
23001 TransformedToGatherNodes.contains(Val: Entry))
23002 continue;
23003
23004 if (Entry->CombinedOp == TreeEntry::ReducedBitcast ||
23005 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
23006 Entry->CombinedOp == TreeEntry::ReducedBitcastLoads ||
23007 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
23008 Entry->CombinedOp == TreeEntry::ReducedCmpBitcast) {
23009 // Skip constant node
23010 if (!Entry->hasState()) {
23011 assert(allConstant(Entry->Scalars) && "Expected constants only.");
23012 continue;
23013 }
23014 for (Value *Scalar : Entry->Scalars) {
23015 auto *I = dyn_cast<Instruction>(Val: Scalar);
23016
23017 if (!I || Entry->isCopyableElement(V: I))
23018 continue;
23019 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *I << ".\n");
23020 RemovedInsts.push_back(Elt: I);
23021 }
23022 continue;
23023 }
23024
23025 assert(Entry->VectorizedValue && "Can't find vectorizable value");
23026
23027 // For each lane:
23028 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
23029 Value *Scalar = Entry->Scalars[Lane];
23030
23031 if (Entry->getOpcode() == Instruction::GetElementPtr &&
23032 !isa<GetElementPtrInst>(Val: Scalar))
23033 continue;
23034 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Scalar);
23035 EE && IgnoredExtracts.contains(V: EE))
23036 continue;
23037 if (!isa<Instruction>(Val: Scalar) || Entry->isCopyableElement(V: Scalar))
23038 continue;
23039#ifndef NDEBUG
23040 Type *Ty = Scalar->getType();
23041 if (!Ty->isVoidTy()) {
23042 for (User *U : Scalar->users()) {
23043 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
23044
23045 // It is legal to delete users in the ignorelist.
23046 assert((isVectorized(U) ||
23047 (UserIgnoreList && UserIgnoreList->contains(U)) ||
23048 (isa_and_nonnull<Instruction>(U) &&
23049 isDeleted(cast<Instruction>(U)))) &&
23050 "Deleting out-of-tree value");
23051 }
23052 }
23053#endif
23054 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
23055 auto *I = cast<Instruction>(Val: Scalar);
23056 RemovedInsts.push_back(Elt: I);
23057 }
23058 }
23059
23060 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
23061 // new vector instruction.
23062 if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree[0]->VectorizedValue))
23063 V->mergeDIAssignID(SourceInstructions: RemovedInsts);
23064
23065 // Clear up reduction references, if any.
23066 if (UserIgnoreList) {
23067 for (Instruction *I : RemovedInsts) {
23068 const TreeEntry *IE = getTreeEntries(V: I).front();
23069 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(V: I);
23070 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
23071 IE = SplitEntries.front();
23072 if (IE->Idx != 0 &&
23073 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
23074 (ValueToGatherNodes.lookup(Val: I).contains(
23075 key: VectorizableTree.front().get()) ||
23076 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
23077 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
23078 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
23079 IE->UserTreeIndex &&
23080 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
23081 !(GatheredLoadsEntriesFirst.has_value() &&
23082 IE->Idx >= *GatheredLoadsEntriesFirst &&
23083 VectorizableTree.front()->isGather() &&
23084 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
23085 !(!VectorizableTree.front()->isGather() &&
23086 VectorizableTree.front()->isCopyableElement(V: I)))
23087 continue;
23088 SmallVector<SelectInst *> LogicalOpSelects;
23089 I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
23090 // Do not replace condition of the logical op in form select <cond>.
23091 bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
23092 (match(V: U.getUser(), P: m_LogicalAnd()) ||
23093 match(V: U.getUser(), P: m_LogicalOr())) &&
23094 U.getOperandNo() == 0;
23095 if (IsPoisoningLogicalOp) {
23096 LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
23097 return false;
23098 }
23099 return UserIgnoreList->contains(V: U.getUser());
23100 });
23101 // Replace conditions of the poisoning logical ops with the non-poison
23102 // constant value.
23103 for (SelectInst *SI : LogicalOpSelects)
23104 SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
23105 }
23106 }
23107 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
23108 // cache correctness.
23109 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
23110 // - instructions are not deleted until later.
23111 removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts), VectorValuesAndScales);
23112
23113 Builder.ClearInsertionPoint();
23114 InstrElementSize.clear();
23115
23116 const TreeEntry &RootTE = *VectorizableTree.front();
23117 Value *Vec = RootTE.VectorizedValue;
23118 if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != 0 &&
23119 It != MinBWs.end() &&
23120 ReductionBitWidth != It->second.first) {
23121 IRBuilder<>::InsertPointGuard Guard(Builder);
23122 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
23123 IP: ReductionRoot->getIterator());
23124 if (isReducedBitcastRoot() || isReducedCmpBitcastRoot()) {
23125 Vec = Builder.CreateIntCast(V: Vec, DestTy: Builder.getIntNTy(N: ReductionBitWidth),
23126 isSigned: It->second.second);
23127
23128 } else {
23129 Vec = Builder.CreateIntCast(
23130 V: Vec,
23131 DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
23132 EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
23133 isSigned: It->second.second);
23134 }
23135 }
23136 return Vec;
23137}
23138
23139void BoUpSLP::optimizeGatherSequence() {
23140 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
23141 << " gather sequences instructions.\n");
23142 // LICM InsertElementInst sequences.
23143 for (Instruction *I : GatherShuffleExtractSeq) {
23144 if (isDeleted(I))
23145 continue;
23146
23147 // Check if this block is inside a loop.
23148 Loop *L = LI->getLoopFor(BB: I->getParent());
23149 if (!L)
23150 continue;
23151
23152 // Check if it has a preheader.
23153 BasicBlock *PreHeader = L->getLoopPreheader();
23154 if (!PreHeader)
23155 continue;
23156
23157 // If the vector or the element that we insert into it are
23158 // instructions that are defined in this basic block then we can't
23159 // hoist this instruction.
23160 if (any_of(Range: I->operands(), P: [L](Value *V) {
23161 auto *OpI = dyn_cast<Instruction>(Val: V);
23162 return OpI && L->contains(Inst: OpI);
23163 }))
23164 continue;
23165
23166 // We can hoist this instruction. Move it to the pre-header.
23167 I->moveBefore(InsertPos: PreHeader->getTerminator()->getIterator());
23168 CSEBlocks.insert(V: PreHeader);
23169 }
23170
23171 // Make a list of all reachable blocks in our CSE queue.
23172 SmallVector<const DomTreeNode *, 8> CSEWorkList;
23173 CSEWorkList.reserve(N: CSEBlocks.size());
23174 for (BasicBlock *BB : CSEBlocks)
23175 if (DomTreeNode *N = DT->getNode(BB)) {
23176 assert(DT->isReachableFromEntry(N));
23177 CSEWorkList.push_back(Elt: N);
23178 }
23179
23180 // Sort blocks by domination. This ensures we visit a block after all blocks
23181 // dominating it are visited.
23182 llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode *A, const DomTreeNode *B) {
23183 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
23184 "Different nodes should have different DFS numbers");
23185 return A->getDFSNumIn() < B->getDFSNumIn();
23186 });
23187
23188 // Less defined shuffles can be replaced by the more defined copies.
23189 // Between two shuffles one is less defined if it has the same vector operands
23190 // and its mask indeces are the same as in the first one or undefs. E.g.
23191 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
23192 // poison, <0, 0, 0, 0>.
23193 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
23194 Instruction *I2,
23195 SmallVectorImpl<int> &NewMask) {
23196 if (I1->getType() != I2->getType())
23197 return false;
23198 auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
23199 auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
23200 if (!SI1 || !SI2)
23201 return I1->isIdenticalTo(I: I2);
23202 if (SI1->isIdenticalTo(I: SI2))
23203 return true;
23204 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
23205 if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
23206 return false;
23207 // Check if the second instruction is more defined than the first one.
23208 NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
23209 ArrayRef<int> SM1 = SI1->getShuffleMask();
23210 // Count trailing undefs in the mask to check the final number of used
23211 // registers.
23212 unsigned LastUndefsCnt = 0;
23213 for (int I = 0, E = NewMask.size(); I < E; ++I) {
23214 if (SM1[I] == PoisonMaskElem)
23215 ++LastUndefsCnt;
23216 else
23217 LastUndefsCnt = 0;
23218 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
23219 NewMask[I] != SM1[I])
23220 return false;
23221 if (NewMask[I] == PoisonMaskElem)
23222 NewMask[I] = SM1[I];
23223 }
23224 // Check if the last undefs actually change the final number of used vector
23225 // registers.
23226 return SM1.size() - LastUndefsCnt > 1 &&
23227 ::getNumberOfParts(TTI: *TTI, VecTy: SI1->getType()) ==
23228 ::getNumberOfParts(
23229 TTI: *TTI, VecTy: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
23230 VF: SM1.size() - LastUndefsCnt));
23231 };
23232 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
23233 // instructions. TODO: We can further optimize this scan if we split the
23234 // instructions into different buckets based on the insert lane.
23235 SmallVector<Instruction *, 16> Visited;
23236 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
23237 assert(*I &&
23238 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
23239 "Worklist not sorted properly!");
23240 BasicBlock *BB = (*I)->getBlock();
23241 // For all instructions in blocks containing gather sequences:
23242 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
23243 if (isDeleted(I: &In))
23244 continue;
23245 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
23246 !GatherShuffleExtractSeq.contains(key: &In))
23247 continue;
23248
23249 // Check if we can replace this instruction with any of the
23250 // visited instructions.
23251 bool Replaced = false;
23252 for (Instruction *&V : Visited) {
23253 SmallVector<int> NewMask;
23254 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
23255 DT->dominates(A: V->getParent(), B: In.getParent())) {
23256 In.replaceAllUsesWith(V);
23257 eraseInstruction(I: &In);
23258 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
23259 if (!NewMask.empty())
23260 SI->setShuffleMask(NewMask);
23261 Replaced = true;
23262 break;
23263 }
23264 if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
23265 GatherShuffleExtractSeq.contains(key: V) &&
23266 IsIdenticalOrLessDefined(V, &In, NewMask) &&
23267 DT->dominates(A: In.getParent(), B: V->getParent())) {
23268 In.moveAfter(MovePos: V);
23269 V->replaceAllUsesWith(V: &In);
23270 eraseInstruction(I: V);
23271 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
23272 if (!NewMask.empty())
23273 SI->setShuffleMask(NewMask);
23274 V = &In;
23275 Replaced = true;
23276 break;
23277 }
23278 }
23279 if (!Replaced) {
23280 assert(!is_contained(Visited, &In));
23281 Visited.push_back(Elt: &In);
23282 }
23283 }
23284 }
23285 CSEBlocks.clear();
23286 GatherShuffleExtractSeq.clear();
23287}
23288
23289BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
23290 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
23291 auto &BundlePtr =
23292 ScheduledBundlesList.emplace_back(Args: std::make_unique<ScheduleBundle>());
23293 for (Value *V : VL) {
23294 if (S.isNonSchedulable(V))
23295 continue;
23296 auto *I = cast<Instruction>(Val: V);
23297 if (S.isCopyableElement(V)) {
23298 // Add a copyable element model.
23299 ScheduleCopyableData &SD =
23300 addScheduleCopyableData(EI, I, SchedulingRegionID, Bundle&: *BundlePtr);
23301 // Group the instructions to a bundle.
23302 BundlePtr->add(SD: &SD);
23303 continue;
23304 }
23305 ScheduleData *BundleMember = getScheduleData(V);
23306 assert(BundleMember && "no ScheduleData for bundle member "
23307 "(maybe not in same basic block)");
23308 // Group the instructions to a bundle.
23309 BundlePtr->add(SD: BundleMember);
23310 ScheduledBundles.try_emplace(Key: I).first->getSecond().push_back(
23311 Elt: BundlePtr.get());
23312 }
23313 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
23314 return *BundlePtr;
23315}
23316
23317// Groups the instructions to a bundle (which is then a single scheduling entity)
23318// and schedules instructions until the bundle gets ready.
23319std::optional<BoUpSLP::ScheduleBundle *>
23320BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
23321 const InstructionsState &S,
23322 const EdgeInfo &EI) {
23323 // No need to schedule PHIs, insertelement, extractelement and extractvalue
23324 // instructions.
23325 if (isa<PHINode>(Val: S.getMainOp()) ||
23326 isVectorLikeInstWithConstOps(V: S.getMainOp()))
23327 return nullptr;
23328 // If the parent node is non-schedulable and the current node is copyable, and
23329 // any of parent instructions are used outside several basic blocks or in
23330 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
23331 // analysis, leading to a crash.
23332 // Non-scheduled nodes may not have related ScheduleData model, which may lead
23333 // to a skipped dep analysis.
23334 bool HasCopyables = S.areInstructionsWithCopyableElements();
23335 bool DoesNotRequireScheduling =
23336 (!HasCopyables && doesNotNeedToSchedule(VL)) ||
23337 all_of(Range&: VL, P: [&](Value *V) { return S.isNonSchedulable(V); });
23338 if (!DoesNotRequireScheduling && S.areInstructionsWithCopyableElements() &&
23339 EI && EI.UserTE->hasState() && EI.UserTE->doesNotNeedToSchedule() &&
23340 EI.UserTE->getOpcode() != Instruction::PHI &&
23341 EI.UserTE->getOpcode() != Instruction::InsertElement &&
23342 any_of(Range&: EI.UserTE->Scalars, P: [](Value *V) {
23343 auto *I = dyn_cast<Instruction>(Val: V);
23344 if (!I)
23345 return false;
23346 for (User *U : I->users()) {
23347 auto *UI = cast<Instruction>(Val: U);
23348 if (isa<BinaryOperator>(Val: UI))
23349 return true;
23350 }
23351 return false;
23352 }))
23353 return std::nullopt;
23354 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
23355 EI.UserTE->hasCopyableElements() &&
23356 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
23357 all_of(Range&: VL, P: [&](Value *V) {
23358 if (S.isCopyableElement(V))
23359 return true;
23360 return isUsedOutsideBlock(V);
23361 }))
23362 return std::nullopt;
23363 // If any instruction is used outside block only and its operand is placed
23364 // immediately before it, do not schedule, it may cause wrong def-use chain.
23365 if (S.areInstructionsWithCopyableElements() && any_of(Range&: VL, P: [&](Value *V) {
23366 if (isa<PoisonValue>(Val: V) || S.isCopyableElement(V))
23367 return false;
23368 if (isUsedOutsideBlock(V)) {
23369 for (Value *Op : cast<Instruction>(Val: V)->operands()) {
23370 auto *I = dyn_cast<Instruction>(Val: Op);
23371 if (!I)
23372 continue;
23373 return SLP->isVectorized(V: I) && I->getNextNode() == V;
23374 }
23375 }
23376 return false;
23377 }))
23378 return std::nullopt;
23379 if (S.areInstructionsWithCopyableElements() && EI) {
23380 bool IsNonSchedulableWithParentPhiNode =
23381 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
23382 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
23383 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
23384 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
23385 if (IsNonSchedulableWithParentPhiNode) {
23386 SmallSet<std::pair<Value *, Value *>, 4> Values;
23387 for (const auto [Idx, V] :
23388 enumerate(First&: EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
23389 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
23390 OpIdx: EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
23391 auto *I = dyn_cast<Instruction>(Val: Op);
23392 if (!I || !isCommutative(I))
23393 continue;
23394 if (!Values.insert(V: std::make_pair(x&: V, y&: Op)).second)
23395 return std::nullopt;
23396 }
23397 } else {
23398 // If any of the parent requires scheduling - exit, complex dep between
23399 // schedulable/non-schedulable parents.
23400 if (any_of(Range&: EI.UserTE->Scalars, P: [&](Value *V) {
23401 if (EI.UserTE->hasCopyableElements() &&
23402 EI.UserTE->isCopyableElement(V))
23403 return false;
23404 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
23405 return any_of(Range&: Entries, P: [](const TreeEntry *TE) {
23406 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
23407 TE->UserTreeIndex.UserTE->hasState() &&
23408 TE->UserTreeIndex.UserTE->State !=
23409 TreeEntry::SplitVectorize &&
23410 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
23411 });
23412 }))
23413 return std::nullopt;
23414 }
23415 }
23416 if (DoesNotRequireScheduling) {
23417 // If all operands were replaced by copyables, the operands of this node
23418 // might be not, so need to recalculate dependencies for schedule data,
23419 // replaced by copyable schedule data.
23420 for (Value *V : VL) {
23421 auto *I = dyn_cast<Instruction>(Val: V);
23422 if (!I || (HasCopyables && S.isCopyableElement(V)))
23423 continue;
23424 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
23425 for (const Use &U : I->operands()) {
23426 unsigned &NumOps =
23427 UserOpToNumOps.try_emplace(Key: std::make_pair(x&: I, y: U.get()), Args: 0)
23428 .first->getSecond();
23429 ++NumOps;
23430 if (auto *Op = dyn_cast<Instruction>(Val: U.get());
23431 Op && areAllOperandsReplacedByCopyableData(User: I, Op, SLP&: *SLP, NumOps)) {
23432 if (ScheduleData *OpSD = getScheduleData(I: Op);
23433 OpSD && OpSD->hasValidDependencies())
23434 // TODO: investigate how to improve it instead of early exiting.
23435 return std::nullopt;
23436 }
23437 }
23438 }
23439 return nullptr;
23440 }
23441
23442 // Any schedulable copyable with split vectorize parent - skip, not supported
23443 // currently.
23444 // TODO: investigate fix for this early exit.
23445 if (S.areInstructionsWithCopyableElements() && EI.UserTE &&
23446 EI.UserTE->State == TreeEntry::SplitVectorize &&
23447 any_of(Range&: VL, P: [&](Value *V) {
23448 return !S.isNonSchedulable(V) && S.isCopyableElement(V);
23449 }))
23450 return std::nullopt;
23451
23452 // Initialize the instruction bundle.
23453 Instruction *OldScheduleEnd = ScheduleEnd;
23454 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
23455
23456 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
23457 // Clear deps or recalculate the region, if the memory instruction is a
23458 // copyable. It may have memory deps, which must be recalculated.
23459 SmallVector<ScheduleData *> ControlDependentMembers;
23460 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
23461 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
23462 for (ScheduleEntity *SE : Bundle.getBundle()) {
23463 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
23464 if (ScheduleData *BundleMember = getScheduleData(I: SD->getInst());
23465 BundleMember && BundleMember->hasValidDependencies()) {
23466 BundleMember->clearDirectDependencies();
23467 if (RegionHasStackSave ||
23468 !isGuaranteedToTransferExecutionToSuccessor(
23469 I: BundleMember->getInst()))
23470 ControlDependentMembers.push_back(Elt: BundleMember);
23471 }
23472 continue;
23473 }
23474 auto *SD = cast<ScheduleData>(Val: SE);
23475 if (SD->hasValidDependencies() &&
23476 (!S.areInstructionsWithCopyableElements() ||
23477 !S.isCopyableElement(V: SD->getInst())) &&
23478 !getScheduleCopyableData(I: SD->getInst()).empty() && EI.UserTE &&
23479 EI.UserTE->hasState() &&
23480 (!EI.UserTE->hasCopyableElements() ||
23481 !EI.UserTE->isCopyableElement(V: SD->getInst())))
23482 SD->clearDirectDependencies();
23483 for (const Use &U : SD->getInst()->operands()) {
23484 unsigned &NumOps =
23485 UserOpToNumOps
23486 .try_emplace(Key: std::make_pair(x: SD->getInst(), y: U.get()), Args: 0)
23487 .first->getSecond();
23488 ++NumOps;
23489 if (auto *Op = dyn_cast<Instruction>(Val: U.get());
23490 Op && areAllOperandsReplacedByCopyableData(User: SD->getInst(), Op,
23491 SLP&: *SLP, NumOps)) {
23492 if (ScheduleData *OpSD = getScheduleData(I: Op);
23493 OpSD && OpSD->hasValidDependencies()) {
23494 OpSD->clearDirectDependencies();
23495 if (RegionHasStackSave ||
23496 !isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
23497 ControlDependentMembers.push_back(Elt: OpSD);
23498 }
23499 }
23500 }
23501 }
23502 };
23503 // The scheduling region got new instructions at the lower end (or it is a
23504 // new region for the first bundle). This makes it necessary to
23505 // recalculate all dependencies.
23506 // It is seldom that this needs to be done a second time after adding the
23507 // initial bundle to the region.
23508 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
23509 for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
23510 if (BB != P.first->getParent())
23511 return;
23512 ScheduleData *SD = P.second;
23513 if (isInSchedulingRegion(SD: *SD))
23514 SD->clearDependencies();
23515 });
23516 for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
23517 for_each(P.second, [&](ScheduleCopyableData *SD) {
23518 if (isInSchedulingRegion(SD: *SD))
23519 SD->clearDependencies();
23520 });
23521 });
23522 ReSchedule = true;
23523 }
23524 // Check if the bundle data has deps for copyable elements already. In
23525 // this case need to reset deps and recalculate it.
23526 if (Bundle && !Bundle.getBundle().empty()) {
23527 if (S.areInstructionsWithCopyableElements() ||
23528 !ScheduleCopyableDataMap.empty())
23529 CheckIfNeedToClearDeps(Bundle);
23530 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
23531 << BB->getName() << "\n");
23532 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
23533 ControlDeps: ControlDependentMembers);
23534 } else if (!ControlDependentMembers.empty()) {
23535 ScheduleBundle Invalid = ScheduleBundle::invalid();
23536 calculateDependencies(Bundle&: Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
23537 ControlDeps: ControlDependentMembers);
23538 }
23539
23540 if (ReSchedule) {
23541 resetSchedule();
23542 initialFillReadyList(ReadyList&: ReadyInsts);
23543 }
23544
23545 // Now try to schedule the new bundle or (if no bundle) just calculate
23546 // dependencies. As soon as the bundle is "ready" it means that there are no
23547 // cyclic dependencies and we can schedule it. Note that's important that we
23548 // don't "schedule" the bundle yet.
23549 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
23550 !ReadyInsts.empty()) {
23551 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
23552 assert(Picked->isReady() && "must be ready to schedule");
23553 schedule(R: *SLP, S, EI, Data: Picked, ReadyList&: ReadyInsts);
23554 if (Picked == &Bundle)
23555 break;
23556 }
23557 };
23558
23559 // Make sure that the scheduling region contains all
23560 // instructions of the bundle.
23561 for (Value *V : VL) {
23562 if (S.isNonSchedulable(V))
23563 continue;
23564 if (!extendSchedulingRegion(V, S)) {
23565 // If the scheduling region got new instructions at the lower end (or it
23566 // is a new region for the first bundle). This makes it necessary to
23567 // recalculate all dependencies.
23568 // Otherwise the compiler may crash trying to incorrectly calculate
23569 // dependencies and emit instruction in the wrong order at the actual
23570 // scheduling.
23571 ScheduleBundle Invalid = ScheduleBundle::invalid();
23572 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
23573 return std::nullopt;
23574 }
23575 }
23576
23577 bool ReSchedule = false;
23578 for (Value *V : VL) {
23579 if (S.isNonSchedulable(V))
23580 continue;
23581 SmallVector<ScheduleCopyableData *> CopyableData =
23582 getScheduleCopyableData(I: cast<Instruction>(Val: V));
23583 if (!CopyableData.empty()) {
23584 for (ScheduleCopyableData *SD : CopyableData)
23585 ReadyInsts.remove(X: SD);
23586 }
23587 ScheduleData *BundleMember = getScheduleData(V);
23588 assert((BundleMember || S.isCopyableElement(V)) &&
23589 "no ScheduleData for bundle member (maybe not in same basic block)");
23590 if (!BundleMember)
23591 continue;
23592
23593 // Make sure we don't leave the pieces of the bundle in the ready list when
23594 // whole bundle might not be ready.
23595 ReadyInsts.remove(X: BundleMember);
23596 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
23597 !Bundles.empty()) {
23598 for (ScheduleBundle *B : Bundles)
23599 ReadyInsts.remove(X: B);
23600 }
23601
23602 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
23603 continue;
23604 // A bundle member was scheduled as single instruction before and now
23605 // needs to be scheduled as part of the bundle. We just get rid of the
23606 // existing schedule.
23607 // A bundle member has deps calculated before it was copyable element - need
23608 // to reschedule.
23609 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
23610 << " was already scheduled\n");
23611 ReSchedule = true;
23612 }
23613
23614 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
23615 TryScheduleBundleImpl(ReSchedule, Bundle);
23616 if (!Bundle.isReady()) {
23617 for (ScheduleEntity *BD : Bundle.getBundle()) {
23618 // Copyable data scheduling is just removed.
23619 if (isa<ScheduleCopyableData>(Val: BD))
23620 continue;
23621 if (BD->isReady()) {
23622 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: BD->getInst());
23623 if (Bundles.empty()) {
23624 ReadyInsts.insert(X: BD);
23625 continue;
23626 }
23627 for (ScheduleBundle *B : Bundles)
23628 if (B->isReady())
23629 ReadyInsts.insert(X: B);
23630 }
23631 }
23632 ScheduledBundlesList.pop_back();
23633 SmallVector<ScheduleData *> ControlDependentMembers;
23634 for (Value *V : VL) {
23635 if (S.isNonSchedulable(V))
23636 continue;
23637 auto *I = cast<Instruction>(Val: V);
23638 if (S.isCopyableElement(V: I)) {
23639 // Remove the copyable data from the scheduling region and restore
23640 // previous mappings.
23641 auto KV = std::make_pair(x: EI, y&: I);
23642 assert(ScheduleCopyableDataMap.contains(KV) &&
23643 "no ScheduleCopyableData for copyable element");
23644 ScheduleCopyableData *SD =
23645 ScheduleCopyableDataMapByInst.find(Val: I)->getSecond().pop_back_val();
23646 ScheduleCopyableDataMapByUsers[I].remove(X: SD);
23647 if (EI.UserTE) {
23648 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
23649 const auto *It = find(Range&: Op, Val: I);
23650 assert(It != Op.end() && "Lane not set");
23651 SmallPtrSet<Instruction *, 4> Visited;
23652 do {
23653 int Lane = std::distance(first: Op.begin(), last: It);
23654 assert(Lane >= 0 && "Lane not set");
23655 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
23656 !EI.UserTE->ReorderIndices.empty())
23657 Lane = EI.UserTE->ReorderIndices[Lane];
23658 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
23659 "Couldn't find extract lane");
23660 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
23661 if (!Visited.insert(Ptr: In).second) {
23662 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
23663 break;
23664 }
23665 ScheduleCopyableDataMapByInstUser
23666 [std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I)]
23667 .pop_back();
23668 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
23669 } while (It != Op.end());
23670 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
23671 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(EI: UserEI, V: I))
23672 ScheduleCopyableDataMapByUsers[I].insert(X: UserCD);
23673 }
23674 if (ScheduleCopyableDataMapByUsers[I].empty())
23675 ScheduleCopyableDataMapByUsers.erase(Val: I);
23676 ScheduleCopyableDataMap.erase(Val: KV);
23677 // Need to recalculate dependencies for the actual schedule data.
23678 if (ScheduleData *OpSD = getScheduleData(I);
23679 OpSD && OpSD->hasValidDependencies()) {
23680 OpSD->clearDirectDependencies();
23681 if (RegionHasStackSave ||
23682 !isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
23683 ControlDependentMembers.push_back(Elt: OpSD);
23684 }
23685 continue;
23686 }
23687 ScheduledBundles.find(Val: I)->getSecond().pop_back();
23688 }
23689 if (!ControlDependentMembers.empty()) {
23690 ScheduleBundle Invalid = ScheduleBundle::invalid();
23691 calculateDependencies(Bundle&: Invalid, /*InsertInReadyList=*/false, SLP,
23692 ControlDeps: ControlDependentMembers);
23693 }
23694 return std::nullopt;
23695 }
23696 return &Bundle;
23697}
23698
23699BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
23700 // Allocate a new ScheduleData for the instruction.
23701 if (ChunkPos >= ChunkSize) {
23702 ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
23703 ChunkPos = 0;
23704 }
23705 return &(ScheduleDataChunks.back()[ChunkPos++]);
23706}
23707
23708bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
23709 Value *V, const InstructionsState &S) {
23710 Instruction *I = dyn_cast<Instruction>(Val: V);
23711 assert(I && "bundle member must be an instruction");
23712 if (getScheduleData(I))
23713 return true;
23714 if (!ScheduleStart) {
23715 // It's the first instruction in the new region.
23716 initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
23717 ScheduleStart = I;
23718 ScheduleEnd = I->getNextNode();
23719 assert(ScheduleEnd && "tried to vectorize a terminator?");
23720 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
23721 return true;
23722 }
23723 // Search up and down at the same time, because we don't know if the new
23724 // instruction is above or below the existing scheduling region.
23725 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
23726 // against the budget. Otherwise debug info could affect codegen.
23727 BasicBlock::reverse_iterator UpIter =
23728 ++ScheduleStart->getIterator().getReverse();
23729 BasicBlock::reverse_iterator UpperEnd = BB->rend();
23730 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
23731 BasicBlock::iterator LowerEnd = BB->end();
23732 auto IsAssumeLikeIntr = [](const Instruction &I) {
23733 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
23734 return II->isAssumeLikeIntrinsic();
23735 return false;
23736 };
23737 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
23738 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
23739 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
23740 &*DownIter != I) {
23741 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
23742 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
23743 return false;
23744 }
23745
23746 ++UpIter;
23747 ++DownIter;
23748
23749 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
23750 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
23751 }
23752 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
23753 assert(I->getParent() == ScheduleStart->getParent() &&
23754 "Instruction is in wrong basic block.");
23755 initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
23756 ScheduleStart = I;
23757 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
23758 << "\n");
23759 return true;
23760 }
23761 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
23762 "Expected to reach top of the basic block or instruction down the "
23763 "lower end.");
23764 assert(I->getParent() == ScheduleEnd->getParent() &&
23765 "Instruction is in wrong basic block.");
23766 initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
23767 NextLoadStore: nullptr);
23768 ScheduleEnd = I->getNextNode();
23769 assert(ScheduleEnd && "tried to vectorize a terminator?");
23770 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
23771 return true;
23772}
23773
23774void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
23775 Instruction *ToI,
23776 ScheduleData *PrevLoadStore,
23777 ScheduleData *NextLoadStore) {
23778 ScheduleData *CurrentLoadStore = PrevLoadStore;
23779 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
23780 // No need to allocate data for non-schedulable instructions.
23781 if (isa<PHINode>(Val: I))
23782 continue;
23783 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
23784 if (!SD) {
23785 SD = allocateScheduleDataChunks();
23786 ScheduleDataMap[I] = SD;
23787 }
23788 assert(!isInSchedulingRegion(*SD) &&
23789 "new ScheduleData already in scheduling region");
23790 SD->init(BlockSchedulingRegionID: SchedulingRegionID, I);
23791
23792 auto CanIgnoreLoad = [](const Instruction *I) {
23793 const auto *LI = dyn_cast<LoadInst>(Val: I);
23794 // If there is a simple load marked as invariant, we can ignore it.
23795 // But, in the (unlikely) case of non-simple invariant load,
23796 // we should not ignore it.
23797 return LI && LI->isSimple() &&
23798 LI->getMetadata(KindID: LLVMContext::MD_invariant_load);
23799 };
23800
23801 if (I->mayReadOrWriteMemory() &&
23802 // Simple InvariantLoad does not depend on other memory accesses.
23803 !CanIgnoreLoad(I) &&
23804 (!isa<IntrinsicInst>(Val: I) ||
23805 (cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
23806 cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
23807 Intrinsic::pseudoprobe))) {
23808 // Update the linked list of memory accessing instructions.
23809 if (CurrentLoadStore) {
23810 CurrentLoadStore->setNextLoadStore(SD);
23811 } else {
23812 FirstLoadStoreInRegion = SD;
23813 }
23814 CurrentLoadStore = SD;
23815 }
23816
23817 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
23818 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23819 RegionHasStackSave = true;
23820 }
23821 if (NextLoadStore) {
23822 if (CurrentLoadStore)
23823 CurrentLoadStore->setNextLoadStore(NextLoadStore);
23824 } else {
23825 LastLoadStoreInRegion = CurrentLoadStore;
23826 }
23827}
23828
23829void BoUpSLP::BlockScheduling::calculateDependencies(
23830 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
23831 ArrayRef<ScheduleData *> ControlDeps) {
23832 SmallVector<ScheduleEntity *> WorkList;
23833 auto ProcessNode = [&](ScheduleEntity *SE) {
23834 if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
23835 if (CD->hasValidDependencies())
23836 return;
23837 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
23838 CD->initDependencies();
23839 CD->resetUnscheduledDeps();
23840 const EdgeInfo &EI = CD->getEdgeInfo();
23841 if (EI.UserTE) {
23842 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
23843 const auto *It = find(Range&: Op, Val: CD->getInst());
23844 assert(It != Op.end() && "Lane not set");
23845 SmallPtrSet<Instruction *, 4> Visited;
23846 do {
23847 int Lane = std::distance(first: Op.begin(), last: It);
23848 assert(Lane >= 0 && "Lane not set");
23849 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
23850 !EI.UserTE->ReorderIndices.empty())
23851 Lane = EI.UserTE->ReorderIndices[Lane];
23852 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
23853 "Couldn't find extract lane");
23854 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
23855 if (EI.UserTE->isCopyableElement(V: In)) {
23856 // We may have not have related copyable scheduling data, if the
23857 // instruction is non-schedulable.
23858 if (ScheduleCopyableData *UseSD =
23859 getScheduleCopyableData(EI: EI.UserTE->UserTreeIndex, V: In)) {
23860 CD->incDependencies();
23861 if (!UseSD->isScheduled())
23862 CD->incrementUnscheduledDeps(Incr: 1);
23863 if (!UseSD->hasValidDependencies() ||
23864 (InsertInReadyList && UseSD->isReady()))
23865 WorkList.push_back(Elt: UseSD);
23866 }
23867 } else if (Visited.insert(Ptr: In).second) {
23868 if (ScheduleData *UseSD = getScheduleData(I: In)) {
23869 CD->incDependencies();
23870 if (!UseSD->isScheduled())
23871 CD->incrementUnscheduledDeps(Incr: 1);
23872 if (!UseSD->hasValidDependencies() ||
23873 (InsertInReadyList && UseSD->isReady()))
23874 WorkList.push_back(Elt: UseSD);
23875 }
23876 }
23877 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: CD->getInst());
23878 } while (It != Op.end());
23879 if (CD->isReady() && CD->getDependencies() == 0 &&
23880 (EI.UserTE->hasState() &&
23881 (EI.UserTE->getMainOp()->getParent() !=
23882 CD->getInst()->getParent() ||
23883 (isa<PHINode>(Val: EI.UserTE->getMainOp()) &&
23884 (EI.UserTE->getMainOp()->hasNUsesOrMore(N: UsesLimit) ||
23885 any_of(Range: EI.UserTE->getMainOp()->users(), P: [&](User *U) {
23886 auto *IU = dyn_cast<Instruction>(Val: U);
23887 if (!IU)
23888 return true;
23889 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
23890 })))))) {
23891 // If no uses in the block - mark as having pseudo-use, which cannot
23892 // be scheduled.
23893 // Prevents incorrect def-use tracking between external user and
23894 // actual instruction.
23895 CD->incDependencies();
23896 CD->incrementUnscheduledDeps(Incr: 1);
23897 }
23898 }
23899 return;
23900 }
23901 auto *BundleMember = cast<ScheduleData>(Val: SE);
23902 if (BundleMember->hasValidDependencies())
23903 return;
23904 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
23905 BundleMember->initDependencies();
23906 BundleMember->resetUnscheduledDeps();
23907 // Handle def-use chain dependencies.
23908 SmallDenseMap<Value *, unsigned> UserToNumOps;
23909 for (User *U : BundleMember->getInst()->users()) {
23910 if (isa<PHINode>(Val: U))
23911 continue;
23912 if (ScheduleData *UseSD = getScheduleData(V: U)) {
23913 // The operand is a copyable element - skip.
23914 unsigned &NumOps = UserToNumOps.try_emplace(Key: U, Args: 0).first->getSecond();
23915 ++NumOps;
23916 if (areAllOperandsReplacedByCopyableData(
23917 User: cast<Instruction>(Val: U), Op: BundleMember->getInst(), SLP&: *SLP, NumOps))
23918 continue;
23919 BundleMember->incDependencies();
23920 if (!UseSD->isScheduled())
23921 BundleMember->incrementUnscheduledDeps(Incr: 1);
23922 if (!UseSD->hasValidDependencies() ||
23923 (InsertInReadyList && UseSD->isReady()))
23924 WorkList.push_back(Elt: UseSD);
23925 }
23926 }
23927 for (ScheduleCopyableData *UseSD :
23928 getScheduleCopyableDataUsers(User: BundleMember->getInst())) {
23929 BundleMember->incDependencies();
23930 if (!UseSD->isScheduled())
23931 BundleMember->incrementUnscheduledDeps(Incr: 1);
23932 if (!UseSD->hasValidDependencies() ||
23933 (InsertInReadyList && UseSD->isReady()))
23934 WorkList.push_back(Elt: UseSD);
23935 }
23936
23937 SmallPtrSet<const Instruction *, 4> Visited;
23938 auto MakeControlDependent = [&](Instruction *I) {
23939 // Do not mark control dependent twice.
23940 if (!Visited.insert(Ptr: I).second)
23941 return;
23942 auto *DepDest = getScheduleData(I);
23943 assert(DepDest && "must be in schedule window");
23944 DepDest->addControlDependency(Dep: BundleMember);
23945 BundleMember->incDependencies();
23946 if (!DepDest->isScheduled())
23947 BundleMember->incrementUnscheduledDeps(Incr: 1);
23948 if (!DepDest->hasValidDependencies() ||
23949 (InsertInReadyList && DepDest->isReady()))
23950 WorkList.push_back(Elt: DepDest);
23951 };
23952
23953 // Any instruction which isn't safe to speculate at the beginning of the
23954 // block is control depend on any early exit or non-willreturn call
23955 // which proceeds it.
23956 if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->getInst())) {
23957 for (Instruction *I = BundleMember->getInst()->getNextNode();
23958 I != ScheduleEnd; I = I->getNextNode()) {
23959 if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
23960 continue;
23961
23962 // Add the dependency
23963 MakeControlDependent(I);
23964
23965 if (!isGuaranteedToTransferExecutionToSuccessor(I))
23966 // Everything past here must be control dependent on I.
23967 break;
23968 }
23969 }
23970
23971 if (RegionHasStackSave) {
23972 // If we have an inalloc alloca instruction, it needs to be scheduled
23973 // after any preceeding stacksave. We also need to prevent any alloca
23974 // from reordering above a preceeding stackrestore.
23975 if (match(V: BundleMember->getInst(), P: m_Intrinsic<Intrinsic::stacksave>()) ||
23976 match(V: BundleMember->getInst(),
23977 P: m_Intrinsic<Intrinsic::stackrestore>())) {
23978 for (Instruction *I = BundleMember->getInst()->getNextNode();
23979 I != ScheduleEnd; I = I->getNextNode()) {
23980 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
23981 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23982 // Any allocas past here must be control dependent on I, and I
23983 // must be memory dependend on BundleMember->Inst.
23984 break;
23985
23986 if (!isa<AllocaInst>(Val: I))
23987 continue;
23988
23989 // Add the dependency
23990 MakeControlDependent(I);
23991 }
23992 }
23993
23994 // In addition to the cases handle just above, we need to prevent
23995 // allocas and loads/stores from moving below a stacksave or a
23996 // stackrestore. Avoiding moving allocas below stackrestore is currently
23997 // thought to be conservatism. Moving loads/stores below a stackrestore
23998 // can lead to incorrect code.
23999 if (isa<AllocaInst>(Val: BundleMember->getInst()) ||
24000 BundleMember->getInst()->mayReadOrWriteMemory()) {
24001 for (Instruction *I = BundleMember->getInst()->getNextNode();
24002 I != ScheduleEnd; I = I->getNextNode()) {
24003 if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
24004 !match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
24005 continue;
24006
24007 // Add the dependency
24008 MakeControlDependent(I);
24009 break;
24010 }
24011 }
24012 }
24013
24014 // Handle the memory dependencies (if any).
24015 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
24016 if (!NextLoadStore)
24017 return;
24018 Instruction *SrcInst = BundleMember->getInst();
24019 assert(SrcInst->mayReadOrWriteMemory() &&
24020 "NextLoadStore list for non memory effecting bundle?");
24021 MemoryLocation SrcLoc = getLocation(I: SrcInst);
24022 bool SrcMayWrite = SrcInst->mayWriteToMemory();
24023 unsigned NumAliased = 0;
24024 unsigned DistToSrc = 1;
24025 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(I: SrcInst);
24026
24027 for (ScheduleData *DepDest = NextLoadStore; DepDest;
24028 DepDest = DepDest->getNextLoadStore()) {
24029 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
24030
24031 // We have two limits to reduce the complexity:
24032 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
24033 // SLP->isAliased (which is the expensive part in this loop).
24034 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
24035 // the whole loop (even if the loop is fast, it's quadratic).
24036 // It's important for the loop break condition (see below) to
24037 // check this limit even between two read-only instructions.
24038 if (DistToSrc >= MaxMemDepDistance ||
24039 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
24040 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
24041 SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->getInst())))) {
24042
24043 // We increment the counter only if the locations are aliased
24044 // (instead of counting all alias checks). This gives a better
24045 // balance between reduced runtime and accurate dependencies.
24046 NumAliased++;
24047
24048 DepDest->addMemoryDependency(Dep: BundleMember);
24049 BundleMember->incDependencies();
24050 if (!DepDest->isScheduled())
24051 BundleMember->incrementUnscheduledDeps(Incr: 1);
24052 if (!DepDest->hasValidDependencies() ||
24053 (InsertInReadyList && DepDest->isReady()))
24054 WorkList.push_back(Elt: DepDest);
24055 }
24056
24057 // Example, explaining the loop break condition: Let's assume our
24058 // starting instruction is i0 and MaxMemDepDistance = 3.
24059 //
24060 // +--------v--v--v
24061 // i0,i1,i2,i3,i4,i5,i6,i7,i8
24062 // +--------^--^--^
24063 //
24064 // MaxMemDepDistance let us stop alias-checking at i3 and we add
24065 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
24066 // Previously we already added dependencies from i3 to i6,i7,i8
24067 // (because of MaxMemDepDistance). As we added a dependency from
24068 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
24069 // and we can abort this loop at i6.
24070 if (DistToSrc >= 2 * MaxMemDepDistance)
24071 break;
24072 DistToSrc++;
24073 }
24074 };
24075
24076 assert((Bundle || !ControlDeps.empty()) &&
24077 "expected at least one instruction to schedule");
24078 if (Bundle)
24079 WorkList.push_back(Elt: Bundle.getBundle().front());
24080 WorkList.append(in_start: ControlDeps.begin(), in_end: ControlDeps.end());
24081 SmallPtrSet<ScheduleBundle *, 16> Visited;
24082 while (!WorkList.empty()) {
24083 ScheduleEntity *SD = WorkList.pop_back_val();
24084 SmallVector<ScheduleBundle *, 1> CopyableBundle;
24085 ArrayRef<ScheduleBundle *> Bundles;
24086 if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SD)) {
24087 CopyableBundle.push_back(Elt: &CD->getBundle());
24088 Bundles = CopyableBundle;
24089 } else {
24090 Bundles = getScheduleBundles(V: SD->getInst());
24091 }
24092 if (Bundles.empty()) {
24093 if (!SD->hasValidDependencies())
24094 ProcessNode(SD);
24095 if (InsertInReadyList && SD->isReady()) {
24096 ReadyInsts.insert(X: SD);
24097 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
24098 }
24099 continue;
24100 }
24101 for (ScheduleBundle *Bundle : Bundles) {
24102 if (Bundle->hasValidDependencies() || !Visited.insert(Ptr: Bundle).second)
24103 continue;
24104 assert(isInSchedulingRegion(*Bundle) &&
24105 "ScheduleData not in scheduling region");
24106 for_each(Range: Bundle->getBundle(), F: ProcessNode);
24107 }
24108 if (InsertInReadyList && SD->isReady()) {
24109 for (ScheduleBundle *Bundle : Bundles) {
24110 assert(isInSchedulingRegion(*Bundle) &&
24111 "ScheduleData not in scheduling region");
24112 if (!Bundle->isReady())
24113 continue;
24114 ReadyInsts.insert(X: Bundle);
24115 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
24116 << "\n");
24117 }
24118 }
24119 }
24120}
24121
24122void BoUpSLP::BlockScheduling::resetSchedule() {
24123 assert(ScheduleStart &&
24124 "tried to reset schedule on block which has not been scheduled");
24125 for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
24126 if (BB != P.first->getParent())
24127 return;
24128 ScheduleData *SD = P.second;
24129 if (isInSchedulingRegion(SD: *SD)) {
24130 SD->setScheduled(/*Scheduled=*/false);
24131 SD->resetUnscheduledDeps();
24132 }
24133 });
24134 for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
24135 for_each(P.second, [&](ScheduleCopyableData *SD) {
24136 if (isInSchedulingRegion(SD: *SD)) {
24137 SD->setScheduled(/*Scheduled=*/false);
24138 SD->resetUnscheduledDeps();
24139 }
24140 });
24141 });
24142 for_each(Range&: ScheduledBundles, F: [&](auto &P) {
24143 for_each(P.second, [&](ScheduleBundle *Bundle) {
24144 if (isInSchedulingRegion(SD: *Bundle))
24145 Bundle->setScheduled(/*Scheduled=*/false);
24146 });
24147 });
24148 // Reset schedule data for copyable elements.
24149 for (auto &P : ScheduleCopyableDataMap) {
24150 if (isInSchedulingRegion(SD: *P.second)) {
24151 P.second->setScheduled(/*Scheduled=*/false);
24152 P.second->resetUnscheduledDeps();
24153 }
24154 }
24155 ReadyInsts.clear();
24156}
24157
24158void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
24159 if (!BS->ScheduleStart)
24160 return;
24161
24162 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
24163
24164 // A key point - if we got here, pre-scheduling was able to find a valid
24165 // scheduling of the sub-graph of the scheduling window which consists
24166 // of all vector bundles and their transitive users. As such, we do not
24167 // need to reschedule anything *outside of* that subgraph.
24168
24169 BS->resetSchedule();
24170
24171 // For the real scheduling we use a more sophisticated ready-list: it is
24172 // sorted by the original instruction location. This lets the final schedule
24173 // be as close as possible to the original instruction order.
24174 // WARNING: If changing this order causes a correctness issue, that means
24175 // there is some missing dependence edge in the schedule data graph.
24176 struct ScheduleDataCompare {
24177 bool operator()(const ScheduleEntity *SD1,
24178 const ScheduleEntity *SD2) const {
24179 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
24180 }
24181 };
24182 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
24183
24184 // Ensure that all dependency data is updated (for nodes in the sub-graph)
24185 // and fill the ready-list with initial instructions.
24186 int Idx = 0;
24187 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
24188 I = I->getNextNode()) {
24189 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(V: I);
24190 if (!Bundles.empty()) {
24191 for (ScheduleBundle *Bundle : Bundles) {
24192 Bundle->setSchedulingPriority(Idx++);
24193 if (!Bundle->hasValidDependencies())
24194 BS->calculateDependencies(Bundle&: *Bundle, /*InsertInReadyList=*/false, SLP: this);
24195 }
24196 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
24197 for (ScheduleCopyableData *SD : reverse(C&: SDs)) {
24198 ScheduleBundle &Bundle = SD->getBundle();
24199 Bundle.setSchedulingPriority(Idx++);
24200 if (!Bundle.hasValidDependencies())
24201 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
24202 }
24203 continue;
24204 }
24205 SmallVector<ScheduleCopyableData *> CopyableData =
24206 BS->getScheduleCopyableDataUsers(User: I);
24207 if (ScheduleData *SD = BS->getScheduleData(I)) {
24208 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(V: I);
24209 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
24210 SDTEs.front()->doesNotNeedToSchedule() ||
24211 doesNotNeedToBeScheduled(I)) &&
24212 "scheduler and vectorizer bundle mismatch");
24213 SD->setSchedulingPriority(Idx++);
24214 if (!CopyableData.empty() ||
24215 any_of(Range: R.ValueToGatherNodes.lookup(Val: I), P: [&](const TreeEntry *TE) {
24216 assert(TE->isGather() && "expected gather node");
24217 return TE->hasState() && TE->hasCopyableElements() &&
24218 TE->isCopyableElement(V: I);
24219 })) {
24220 SD->clearDirectDependencies();
24221 // Need to calculate deps for these nodes to correctly handle copyable
24222 // dependencies, even if they were cancelled.
24223 // If copyables bundle was cancelled, the deps are cleared and need to
24224 // recalculate them.
24225 ScheduleBundle Bundle;
24226 Bundle.add(SD);
24227 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
24228 }
24229 }
24230 for (ScheduleCopyableData *SD : reverse(C&: CopyableData)) {
24231 ScheduleBundle &Bundle = SD->getBundle();
24232 Bundle.setSchedulingPriority(Idx++);
24233 if (!Bundle.hasValidDependencies())
24234 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
24235 }
24236 }
24237 BS->initialFillReadyList(ReadyList&: ReadyInsts);
24238
24239 Instruction *LastScheduledInst = BS->ScheduleEnd;
24240
24241 // Do the "real" scheduling.
24242 SmallPtrSet<Instruction *, 16> Scheduled;
24243 while (!ReadyInsts.empty()) {
24244 auto *Picked = *ReadyInsts.begin();
24245 ReadyInsts.erase(position: ReadyInsts.begin());
24246
24247 // Move the scheduled instruction(s) to their dedicated places, if not
24248 // there yet.
24249 if (auto *Bundle = dyn_cast<ScheduleBundle>(Val: Picked)) {
24250 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
24251 Instruction *PickedInst = BundleMember->getInst();
24252 // If copyable must be schedule as part of something else, skip it.
24253 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(V: PickedInst);
24254 if ((IsCopyable && BS->getScheduleData(I: PickedInst)) ||
24255 (!IsCopyable && !Scheduled.insert(Ptr: PickedInst).second))
24256 continue;
24257 if (PickedInst->getNextNode() != LastScheduledInst)
24258 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
24259 LastScheduledInst = PickedInst;
24260 }
24261 EntryToLastInstruction.try_emplace(Key: Bundle->getTreeEntry(),
24262 Args&: LastScheduledInst);
24263 } else {
24264 auto *SD = cast<ScheduleData>(Val: Picked);
24265 Instruction *PickedInst = SD->getInst();
24266 if (PickedInst->getNextNode() != LastScheduledInst)
24267 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
24268 LastScheduledInst = PickedInst;
24269 }
24270 auto Invalid = InstructionsState::invalid();
24271 BS->schedule(R, S: Invalid, EI: EdgeInfo(), Data: Picked, ReadyList&: ReadyInsts);
24272 }
24273
24274 // Check that we didn't break any of our invariants.
24275#ifdef EXPENSIVE_CHECKS
24276 BS->verify();
24277#endif
24278
24279#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
24280 // Check that all schedulable entities got scheduled
24281 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
24282 I = I->getNextNode()) {
24283 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
24284 assert(all_of(Bundles,
24285 [](const ScheduleBundle *Bundle) {
24286 return Bundle->isScheduled();
24287 }) &&
24288 "must be scheduled at this point");
24289 }
24290#endif
24291
24292 // Avoid duplicate scheduling of the block.
24293 BS->ScheduleStart = nullptr;
24294}
24295
24296unsigned BoUpSLP::getVectorElementSize(Value *V) {
24297 // If V is a store, just return the width of the stored value (or value
24298 // truncated just before storing) without traversing the expression tree.
24299 // This is the common case.
24300 if (auto *Store = dyn_cast<StoreInst>(Val: V))
24301 return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
24302
24303 if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
24304 return getVectorElementSize(V: IEI->getOperand(i_nocapture: 1));
24305
24306 auto E = InstrElementSize.find(Val: V);
24307 if (E != InstrElementSize.end())
24308 return E->second;
24309
24310 // If V is not a store, we can traverse the expression tree to find loads
24311 // that feed it. The type of the loaded value may indicate a more suitable
24312 // width than V's type. We want to base the vector element size on the width
24313 // of memory operations where possible.
24314 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
24315 SmallPtrSet<Instruction *, 16> Visited;
24316 if (auto *I = dyn_cast<Instruction>(Val: V)) {
24317 Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: 0);
24318 Visited.insert(Ptr: I);
24319 }
24320
24321 // Traverse the expression tree in bottom-up order looking for loads. If we
24322 // encounter an instruction we don't yet handle, we give up.
24323 auto Width = 0u;
24324 Value *FirstNonBool = nullptr;
24325 while (!Worklist.empty()) {
24326 auto [I, Parent, Level] = Worklist.pop_back_val();
24327
24328 // We should only be looking at scalar instructions here. If the current
24329 // instruction has a vector type, skip.
24330 auto *Ty = I->getType();
24331 if (isa<VectorType>(Val: Ty))
24332 continue;
24333 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
24334 FirstNonBool = I;
24335 if (Level > RecursionMaxDepth)
24336 continue;
24337
24338 // If the current instruction is a load, update MaxWidth to reflect the
24339 // width of the loaded value.
24340 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
24341 Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
24342
24343 // Otherwise, we need to visit the operands of the instruction. We only
24344 // handle the interesting cases from buildTree here. If an operand is an
24345 // instruction we haven't yet visited and from the same basic block as the
24346 // user or the use is a PHI node, we add it to the worklist.
24347 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
24348 BinaryOperator, UnaryOperator>(Val: I)) {
24349 for (Use &U : I->operands()) {
24350 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
24351 if (Visited.insert(Ptr: J).second &&
24352 (isa<PHINode>(Val: I) || J->getParent() == Parent)) {
24353 Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + 1);
24354 continue;
24355 }
24356 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
24357 FirstNonBool = U.get();
24358 }
24359 } else {
24360 break;
24361 }
24362 }
24363
24364 // If we didn't encounter a memory access in the expression tree, or if we
24365 // gave up for some reason, just return the width of V. Otherwise, return the
24366 // maximum width we found.
24367 if (!Width) {
24368 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
24369 V = FirstNonBool;
24370 Width = DL->getTypeSizeInBits(Ty: V->getType());
24371 }
24372
24373 for (Instruction *I : Visited)
24374 InstrElementSize[I] = Width;
24375
24376 return Width;
24377}
24378
24379bool BoUpSLP::collectValuesToDemote(
24380 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
24381 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
24382 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
24383 bool &IsProfitableToDemote, bool IsTruncRoot) const {
24384 // We can always demote constants.
24385 if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
24386 return true;
24387
24388 unsigned OrigBitWidth =
24389 DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType()->getScalarType());
24390 if (OrigBitWidth == BitWidth) {
24391 MaxDepthLevel = 1;
24392 return true;
24393 }
24394
24395 // Check if the node was analyzed already and must keep its original bitwidth.
24396 if (NodesToKeepBWs.contains(V: E.Idx))
24397 return false;
24398
24399 // If the value is not a vectorized instruction in the expression and not used
24400 // by the insertelement instruction and not used in multiple vector nodes, it
24401 // cannot be demoted.
24402 bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
24403 if (isa<PoisonValue>(Val: R))
24404 return false;
24405 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
24406 });
24407 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
24408 if (isa<PoisonValue>(Val: V))
24409 return true;
24410 if (getTreeEntries(V).size() > 1)
24411 return false;
24412 // For lat shuffle of sext/zext with many uses need to check the extra bit
24413 // for unsigned values, otherwise may have incorrect casting for reused
24414 // scalars.
24415 bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery(*DL));
24416 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
24417 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
24418 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
24419 return true;
24420 }
24421 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
24422 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
24423 if (IsSignedNode)
24424 ++BitWidth1;
24425 if (auto *I = dyn_cast<Instruction>(Val: V)) {
24426 APInt Mask = DB->getDemandedBits(I);
24427 unsigned BitWidth2 =
24428 std::max<unsigned>(a: 1, b: Mask.getBitWidth() - Mask.countl_zero());
24429 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
24430 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - 1);
24431 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
24432 break;
24433 BitWidth2 *= 2;
24434 }
24435 BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
24436 }
24437 BitWidth = std::max(a: BitWidth, b: BitWidth1);
24438 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
24439 };
24440 auto FinalAnalysis = [&, TTI = TTI]() {
24441 if (!IsProfitableToDemote)
24442 return false;
24443 bool Res = all_of(
24444 Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
24445 // Demote gathers.
24446 if (Res && E.isGather()) {
24447 if (E.hasState()) {
24448 if (const TreeEntry *SameTE =
24449 getSameValuesTreeEntry(V: E.getMainOp(), VL: E.Scalars))
24450 if (collectValuesToDemote(E: *SameTE, IsProfitableToDemoteRoot, BitWidth,
24451 ToDemote, Visited, NodesToKeepBWs,
24452 MaxDepthLevel, IsProfitableToDemote,
24453 IsTruncRoot)) {
24454 ToDemote.push_back(Elt: E.Idx);
24455 return true;
24456 }
24457 }
24458 // Check possible extractelement instructions bases and final vector
24459 // length.
24460 SmallPtrSet<Value *, 4> UniqueBases;
24461 for (Value *V : E.Scalars) {
24462 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
24463 if (!EE)
24464 continue;
24465 UniqueBases.insert(Ptr: EE->getVectorOperand());
24466 }
24467 const unsigned VF = E.Scalars.size();
24468 Type *OrigScalarTy = E.Scalars.front()->getType();
24469 if (UniqueBases.size() <= 2 ||
24470 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF)) >=
24471 ::getNumberOfParts(
24472 TTI: *TTI,
24473 VecTy: getWidenedType(
24474 ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth),
24475 VF))) {
24476 ToDemote.push_back(Elt: E.Idx);
24477 return true;
24478 }
24479 }
24480 return Res;
24481 };
24482 if (E.isGather() || !Visited.insert(V: &E).second ||
24483 any_of(Range: E.Scalars, P: [&](Value *V) {
24484 return !isa<Constant>(Val: V) && all_of(Range: V->users(), P: [&](User *U) {
24485 return isa<InsertElementInst>(Val: U) && !isVectorized(V: U);
24486 });
24487 }))
24488 return FinalAnalysis();
24489
24490 if (any_of(Range: E.Scalars, P: [&](Value *V) {
24491 return !isa<Constant>(Val: V) && !all_of(Range: V->users(), P: [=](User *U) {
24492 return isVectorized(V: U) ||
24493 (E.Idx == 0 && UserIgnoreList &&
24494 UserIgnoreList->contains(V: U)) ||
24495 (!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
24496 !U->getType()->isScalableTy() &&
24497 DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
24498 }) && !IsPotentiallyTruncated(V, BitWidth);
24499 }))
24500 return false;
24501
24502 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
24503 bool &NeedToExit) {
24504 NeedToExit = false;
24505 unsigned InitLevel = MaxDepthLevel;
24506 for (const TreeEntry *Op : Operands) {
24507 unsigned Level = InitLevel;
24508 if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
24509 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel&: Level,
24510 IsProfitableToDemote, IsTruncRoot)) {
24511 if (!IsProfitableToDemote)
24512 return false;
24513 NeedToExit = true;
24514 if (!FinalAnalysis())
24515 return false;
24516 continue;
24517 }
24518 MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
24519 }
24520 return true;
24521 };
24522 auto AttemptCheckBitwidth =
24523 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
24524 // Try all bitwidth < OrigBitWidth.
24525 NeedToExit = false;
24526 unsigned BestFailBitwidth = 0;
24527 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
24528 if (Checker(BitWidth, OrigBitWidth))
24529 return true;
24530 if (BestFailBitwidth == 0 && FinalAnalysis())
24531 BestFailBitwidth = BitWidth;
24532 }
24533 if (BitWidth >= OrigBitWidth) {
24534 if (BestFailBitwidth == 0) {
24535 BitWidth = OrigBitWidth;
24536 return false;
24537 }
24538 MaxDepthLevel = 1;
24539 BitWidth = BestFailBitwidth;
24540 NeedToExit = true;
24541 return true;
24542 }
24543 return false;
24544 };
24545 auto TryProcessInstruction =
24546 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
24547 function_ref<bool(unsigned, unsigned)> Checker = {}) {
24548 if (Operands.empty()) {
24549 if (!IsTruncRoot)
24550 MaxDepthLevel = 1;
24551 for (Value *V : E.Scalars)
24552 (void)IsPotentiallyTruncated(V, BitWidth);
24553 } else {
24554 // Several vectorized uses? Check if we can truncate it, otherwise -
24555 // exit.
24556 if (any_of(Range: E.Scalars, P: [&](Value *V) {
24557 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
24558 }))
24559 return false;
24560 bool NeedToExit = false;
24561 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
24562 return false;
24563 if (NeedToExit)
24564 return true;
24565 if (!ProcessOperands(Operands, NeedToExit))
24566 return false;
24567 if (NeedToExit)
24568 return true;
24569 }
24570
24571 ++MaxDepthLevel;
24572 // Record the entry that we can demote.
24573 ToDemote.push_back(Elt: E.Idx);
24574 return IsProfitableToDemote;
24575 };
24576
24577 if (E.State == TreeEntry::SplitVectorize)
24578 return TryProcessInstruction(
24579 BitWidth,
24580 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
24581 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
24582
24583 if (E.isAltShuffle()) {
24584 // Combining these opcodes may lead to incorrect analysis, skip for now.
24585 auto IsDangerousOpcode = [](unsigned Opcode) {
24586 switch (Opcode) {
24587 case Instruction::Shl:
24588 case Instruction::AShr:
24589 case Instruction::LShr:
24590 case Instruction::UDiv:
24591 case Instruction::SDiv:
24592 case Instruction::URem:
24593 case Instruction::SRem:
24594 return true;
24595 default:
24596 break;
24597 }
24598 return false;
24599 };
24600 if (IsDangerousOpcode(E.getAltOpcode()))
24601 return FinalAnalysis();
24602 }
24603
24604 switch (E.getOpcode()) {
24605
24606 // We can always demote truncations and extensions. Since truncations can
24607 // seed additional demotion, we save the truncated value.
24608 case Instruction::Trunc:
24609 if (IsProfitableToDemoteRoot)
24610 IsProfitableToDemote = true;
24611 return TryProcessInstruction(BitWidth);
24612 case Instruction::ZExt:
24613 case Instruction::SExt:
24614 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
24615 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
24616 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
24617 return false;
24618 IsProfitableToDemote = true;
24619 return TryProcessInstruction(BitWidth);
24620
24621 // We can demote certain binary operations if we can demote both of their
24622 // operands.
24623 case Instruction::Add:
24624 case Instruction::Sub:
24625 case Instruction::Mul:
24626 case Instruction::And:
24627 case Instruction::Or:
24628 case Instruction::Xor: {
24629 return TryProcessInstruction(
24630 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)});
24631 }
24632 case Instruction::Freeze:
24633 return TryProcessInstruction(BitWidth, getOperandEntry(E: &E, Idx: 0));
24634 case Instruction::Shl: {
24635 // If we are truncating the result of this SHL, and if it's a shift of an
24636 // inrange amount, we can always perform a SHL in a smaller type.
24637 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
24638 return all_of(Range: E.Scalars, P: [&](Value *V) {
24639 if (isa<PoisonValue>(Val: V))
24640 return true;
24641 if (E.isCopyableElement(V))
24642 return true;
24643 auto *I = cast<Instruction>(Val: V);
24644 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
24645 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
24646 });
24647 };
24648 return TryProcessInstruction(
24649 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, ShlChecker);
24650 }
24651 case Instruction::LShr: {
24652 // If this is a truncate of a logical shr, we can truncate it to a smaller
24653 // lshr iff we know that the bits we would otherwise be shifting in are
24654 // already zeros.
24655 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24656 return all_of(Range: E.Scalars, P: [&](Value *V) {
24657 if (isa<PoisonValue>(Val: V))
24658 return true;
24659 APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
24660 if (E.isCopyableElement(V))
24661 return MaskedValueIsZero(V, Mask: ShiftedBits, SQ: SimplifyQuery(*DL));
24662 auto *I = cast<Instruction>(Val: V);
24663 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
24664 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
24665 MaskedValueIsZero(V: I->getOperand(i: 0), Mask: ShiftedBits,
24666 SQ: SimplifyQuery(*DL));
24667 });
24668 };
24669 return TryProcessInstruction(
24670 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
24671 LShrChecker);
24672 }
24673 case Instruction::AShr: {
24674 // If this is a truncate of an arithmetic shr, we can truncate it to a
24675 // smaller ashr iff we know that all the bits from the sign bit of the
24676 // original type and the sign bit of the truncate type are similar.
24677 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24678 return all_of(Range: E.Scalars, P: [&](Value *V) {
24679 if (isa<PoisonValue>(Val: V))
24680 return true;
24681 auto *I = cast<Instruction>(Val: V);
24682 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
24683 unsigned ShiftedBits = OrigBitWidth - BitWidth;
24684 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
24685 ShiftedBits <
24686 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
24687 });
24688 };
24689 return TryProcessInstruction(
24690 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
24691 AShrChecker);
24692 }
24693 case Instruction::UDiv:
24694 case Instruction::URem: {
24695 // UDiv and URem can be truncated if all the truncated bits are zero.
24696 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24697 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
24698 return all_of(Range: E.Scalars, P: [&](Value *V) {
24699 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
24700 if (E.hasCopyableElements() && E.isCopyableElement(V))
24701 return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL));
24702 auto *I = cast<Instruction>(Val: V);
24703 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)) &&
24704 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
24705 });
24706 };
24707 return TryProcessInstruction(
24708 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, Checker);
24709 }
24710
24711 // We can demote selects if we can demote their true and false values.
24712 case Instruction::Select: {
24713 return TryProcessInstruction(
24714 BitWidth, {getOperandEntry(E: &E, Idx: 1), getOperandEntry(E: &E, Idx: 2)});
24715 }
24716
24717 // We can demote phis if we can demote all their incoming operands.
24718 case Instruction::PHI: {
24719 const unsigned NumOps = E.getNumOperands();
24720 SmallVector<const TreeEntry *> Ops(NumOps);
24721 transform(Range: seq<unsigned>(Begin: 0, End: NumOps), d_first: Ops.begin(),
24722 F: [&](unsigned Idx) { return getOperandEntry(E: &E, Idx); });
24723
24724 return TryProcessInstruction(BitWidth, Ops);
24725 }
24726
24727 case Instruction::Call: {
24728 auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
24729 if (!IC)
24730 break;
24731 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
24732 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
24733 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
24734 break;
24735 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(E: &E, Idx: 0));
24736 function_ref<bool(unsigned, unsigned)> CallChecker;
24737 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24738 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
24739 return all_of(Range: E.Scalars, P: [&](Value *V) {
24740 auto *I = cast<Instruction>(Val: V);
24741 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
24742 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
24743 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
24744 SQ: SimplifyQuery(*DL)) &&
24745 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
24746 }
24747 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
24748 "Expected min/max intrinsics only.");
24749 unsigned SignBits = OrigBitWidth - BitWidth;
24750 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
24751 unsigned Op0SignBits =
24752 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
24753 unsigned Op1SignBits =
24754 ComputeNumSignBits(Op: I->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
24755 return SignBits <= Op0SignBits &&
24756 ((SignBits != Op0SignBits &&
24757 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
24758 MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
24759 SQ: SimplifyQuery(*DL))) &&
24760 SignBits <= Op1SignBits &&
24761 ((SignBits != Op1SignBits &&
24762 !isKnownNonNegative(V: I->getOperand(i: 1), SQ: SimplifyQuery(*DL))) ||
24763 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL)));
24764 });
24765 };
24766 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
24767 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
24768 return all_of(Range: E.Scalars, P: [&](Value *V) {
24769 auto *I = cast<Instruction>(Val: V);
24770 unsigned SignBits = OrigBitWidth - BitWidth;
24771 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
24772 unsigned Op0SignBits =
24773 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
24774 return SignBits <= Op0SignBits &&
24775 ((SignBits != Op0SignBits &&
24776 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
24777 MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)));
24778 });
24779 };
24780 if (ID != Intrinsic::abs) {
24781 Operands.push_back(Elt: getOperandEntry(E: &E, Idx: 1));
24782 CallChecker = CompChecker;
24783 } else {
24784 CallChecker = AbsChecker;
24785 }
24786 InstructionCost BestCost =
24787 std::numeric_limits<InstructionCost::CostType>::max();
24788 unsigned BestBitWidth = BitWidth;
24789 unsigned VF = E.Scalars.size();
24790 // Choose the best bitwidth based on cost estimations.
24791 auto Checker = [&](unsigned BitWidth, unsigned) {
24792 unsigned MinBW = PowerOf2Ceil(A: BitWidth);
24793 SmallVector<Type *> ArgTys =
24794 buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW, TTI);
24795 auto VecCallCosts = getVectorCallCosts(
24796 CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
24797 TTI, TLI, ArgTys);
24798 InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
24799 if (Cost < BestCost) {
24800 BestCost = Cost;
24801 BestBitWidth = BitWidth;
24802 }
24803 return false;
24804 };
24805 [[maybe_unused]] bool NeedToExit;
24806 (void)AttemptCheckBitwidth(Checker, NeedToExit);
24807 BitWidth = BestBitWidth;
24808 return TryProcessInstruction(BitWidth, Operands, CallChecker);
24809 }
24810
24811 // Otherwise, conservatively give up.
24812 default:
24813 break;
24814 }
24815 MaxDepthLevel = 1;
24816 return FinalAnalysis();
24817}
24818
24819static RecurKind getRdxKind(Value *V);
24820
24821void BoUpSLP::computeMinimumValueSizes() {
24822 // We only attempt to truncate integer expressions.
24823 bool IsStoreOrInsertElt =
24824 VectorizableTree.front()->hasState() &&
24825 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
24826 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
24827 if ((IsStoreOrInsertElt || UserIgnoreList) &&
24828 ExtraBitWidthNodes.size() <= 1 &&
24829 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
24830 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
24831 return;
24832
24833 unsigned NodeIdx = 0;
24834 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
24835 NodeIdx = 1;
24836
24837 // Ensure the roots of the vectorizable tree don't form a cycle.
24838 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
24839 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
24840 "Unexpected tree is graph.");
24841
24842 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
24843 // resize to the final type.
24844 bool IsTruncRoot = false;
24845 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
24846 SmallVector<unsigned> RootDemotes;
24847 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
24848 if (NodeIdx != 0 &&
24849 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
24850 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
24851 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
24852 IsTruncRoot = true;
24853 RootDemotes.push_back(Elt: NodeIdx);
24854 IsProfitableToDemoteRoot = true;
24855 ++NodeIdx;
24856 }
24857
24858 // Analyzed the reduction already and not profitable - exit.
24859 if (AnalyzedMinBWVals.contains(V: VectorizableTree[NodeIdx]->Scalars.front()))
24860 return;
24861
24862 SmallVector<unsigned> ToDemote;
24863 auto ComputeMaxBitWidth =
24864 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
24865 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
24866 ToDemote.clear();
24867 // Check if the root is trunc and the next node is gather/buildvector, then
24868 // keep trunc in scalars, which is free in most cases.
24869 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
24870 !NodesToKeepBWs.contains(V: E.Idx) &&
24871 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
24872 all_of(Range: E.Scalars, P: [&](Value *V) {
24873 return V->hasOneUse() || isa<Constant>(Val: V) ||
24874 (!V->hasNUsesOrMore(N: UsesLimit) &&
24875 none_of(Range: V->users(), P: [&](User *U) {
24876 ArrayRef<TreeEntry *> TEs = getTreeEntries(V: U);
24877 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24878 if (TEs.empty() || is_contained(Range&: TEs, Element: UserTE))
24879 return false;
24880 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24881 SelectInst>(Val: U) ||
24882 isa<SIToFPInst, UIToFPInst>(Val: U) ||
24883 (UserTE->hasState() &&
24884 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24885 SelectInst>(Val: UserTE->getMainOp()) ||
24886 isa<SIToFPInst, UIToFPInst>(Val: UserTE->getMainOp()))))
24887 return true;
24888 unsigned UserTESz = DL->getTypeSizeInBits(
24889 Ty: UserTE->Scalars.front()->getType());
24890 if (all_of(Range&: TEs, P: [&](const TreeEntry *TE) {
24891 auto It = MinBWs.find(Val: TE);
24892 return It != MinBWs.end() &&
24893 It->second.first > UserTESz;
24894 }))
24895 return true;
24896 return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
24897 }));
24898 })) {
24899 ToDemote.push_back(Elt: E.Idx);
24900 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24901 auto It = MinBWs.find(Val: UserTE);
24902 if (It != MinBWs.end())
24903 return It->second.first;
24904 unsigned MaxBitWidth =
24905 DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
24906 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
24907 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24908 MaxBitWidth = 8;
24909 return MaxBitWidth;
24910 }
24911
24912 if (!E.hasState())
24913 return 0u;
24914
24915 unsigned VF = E.getVectorFactor();
24916 Type *ScalarTy = E.Scalars.front()->getType();
24917 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
24918 auto *TreeRootIT = dyn_cast<IntegerType>(Val: ScalarTy->getScalarType());
24919 if (!TreeRootIT)
24920 return 0u;
24921
24922 if (any_of(Range: E.Scalars,
24923 P: [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
24924 return 0u;
24925
24926 unsigned NumParts = ::getNumberOfParts(
24927 TTI: *TTI, VecTy: getWidenedType(ScalarTy: TreeRootIT, VF: VF * ScalarTyNumElements));
24928
24929 // The maximum bit width required to represent all the values that can be
24930 // demoted without loss of precision. It would be safe to truncate the roots
24931 // of the expression to this width.
24932 unsigned MaxBitWidth = 1u;
24933
24934 // True if the roots can be zero-extended back to their original type,
24935 // rather than sign-extended. We know that if the leading bits are not
24936 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
24937 // True.
24938 // Determine if the sign bit of all the roots is known to be zero. If not,
24939 // IsKnownPositive is set to False.
24940 bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
24941 if (isa<PoisonValue>(Val: R))
24942 return true;
24943 KnownBits Known = computeKnownBits(V: R, DL: *DL);
24944 return Known.isNonNegative();
24945 });
24946
24947 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
24948 E.UserTreeIndex.UserTE->hasState() &&
24949 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
24950 MaxBitWidth =
24951 std::min(a: DL->getTypeSizeInBits(
24952 Ty: E.UserTreeIndex.UserTE->Scalars.front()->getType()),
24953 b: DL->getTypeSizeInBits(Ty: ScalarTy));
24954
24955 // We first check if all the bits of the roots are demanded. If they're not,
24956 // we can truncate the roots to this narrower type.
24957 for (Value *Root : E.Scalars) {
24958 if (isa<PoisonValue>(Val: Root))
24959 continue;
24960 unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: *DL, AC, CxtI: nullptr, DT);
24961 TypeSize NumTypeBits =
24962 DL->getTypeSizeInBits(Ty: Root->getType()->getScalarType());
24963 unsigned BitWidth1 = NumTypeBits - NumSignBits;
24964 // If we can't prove that the sign bit is zero, we must add one to the
24965 // maximum bit width to account for the unknown sign bit. This preserves
24966 // the existing sign bit so we can safely sign-extend the root back to the
24967 // original type. Otherwise, if we know the sign bit is zero, we will
24968 // zero-extend the root instead.
24969 //
24970 // FIXME: This is somewhat suboptimal, as there will be cases where adding
24971 // one to the maximum bit width will yield a larger-than-necessary
24972 // type. In general, we need to add an extra bit only if we can't
24973 // prove that the upper bit of the original type is equal to the
24974 // upper bit of the proposed smaller type. If these two bits are
24975 // the same (either zero or one) we know that sign-extending from
24976 // the smaller type will result in the same value. Here, since we
24977 // can't yet prove this, we are just making the proposed smaller
24978 // type larger to ensure correctness.
24979 if (!IsKnownPositive)
24980 ++BitWidth1;
24981
24982 auto *I = dyn_cast<Instruction>(Val: Root);
24983 if (!I) {
24984 MaxBitWidth = std::max(a: BitWidth1, b: MaxBitWidth);
24985 continue;
24986 }
24987 APInt Mask = DB->getDemandedBits(I);
24988 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24989 MaxBitWidth =
24990 std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
24991 }
24992
24993 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24994 MaxBitWidth = 8;
24995
24996 // If the original type is large, but reduced type does not improve the reg
24997 // use - ignore it.
24998 if (NumParts > 1 &&
24999 NumParts ==
25000 ::getNumberOfParts(
25001 TTI: *TTI, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: F->getContext(),
25002 NumBits: bit_ceil(Value: MaxBitWidth)),
25003 VF)))
25004 return 0u;
25005
25006 unsigned Opcode = E.getOpcode();
25007 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
25008 Opcode == Instruction::SExt ||
25009 Opcode == Instruction::ZExt || NumParts > 1;
25010 // Conservatively determine if we can actually truncate the roots of the
25011 // expression. Collect the values that can be demoted in ToDemote and
25012 // additional roots that require investigating in Roots.
25013 DenseSet<const TreeEntry *> Visited;
25014 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
25015 bool NeedToDemote = IsProfitableToDemote;
25016
25017 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
25018 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
25019 IsProfitableToDemote&: NeedToDemote, IsTruncRoot) ||
25020 (MaxDepthLevel <= Limit &&
25021 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
25022 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
25023 DL->getTypeSizeInBits(Ty: TreeRootIT) /
25024 DL->getTypeSizeInBits(
25025 Ty: E.getMainOp()->getOperand(i: 0)->getType()) >
25026 2)))))
25027 return 0u;
25028 // Round MaxBitWidth up to the next power-of-two.
25029 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
25030
25031 return MaxBitWidth;
25032 };
25033
25034 // If we can truncate the root, we must collect additional values that might
25035 // be demoted as a result. That is, those seeded by truncations we will
25036 // modify.
25037 // Add reduction ops sizes, if any.
25038 if (UserIgnoreList &&
25039 isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
25040 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
25041 // x i1> to in)).
25042 if (all_of(Range: *UserIgnoreList,
25043 P: [](Value *V) {
25044 return isa<PoisonValue>(Val: V) ||
25045 cast<Instruction>(Val: V)->getOpcode() == Instruction::Add;
25046 }) &&
25047 VectorizableTree.front()->State == TreeEntry::Vectorize &&
25048 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
25049 cast<CastInst>(Val: VectorizableTree.front()->getMainOp())->getSrcTy() ==
25050 Builder.getInt1Ty()) {
25051 ReductionBitWidth = 1;
25052 } else {
25053 for (Value *V : *UserIgnoreList) {
25054 if (isa<PoisonValue>(Val: V))
25055 continue;
25056 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
25057 TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
25058 unsigned BitWidth1 = NumTypeBits - NumSignBits;
25059 if (!isKnownNonNegative(V, SQ: SimplifyQuery(*DL)))
25060 ++BitWidth1;
25061 unsigned BitWidth2 = BitWidth1;
25062 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
25063 APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
25064 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
25065 }
25066 ReductionBitWidth =
25067 std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
25068 }
25069 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
25070 ReductionBitWidth = 8;
25071
25072 ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
25073 }
25074 }
25075 bool IsTopRoot = NodeIdx == 0;
25076 while (NodeIdx < VectorizableTree.size() &&
25077 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
25078 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
25079 RootDemotes.push_back(Elt: NodeIdx);
25080 ++NodeIdx;
25081 IsTruncRoot = true;
25082 }
25083 bool IsSignedCmp = false;
25084 if (UserIgnoreList &&
25085 all_of(Range: *UserIgnoreList,
25086 P: match_fn(P: m_CombineOr(L: m_SMin(L: m_Value(), R: m_Value()),
25087 R: m_SMax(L: m_Value(), R: m_Value())))))
25088 IsSignedCmp = true;
25089 while (NodeIdx < VectorizableTree.size()) {
25090 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
25091 unsigned Limit = 2;
25092 if (IsTopRoot &&
25093 ReductionBitWidth ==
25094 DL->getTypeSizeInBits(
25095 Ty: VectorizableTree.front()->Scalars.front()->getType()))
25096 Limit = 3;
25097 unsigned MaxBitWidth = ComputeMaxBitWidth(
25098 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
25099 IsTruncRoot, IsSignedCmp);
25100 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
25101 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
25102 ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
25103 else if (MaxBitWidth == 0)
25104 ReductionBitWidth = 0;
25105 }
25106
25107 for (unsigned Idx : RootDemotes) {
25108 if (all_of(Range&: VectorizableTree[Idx]->Scalars, P: [&](Value *V) {
25109 uint32_t OrigBitWidth =
25110 DL->getTypeSizeInBits(Ty: V->getType()->getScalarType());
25111 if (OrigBitWidth > MaxBitWidth) {
25112 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
25113 return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL));
25114 }
25115 return false;
25116 }))
25117 ToDemote.push_back(Elt: Idx);
25118 }
25119 RootDemotes.clear();
25120 IsTopRoot = false;
25121 IsProfitableToDemoteRoot = true;
25122
25123 if (ExtraBitWidthNodes.empty()) {
25124 NodeIdx = VectorizableTree.size();
25125 } else {
25126 unsigned NewIdx = 0;
25127 do {
25128 NewIdx = *ExtraBitWidthNodes.begin();
25129 ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
25130 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
25131 NodeIdx = NewIdx;
25132 IsTruncRoot =
25133 NodeIdx < VectorizableTree.size() &&
25134 VectorizableTree[NodeIdx]->UserTreeIndex &&
25135 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
25136 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
25137 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
25138 Instruction::Trunc &&
25139 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
25140 IsSignedCmp =
25141 NodeIdx < VectorizableTree.size() &&
25142 VectorizableTree[NodeIdx]->UserTreeIndex &&
25143 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
25144 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
25145 Instruction::ICmp &&
25146 any_of(
25147 Range&: VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
25148 P: [&](Value *V) {
25149 auto *IC = dyn_cast<ICmpInst>(Val: V);
25150 return IC && (IC->isSigned() ||
25151 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 0),
25152 SQ: SimplifyQuery(*DL)) ||
25153 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 1),
25154 SQ: SimplifyQuery(*DL)));
25155 });
25156 }
25157
25158 // If the maximum bit width we compute is less than the width of the roots'
25159 // type, we can proceed with the narrowing. Otherwise, do nothing.
25160 if (MaxBitWidth == 0 ||
25161 MaxBitWidth >=
25162 cast<IntegerType>(Val: TreeRoot.front()->getType()->getScalarType())
25163 ->getBitWidth()) {
25164 if (UserIgnoreList)
25165 AnalyzedMinBWVals.insert_range(R&: TreeRoot);
25166 NodesToKeepBWs.insert_range(R&: ToDemote);
25167 continue;
25168 }
25169
25170 // Finally, map the values we can demote to the maximum bit with we
25171 // computed.
25172 for (unsigned Idx : ToDemote) {
25173 TreeEntry *TE = VectorizableTree[Idx].get();
25174 if (MinBWs.contains(Val: TE))
25175 continue;
25176 bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
25177 if (isa<PoisonValue>(Val: R))
25178 return false;
25179 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
25180 });
25181 MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
25182 }
25183 }
25184}
25185
25186PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
25187 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
25188 auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
25189 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
25190 auto *AA = &AM.getResult<AAManager>(IR&: F);
25191 auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
25192 auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
25193 auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
25194 auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
25195 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
25196
25197 bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
25198 if (!Changed)
25199 return PreservedAnalyses::all();
25200
25201 PreservedAnalyses PA;
25202 PA.preserveSet<CFGAnalyses>();
25203 return PA;
25204}
25205
25206bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
25207 TargetTransformInfo *TTI_,
25208 TargetLibraryInfo *TLI_, AAResults *AA_,
25209 LoopInfo *LI_, DominatorTree *DT_,
25210 AssumptionCache *AC_, DemandedBits *DB_,
25211 OptimizationRemarkEmitter *ORE_) {
25212 if (!RunSLPVectorization)
25213 return false;
25214 SE = SE_;
25215 TTI = TTI_;
25216 TLI = TLI_;
25217 AA = AA_;
25218 LI = LI_;
25219 DT = DT_;
25220 AC = AC_;
25221 DB = DB_;
25222 DL = &F.getDataLayout();
25223
25224 Stores.clear();
25225 GEPs.clear();
25226 bool Changed = false;
25227
25228 // If the target claims to have no vector registers don't attempt
25229 // vectorization.
25230 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
25231 LLVM_DEBUG(
25232 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
25233 return false;
25234 }
25235
25236 // Don't vectorize when the attribute NoImplicitFloat is used.
25237 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
25238 return false;
25239
25240 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
25241
25242 // Use the bottom up slp vectorizer to construct chains that start with
25243 // store instructions.
25244 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
25245
25246 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
25247 // delete instructions.
25248
25249 // Update DFS numbers now so that we can use them for ordering.
25250 DT->updateDFSNumbers();
25251
25252 // Scan the blocks in the function in post order.
25253 for (auto *BB : post_order(G: &F.getEntryBlock())) {
25254 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()))
25255 continue;
25256
25257 // Start new block - clear the list of reduction roots.
25258 R.clearReductionData();
25259 collectSeedInstructions(BB);
25260
25261 // Vectorize trees that end at stores.
25262 if (!Stores.empty()) {
25263 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
25264 << " underlying objects.\n");
25265 Changed |= vectorizeStoreChains(R);
25266 }
25267
25268 // Vectorize trees that end at reductions.
25269 Changed |= vectorizeChainsInBlock(BB, R);
25270
25271 // Vectorize the index computations of getelementptr instructions. This
25272 // is primarily intended to catch gather-like idioms ending at
25273 // non-consecutive loads.
25274 if (!GEPs.empty()) {
25275 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
25276 << " underlying objects.\n");
25277 Changed |= vectorizeGEPIndices(BB, R);
25278 }
25279 }
25280
25281 if (Changed) {
25282 R.optimizeGatherSequence();
25283 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
25284 }
25285 return Changed;
25286}
25287
25288std::optional<bool>
25289SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
25290 unsigned Idx, unsigned MinVF,
25291 unsigned &Size) {
25292 Size = 0;
25293 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
25294 << "\n");
25295 const unsigned Sz = R.getVectorElementSize(V: Chain[0]);
25296 unsigned VF = Chain.size();
25297
25298 if (!has_single_bit(Value: Sz) ||
25299 !hasFullVectorsOrPowerOf2(
25300 TTI: *TTI, Ty: cast<StoreInst>(Val: Chain.front())->getValueOperand()->getType(),
25301 Sz: VF) ||
25302 VF < 2 || VF < MinVF) {
25303 // Check if vectorizing with a non-power-of-2 VF should be considered. At
25304 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
25305 // all vector lanes are used.
25306 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
25307 return false;
25308 }
25309
25310 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
25311 << "\n");
25312
25313 SetVector<Value *> ValOps;
25314 for (Value *V : Chain)
25315 ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
25316 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
25317 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
25318 InstructionsState S =
25319 Analysis.buildInstructionsState(VL: ValOps.getArrayRef(), R);
25320 if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > 1) {
25321 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
25322 bool IsAllowedSize =
25323 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ValOps.front()->getType(),
25324 Sz: ValOps.size()) ||
25325 (VectorizeNonPowerOf2 && has_single_bit(Value: ValOps.size() + 1));
25326 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
25327 (!S.getMainOp()->isSafeToRemove() ||
25328 any_of(Range: ValOps.getArrayRef(),
25329 P: [&](Value *V) {
25330 return !isa<ExtractElementInst>(Val: V) &&
25331 (V->getNumUses() > Chain.size() ||
25332 any_of(Range: V->users(), P: [&](User *U) {
25333 return !Stores.contains(V: U);
25334 }));
25335 }))) ||
25336 (ValOps.size() > Chain.size() / 2 && !S)) {
25337 Size = (!IsAllowedSize && S) ? 1 : 2;
25338 return false;
25339 }
25340 }
25341 R.buildTree(Roots: Chain);
25342 // Check if tree tiny and store itself or its value is not vectorized.
25343 if (R.isTreeTinyAndNotFullyVectorizable()) {
25344 if (R.isGathered(V: Chain.front()) ||
25345 R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
25346 return std::nullopt;
25347 Size = R.getCanonicalGraphSize();
25348 return false;
25349 }
25350 if (R.isProfitableToReorder()) {
25351 R.reorderTopToBottom();
25352 R.reorderBottomToTop();
25353 }
25354 R.transformNodes();
25355 R.computeMinimumValueSizes();
25356
25357 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
25358 R.buildExternalUses();
25359
25360 Size = R.getCanonicalGraphSize();
25361 if (S && S.getOpcode() == Instruction::Load)
25362 Size = 2; // cut off masked gather small trees
25363 InstructionCost Cost = R.getTreeCost(TreeCost);
25364
25365 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
25366 if (Cost < -SLPCostThreshold) {
25367 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
25368
25369 using namespace ore;
25370
25371 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "StoresVectorized",
25372 cast<StoreInst>(Val: Chain[0]))
25373 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
25374 << " and with tree size "
25375 << NV("TreeSize", R.getTreeSize()));
25376
25377 R.vectorizeTree();
25378 return true;
25379 }
25380
25381 return false;
25382}
25383
25384namespace {
25385/// A group of related stores which we are in the process of vectorizing,
25386/// a subset of which may already be vectorized. Stores context information
25387/// about the group as a whole as well as information about what VFs need
25388/// to be attempted still.
25389class StoreChainContext {
25390public:
25391 using SizePair = std::pair<unsigned, unsigned>;
25392
25393 explicit StoreChainContext(ArrayRef<Value *> Ops,
25394 ArrayRef<SizePair> RangeSizes,
25395 SmallVector<unsigned> &RangeSizesByIdx)
25396 : Operands(Ops), RangeSizesStorage(RangeSizes),
25397 RangeSizesByIdx(RangeSizesByIdx) {}
25398
25399 /// Set up initial values using the already set Operands
25400 bool initializeContext(BoUpSLP &R, const DataLayout &DL,
25401 const TargetTransformInfo &TTI);
25402 /// Get the current VF
25403 std::optional<unsigned> getCurrentVF() const;
25404 /// Return the maximum VF for the context
25405 unsigned getMaxVF() const { return MaxVF; }
25406 /// Attempt to vectorize Operands for the given VF
25407 /// Returns false if no more attempts should be made for the context
25408 bool vectorizeOneVF(const TargetTransformInfo &TTI, unsigned VF,
25409 BoUpSLP::ValueSet &VectorizedStores, bool &Changed,
25410 llvm::function_ref<std::optional<bool>(
25411 ArrayRef<Value *>, unsigned, unsigned, unsigned &)>
25412 VectorizeStoreChain);
25413
25414private:
25415 bool isNotVectorized(const SizePair &P) const {
25416 return P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] > 0;
25417 }
25418
25419 bool isVectorized(const SizePair &P) const {
25420 return P.first == LocallyUnvectorizable || RangeSizesByIdx[P.first] == 0;
25421 }
25422
25423 bool isVFProfitable(unsigned Size, const SizePair &P) const {
25424 assert(P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] &&
25425 "Cannot check profitability of vectorized element");
25426 return Size >= RangeSizesByIdx[P.first];
25427 }
25428
25429 bool firstSizeSame(unsigned Size, const SizePair &P) const {
25430 assert(P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] &&
25431 "Cannot check profitability of vectorized element");
25432 return Size == RangeSizesByIdx[P.first];
25433 }
25434
25435 /// Return the index of the first unvectorized store after \p StartIdx
25436 unsigned getFirstUnvecStore(unsigned StartIdx = 0) const;
25437 /// Return the index of the first vectorized store after \p StartIdx
25438 unsigned getFirstVecStoreAfter(unsigned StartIdx) const;
25439 /// Return true if all stores have been vectorized
25440 bool allVectorized() const;
25441 /// Return true if all elements in the given range match \p TreeSize
25442 bool isFirstSizeSameRange(unsigned StartIdx, unsigned Length,
25443 unsigned TreeSize) const;
25444 /// Return true if the \p TreeSize is profitable for all elements in the range
25445 bool allOfRangeProfitable(unsigned StartIdx, unsigned Length,
25446 unsigned TreeSize) const;
25447 /// Update the live (first) range sizes from the cached values (second)
25448 void updateRangeSizesFromCache();
25449 /// Update the cached (second) range sizes with the given \p TreeSize
25450 void updateCachedRangeSizes(unsigned StartIdx, unsigned Length,
25451 unsigned TreeSize);
25452 /// Update CandidateVFs for secondary iterations
25453 bool updateCandidateVFs(const TargetTransformInfo &TTI);
25454 /// Remove the current VF from the queue
25455 void incrementVF() {
25456 if (!CandidateVFs.empty())
25457 CandidateVFs.pop();
25458 }
25459 /// Record vectorization of the provided range
25460 void markRangeVectorized(unsigned StartIdx, unsigned Length,
25461 unsigned &FirstUnvecStore, unsigned &MaxSliceEnd);
25462 /// Checks if the quadratic mean deviation is less than 90% of the mean size.
25463 bool checkTreeSizes(const unsigned SliceStartIdx, const unsigned VF) const;
25464
25465 /// In RangeSizes, element has not been vectorized, but due to the elements
25466 /// around it being vectorized, it does not have enough neighboring elements
25467 /// to make a chain longer than MinVF as part of the current Context
25468 static constexpr unsigned LocallyUnvectorizable =
25469 std::numeric_limits<unsigned>::max();
25470 /// Maximum number of iterations through CandidateVFs
25471 static constexpr unsigned MaxAttempts = 4;
25472
25473 /// For the StoreTy/Stride in the given group, what is the smallest VF
25474 /// that can be used
25475 unsigned MinVF = 0;
25476 /// Maximum number of instructions that can be vectorized, either
25477 /// constrained by register width or operands size.
25478 unsigned MaxVF = 0;
25479 /// MaxRegVF represents the number of instructions (scalar, or vector in
25480 /// case of revec) that can be vectorized to naturally fit in a vector
25481 /// register.
25482 unsigned MaxRegVF = 0;
25483 /// The largest VF checked in the current Repeat
25484 unsigned ProbeVF = 0;
25485 /// Type of the Stores in `Operands`
25486 Type *StoreTy = nullptr;
25487 /// Which VFs do we want to attempt for this chain
25488 std::queue<unsigned> CandidateVFs;
25489 /// Stores that compose this chain
25490 BoUpSLP::ValueList Operands;
25491 /// Track the TreeSizes of prior vectorization attempts using each element,
25492 /// to help us find early exit cases
25493 /// - first: contains pointer into RangeSizesByIdx to help us track
25494 /// vectorization of elements that belong to multiple chains
25495 /// - second: contains cached TreeSize value for that element
25496 SmallVector<SizePair> RangeSizesStorage;
25497 MutableArrayRef<SizePair> RangeSizes;
25498 /// RangeSize information for all elements in any chain
25499 /// Needed since may be overlap between chains
25500 SmallVector<unsigned> &RangeSizesByIdx;
25501 /// What element index is the end of the to be vectorized Operands
25502 /// i.e. Operands.size() == 16, and 12-15 were vectorized, then End == 12
25503 unsigned End = 0;
25504 /// How many times has CandidateVFs been refilled, prevents excessive
25505 /// attempts at vectorizing large VFs
25506 unsigned Repeat = 1;
25507 /// Did any vectorization occur for the current iteration over CandidateVFs
25508 bool RepeatChanged = false;
25509 /// Store information about failed vectorization attempts due to scheduling
25510 SmallDenseMap<Value *, SizePair> NonSchedulable;
25511};
25512
25513void StoreChainContext::markRangeVectorized(unsigned StartIdx, unsigned Length,
25514 unsigned &FirstUnvecStore,
25515 unsigned &MaxSliceEnd) {
25516 for (SizePair &P : RangeSizes.slice(N: StartIdx, M: Length))
25517 RangeSizesByIdx[P.first] = P.second = 0;
25518 if (StartIdx < FirstUnvecStore + MinVF) {
25519 for (SizePair &P :
25520 RangeSizes.slice(N: FirstUnvecStore, M: StartIdx - FirstUnvecStore)) {
25521 P.first = LocallyUnvectorizable;
25522 P.second = 0;
25523 }
25524 FirstUnvecStore = StartIdx + Length;
25525 }
25526 if (StartIdx + Length > MaxSliceEnd - MinVF) {
25527 for (SizePair &P : RangeSizes.slice(N: StartIdx + Length,
25528 M: MaxSliceEnd - (StartIdx + Length))) {
25529 P.first = LocallyUnvectorizable;
25530 P.second = 0;
25531 }
25532 if (MaxSliceEnd == End)
25533 End = StartIdx;
25534 MaxSliceEnd = StartIdx;
25535 }
25536}
25537
25538bool StoreChainContext::initializeContext(BoUpSLP &R, const DataLayout &DL,
25539 const TargetTransformInfo &TTI) {
25540 // Initialize range tracking in context.
25541 RangeSizes = MutableArrayRef(RangeSizesStorage);
25542
25543 unsigned MaxVecRegSize = R.getMaxVecRegSize();
25544 unsigned EltSize = R.getVectorElementSize(V: Operands[0]);
25545 unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
25546
25547 MaxVF = std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
25548 auto *Store = cast<StoreInst>(Val: Operands[0]);
25549 StoreTy = Store->getValueOperand()->getType();
25550 Type *ValueTy = StoreTy;
25551 if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
25552 ValueTy = Trunc->getSrcTy();
25553 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
25554 // getStoreMinimumVF only support scalar type as arguments. As a result,
25555 // we need to use the element type of StoreTy and ValueTy to retrieve the
25556 // VF and then transform it back.
25557 // Remember: VF is defined as the number we want to vectorize, not the
25558 // number of elements in the final vector.
25559 Type *StoreScalarTy = StoreTy->getScalarType();
25560 MinVF = PowerOf2Ceil(A: TTI.getStoreMinimumVF(
25561 VF: R.getMinVF(Sz: DL.getTypeStoreSizeInBits(Ty: StoreScalarTy)), ScalarMemTy: StoreScalarTy,
25562 ScalarValTy: ValueTy->getScalarType(), Alignment: Store->getAlign(),
25563 AddrSpace: Store->getPointerAddressSpace()));
25564 MinVF /= getNumElements(Ty: StoreTy);
25565 MinVF = std::max<unsigned>(a: 2, b: MinVF);
25566
25567 if (MaxVF < MinVF) {
25568 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
25569 << ") < "
25570 << "MinVF (" << MinVF << ")\n");
25571 return false;
25572 }
25573
25574 unsigned NonPowerOf2VF = 0;
25575 if (VectorizeNonPowerOf2) {
25576 // First try vectorizing with a non-power-of-2 VF. At the moment, only
25577 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
25578 // lanes are used.
25579 unsigned CandVF = std::clamp<unsigned>(val: Operands.size(), lo: MinVF, hi: MaxVF);
25580 if (has_single_bit(Value: CandVF + 1)) {
25581 NonPowerOf2VF = CandVF;
25582 assert(NonPowerOf2VF != MaxVF &&
25583 "Non-power-of-2 VF should not be equal to MaxVF");
25584 }
25585 }
25586
25587 MaxRegVF = MaxVF;
25588
25589 MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
25590 if (MaxVF < MinVF) {
25591 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
25592 << ") < "
25593 << "MinVF (" << MinVF << ")\n");
25594 return false;
25595 }
25596
25597 for (unsigned VF = std::max(a: MaxVF, b: NonPowerOf2VF); VF >= MinVF;
25598 VF = divideCeil(Numerator: VF, Denominator: 2))
25599 CandidateVFs.push(x: VF);
25600
25601 End = Operands.size();
25602 ProbeVF = MaxVF;
25603 return true;
25604}
25605
25606// Return the index of the first unvectorized store after \p StartIdx
25607unsigned StoreChainContext::getFirstUnvecStore(unsigned StartIdx) const {
25608 return std::distance(
25609 first: RangeSizes.begin(),
25610 last: find_if(Range: RangeSizes.drop_front(N: StartIdx),
25611 P: [this](const SizePair &P) { return this->isNotVectorized(P); }));
25612}
25613
25614// Return the index of the first vectorized store after \p StartIdx
25615unsigned StoreChainContext::getFirstVecStoreAfter(unsigned StartIdx) const {
25616 return std::distance(
25617 first: RangeSizes.begin(),
25618 last: find_if(Range: RangeSizes.drop_front(N: StartIdx),
25619 P: [this](const SizePair &P) { return this->isVectorized(P); }));
25620}
25621
25622// Return true if all stores have been vectorized
25623bool StoreChainContext::allVectorized() const {
25624 return all_of(Range: RangeSizes,
25625 P: [this](const SizePair &P) { return this->isVectorized(P); });
25626}
25627
25628// Return true if all elements in the given range match \p TreeSize
25629bool StoreChainContext::isFirstSizeSameRange(unsigned StartIdx, unsigned Length,
25630 unsigned TreeSize) const {
25631 return all_of(Range: RangeSizes.slice(N: StartIdx, M: Length),
25632 P: [TreeSize, this](const SizePair &P) {
25633 return firstSizeSame(Size: TreeSize, P);
25634 });
25635}
25636
25637// Return true if the \p TreeSize is profitable for all elements in the range
25638bool StoreChainContext::allOfRangeProfitable(unsigned StartIdx, unsigned Length,
25639 unsigned TreeSize) const {
25640 return all_of(Range: RangeSizes.slice(N: StartIdx, M: Length),
25641 P: [TreeSize, this](const SizePair &P) {
25642 return isVFProfitable(Size: TreeSize, P);
25643 });
25644}
25645
25646// Update the live (first) range sizes from the cached values (second)
25647void StoreChainContext::updateRangeSizesFromCache() {
25648 for (SizePair &P : RangeSizes) {
25649 if (P.first != LocallyUnvectorizable && RangeSizesByIdx[P.first] != 0)
25650 RangeSizesByIdx[P.first] = std::max(a: P.second, b: RangeSizesByIdx[P.first]);
25651 }
25652}
25653
25654// Update the cached (second) range sizes with the given \p TreeSize
25655void StoreChainContext::updateCachedRangeSizes(unsigned StartIdx,
25656 unsigned Length,
25657 unsigned TreeSize) {
25658 for (SizePair &P : RangeSizes.slice(N: StartIdx, M: Length))
25659 P.second = std::max(a: P.second, b: TreeSize);
25660}
25661
25662bool StoreChainContext::updateCandidateVFs(const TargetTransformInfo &TTI) {
25663 assert(CandidateVFs.empty() && "Did not use all VFs before refilling");
25664 constexpr unsigned StoresLimit = 64;
25665 const unsigned MaxTotalNum = std::min<unsigned>(
25666 a: Operands.size(), b: static_cast<unsigned>(End - getFirstUnvecStore()));
25667 unsigned VF = bit_ceil(Value: ProbeVF) * 2;
25668 if (VF > MaxTotalNum || VF >= StoresLimit)
25669 return false;
25670 // Attempt again to vectorize even larger chains if all previous
25671 // attempts were unsuccessful because of the cost issues.
25672 unsigned Limit =
25673 getFloorFullVectorNumberOfElements(TTI, Ty: StoreTy, Sz: MaxTotalNum);
25674 if (bit_floor(Value: Limit) == VF && Limit != VF)
25675 CandidateVFs.push(x: Limit);
25676 CandidateVFs.push(x: VF);
25677 ProbeVF = CandidateVFs.front();
25678 ++Repeat;
25679 RepeatChanged = false;
25680 return true;
25681}
25682
25683// Get the current VF
25684std::optional<unsigned> StoreChainContext::getCurrentVF() const {
25685 if (CandidateVFs.empty())
25686 return std::nullopt;
25687 return CandidateVFs.front();
25688}
25689
25690bool StoreChainContext::checkTreeSizes(const unsigned SliceStartIdx,
25691 const unsigned VF) const {
25692 auto Sizes = RangeSizes.slice(N: SliceStartIdx, M: VF);
25693 unsigned Num = 0;
25694 uint64_t Sum = std::accumulate(
25695 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
25696 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
25697 unsigned Size = Val.first == StoreChainContext::LocallyUnvectorizable
25698 ? 0
25699 : RangeSizesByIdx[Val.first];
25700 if (Size == 1)
25701 return V;
25702 ++Num;
25703 return V + Size;
25704 });
25705 if (Num == 0)
25706 return true;
25707 uint64_t Mean = Sum / Num;
25708 if (Mean == 0)
25709 return true;
25710 uint64_t Dev = std::accumulate(
25711 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
25712 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
25713 unsigned P =
25714 Val.first == StoreChainContext::LocallyUnvectorizable
25715 ? 0
25716 : RangeSizesByIdx[Val.first];
25717 if (P == 1)
25718 return V;
25719 return V + (P - Mean) * (P - Mean);
25720 }) /
25721 Num;
25722 return Dev * 96 / (Mean * Mean) == 0;
25723}
25724
25725bool StoreChainContext::vectorizeOneVF(
25726 const TargetTransformInfo &TTI, unsigned VF,
25727 BoUpSLP::ValueSet &VectorizedStores, bool &Changed,
25728 llvm::function_ref<std::optional<bool>(ArrayRef<Value *>, unsigned,
25729 unsigned, unsigned &)>
25730 VectorizeStoreChain) {
25731 bool AnyProfitableGraph = false;
25732 unsigned FirstUnvecStore = getFirstUnvecStore();
25733
25734 // Form slices of size VF starting from FirstUnvecStore and try to
25735 // vectorize them.
25736 while (FirstUnvecStore < End) {
25737 unsigned FirstVecStore = getFirstVecStoreAfter(StartIdx: FirstUnvecStore);
25738 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
25739 for (unsigned SliceStartIdx = FirstUnvecStore;
25740 SliceStartIdx + VF <= MaxSliceEnd;) {
25741 if (!checkTreeSizes(SliceStartIdx, VF)) {
25742 ++SliceStartIdx;
25743 continue;
25744 }
25745 ArrayRef<Value *> Slice = ArrayRef(Operands).slice(N: SliceStartIdx, M: VF);
25746 assert(all_of(Slice,
25747 [&](Value *V) {
25748 return cast<StoreInst>(V)->getValueOperand()->getType() ==
25749 cast<StoreInst>(Slice.front())
25750 ->getValueOperand()
25751 ->getType();
25752 }) &&
25753 "Expected all operands of same type.");
25754 if (!NonSchedulable.empty()) {
25755 auto [NonSchedSizeMax, NonSchedSizeMin] =
25756 NonSchedulable.lookup(Val: Slice.front());
25757 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
25758 // VF is too ambitious. Try to vectorize another slice before
25759 // trying a smaller VF.
25760 SliceStartIdx += NonSchedSizeMax;
25761 continue;
25762 }
25763 }
25764 unsigned TreeSize;
25765 std::optional<bool> Res =
25766 VectorizeStoreChain(Slice, SliceStartIdx, MinVF, TreeSize);
25767 if (!Res) {
25768 // Update the range of non schedulable VFs for slices starting
25769 // at SliceStartIdx.
25770 NonSchedulable.try_emplace(Key: Slice.front(), Args: std::make_pair(x&: VF, y&: VF))
25771 .first->getSecond()
25772 .second = VF;
25773 } else if (*Res) {
25774 // Mark the vectorized stores so that we don't vectorize them
25775 // again.
25776 VectorizedStores.insert_range(R&: Slice);
25777 AnyProfitableGraph = RepeatChanged = Changed = true;
25778 // If we vectorized initial block, no need to try to vectorize
25779 // it again.
25780 markRangeVectorized(StartIdx: SliceStartIdx, Length: VF, FirstUnvecStore, MaxSliceEnd);
25781 SliceStartIdx += VF;
25782 continue;
25783 }
25784 if (VF > 2 && Res && !allOfRangeProfitable(StartIdx: SliceStartIdx, Length: VF, TreeSize)) {
25785 SliceStartIdx += VF;
25786 continue;
25787 }
25788 // Check for the very big VFs that we're not rebuilding same
25789 // trees, just with larger number of elements.
25790 if (VF > MaxRegVF && TreeSize > 1 &&
25791 isFirstSizeSameRange(StartIdx: SliceStartIdx, Length: VF, TreeSize)) {
25792 SliceStartIdx += VF;
25793 while (SliceStartIdx != MaxSliceEnd &&
25794 isFirstSizeSameRange(StartIdx: SliceStartIdx, Length: 1, TreeSize))
25795 ++SliceStartIdx;
25796 continue;
25797 }
25798 if (TreeSize > 1)
25799 updateCachedRangeSizes(StartIdx: SliceStartIdx, Length: VF, TreeSize);
25800 ++SliceStartIdx;
25801 AnyProfitableGraph = true;
25802 }
25803 if (FirstUnvecStore >= End)
25804 break;
25805 if (MaxSliceEnd - FirstUnvecStore < VF &&
25806 MaxSliceEnd - FirstUnvecStore >= MinVF)
25807 AnyProfitableGraph = true;
25808 FirstUnvecStore = getFirstUnvecStore(StartIdx: MaxSliceEnd);
25809 }
25810 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(Value: VF))
25811 while (!CandidateVFs.empty())
25812 CandidateVFs.pop();
25813
25814 // For the MaxRegVF case, save RangeSizes to limit compile time
25815 if (VF == MaxRegVF)
25816 updateRangeSizesFromCache();
25817
25818 incrementVF();
25819 if (!getCurrentVF()) {
25820 // All values vectorized - exit.
25821 if (allVectorized())
25822 return false;
25823 // Check if tried all attempts or no need for the last attempts at
25824 // all.
25825 if (Repeat >= MaxAttempts ||
25826 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
25827 return false;
25828
25829 if (!updateCandidateVFs(TTI))
25830 return false;
25831 // Avoid double update of cache sizes
25832 if (VF != MaxRegVF)
25833 updateRangeSizesFromCache();
25834 }
25835 return true;
25836}
25837
25838/// A group of stores that we'll try to bundle together using vector ops.
25839/// They are ordered using the signed distance of their address operand to the
25840/// address of this group's BaseInstr.
25841class RelatedStoreInsts {
25842public:
25843 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
25844 : AllStores(AllStores) {
25845 reset(NewBaseInstr: BaseInstrIdx);
25846 }
25847
25848 void reset(unsigned NewBaseInstr) {
25849 assert(NewBaseInstr < AllStores.size() &&
25850 "Instruction index out of bounds");
25851 BaseInstrIdx = NewBaseInstr;
25852 Instrs.clear();
25853 insertOrLookup(InstrIdx: NewBaseInstr, PtrDist: 0);
25854 }
25855
25856 /// Tries to insert \p InstrIdx as the store with a pointer distance of
25857 /// \p PtrDist.
25858 /// Does nothing if there is already a store with that \p PtrDist.
25859 /// \returns The previously associated Instruction index, or std::nullopt
25860 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
25861 auto [It, Inserted] = Instrs.emplace(args&: PtrDist, args&: InstrIdx);
25862 return Inserted ? std::nullopt : std::make_optional(t&: It->second);
25863 }
25864
25865 using DistToInstMap = std::map<int64_t, unsigned>;
25866 const DistToInstMap &getStores() const { return Instrs; }
25867
25868 /// If \p SI is related to this group of stores, return the distance of its
25869 /// pointer operand to the one the group's BaseInstr.
25870 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
25871 ScalarEvolution &SE) const {
25872 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
25873 return getPointersDiff(
25874 ElemTyA: BaseStore.getValueOperand()->getType(), PtrA: BaseStore.getPointerOperand(),
25875 ElemTyB: SI.getValueOperand()->getType(), PtrB: SI.getPointerOperand(), DL, SE,
25876 /*StrictCheck=*/true);
25877 }
25878
25879 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
25880 /// Stores whose index is less than \p MinSafeIdx will be dropped.
25881 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
25882 int64_t DistFromCurBase) {
25883 DistToInstMap PrevSet = std::move(Instrs);
25884 reset(NewBaseInstr: NewBaseInstIdx);
25885
25886 // Re-insert stores that come after MinSafeIdx to try and vectorize them
25887 // again. Their distance will be "rebased" to use NewBaseInstIdx as
25888 // reference.
25889 for (auto [Dist, InstIdx] : PrevSet) {
25890 if (InstIdx >= MinSafeIdx)
25891 insertOrLookup(InstrIdx: InstIdx, PtrDist: Dist - DistFromCurBase);
25892 }
25893 }
25894
25895 /// Remove all stores that have been vectorized from this group.
25896 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
25897 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
25898 Range: reverse(C&: Instrs), P: [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
25899 return VectorizedStores.contains(Ptr: AllStores[DistAndIdx.second]);
25900 });
25901
25902 // Get a forward iterator pointing after the last vectorized store and erase
25903 // all stores before it so we don't try to vectorize them again.
25904 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
25905 Instrs.erase(first: Instrs.begin(), last: VectorizedStoresEnd);
25906 }
25907
25908private:
25909 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
25910 unsigned BaseInstrIdx;
25911
25912 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
25913 DistToInstMap Instrs;
25914
25915 /// Reference to all the stores in the BB being analyzed.
25916 ArrayRef<StoreInst *> AllStores;
25917};
25918
25919} // end anonymous namespace
25920
25921bool SLPVectorizerPass::vectorizeStores(
25922 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
25923 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
25924 &Visited) {
25925 // We may run into multiple chains that merge into a single chain. We mark the
25926 // stores that we vectorized so that we don't visit the same store twice.
25927 BoUpSLP::ValueSet VectorizedStores;
25928 bool Changed = false;
25929
25930 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
25931 int64_t PrevDist = -1;
25932 unsigned GlobalMaxVF = 0;
25933 SmallVector<unsigned> RangeSizesByIdx(StoreSeq.size(), 1);
25934 SmallVector<std::unique_ptr<StoreChainContext>> AllContexts;
25935 BoUpSLP::ValueList Operands;
25936 SmallVector<StoreChainContext::SizePair> RangeSizes;
25937 for (auto [Idx, Data] : enumerate(First: StoreSeq)) {
25938 auto &[Dist, InstIdx] = Data;
25939 if (Operands.empty() || Dist - PrevDist == 1) {
25940 Operands.push_back(Elt: Stores[InstIdx]);
25941 RangeSizes.emplace_back(Args&: Idx, Args: 1);
25942 PrevDist = Dist;
25943 if (Idx != StoreSeq.size() - 1)
25944 continue;
25945 }
25946
25947 if (Operands.size() > 1 &&
25948 Visited
25949 .insert(V: {Operands.front(),
25950 cast<StoreInst>(Val: Operands.front())->getValueOperand(),
25951 Operands.back(),
25952 cast<StoreInst>(Val: Operands.back())->getValueOperand(),
25953 Operands.size()})
25954 .second) {
25955 AllContexts.emplace_back(Args: std::make_unique<StoreChainContext>(
25956 args&: Operands, args&: RangeSizes, args&: RangeSizesByIdx));
25957 if (!AllContexts.back()->initializeContext(R, DL: *DL, TTI: *TTI))
25958 AllContexts.pop_back();
25959 else
25960 GlobalMaxVF = std::max(a: GlobalMaxVF, b: AllContexts.back()->getMaxVF());
25961 }
25962 Operands.clear();
25963 RangeSizes.clear();
25964 if (Idx != StoreSeq.size() - 1) {
25965 Operands.push_back(Elt: Stores[InstIdx]);
25966 RangeSizes.emplace_back(Args&: Idx, Args: 1);
25967 PrevDist = Dist;
25968 }
25969 }
25970
25971 for (unsigned LimitVF = GlobalMaxVF; LimitVF > 0;
25972 LimitVF = bit_ceil(Value: LimitVF) / 2) {
25973 for (auto &CtxPtr : AllContexts) {
25974 if (!CtxPtr)
25975 break;
25976 StoreChainContext &Context = *CtxPtr;
25977 for (std::optional<unsigned> VFUnval = Context.getCurrentVF();
25978 VFUnval && *VFUnval >= LimitVF; VFUnval = Context.getCurrentVF()) {
25979 unsigned VF = *VFUnval;
25980 if (!Context.vectorizeOneVF(
25981 TTI: *TTI, VF, VectorizedStores, Changed,
25982 VectorizeStoreChain: [this, &R](ArrayRef<Value *> Chain, unsigned Idx,
25983 unsigned MinVF, unsigned &Size) {
25984 return vectorizeStoreChain(Chain, R, Idx, MinVF, Size);
25985 })) {
25986 CtxPtr.reset();
25987 break;
25988 }
25989 }
25990 }
25991 }
25992 };
25993
25994 /// Groups of stores to vectorize
25995 SmallVector<RelatedStoreInsts> SortedStores;
25996
25997 // Inserts the specified store SI with the given index Idx to the set of the
25998 // stores. If the store with the same distance is found already - stop
25999 // insertion, try to vectorize already found stores. If some stores from this
26000 // sequence were not vectorized - try to vectorize them with the new store
26001 // later. But this logic is applied only to the stores, that come before the
26002 // previous store with the same distance.
26003 // Example:
26004 // 1. store x, %p
26005 // 2. store y, %p+1
26006 // 3. store z, %p+2
26007 // 4. store a, %p
26008 // 5. store b, %p+3
26009 // - Scan this from the last to first store. The very first bunch of stores is
26010 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
26011 // vector).
26012 // - The next store in the list - #1 - has the same distance from store #5 as
26013 // the store #4.
26014 // - Try to vectorize sequence of stores 4,2,3,5.
26015 // - If all these stores are vectorized - just drop them.
26016 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
26017 // - Start new stores sequence.
26018 // The new bunch of stores is {1, {1, 0}}.
26019 // - Add the stores from previous sequence, that were not vectorized.
26020 // Here we consider the stores in the reversed order, rather they are used in
26021 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
26022 // Store #3 can be added -> comes after store #4 with the same distance as
26023 // store #1.
26024 // Store #5 cannot be added - comes before store #4.
26025 // This logic allows to improve the compile time, we assume that the stores
26026 // after previous store with the same distance most likely have memory
26027 // dependencies and no need to waste compile time to try to vectorize them.
26028 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
26029 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
26030 std::optional<int64_t> PtrDist;
26031 auto *RelatedStores = find_if(
26032 Range&: SortedStores, P: [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
26033 PtrDist = StoreSeq.getPointerDiff(SI&: *SI, DL: *DL, SE&: *SE);
26034 return PtrDist.has_value();
26035 });
26036
26037 // We did not find a comparable store, start a new group.
26038 if (RelatedStores == SortedStores.end()) {
26039 SortedStores.emplace_back(Args&: Idx, Args&: Stores);
26040 return;
26041 }
26042
26043 // If there is already a store in the group with the same PtrDiff, try to
26044 // vectorize the existing instructions before adding the current store.
26045 // Otherwise, insert this store and keep collecting.
26046 if (std::optional<unsigned> PrevInst =
26047 RelatedStores->insertOrLookup(InstrIdx: Idx, PtrDist: *PtrDist)) {
26048 TryToVectorize(RelatedStores->getStores());
26049 RelatedStores->clearVectorizedStores(VectorizedStores);
26050 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
26051 /*NewBaseInstIdx=*/Idx,
26052 /*DistFromCurBase=*/*PtrDist);
26053 }
26054 };
26055 Type *PrevValTy = nullptr;
26056 for (auto [I, SI] : enumerate(First&: Stores)) {
26057 if (R.isDeleted(I: SI))
26058 continue;
26059 if (!PrevValTy)
26060 PrevValTy = SI->getValueOperand()->getType();
26061 // Check that we do not try to vectorize stores of different types.
26062 if (PrevValTy != SI->getValueOperand()->getType()) {
26063 for (RelatedStoreInsts &StoreSeq : SortedStores)
26064 TryToVectorize(StoreSeq.getStores());
26065 SortedStores.clear();
26066 PrevValTy = SI->getValueOperand()->getType();
26067 }
26068 FillStoresSet(I, SI);
26069 }
26070
26071 // Final vectorization attempt.
26072 for (RelatedStoreInsts &StoreSeq : SortedStores)
26073 TryToVectorize(StoreSeq.getStores());
26074
26075 return Changed;
26076}
26077
26078void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
26079 // Initialize the collections. We will make a single pass over the block.
26080 Stores.clear();
26081 GEPs.clear();
26082
26083 // Visit the store and getelementptr instructions in BB and organize them in
26084 // Stores and GEPs according to the underlying objects of their pointer
26085 // operands.
26086 for (Instruction &I : *BB) {
26087 // Ignore store instructions that are volatile or have a pointer operand
26088 // that doesn't point to a scalar type.
26089 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
26090 if (!SI->isSimple())
26091 continue;
26092 if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
26093 continue;
26094 Stores[getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
26095 }
26096
26097 // Ignore getelementptr instructions that have more than one index, a
26098 // constant index, or a pointer operand that doesn't point to a scalar
26099 // type.
26100 else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
26101 if (GEP->getNumIndices() != 1)
26102 continue;
26103 Value *Idx = GEP->idx_begin()->get();
26104 if (isa<Constant>(Val: Idx))
26105 continue;
26106 if (!isValidElementType(Ty: Idx->getType()))
26107 continue;
26108 if (GEP->getType()->isVectorTy())
26109 continue;
26110 GEPs[GEP->getPointerOperand()].push_back(Elt: GEP);
26111 }
26112 }
26113}
26114
26115bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
26116 bool MaxVFOnly) {
26117 if (VL.size() < 2)
26118 return false;
26119
26120 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
26121 << VL.size() << ".\n");
26122
26123 // Check that all of the parts are instructions of the same type,
26124 // we permit an alternate opcode via InstructionsState.
26125 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
26126 if (!S)
26127 return false;
26128
26129 Instruction *I0 = S.getMainOp();
26130 // Make sure invalid types (including vector type) are rejected before
26131 // determining vectorization factor for scalar instructions.
26132 for (Value *V : VL) {
26133 Type *Ty = V->getType();
26134 if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
26135 // NOTE: the following will give user internal llvm type name, which may
26136 // not be useful.
26137 R.getORE()->emit(RemarkBuilder: [&]() {
26138 std::string TypeStr;
26139 llvm::raw_string_ostream OS(TypeStr);
26140 Ty->print(O&: OS);
26141 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
26142 << "Cannot SLP vectorize list: type "
26143 << TypeStr + " is unsupported by vectorizer";
26144 });
26145 return false;
26146 }
26147 }
26148
26149 Type *ScalarTy = getValueType(V: VL[0]);
26150 unsigned Sz = R.getVectorElementSize(V: I0);
26151 unsigned MinVF = R.getMinVF(Sz);
26152 unsigned MaxVF = std::max<unsigned>(
26153 a: getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: ScalarTy, Sz: VL.size()), b: MinVF);
26154 MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
26155 if (MaxVF < 2) {
26156 R.getORE()->emit(RemarkBuilder: [&]() {
26157 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
26158 << "Cannot SLP vectorize list: vectorization factor "
26159 << "less than 2 is not supported";
26160 });
26161 return false;
26162 }
26163
26164 bool Changed = false;
26165 bool CandidateFound = false;
26166 InstructionCost MinCost = SLPCostThreshold.getValue();
26167
26168 unsigned NextInst = 0, MaxInst = VL.size();
26169 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
26170 VF = getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: I0->getType(), Sz: VF - 1)) {
26171 // No actual vectorization should happen, if number of parts is the same as
26172 // provided vectorization factor (i.e. the scalar type is used for vector
26173 // code during codegen).
26174 auto *VecTy = getWidenedType(ScalarTy, VF);
26175 if (TTI->getNumberOfParts(Tp: VecTy) == VF)
26176 continue;
26177 for (unsigned I = NextInst; I < MaxInst; ++I) {
26178 unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
26179
26180 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ScalarTy, Sz: ActualVF))
26181 continue;
26182
26183 if (MaxVFOnly && ActualVF < MaxVF)
26184 break;
26185 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
26186 break;
26187
26188 SmallVector<Value *> Ops(ActualVF, nullptr);
26189 unsigned Idx = 0;
26190 for (Value *V : VL.drop_front(N: I)) {
26191 // Check that a previous iteration of this loop did not delete the
26192 // Value.
26193 if (auto *Inst = dyn_cast<Instruction>(Val: V);
26194 !Inst || !R.isDeleted(I: Inst)) {
26195 Ops[Idx] = V;
26196 ++Idx;
26197 if (Idx == ActualVF)
26198 break;
26199 }
26200 }
26201 // Not enough vectorizable instructions - exit.
26202 if (Idx != ActualVF)
26203 break;
26204
26205 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
26206 << "\n");
26207
26208 R.buildTree(Roots: Ops);
26209 if (R.isTreeTinyAndNotFullyVectorizable())
26210 continue;
26211 if (R.isProfitableToReorder()) {
26212 R.reorderTopToBottom();
26213 R.reorderBottomToTop(IgnoreReorder: !isa<InsertElementInst>(Val: Ops.front()));
26214 }
26215 R.transformNodes();
26216 R.computeMinimumValueSizes();
26217 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
26218 R.buildExternalUses();
26219
26220 InstructionCost Cost = R.getTreeCost(TreeCost);
26221 CandidateFound = true;
26222 MinCost = std::min(a: MinCost, b: Cost);
26223
26224 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
26225 << " for VF=" << ActualVF << "\n");
26226 if (Cost < -SLPCostThreshold) {
26227 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
26228 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "VectorizedList",
26229 cast<Instruction>(Val: Ops[0]))
26230 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
26231 << " and with tree size "
26232 << ore::NV("TreeSize", R.getTreeSize()));
26233
26234 R.vectorizeTree();
26235 // Move to the next bundle.
26236 I += VF - 1;
26237 NextInst = I + 1;
26238 Changed = true;
26239 }
26240 }
26241 }
26242
26243 if (!Changed && CandidateFound) {
26244 R.getORE()->emit(RemarkBuilder: [&]() {
26245 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
26246 << "List vectorization was possible but not beneficial with cost "
26247 << ore::NV("Cost", MinCost) << " >= "
26248 << ore::NV("Treshold", -SLPCostThreshold);
26249 });
26250 } else if (!Changed) {
26251 R.getORE()->emit(RemarkBuilder: [&]() {
26252 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
26253 << "Cannot SLP vectorize list: vectorization was impossible"
26254 << " with available vectorization factors";
26255 });
26256 }
26257 return Changed;
26258}
26259
26260namespace {
26261
26262/// Model horizontal reductions.
26263///
26264/// A horizontal reduction is a tree of reduction instructions that has values
26265/// that can be put into a vector as its leaves. For example:
26266///
26267/// mul mul mul mul
26268/// \ / \ /
26269/// + +
26270/// \ /
26271/// +
26272/// This tree has "mul" as its leaf values and "+" as its reduction
26273/// instructions. A reduction can feed into a store or a binary operation
26274/// feeding a phi.
26275/// ...
26276/// \ /
26277/// +
26278/// |
26279/// phi +=
26280///
26281/// Or:
26282/// ...
26283/// \ /
26284/// +
26285/// |
26286/// *p =
26287///
26288class HorizontalReduction {
26289 using ReductionOpsType = SmallVector<Value *, 16>;
26290 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
26291 ReductionOpsListType ReductionOps;
26292 /// List of possibly reduced values.
26293 SmallVector<SmallVector<Value *>> ReducedVals;
26294 /// Maps reduced value to the corresponding reduction operation.
26295 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
26296 WeakTrackingVH ReductionRoot;
26297 /// The type of reduction operation.
26298 RecurKind RdxKind;
26299 /// Checks if the optimization of original scalar identity operations on
26300 /// matched horizontal reductions is enabled and allowed.
26301 bool IsSupportedHorRdxIdentityOp = false;
26302 /// The minimum number of the reduced values.
26303 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
26304 /// Contains vector values for reduction including their scale factor and
26305 /// signedness. The last bool is true, if the value was reduced in-tree.
26306 SmallVector<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
26307 VectorValuesAndScales;
26308
26309 static bool isCmpSelMinMax(Instruction *I) {
26310 return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
26311 RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
26312 }
26313
26314 // And/or are potentially poison-safe logical patterns like:
26315 // select x, y, false
26316 // select x, true, y
26317 static bool isBoolLogicOp(Instruction *I) {
26318 return isa<SelectInst>(Val: I) &&
26319 (match(V: I, P: m_LogicalAnd()) || match(V: I, P: m_LogicalOr()));
26320 }
26321
26322 /// Checks if instruction is associative and can be vectorized.
26323 enum class ReductionOrdering { Unordered, Ordered, None };
26324 ReductionOrdering RK = ReductionOrdering::None;
26325 static ReductionOrdering isVectorizable(RecurKind Kind, Instruction *I,
26326 bool TwoElementReduction = false) {
26327 if (Kind == RecurKind::None)
26328 return ReductionOrdering::None;
26329
26330 // Integer ops that map to select instructions or intrinsics are fine.
26331 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
26332 isBoolLogicOp(I))
26333 return ReductionOrdering::Unordered;
26334
26335 // No need to check for associativity, if 2 reduced values.
26336 if (TwoElementReduction)
26337 return ReductionOrdering::Unordered;
26338
26339 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
26340 // FP min/max are associative except for NaN and -0.0. We do not
26341 // have to rule out -0.0 here because the intrinsic semantics do not
26342 // specify a fixed result for it.
26343 return I->getFastMathFlags().noNaNs() ? ReductionOrdering::Unordered
26344 : ReductionOrdering::Ordered;
26345 }
26346
26347 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
26348 return ReductionOrdering::Unordered;
26349
26350 if (I->isAssociative())
26351 return ReductionOrdering::Unordered;
26352
26353 return ::isCommutative(I) ? ReductionOrdering::Ordered
26354 : ReductionOrdering::None;
26355 }
26356
26357 static Value *getRdxOperand(Instruction *I, unsigned Index) {
26358 // Poison-safe 'or' takes the form: select X, true, Y
26359 // To make that work with the normal operand processing, we skip the
26360 // true value operand.
26361 // TODO: Change the code and data structures to handle this without a hack.
26362 if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == 1)
26363 return I->getOperand(i: 2);
26364 return I->getOperand(i: Index);
26365 }
26366
26367 /// Creates reduction operation with the current opcode.
26368 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
26369 Value *RHS, const Twine &Name, bool UseSelect) {
26370 Type *OpTy = LHS->getType();
26371 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
26372 switch (Kind) {
26373 case RecurKind::Or: {
26374 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
26375 return Builder.CreateSelectWithUnknownProfile(
26376 C: LHS, True: ConstantInt::getAllOnesValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
26377 False: RHS, DEBUG_TYPE, Name);
26378 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
26379 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
26380 Name);
26381 }
26382 case RecurKind::And: {
26383 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
26384 return Builder.CreateSelectWithUnknownProfile(
26385 C: LHS, True: RHS,
26386 False: ConstantInt::getNullValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
26387 DEBUG_TYPE, Name);
26388 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
26389 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
26390 Name);
26391 }
26392 case RecurKind::Add:
26393 case RecurKind::Mul:
26394 case RecurKind::Xor:
26395 case RecurKind::FAdd:
26396 case RecurKind::FMul: {
26397 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
26398 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
26399 Name);
26400 }
26401 case RecurKind::SMax:
26402 case RecurKind::SMin:
26403 case RecurKind::UMax:
26404 case RecurKind::UMin:
26405 if (UseSelect) {
26406 CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(RK: Kind);
26407 Value *Cmp = Builder.CreateICmp(P: Pred, LHS, RHS, Name);
26408 return Builder.CreateSelectWithUnknownProfile(C: Cmp, True: LHS, False: RHS, DEBUG_TYPE,
26409 Name);
26410 }
26411 [[fallthrough]];
26412 case RecurKind::FMax:
26413 case RecurKind::FMin:
26414 case RecurKind::FMaximum:
26415 case RecurKind::FMinimum:
26416 case RecurKind::FMaximumNum:
26417 case RecurKind::FMinimumNum: {
26418 Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(RK: Kind);
26419 return Builder.CreateBinaryIntrinsic(ID: Id, LHS, RHS);
26420 }
26421 default:
26422 llvm_unreachable("Unknown reduction operation.");
26423 }
26424 }
26425
26426 /// Creates reduction operation with the current opcode with the IR flags
26427 /// from \p ReductionOps, dropping nuw/nsw flags.
26428 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
26429 Value *RHS, const Twine &Name,
26430 const ReductionOpsListType &ReductionOps) {
26431 bool UseSelect = ReductionOps.size() == 2 ||
26432 // Logical or/and.
26433 (ReductionOps.size() == 1 &&
26434 any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
26435 assert((!UseSelect || ReductionOps.size() != 2 ||
26436 isa<SelectInst>(ReductionOps[1][0])) &&
26437 "Expected cmp + select pairs for reduction");
26438 Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
26439 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
26440 if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
26441 propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps[0], OpValue: nullptr,
26442 /*IncludeWrapFlags=*/false);
26443 propagateIRFlags(I: Op, VL: ReductionOps[1], OpValue: nullptr,
26444 /*IncludeWrapFlags=*/false);
26445 return Op;
26446 }
26447 }
26448 propagateIRFlags(I: Op, VL: ReductionOps[0], OpValue: nullptr, /*IncludeWrapFlags=*/false);
26449 return Op;
26450 }
26451
26452public:
26453 static RecurKind getRdxKind(Value *V) {
26454 auto *I = dyn_cast<Instruction>(Val: V);
26455 if (!I)
26456 return RecurKind::None;
26457 if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
26458 return RecurKind::Add;
26459 if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
26460 return RecurKind::Mul;
26461 if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) ||
26462 match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
26463 return RecurKind::And;
26464 if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) ||
26465 match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
26466 return RecurKind::Or;
26467 if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
26468 return RecurKind::Xor;
26469 if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
26470 return RecurKind::FAdd;
26471 if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
26472 return RecurKind::FMul;
26473
26474 if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
26475 return RecurKind::FMax;
26476 if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
26477 return RecurKind::FMin;
26478
26479 if (match(V: I, P: m_FMaximum(Op0: m_Value(), Op1: m_Value())))
26480 return RecurKind::FMaximum;
26481 if (match(V: I, P: m_FMinimum(Op0: m_Value(), Op1: m_Value())))
26482 return RecurKind::FMinimum;
26483 // This matches either cmp+select or intrinsics. SLP is expected to handle
26484 // either form.
26485 // TODO: If we are canonicalizing to intrinsics, we can remove several
26486 // special-case paths that deal with selects.
26487 if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
26488 return RecurKind::SMax;
26489 if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
26490 return RecurKind::SMin;
26491 if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
26492 return RecurKind::UMax;
26493 if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
26494 return RecurKind::UMin;
26495
26496 if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
26497 // Try harder: look for min/max pattern based on instructions producing
26498 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
26499 // During the intermediate stages of SLP, it's very common to have
26500 // pattern like this (since optimizeGatherSequence is run only once
26501 // at the end):
26502 // %1 = extractelement <2 x i32> %a, i32 0
26503 // %2 = extractelement <2 x i32> %a, i32 1
26504 // %cond = icmp sgt i32 %1, %2
26505 // %3 = extractelement <2 x i32> %a, i32 0
26506 // %4 = extractelement <2 x i32> %a, i32 1
26507 // %select = select i1 %cond, i32 %3, i32 %4
26508 CmpPredicate Pred;
26509 Instruction *L1;
26510 Instruction *L2;
26511
26512 Value *LHS = Select->getTrueValue();
26513 Value *RHS = Select->getFalseValue();
26514 Value *Cond = Select->getCondition();
26515
26516 // TODO: Support inverse predicates.
26517 if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
26518 if (!isa<ExtractElementInst>(Val: RHS) ||
26519 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
26520 return RecurKind::None;
26521 } else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
26522 if (!isa<ExtractElementInst>(Val: LHS) ||
26523 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
26524 return RecurKind::None;
26525 } else {
26526 if (!isa<ExtractElementInst>(Val: LHS) || !isa<ExtractElementInst>(Val: RHS))
26527 return RecurKind::None;
26528 if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) ||
26529 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) ||
26530 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
26531 return RecurKind::None;
26532 }
26533
26534 switch (Pred) {
26535 default:
26536 return RecurKind::None;
26537 case CmpInst::ICMP_SGT:
26538 case CmpInst::ICMP_SGE:
26539 return RecurKind::SMax;
26540 case CmpInst::ICMP_SLT:
26541 case CmpInst::ICMP_SLE:
26542 return RecurKind::SMin;
26543 case CmpInst::ICMP_UGT:
26544 case CmpInst::ICMP_UGE:
26545 return RecurKind::UMax;
26546 case CmpInst::ICMP_ULT:
26547 case CmpInst::ICMP_ULE:
26548 return RecurKind::UMin;
26549 }
26550 }
26551 return RecurKind::None;
26552 }
26553
26554 /// Get the index of the first operand.
26555 static unsigned getFirstOperandIndex(Instruction *I) {
26556 return isCmpSelMinMax(I) ? 1 : 0;
26557 }
26558
26559private:
26560 /// Total number of operands in the reduction operation.
26561 static unsigned getNumberOfOperands(Instruction *I) {
26562 return isCmpSelMinMax(I) ? 3 : 2;
26563 }
26564
26565 /// Checks if the instruction is in basic block \p BB.
26566 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
26567 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
26568 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
26569 auto *Sel = cast<SelectInst>(Val: I);
26570 auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
26571 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
26572 }
26573 return I->getParent() == BB;
26574 }
26575
26576 /// Expected number of uses for reduction operations/reduced values.
26577 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
26578 if (IsCmpSelMinMax) {
26579 // SelectInst must be used twice while the condition op must have single
26580 // use only.
26581 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
26582 return Sel->hasNUses(N: 2) && Sel->getCondition()->hasOneUse();
26583 return I->hasNUses(N: 2);
26584 }
26585
26586 // Arithmetic reduction operation must be used once only.
26587 return I->hasOneUse();
26588 }
26589
26590 /// Initializes the list of reduction operations.
26591 void initReductionOps(Instruction *I) {
26592 if (isCmpSelMinMax(I))
26593 ReductionOps.assign(NumElts: 2, Elt: ReductionOpsType());
26594 else
26595 ReductionOps.assign(NumElts: 1, Elt: ReductionOpsType());
26596 }
26597
26598 /// Add all reduction operations for the reduction instruction \p I.
26599 void addReductionOps(Instruction *I) {
26600 if (isCmpSelMinMax(I)) {
26601 ReductionOps[0].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
26602 ReductionOps[1].emplace_back(Args&: I);
26603 } else {
26604 ReductionOps[0].emplace_back(Args&: I);
26605 }
26606 }
26607
26608 static bool isGoodForReduction(ArrayRef<Value *> Data) {
26609 int Sz = Data.size();
26610 auto *I = dyn_cast<Instruction>(Val: Data.front());
26611 return Sz > 1 || isConstant(V: Data.front()) ||
26612 (I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
26613 }
26614
26615 /// Optimizes original placement of the reduced values for the reduction tree.
26616 /// For example, if there is a zext i1 + selects, we can merge select
26617 /// into zext and improve emission of the reductions.
26618 void optimizeReducedVals(BoUpSLP &R, DominatorTree &DT, const DataLayout &DL,
26619 const TargetTransformInfo &TTI,
26620 const TargetLibraryInfo &TLI) {
26621 SmallDenseMap<unsigned, unsigned> UsedReductionOpIds;
26622 for (const auto [Idx, Vals] : enumerate(First&: ReducedVals)) {
26623 if (auto *I = dyn_cast<Instruction>(Val: Vals.front()))
26624 UsedReductionOpIds.try_emplace(Key: I->getOpcode(), Args&: Idx);
26625 }
26626 // Check if zext i1 can be merged with select.
26627 auto ZExtIt = UsedReductionOpIds.find(Val: Instruction::ZExt);
26628 auto SelectIt = UsedReductionOpIds.find(Val: Instruction::Select);
26629 if (ZExtIt != UsedReductionOpIds.end() &&
26630 SelectIt != UsedReductionOpIds.end()) {
26631 unsigned ZExtIdx = ZExtIt->second;
26632 unsigned SelectIdx = SelectIt->second;
26633 auto *ZExt = cast<ZExtInst>(Val: ReducedVals[ZExtIdx].front());
26634 // ZExt is compatible with Select? Merge select to zext, if so.
26635 if (ZExt->getSrcTy()->isIntegerTy(Bitwidth: 1) &&
26636 ZExt->getType() == ReducedVals[SelectIdx].front()->getType()) {
26637 ReducedVals[ZExtIdx].append(RHS: ReducedVals[SelectIdx]);
26638 ReducedVals.erase(CI: std::next(x: ReducedVals.begin(), n: SelectIdx));
26639 }
26640 }
26641 // Merge 1 element reduced value groups into larger group of shl, if only 2
26642 // groups available. May trigger extra vectorization with the copyables.
26643 if (ReducedVals.size() == 2 &&
26644 (ReducedVals.front().size() == 1 || ReducedVals.back().size() == 1)) {
26645 SmallVector<Value *> Ops(ReducedVals.front().size() +
26646 ReducedVals.back().size());
26647 copy(Range&: ReducedVals.front(), Out: Ops.begin());
26648 copy(Range&: ReducedVals.back(),
26649 Out: std::next(x: Ops.begin(), n: ReducedVals.front().size()));
26650 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
26651 InstructionsState OpS = Analysis.buildInstructionsState(
26652 VL: Ops, R, /*TryCopyableElementsVectorization=*/WithProfitabilityCheck: true,
26653 /*WithProfitabilityCheck=*/SkipSameCodeCheck: true);
26654 if (OpS && OpS.areInstructionsWithCopyableElements() &&
26655 OpS.getOpcode() == Instruction::Shl) {
26656 // The smallest reduced values group should be the first.
26657 if (ReducedVals.back().size() == 1 && ReducedVals.front().size() != 1)
26658 std::swap(LHS&: ReducedVals.front(), RHS&: ReducedVals.back());
26659 // Check if the largest reduced values group are shl and sort them by
26660 // the constant shift amount to improve chances of vectorization with
26661 // the copyables.
26662 auto Comparator = [](Value *V1, Value *V2) {
26663 ConstantInt *C1, *C2;
26664 if (!match(V: V1, P: m_Shl(L: m_Value(), R: m_ConstantInt(CI&: C1))))
26665 return false;
26666 if (!match(V: V2, P: m_Shl(L: m_Value(), R: m_ConstantInt(CI&: C2))))
26667 return true;
26668 return C1->getZExtValue() < C2->getZExtValue();
26669 };
26670 stable_sort(Range&: ReducedVals.back(), C: Comparator);
26671 ReducedVals.front().append(RHS: ReducedVals.back());
26672 ReducedVals.pop_back();
26673 }
26674 }
26675 }
26676
26677public:
26678 HorizontalReduction() = default;
26679 HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
26680 : ReductionRoot(I), ReductionLimit(2) {
26681 RdxKind = HorizontalReduction::getRdxKind(V: I);
26682 ReductionOps.emplace_back().push_back(Elt: I);
26683 ReducedVals.emplace_back().assign(in_start: Ops.begin(), in_end: Ops.end());
26684 for (Value *V : Ops)
26685 ReducedValsToOps[V].push_back(Elt: I);
26686 }
26687
26688 bool matchReductionForOperands() {
26689 // Analyze "regular" integer/FP types for reductions - no target-specific
26690 // types or pointers.
26691 assert(ReductionRoot && "Reduction root is not set!");
26692 RK = isVectorizable(Kind: RdxKind, I: cast<Instruction>(Val&: ReductionRoot),
26693 TwoElementReduction: all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> Ops) {
26694 return Ops.size() == 2;
26695 }));
26696 return RK != ReductionOrdering::None;
26697 }
26698
26699 /// Try to find a reduction tree.
26700 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
26701 ScalarEvolution &SE, DominatorTree &DT,
26702 const DataLayout &DL,
26703 const TargetTransformInfo &TTI,
26704 const TargetLibraryInfo &TLI) {
26705 RdxKind = HorizontalReduction::getRdxKind(V: Root);
26706 RK = isVectorizable(Kind: RdxKind, I: Root);
26707 if (RK == ReductionOrdering::None)
26708 return false;
26709
26710 // Analyze "regular" integer/FP types for reductions - no target-specific
26711 // types or pointers.
26712 Type *Ty = Root->getType();
26713 if (!isValidElementType(Ty) || Ty->isPointerTy())
26714 return false;
26715
26716 // Though the ultimate reduction may have multiple uses, its condition must
26717 // have only single use.
26718 if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
26719 if (!Sel->getCondition()->hasOneUse())
26720 RK = ReductionOrdering::Ordered;
26721
26722 ReductionRoot = Root;
26723
26724 // Iterate through all the operands of the possible reduction tree and
26725 // gather all the reduced values, sorting them by their value id.
26726 BasicBlock *BB = Root->getParent();
26727 bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
26728 SmallVector<std::pair<Instruction *, unsigned>> Worklist(
26729 1, std::make_pair(x&: Root, y: 0));
26730 SmallPtrSet<Value *, 8> Operands;
26731 SmallVector<std::pair<Instruction *, unsigned>> PossibleOrderedReductionOps;
26732 // Checks if the operands of the \p TreeN instruction are also reduction
26733 // operations or should be treated as reduced values or an extra argument,
26734 // which is not part of the reduction.
26735 auto CheckOperands = [&](Instruction *TreeN,
26736 SmallVectorImpl<Value *> &PossibleReducedVals,
26737 SmallVectorImpl<Instruction *> &ReductionOps,
26738 unsigned Level) {
26739 for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
26740 End: getNumberOfOperands(I: TreeN)))) {
26741 Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
26742 ReducedValsToOps[EdgeVal].push_back(Elt: TreeN);
26743 auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
26744 // If the edge is not an instruction, or it is different from the main
26745 // reduction opcode or has too many uses - possible reduced value.
26746 // Also, do not try to reduce const values, if the operation is not
26747 // foldable.
26748 bool IsReducedVal = !EdgeInst || Level > RecursionMaxDepth ||
26749 getRdxKind(V: EdgeInst) != RdxKind ||
26750 IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst);
26751 ReductionOrdering CurrentRK = IsReducedVal
26752 ? ReductionOrdering::None
26753 : isVectorizable(Kind: RdxKind, I: EdgeInst);
26754 if (!IsReducedVal && CurrentRK == ReductionOrdering::Unordered &&
26755 RK == ReductionOrdering::Unordered &&
26756 !hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst)) {
26757 IsReducedVal = true;
26758 CurrentRK = ReductionOrdering::None;
26759 if (PossibleReducedVals.size() < ReductionLimit &&
26760 !Operands.contains(Ptr: EdgeInst))
26761 PossibleOrderedReductionOps.emplace_back(Args&: EdgeInst, Args&: Level);
26762 }
26763 if (CurrentRK == ReductionOrdering::None ||
26764 Operands.contains(Ptr: EdgeInst) ||
26765 (R.isAnalyzedReductionRoot(I: EdgeInst) &&
26766 all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
26767 PossibleReducedVals.push_back(Elt: EdgeVal);
26768 if (EdgeInst && !isCmpSelMinMax(I: EdgeInst))
26769 Operands.insert_range(R: EdgeInst->operands());
26770 continue;
26771 }
26772 if (CurrentRK == ReductionOrdering::Ordered)
26773 RK = ReductionOrdering::Ordered;
26774 ReductionOps.push_back(Elt: EdgeInst);
26775 }
26776 };
26777 // Try to regroup reduced values so that it gets more profitable to try to
26778 // reduce them. Values are grouped by their value ids, instructions - by
26779 // instruction op id and/or alternate op id, plus do extra analysis for
26780 // loads (grouping them by the distance between pointers) and cmp
26781 // instructions (grouping them by the predicate).
26782 SmallMapVector<
26783 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
26784 8>
26785 PossibleReducedVals;
26786 initReductionOps(I: Root);
26787 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
26788 SmallSet<size_t, 2> LoadKeyUsed;
26789
26790 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
26791 Key = hash_combine(args: hash_value(value: LI->getParent()->getNumber()), args: Key);
26792 Value *Ptr =
26793 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
26794 if (!LoadKeyUsed.insert(V: Key).second) {
26795 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
26796 if (LIt != LoadsMap.end()) {
26797 for (LoadInst *RLI : LIt->second) {
26798 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
26799 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
26800 /*StrictCheck=*/true))
26801 return hash_value(ptr: RLI->getPointerOperand());
26802 }
26803 for (LoadInst *RLI : LIt->second) {
26804 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
26805 Ptr2: LI->getPointerOperand(), TLI)) {
26806 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
26807 return SubKey;
26808 }
26809 }
26810 if (LIt->second.size() > 2) {
26811 hash_code SubKey =
26812 hash_value(ptr: LIt->second.back()->getPointerOperand());
26813 return SubKey;
26814 }
26815 }
26816 }
26817 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr))
26818 .first->second.push_back(Elt: LI);
26819 return hash_value(ptr: LI->getPointerOperand());
26820 };
26821
26822 SmallVector<Value *> ReducedValsCandidates;
26823 bool AdjustedToOrdered = false;
26824 SmallPtrSet<Instruction *, 16> Visited;
26825 while (!Worklist.empty()) {
26826 auto [TreeN, Level] = Worklist.pop_back_val();
26827 if (!Visited.insert(Ptr: TreeN).second)
26828 continue;
26829 SmallVector<Value *> PossibleRedVals;
26830 SmallVector<Instruction *> PossibleReductionOps;
26831 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
26832 addReductionOps(I: TreeN);
26833 ReducedValsCandidates.append(in_start: PossibleRedVals.begin(),
26834 in_end: PossibleRedVals.end());
26835 for (Instruction *I : reverse(C&: PossibleReductionOps))
26836 Worklist.emplace_back(Args&: I, Args: I->getParent() == BB ? 0 : Level + 1);
26837 // If not enough elements for unordered vectorization, check if there are
26838 // potential candidates for the ordered vectorization and try to add them
26839 // to the worklist.
26840 if (Worklist.empty() && ReducedValsCandidates.size() < ReductionLimit &&
26841 !PossibleOrderedReductionOps.empty() &&
26842 RK == ReductionOrdering::Unordered) {
26843 RK = ReductionOrdering::Ordered;
26844 AdjustedToOrdered = true;
26845 SmallPtrSet<const Instruction *, 4> Ops;
26846 for (const auto &P : PossibleOrderedReductionOps)
26847 Ops.insert(Ptr: P.first);
26848 erase_if(C&: ReducedValsCandidates, P: [&](Value *V) {
26849 auto *I = dyn_cast<Instruction>(Val: V);
26850 return I && Ops.contains(Ptr: I);
26851 });
26852 Worklist.append(in_start: PossibleOrderedReductionOps.begin(),
26853 in_end: PossibleOrderedReductionOps.end());
26854 PossibleOrderedReductionOps.clear();
26855 }
26856 }
26857 // Too many integer reduced values candidates for the ordered reductions
26858 // after adjustements - try to switch to unordered reductions instead.
26859 constexpr unsigned ReducedValsLimit = 1024;
26860 if (ReducedValsCandidates.size() > ReducedValsLimit && AdjustedToOrdered &&
26861 ReducedValsCandidates.front()->getType()->isIntOrIntVectorTy())
26862 return false;
26863 // Add reduction values. The values are sorted for better vectorization
26864 // results.
26865 for (Value *V : ReducedValsCandidates) {
26866 if (RK == ReductionOrdering::Ordered && !isa<Instruction>(Val: V))
26867 continue;
26868 size_t Key, Idx;
26869 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
26870 /*AllowAlternate=*/false);
26871 ++PossibleReducedVals[Key][Idx].try_emplace(Key: V, Args: 0).first->second;
26872 }
26873 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
26874 // Sort values by the total number of values kinds to start the reduction
26875 // from the longest possible reduced values sequences.
26876 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
26877 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
26878 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
26879 for (auto &Slice : PossibleRedVals) {
26880 PossibleRedValsVect.emplace_back();
26881 auto RedValsVect = Slice.second.takeVector();
26882 stable_sort(Range&: RedValsVect, C: llvm::less_second());
26883 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
26884 PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
26885 }
26886 stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
26887 return P1.size() > P2.size();
26888 });
26889 bool First = true;
26890 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
26891 if (First) {
26892 First = false;
26893 ReducedVals.emplace_back();
26894 } else if (!isGoodForReduction(Data)) {
26895 auto *LI = dyn_cast<LoadInst>(Val: Data.front());
26896 auto *LastLI = dyn_cast<LoadInst>(Val: ReducedVals.back().front());
26897 if (!LI || !LastLI ||
26898 getUnderlyingObject(V: LI->getPointerOperand()) !=
26899 getUnderlyingObject(V: LastLI->getPointerOperand()))
26900 ReducedVals.emplace_back();
26901 }
26902 ReducedVals.back().append(in_start: Data.rbegin(), in_end: Data.rend());
26903 }
26904 }
26905 // Post optimize reduced values to get better reduction sequences and sort
26906 // them by size.
26907 optimizeReducedVals(R, DT, DL, TTI, TLI);
26908 // Sort the reduced values by number of same/alternate opcode and/or pointer
26909 // operand.
26910 stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
26911 return P1.size() > P2.size();
26912 });
26913 return true;
26914 }
26915
26916 /// Attempt to vectorize the tree found by matchAssociativeReduction.
26917 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
26918 const TargetLibraryInfo &TLI, AssumptionCache *AC,
26919 DominatorTree &DT) {
26920 constexpr unsigned RegMaxNumber = 4;
26921 const unsigned RedValsMaxNumber =
26922 (RK == ReductionOrdering::Ordered &&
26923 ReductionRoot->getType()->isIntOrIntVectorTy())
26924 ? 48
26925 : 128;
26926 // If there are a sufficient number of reduction values, reduce
26927 // to a nearby power-of-2. We can safely generate oversized
26928 // vectors and rely on the backend to split them to legal sizes.
26929 if (unsigned NumReducedVals = std::accumulate(
26930 first: ReducedVals.begin(), last: ReducedVals.end(), init: 0,
26931 binary_op: [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
26932 if (!isGoodForReduction(Data: Vals))
26933 return Num;
26934 return Num + Vals.size();
26935 });
26936 NumReducedVals < ReductionLimit &&
26937 all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
26938 return RedV.size() < 2 || !allConstant(VL: RedV) || !isSplat(VL: RedV);
26939 })) {
26940 for (ReductionOpsType &RdxOps : ReductionOps)
26941 for (Value *RdxOp : RdxOps)
26942 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
26943 return nullptr;
26944 }
26945
26946 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
26947 TargetFolder(DL));
26948 Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
26949
26950 // Track the reduced values in case if they are replaced by extractelement
26951 // because of the vectorization.
26952 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
26953 ReducedVals.front().size());
26954
26955 // The compare instruction of a min/max is the insertion point for new
26956 // instructions and may be replaced with a new compare instruction.
26957 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
26958 assert(isa<SelectInst>(RdxRootInst) &&
26959 "Expected min/max reduction to have select root instruction");
26960 Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
26961 assert(isa<Instruction>(ScalarCond) &&
26962 "Expected min/max reduction to have compare condition");
26963 return cast<Instruction>(Val: ScalarCond);
26964 };
26965
26966 bool AnyBoolLogicOp = any_of(Range&: ReductionOps.back(), P: [](Value *V) {
26967 return isBoolLogicOp(I: cast<Instruction>(Val: V));
26968 });
26969 // Return new VectorizedTree, based on previous value.
26970 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
26971 if (VectorizedTree) {
26972 // Update the final value in the reduction.
26973 Builder.SetCurrentDebugLocation(
26974 cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
26975 if (AnyBoolLogicOp) {
26976 auto It = ReducedValsToOps.find(Val: VectorizedTree);
26977 auto It1 = ReducedValsToOps.find(Val: Res);
26978 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
26979 isGuaranteedNotToBePoison(V: VectorizedTree, AC) ||
26980 (It != ReducedValsToOps.end() &&
26981 any_of(Range&: It->getSecond(), P: [&](Instruction *I) {
26982 return isBoolLogicOp(I) &&
26983 getRdxOperand(I, Index: 0) == VectorizedTree;
26984 }))) {
26985 ;
26986 } else if (isGuaranteedNotToBePoison(V: Res, AC) ||
26987 (It1 != ReducedValsToOps.end() &&
26988 any_of(Range&: It1->getSecond(), P: [&](Instruction *I) {
26989 return isBoolLogicOp(I) && getRdxOperand(I, Index: 0) == Res;
26990 }))) {
26991 std::swap(a&: VectorizedTree, b&: Res);
26992 } else {
26993 VectorizedTree = Builder.CreateFreeze(V: VectorizedTree);
26994 }
26995 }
26996
26997 return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
26998 ReductionOps);
26999 }
27000 // Initialize the final value in the reduction.
27001 return Res;
27002 };
27003 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
27004 ReductionOps.front().size());
27005 for (ReductionOpsType &RdxOps : ReductionOps)
27006 for (Value *RdxOp : RdxOps) {
27007 if (!RdxOp)
27008 continue;
27009 IgnoreList.insert(V: RdxOp);
27010 }
27011 // Intersect the fast-math-flags from all reduction operations.
27012 FastMathFlags RdxFMF;
27013 RdxFMF.set();
27014 for (Value *U : IgnoreList)
27015 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
27016 RdxFMF &= FPMO->getFastMathFlags();
27017 // For ordered reductions here we need to generate extractelement
27018 // instructions, so clear IgnoreList.
27019 if (RK == ReductionOrdering::Ordered)
27020 IgnoreList.clear();
27021 bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
27022
27023 // Need to track reduced vals, they may be changed during vectorization of
27024 // subvectors.
27025 for (ArrayRef<Value *> Candidates : ReducedVals)
27026 for (Value *V : Candidates)
27027 TrackedVals.try_emplace(Key: V, Args&: V);
27028
27029 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
27030 Value *V) -> unsigned & {
27031 auto *It = MV.find(Key: V);
27032 assert(It != MV.end() && "Unable to find given key.");
27033 return It->second;
27034 };
27035
27036 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
27037 // List of the values that were reduced in other trees as part of gather
27038 // nodes and thus requiring extract if fully vectorized in other trees.
27039 SmallPtrSet<Value *, 4> RequiredExtract;
27040 WeakTrackingVH VectorizedTree = nullptr;
27041 bool CheckForReusedReductionOps = false;
27042 // Try to vectorize elements based on their type.
27043 SmallVector<InstructionsState> States;
27044 SmallVector<SmallVector<Value *>> LocalReducedVals;
27045 // Try merge consecutive reduced values into a single vectorizable group and
27046 // check, if they can be vectorized as copyables.
27047 const bool TwoGroupsOnly = ReducedVals.size() == 2;
27048 const bool TwoGroupsOfSameSmallSize =
27049 TwoGroupsOnly &&
27050 ReducedVals.front().size() == ReducedVals.back().size() &&
27051 ReducedVals.front().size() < ReductionLimit;
27052 for (ArrayRef<Value *> RV : ReducedVals) {
27053 // Loads are not very compatible with undefs.
27054 if (isa<UndefValue>(Val: RV.front()) &&
27055 (States.empty() || !States.back() ||
27056 States.back().getOpcode() == Instruction::Load)) {
27057 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
27058 States.push_back(Elt: InstructionsState::invalid());
27059 continue;
27060 }
27061 if (!LocalReducedVals.empty() &&
27062 isa<UndefValue>(Val: LocalReducedVals.back().front()) &&
27063 isa<LoadInst>(Val: RV.front())) {
27064 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
27065 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
27066 continue;
27067 }
27068 // Do some copyables analysis only if more than 2 groups exist or they
27069 // are large enough.
27070 if (!TwoGroupsOfSameSmallSize) {
27071 SmallVector<Value *> Ops;
27072 if (!LocalReducedVals.empty())
27073 Ops = LocalReducedVals.back();
27074 Ops.append(in_start: RV.begin(), in_end: RV.end());
27075 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
27076 InstructionsState OpS = Analysis.buildInstructionsState(
27077 VL: Ops, R: V, /*WithProfitabilityCheck=*/true,
27078 /*SkipSameCodeCheck=*/true);
27079 if (OpS && OpS.areInstructionsWithCopyableElements()) {
27080 if (LocalReducedVals.empty()) {
27081 LocalReducedVals.push_back(Elt: Ops);
27082 States.push_back(Elt: OpS);
27083 continue;
27084 }
27085 LocalReducedVals.back().swap(RHS&: Ops);
27086 States.back() = OpS;
27087 continue;
27088 }
27089 // For safety, allow split vectorization only if 2 groups are available
27090 // overall.
27091 if (TwoGroupsOnly) {
27092 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL: Ops);
27093 OpS = InstructionsState(MainOp, AltOp);
27094 // Last chance to try to vectorize alternate node.
27095 SmallVector<Value *> Op1, Op2;
27096 BoUpSLP::OrdersType ReorderIndices;
27097 if (MainOp && AltOp &&
27098 V.canBuildSplitNode(VL: Ops, LocalState: OpS, Op1, Op2, ReorderIndices)) {
27099 if (LocalReducedVals.empty()) {
27100 LocalReducedVals.push_back(Elt: Ops);
27101 States.push_back(Elt: OpS);
27102 continue;
27103 }
27104 LocalReducedVals.back().swap(RHS&: Ops);
27105 States.back() = OpS;
27106 continue;
27107 }
27108 }
27109 }
27110 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
27111 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
27112 }
27113 ReducedVals.swap(RHS&: LocalReducedVals);
27114 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
27115 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
27116 InstructionsState S = States[I];
27117 SmallVector<Value *> Candidates;
27118 Candidates.reserve(N: 2 * OrigReducedVals.size());
27119 SmallVector<Value *> TrackedToOrig;
27120 for (Value *ReducedVal : OrigReducedVals) {
27121 Value *RdxVal = TrackedVals.at(Val: ReducedVal);
27122 // Check if the reduction value was not overriden by the extractelement
27123 // instruction because of the vectorization and exclude it, if it is not
27124 // compatible with other values.
27125 // Also check if the instruction was folded to constant/other value.
27126 auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
27127 if (Inst && V.isDeleted(I: Inst))
27128 continue;
27129 if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
27130 (!S || (!S.getMatchingMainOpOrAltOp(I: Inst) &&
27131 !S.isCopyableElement(V: Inst)))) ||
27132 (S && !Inst && !isa<PoisonValue>(Val: RdxVal) &&
27133 !S.isCopyableElement(V: RdxVal)))
27134 continue;
27135 Candidates.push_back(Elt: RdxVal);
27136 TrackedToOrig.push_back(Elt: ReducedVal);
27137 }
27138 bool ShuffledExtracts = false;
27139 // Try to handle shuffled extractelements.
27140 if (S && S.getOpcode() == Instruction::ExtractElement &&
27141 !S.isAltShuffle() && I + 1 < E) {
27142 SmallVector<Value *> CommonCandidates(Candidates);
27143 for (Value *RV : ReducedVals[I + 1]) {
27144 Value *RdxVal = TrackedVals.at(Val: RV);
27145 // Check if the reduction value was not overriden by the
27146 // extractelement instruction because of the vectorization and
27147 // exclude it, if it is not compatible with other values.
27148 auto *Inst = dyn_cast<ExtractElementInst>(Val: RdxVal);
27149 if (!Inst)
27150 continue;
27151 CommonCandidates.push_back(Elt: RdxVal);
27152 TrackedToOrig.push_back(Elt: RV);
27153 }
27154 SmallVector<int> Mask;
27155 if (isFixedVectorShuffle(VL: CommonCandidates, Mask, AC)) {
27156 ++I;
27157 Candidates.swap(RHS&: CommonCandidates);
27158 ShuffledExtracts = true;
27159 }
27160 }
27161
27162 // Emit code for constant values.
27163 if (Candidates.size() > 1 && allConstant(VL: Candidates)) {
27164 if (RK == ReductionOrdering::Ordered)
27165 continue;
27166 Value *Res = Candidates.front();
27167 Value *OrigV = TrackedToOrig.front();
27168 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
27169 for (const auto [Idx, VC] :
27170 enumerate(First: ArrayRef(Candidates).drop_front())) {
27171 Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
27172 Value *OrigV = TrackedToOrig[Idx + 1];
27173 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
27174 if (auto *ResI = dyn_cast<Instruction>(Val: Res))
27175 V.analyzedReductionRoot(I: ResI);
27176 }
27177 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
27178 continue;
27179 }
27180
27181 unsigned NumReducedVals = Candidates.size();
27182 if (NumReducedVals < ReductionLimit &&
27183 (NumReducedVals < 2 || !isSplat(VL: Candidates)))
27184 continue;
27185
27186 // Check if we support repeated scalar values processing (optimization of
27187 // original scalar identity operations on matched horizontal reductions).
27188 IsSupportedHorRdxIdentityOp =
27189 RK == ReductionOrdering::Unordered && RdxKind != RecurKind::Mul &&
27190 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
27191 // Gather same values.
27192 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
27193 if (IsSupportedHorRdxIdentityOp)
27194 for (const auto [Idx, V] : enumerate(First&: Candidates)) {
27195 Value *OrigV = TrackedToOrig[Idx];
27196 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
27197 }
27198 // Used to check if the reduced values used same number of times. In this
27199 // case the compiler may produce better code. E.g. if reduced values are
27200 // aabbccdd (8 x values), then the first node of the tree will have a node
27201 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
27202 // Plus, the final reduction will be performed on <8 x aabbccdd>.
27203 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
27204 // x abcd) * 2.
27205 // Currently it only handles add/fadd/xor. and/or/min/max do not require
27206 // this analysis, other operations may require an extra estimation of
27207 // the profitability.
27208 bool SameScaleFactor = false;
27209 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
27210 SameValuesCounter.size() != Candidates.size();
27211 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
27212 if (OptReusedScalars) {
27213 SameScaleFactor =
27214 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
27215 RdxKind == RecurKind::Xor) &&
27216 all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
27217 P: [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
27218 return P.second == SameValuesCounter.front().second;
27219 });
27220 Candidates.resize(N: SameValuesCounter.size());
27221 TrackedToOrig.resize(N: SameValuesCounter.size());
27222 transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
27223 F: [&](const auto &P) { return TrackedVals.at(P.first); });
27224 transform(Range&: SameValuesCounter, d_first: TrackedToOrig.begin(),
27225 F: [](const auto &P) { return P.first; });
27226 NumReducedVals = Candidates.size();
27227 // Have a reduction of the same element.
27228 if (NumReducedVals == 1) {
27229 Value *OrigV = TrackedToOrig.front();
27230 unsigned Cnt = At(SameValuesCounter, OrigV);
27231 Value *RedVal =
27232 emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
27233 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
27234 VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
27235 ExternallyUsedValues.insert(V: OrigV);
27236 continue;
27237 }
27238 }
27239
27240 unsigned MaxVecRegSize = V.getMaxVecRegSize();
27241 unsigned EltSize = V.getVectorElementSize(V: Candidates[0]);
27242 const unsigned MaxElts = std::clamp<unsigned>(
27243 val: llvm::bit_floor(Value: MaxVecRegSize / EltSize), lo: RedValsMaxNumber,
27244 hi: RegMaxNumber * RedValsMaxNumber);
27245
27246 unsigned ReduxWidth = NumReducedVals;
27247 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
27248 unsigned NumParts, NumRegs;
27249 Type *ScalarTy = Candidates.front()->getType();
27250 ReduxWidth =
27251 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: ReduxWidth);
27252 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
27253 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
27254 NumRegs =
27255 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
27256 while (NumParts > NumRegs) {
27257 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
27258 ReduxWidth = bit_floor(Value: ReduxWidth - 1);
27259 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
27260 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
27261 NumRegs =
27262 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
27263 }
27264 if (NumParts > NumRegs / 2)
27265 ReduxWidth = bit_floor(Value: ReduxWidth);
27266 return ReduxWidth;
27267 };
27268 if (!VectorizeNonPowerOf2 || !has_single_bit(Value: ReduxWidth + 1))
27269 ReduxWidth = GetVectorFactor(ReduxWidth);
27270 ReduxWidth = std::min(a: ReduxWidth, b: MaxElts);
27271
27272 unsigned Start = 0;
27273 unsigned Pos = Start;
27274 // Restarts vectorization attempt with lower vector factor.
27275 unsigned PrevReduxWidth = ReduxWidth;
27276 bool CheckForReusedReductionOpsLocal = false;
27277 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
27278 bool IsAnyRedOpGathered =
27279 !IgnoreVL &&
27280 (RK == ReductionOrdering::Ordered || V.isAnyGathered(Vals: IgnoreList));
27281 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
27282 // Check if any of the reduction ops are gathered. If so, worth
27283 // trying again with less number of reduction ops.
27284 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
27285 }
27286 ++Pos;
27287 if (Pos < NumReducedVals - ReduxWidth + 1)
27288 return IsAnyRedOpGathered;
27289 Pos = Start;
27290 --ReduxWidth;
27291 if (ReduxWidth > 1)
27292 ReduxWidth = GetVectorFactor(ReduxWidth);
27293 return IsAnyRedOpGathered;
27294 };
27295 bool AnyVectorized = false;
27296 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
27297 while (Pos < NumReducedVals - ReduxWidth + 1 &&
27298 ReduxWidth >= ReductionLimit) {
27299 // Dependency in tree of the reduction ops - drop this attempt, try
27300 // later.
27301 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
27302 Start == 0) {
27303 CheckForReusedReductionOps = true;
27304 break;
27305 }
27306 PrevReduxWidth = ReduxWidth;
27307 ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
27308 // Been analyzed already - skip.
27309 if (IgnoredCandidates.contains(V: std::make_pair(x&: Pos, y&: ReduxWidth)) ||
27310 (!has_single_bit(Value: ReduxWidth) &&
27311 (IgnoredCandidates.contains(
27312 V: std::make_pair(x&: Pos, y: bit_floor(Value: ReduxWidth))) ||
27313 IgnoredCandidates.contains(
27314 V: std::make_pair(x: Pos + (ReduxWidth - bit_floor(Value: ReduxWidth)),
27315 y: bit_floor(Value: ReduxWidth))))) ||
27316 V.areAnalyzedReductionVals(VL)) {
27317 (void)AdjustReducedVals(/*IgnoreVL=*/true);
27318 continue;
27319 }
27320 // Early exit if any of the reduction values were deleted during
27321 // previous vectorization attempts.
27322 if (any_of(Range&: VL, P: [&V](Value *RedVal) {
27323 auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
27324 return RedValI && V.isDeleted(I: RedValI);
27325 }))
27326 break;
27327 if (RK == ReductionOrdering::Ordered)
27328 V.buildTree(Roots: VL);
27329 else
27330 V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
27331 if (V.isTreeTinyAndNotFullyVectorizable(ForReduction: RK ==
27332 ReductionOrdering::Unordered)) {
27333 constexpr unsigned CandidatesLimit = 64;
27334 if (!AdjustReducedVals(RK == ReductionOrdering::Ordered &&
27335 Candidates.size() >= CandidatesLimit))
27336 V.analyzedReductionVals(VL);
27337 continue;
27338 }
27339 V.reorderTopToBottom();
27340 // No need to reorder the root node at all for reassociative reduction.
27341 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
27342 VL.front()->getType()->isIntOrIntVectorTy() ||
27343 ReductionLimit > 2 ||
27344 RK == ReductionOrdering::Ordered);
27345 // Keep extracted other reduction values, if they are used in the
27346 // vectorization trees.
27347 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
27348 ExternallyUsedValues);
27349 // The reduction root is used as the insertion point for new
27350 // instructions, so set it as externally used to prevent it from being
27351 // deleted.
27352 LocalExternallyUsedValues.insert(V: ReductionRoot);
27353 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
27354 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
27355 continue;
27356 for (Value *V : ReducedVals[Cnt])
27357 if (isa<Instruction>(Val: V))
27358 LocalExternallyUsedValues.insert(V: TrackedVals[V]);
27359 }
27360 if (!IsSupportedHorRdxIdentityOp) {
27361 // Number of uses of the candidates in the vector of values.
27362 assert(SameValuesCounter.empty() &&
27363 "Reused values counter map is not empty");
27364 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
27365 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
27366 continue;
27367 Value *OrigV = TrackedToOrig[Cnt];
27368 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
27369 }
27370 }
27371 V.transformNodes();
27372 V.computeMinimumValueSizes();
27373 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VectorizedVals: VL);
27374
27375 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
27376 // Gather externally used values.
27377 SmallPtrSet<Value *, 4> Visited;
27378 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
27379 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
27380 continue;
27381 Value *RdxVal = Candidates[Cnt];
27382 if (auto It = TrackedVals.find(Val: RdxVal); It != TrackedVals.end())
27383 RdxVal = It->second;
27384 if (!Visited.insert(Ptr: RdxVal).second)
27385 continue;
27386 // Check if the scalar was vectorized as part of the vectorization
27387 // tree but not the top node.
27388 if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
27389 LocalExternallyUsedValues.insert(V: RdxVal);
27390 continue;
27391 }
27392 Value *OrigV = TrackedToOrig[Cnt];
27393 unsigned NumOps =
27394 VectorizedVals.lookup(Val: OrigV) + At(SameValuesCounter, OrigV);
27395 if (NumOps != ReducedValsToOps.at(Val: OrigV).size())
27396 LocalExternallyUsedValues.insert(V: RdxVal);
27397 }
27398 // Do not need the list of reused scalars in regular mode anymore.
27399 if (!IsSupportedHorRdxIdentityOp)
27400 SameValuesCounter.clear();
27401 for (Value *RdxVal : VL)
27402 if (RequiredExtract.contains(Ptr: RdxVal))
27403 LocalExternallyUsedValues.insert(V: RdxVal);
27404 V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
27405
27406 // Estimate cost.
27407 InstructionCost ReductionCost;
27408 if (RK == ReductionOrdering::Ordered || V.isReducedBitcastRoot() ||
27409 V.isReducedCmpBitcastRoot())
27410 ReductionCost = 0;
27411 else
27412 ReductionCost =
27413 getReductionCost(TTI, ReducedVals: VL, SameValuesCounter, IsCmpSelMinMax,
27414 FMF: RdxFMF, R: V, DT, DL, TLI);
27415 // If the root is a select (min/max idiom), the insert point is the
27416 // compare condition of that select.
27417 Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
27418 Instruction *InsertPt = RdxRootInst;
27419 if (IsCmpSelMinMax)
27420 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
27421 InstructionCost Cost =
27422 V.getTreeCost(TreeCost, VectorizedVals: VL, ReductionCost, RdxRoot: InsertPt);
27423 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
27424 << " for reduction\n");
27425 if (!Cost.isValid())
27426 break;
27427 if (Cost >= -SLPCostThreshold) {
27428 V.getORE()->emit(RemarkBuilder: [&]() {
27429 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
27430 ReducedValsToOps.at(Val: VL[0]).front())
27431 << "Vectorizing horizontal reduction is possible "
27432 << "but not beneficial with cost " << ore::NV("Cost", Cost)
27433 << " and threshold "
27434 << ore::NV("Threshold", -SLPCostThreshold);
27435 });
27436 if (!AdjustReducedVals()) {
27437 V.analyzedReductionVals(VL);
27438 unsigned Offset = Pos == Start ? Pos : Pos - 1;
27439 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
27440 // Add subvectors of VL to the list of the analyzed values.
27441 for (unsigned VF = getFloorFullVectorNumberOfElements(
27442 TTI: *TTI, Ty: VL.front()->getType(), Sz: ReduxWidth - 1);
27443 VF >= ReductionLimit;
27444 VF = getFloorFullVectorNumberOfElements(
27445 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
27446 if (has_single_bit(Value: VF) &&
27447 V.getCanonicalGraphSize() != V.getTreeSize())
27448 continue;
27449 for (unsigned Idx : seq<unsigned>(Size: ReduxWidth - VF))
27450 IgnoredCandidates.insert(V: std::make_pair(x: Offset + Idx, y&: VF));
27451 }
27452 }
27453 }
27454 continue;
27455 }
27456
27457 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
27458 << Cost << ". (HorRdx)\n");
27459 V.getORE()->emit(RemarkBuilder: [&]() {
27460 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
27461 ReducedValsToOps.at(Val: VL[0]).front())
27462 << "Vectorized horizontal reduction with cost "
27463 << ore::NV("Cost", Cost) << " and with tree size "
27464 << ore::NV("TreeSize", V.getTreeSize());
27465 });
27466
27467 Builder.setFastMathFlags(RdxFMF);
27468
27469 // Vectorize a tree.
27470 Value *VectorizedRoot = V.vectorizeTree(
27471 ExternallyUsedValues: LocalExternallyUsedValues, ReductionRoot: InsertPt, VectorValuesAndScales);
27472
27473 if (RK == ReductionOrdering::Ordered) {
27474 // No need to generate reduction here, emit extractelements instead in
27475 // the tree vectorizer.
27476 assert(VectorizedRoot && "Expected vectorized tree");
27477 // Count vectorized reduced values to exclude them from final
27478 // reduction.
27479 for (Value *RdxVal : VL)
27480 ++VectorizedVals.try_emplace(Key: RdxVal).first->getSecond();
27481 Pos += ReduxWidth;
27482 Start = Pos;
27483 ReduxWidth = NumReducedVals - Pos;
27484 if (ReduxWidth > 1)
27485 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
27486 AnyVectorized = true;
27487 VectorizedTree = ReductionRoot;
27488 continue;
27489 }
27490 Builder.SetInsertPoint(InsertPt);
27491
27492 // To prevent poison from leaking across what used to be sequential,
27493 // safe, scalar boolean logic operations, the reduction operand must be
27494 // frozen.
27495 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(V: VectorizedRoot, AC))
27496 VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
27497
27498 // Emit code to correctly handle reused reduced values, if required.
27499 if (OptReusedScalars && !SameScaleFactor) {
27500 // Build TrackedToOrig aligned with the root node scalars order,
27501 // which may differ from Candidates order due to tree reordering.
27502 ArrayRef<Value *> RootVL = V.getRootNodeScalars();
27503 ArrayRef<Value *> CandSlice(Candidates.begin() + Pos, ReduxWidth);
27504 SmallVector<Value *> RootTrackedToOrig(RootVL.size());
27505 for (auto [Idx, V] : enumerate(First&: RootVL)) {
27506 auto *It = find(Range&: CandSlice, Val: V);
27507 assert(It != CandSlice.end() &&
27508 "Root scalar not found in candidates");
27509 RootTrackedToOrig[Idx] =
27510 TrackedToOrig[Pos + std::distance(first: CandSlice.begin(), last: It)];
27511 }
27512 VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
27513 SameValuesCounter, TrackedToOrig: RootTrackedToOrig);
27514 }
27515
27516 Type *ScalarTy = VL.front()->getType();
27517 Type *VecTy = VectorizedRoot->getType();
27518 Type *RedScalarTy = VecTy->getScalarType();
27519 VectorValuesAndScales.emplace_back(
27520 Args&: VectorizedRoot,
27521 Args: OptReusedScalars && SameScaleFactor
27522 ? SameValuesCounter.front().second
27523 : 1,
27524 Args: RedScalarTy != ScalarTy->getScalarType()
27525 ? V.isSignedMinBitwidthRootNode()
27526 : true,
27527 Args: V.isReducedBitcastRoot() || V.isReducedCmpBitcastRoot());
27528
27529 // Count vectorized reduced values to exclude them from final reduction.
27530 for (const auto [Idx, RdxVal] : enumerate(First&: VL)) {
27531 Value *OrigV = TrackedToOrig[Pos + Idx];
27532 if (IsSupportedHorRdxIdentityOp) {
27533 VectorizedVals.try_emplace(Key: OrigV, Args&: At(SameValuesCounter, OrigV));
27534 continue;
27535 }
27536 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
27537 if (!V.isVectorized(V: RdxVal))
27538 RequiredExtract.insert(Ptr: RdxVal);
27539 }
27540 Pos += ReduxWidth;
27541 Start = Pos;
27542 ReduxWidth = NumReducedVals - Pos;
27543 if (ReduxWidth > 1)
27544 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
27545 AnyVectorized = true;
27546 }
27547 if (OptReusedScalars && !AnyVectorized) {
27548 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
27549 Value *RdxVal = TrackedVals.at(Val: P.first);
27550 Value *RedVal = emitScaleForReusedOps(VectorizedValue: RdxVal, Builder, Cnt: P.second);
27551 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
27552 VectorizedVals.try_emplace(Key: P.first, Args: P.second);
27553 }
27554 continue;
27555 }
27556 }
27557 // Early exit for the ordered reductions.
27558 // No need to do anything else here, so we can just exit.
27559 if (RK == ReductionOrdering::Ordered)
27560 return VectorizedTree;
27561
27562 if (!VectorValuesAndScales.empty())
27563 VectorizedTree = GetNewVectorizedTree(
27564 VectorizedTree,
27565 emitReduction(Builder, TTI: *TTI, DestTy: ReductionRoot->getType()));
27566
27567 if (!VectorizedTree) {
27568 if (!CheckForReusedReductionOps) {
27569 for (ReductionOpsType &RdxOps : ReductionOps)
27570 for (Value *RdxOp : RdxOps)
27571 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
27572 }
27573 return nullptr;
27574 }
27575
27576 // Reorder operands of bool logical op in the natural order to avoid
27577 // possible problem with poison propagation. If not possible to reorder
27578 // (both operands are originally RHS), emit an extra freeze instruction
27579 // for the LHS operand.
27580 // I.e., if we have original code like this:
27581 // RedOp1 = select i1 ?, i1 LHS, i1 false
27582 // RedOp2 = select i1 RHS, i1 ?, i1 false
27583
27584 // Then, we swap LHS/RHS to create a new op that matches the poison
27585 // semantics of the original code.
27586
27587 // If we have original code like this and both values could be poison:
27588 // RedOp1 = select i1 ?, i1 LHS, i1 false
27589 // RedOp2 = select i1 ?, i1 RHS, i1 false
27590
27591 // Then, we must freeze LHS in the new op.
27592 auto FixBoolLogicalOps =
27593 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
27594 Instruction *RedOp2, bool InitStep) {
27595 if (!AnyBoolLogicOp)
27596 return;
27597 if (isBoolLogicOp(I: RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
27598 getRdxOperand(I: RedOp1, Index: 0) == LHS ||
27599 isGuaranteedNotToBePoison(V: LHS, AC)))
27600 return;
27601 bool NeedFreeze = LHS != VectorizedTree;
27602 if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
27603 getRdxOperand(I: RedOp2, Index: 0) == RHS ||
27604 isGuaranteedNotToBePoison(V: RHS, AC))) {
27605 // If RedOp2 was used as a second operand - do not swap.
27606 if ((InitStep || RHS != VectorizedTree) &&
27607 getRdxOperand(I: RedOp2, Index: 0) == RHS &&
27608 ((isBoolLogicOp(I: RedOp1) &&
27609 getRdxOperand(I: RedOp1, Index: 1) == RedOp2) ||
27610 any_of(Range&: ReductionOps, P: [&](ArrayRef<Value *> Ops) {
27611 return any_of(Range&: Ops, P: [&](Value *Op) {
27612 auto *OpI = dyn_cast<Instruction>(Val: Op);
27613 return OpI && isBoolLogicOp(I: OpI) &&
27614 getRdxOperand(I: OpI, Index: 1) == RedOp2;
27615 });
27616 }))) {
27617 NeedFreeze = false;
27618 } else {
27619 std::swap(a&: LHS, b&: RHS);
27620 return;
27621 }
27622 }
27623 if (NeedFreeze)
27624 LHS = Builder.CreateFreeze(V: LHS);
27625 };
27626 // Finish the reduction.
27627 // Need to add extra arguments and not vectorized possible reduction values.
27628 // Try to avoid dependencies between the scalar remainders after reductions.
27629 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
27630 bool InitStep) {
27631 unsigned Sz = InstVals.size();
27632 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
27633 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
27634 Instruction *RedOp = InstVals[I + 1].first;
27635 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
27636 Value *RdxVal1 = InstVals[I].second;
27637 Value *StableRdxVal1 = RdxVal1;
27638 auto It1 = TrackedVals.find(Val: RdxVal1);
27639 if (It1 != TrackedVals.end())
27640 StableRdxVal1 = It1->second;
27641 Value *RdxVal2 = InstVals[I + 1].second;
27642 Value *StableRdxVal2 = RdxVal2;
27643 auto It2 = TrackedVals.find(Val: RdxVal2);
27644 if (It2 != TrackedVals.end())
27645 StableRdxVal2 = It2->second;
27646 // To prevent poison from leaking across what used to be sequential,
27647 // safe, scalar boolean logic operations, the reduction operand must be
27648 // frozen.
27649 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
27650 RedOp, InitStep);
27651 Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
27652 RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
27653 ExtraReds[I / 2] = std::make_pair(x: InstVals[I].first, y&: ExtraRed);
27654 }
27655 if (Sz % 2 == 1)
27656 ExtraReds[Sz / 2] = InstVals.back();
27657 return ExtraReds;
27658 };
27659 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
27660 ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
27661 Args&: VectorizedTree);
27662 SmallPtrSet<Value *, 8> Visited;
27663 for (ArrayRef<Value *> Candidates : ReducedVals) {
27664 for (Value *RdxVal : Candidates) {
27665 if (!Visited.insert(Ptr: RdxVal).second)
27666 continue;
27667 unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
27668 for (Instruction *RedOp :
27669 ArrayRef(ReducedValsToOps.at(Val: RdxVal)).drop_back(N: NumOps))
27670 ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
27671 }
27672 }
27673 // Iterate through all not-vectorized reduction values/extra arguments.
27674 bool InitStep = true;
27675 while (ExtraReductions.size() > 1) {
27676 SmallVector<std::pair<Instruction *, Value *>> NewReds =
27677 FinalGen(ExtraReductions, InitStep);
27678 ExtraReductions.swap(RHS&: NewReds);
27679 InitStep = false;
27680 }
27681 VectorizedTree = ExtraReductions.front().second;
27682
27683 ReductionRoot->replaceAllUsesWith(V: VectorizedTree);
27684
27685 // The original scalar reduction is expected to have no remaining
27686 // uses outside the reduction tree itself. Assert that we got this
27687 // correct, replace internal uses with undef, and mark for eventual
27688 // deletion.
27689#ifndef NDEBUG
27690 SmallPtrSet<Value *, 4> IgnoreSet;
27691 for (ArrayRef<Value *> RdxOps : ReductionOps)
27692 IgnoreSet.insert_range(RdxOps);
27693#endif
27694 for (ArrayRef<Value *> RdxOps : ReductionOps) {
27695 for (Value *Ignore : RdxOps) {
27696 if (!Ignore)
27697 continue;
27698#ifndef NDEBUG
27699 for (auto *U : Ignore->users()) {
27700 assert(IgnoreSet.count(U) &&
27701 "All users must be either in the reduction ops list.");
27702 }
27703#endif
27704 if (!Ignore->use_empty()) {
27705 Value *P = PoisonValue::get(T: Ignore->getType());
27706 Ignore->replaceAllUsesWith(V: P);
27707 }
27708 }
27709 V.removeInstructionsAndOperands(DeadVals: RdxOps, VectorValuesAndScales);
27710 }
27711 return VectorizedTree;
27712 }
27713
27714private:
27715 /// Creates the reduction from the given \p Vec vector value with the given
27716 /// scale \p Scale and signedness \p IsSigned.
27717 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
27718 Value *Vec, unsigned Scale, bool IsSigned, Type *DestTy,
27719 bool ReducedInTree) {
27720 Value *Rdx;
27721 if (ReducedInTree) {
27722 Rdx = Vec;
27723 } else if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DestTy)) {
27724 unsigned DestTyNumElements = getNumElements(Ty: VecTy);
27725 unsigned VF = getNumElements(Ty: Vec->getType()) / DestTyNumElements;
27726 Rdx = PoisonValue::get(
27727 T: getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: DestTyNumElements));
27728 for (unsigned I : seq<unsigned>(Size: DestTyNumElements)) {
27729 // Do reduction for each lane.
27730 // e.g., do reduce add for
27731 // VL[0] = <4 x Ty> <a, b, c, d>
27732 // VL[1] = <4 x Ty> <e, f, g, h>
27733 // Lane[0] = <2 x Ty> <a, e>
27734 // Lane[1] = <2 x Ty> <b, f>
27735 // Lane[2] = <2 x Ty> <c, g>
27736 // Lane[3] = <2 x Ty> <d, h>
27737 // result[0] = reduce add Lane[0]
27738 // result[1] = reduce add Lane[1]
27739 // result[2] = reduce add Lane[2]
27740 // result[3] = reduce add Lane[3]
27741 SmallVector<int, 16> Mask = createStrideMask(Start: I, Stride: DestTyNumElements, VF);
27742 Value *Lane = Builder.CreateShuffleVector(V: Vec, Mask);
27743 Rdx = Builder.CreateInsertElement(
27744 Vec: Rdx, NewElt: emitReduction(VectorizedValue: Lane, Builder, TTI: &TTI, DestTy), Idx: I);
27745 }
27746 } else {
27747 Rdx = emitReduction(VectorizedValue: Vec, Builder, TTI: &TTI, DestTy);
27748 }
27749 if (Rdx->getType() != DestTy)
27750 Rdx = Builder.CreateIntCast(V: Rdx, DestTy, isSigned: IsSigned);
27751 // Improved analysis for add/fadd/xor reductions with same scale
27752 // factor for all operands of reductions. We can emit scalar ops for
27753 // them instead.
27754 if (Scale > 1)
27755 Rdx = emitScaleForReusedOps(VectorizedValue: Rdx, Builder, Cnt: Scale);
27756 return Rdx;
27757 }
27758
27759 /// Calculate the cost of a reduction.
27760 InstructionCost getReductionCost(
27761 TargetTransformInfo *TTI, ArrayRef<Value *> ReducedVals,
27762 const SmallMapVector<Value *, unsigned, 16> SameValuesCounter,
27763 bool IsCmpSelMinMax, FastMathFlags FMF, const BoUpSLP &R,
27764 DominatorTree &DT, const DataLayout &DL, const TargetLibraryInfo &TLI) {
27765 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
27766 Type *ScalarTy = ReducedVals.front()->getType();
27767 unsigned ReduxWidth = ReducedVals.size();
27768 FixedVectorType *VectorTy = R.getReductionType();
27769 InstructionCost VectorCost = 0, ScalarCost;
27770 // If all of the reduced values are constant, the vector cost is 0, since
27771 // the reduction value can be calculated at the compile time.
27772 bool AllConsts = allConstant(VL: ReducedVals);
27773 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
27774 InstructionCost Cost = 0;
27775 // Scalar cost is repeated for N-1 elements.
27776 int Cnt = ReducedVals.size();
27777 for (Value *RdxVal : ReducedVals) {
27778 if (!isa<Instruction>(Val: RdxVal))
27779 continue;
27780 if (Cnt == 1) {
27781 unsigned SameValueCount = SameValuesCounter.lookup(Key: RdxVal);
27782 Cost += (SameValueCount ? SameValueCount - 1 : 0) * GenCostFn();
27783 break;
27784 }
27785 --Cnt;
27786 if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? 3 : 2)) {
27787 unsigned SameValueCount = SameValuesCounter.lookup(Key: RdxVal);
27788 Cost += (SameValueCount ? SameValueCount : 1) * GenCostFn();
27789 continue;
27790 }
27791 InstructionCost ScalarCost = 0;
27792 for (User *U : RdxVal->users()) {
27793 auto *RdxOp = cast<Instruction>(Val: U);
27794 if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
27795 if (RdxKind == RecurKind::FAdd) {
27796 InstructionCost FMACost = canConvertToFMA(
27797 VL: RdxOp, S: getSameOpcode(VL: RdxOp, TLI), DT, DL, TTI&: *TTI, TLI);
27798 if (FMACost.isValid()) {
27799 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
27800 if (auto *I = dyn_cast<Instruction>(Val: RdxVal)) {
27801 // Also, exclude scalar fmul cost.
27802 InstructionCost FMulCost =
27803 TTI->getInstructionCost(U: I, CostKind);
27804 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
27805 FMACost -= FMulCost;
27806 }
27807 ScalarCost += FMACost;
27808 continue;
27809 }
27810 }
27811 ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
27812 continue;
27813 }
27814 ScalarCost = InstructionCost::getInvalid();
27815 break;
27816 }
27817 if (ScalarCost.isValid())
27818 Cost += ScalarCost;
27819 else
27820 Cost += GenCostFn();
27821 }
27822 return Cost;
27823 };
27824 // Require reduction cost if:
27825 // 1. This type is not a full register type and no other vectors with the
27826 // same type in the storage (first vector with small type).
27827 // 2. The storage does not have any vector with full vector use (first
27828 // vector with full register use).
27829 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
27830 switch (RdxKind) {
27831 case RecurKind::Add:
27832 case RecurKind::Mul:
27833 case RecurKind::Or:
27834 case RecurKind::And:
27835 case RecurKind::Xor:
27836 case RecurKind::FAdd:
27837 case RecurKind::FMul: {
27838 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
27839 if (!AllConsts) {
27840 if (DoesRequireReductionOp) {
27841 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
27842 assert(SLPReVec && "FixedVectorType is not expected.");
27843 unsigned ScalarTyNumElements = VecTy->getNumElements();
27844 for (unsigned I : seq<unsigned>(Size: ReducedVals.size())) {
27845 VectorCost += TTI->getShuffleCost(
27846 Kind: TTI::SK_PermuteSingleSrc,
27847 DstTy: FixedVectorType::get(ElementType: VecTy->getScalarType(),
27848 NumElts: ReducedVals.size()),
27849 SrcTy: VectorTy,
27850 Mask: createStrideMask(Start: I, Stride: ScalarTyNumElements, VF: ReducedVals.size()));
27851 VectorCost += TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VecTy,
27852 FMF, CostKind);
27853 }
27854 VectorCost += TTI->getScalarizationOverhead(
27855 Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: ScalarTyNumElements), /*Insert*/ true,
27856 /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
27857 } else {
27858 Type *RedTy = VectorTy->getElementType();
27859 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
27860 u: std::make_pair(x&: RedTy, y: true));
27861 if (RType == RedTy) {
27862 VectorCost = TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy,
27863 FMF, CostKind);
27864 } else {
27865 VectorCost = TTI->getExtendedReductionCost(
27866 Opcode: RdxOpcode, IsUnsigned: !IsSigned, ResTy: RedTy,
27867 Ty: getWidenedType(ScalarTy: RType, VF: ReduxWidth), FMF, CostKind);
27868 }
27869 }
27870 } else {
27871 Type *RedTy = VectorTy->getElementType();
27872 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
27873 u: std::make_pair(x&: RedTy, y: true));
27874 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
27875 InstructionCost FMACost = InstructionCost::getInvalid();
27876 if (RdxKind == RecurKind::FAdd) {
27877 // Check if the reduction operands can be converted to FMA.
27878 SmallVector<Value *> Ops;
27879 FastMathFlags FMF;
27880 FMF.set();
27881 for (Value *RdxVal : ReducedVals) {
27882 if (!RdxVal->hasOneUse()) {
27883 Ops.clear();
27884 break;
27885 }
27886 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: RdxVal))
27887 FMF &= FPCI->getFastMathFlags();
27888 Ops.push_back(Elt: RdxVal->user_back());
27889 }
27890 if (!Ops.empty()) {
27891 FMACost = canConvertToFMA(VL: Ops, S: getSameOpcode(VL: Ops, TLI), DT, DL,
27892 TTI&: *TTI, TLI);
27893 if (FMACost.isValid()) {
27894 // Calculate actual FMAD cost.
27895 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
27896 {RVecTy, RVecTy, RVecTy}, FMF);
27897 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
27898
27899 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
27900 // Also, exclude vector fmul cost.
27901 InstructionCost FMulCost = TTI->getArithmeticInstrCost(
27902 Opcode: Instruction::FMul, Ty: RVecTy, CostKind);
27903 LLVM_DEBUG(dbgs()
27904 << "Minus vector FMul cost: " << FMulCost << "\n");
27905 FMACost -= FMulCost;
27906 }
27907 }
27908 }
27909 if (FMACost.isValid())
27910 VectorCost += FMACost;
27911 else
27912 VectorCost +=
27913 TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: RVecTy, CostKind);
27914 if (RType != RedTy) {
27915 unsigned Opcode = Instruction::Trunc;
27916 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
27917 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
27918 VectorCost += TTI->getCastInstrCost(
27919 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
27920 }
27921 }
27922 }
27923 ScalarCost = EvaluateScalarCost([&]() {
27924 return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
27925 });
27926 break;
27927 }
27928 case RecurKind::FMax:
27929 case RecurKind::FMin:
27930 case RecurKind::FMaximum:
27931 case RecurKind::FMinimum:
27932 case RecurKind::SMax:
27933 case RecurKind::SMin:
27934 case RecurKind::UMax:
27935 case RecurKind::UMin: {
27936 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
27937 if (!AllConsts) {
27938 if (DoesRequireReductionOp) {
27939 VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
27940 } else {
27941 // Check if the previous reduction already exists and account it as
27942 // series of operations + single reduction.
27943 Type *RedTy = VectorTy->getElementType();
27944 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
27945 u: std::make_pair(x&: RedTy, y: true));
27946 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
27947 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
27948 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
27949 if (RType != RedTy) {
27950 unsigned Opcode = Instruction::Trunc;
27951 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
27952 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
27953 VectorCost += TTI->getCastInstrCost(
27954 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
27955 }
27956 }
27957 }
27958 ScalarCost = EvaluateScalarCost([&]() {
27959 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
27960 return TTI->getIntrinsicInstrCost(ICA, CostKind);
27961 });
27962 break;
27963 }
27964 default:
27965 llvm_unreachable("Expected arithmetic or min/max reduction operation");
27966 }
27967
27968 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
27969 << " for reduction of " << shortBundleName(ReducedVals)
27970 << " (It is a splitting reduction)\n");
27971 return VectorCost - ScalarCost;
27972 }
27973
27974 /// Splits the values, stored in VectorValuesAndScales, into registers/free
27975 /// sub-registers, combines them with the given reduction operation as a
27976 /// vector operation and then performs single (small enough) reduction.
27977 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
27978 Type *DestTy) {
27979 Value *ReducedSubTree = nullptr;
27980 // Creates reduction and combines with the previous reduction.
27981 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned,
27982 bool ReducedInTree) {
27983 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy,
27984 ReducedInTree);
27985 if (ReducedSubTree)
27986 ReducedSubTree = createOp(Builder, RdxKind, LHS: ReducedSubTree, RHS: Rdx,
27987 Name: "op.rdx", ReductionOps);
27988 else
27989 ReducedSubTree = Rdx;
27990 };
27991 if (VectorValuesAndScales.size() == 1) {
27992 const auto &[Vec, Scale, IsSigned, ReducedInTree] =
27993 VectorValuesAndScales.front();
27994 CreateSingleOp(Vec, Scale, IsSigned, ReducedInTree);
27995 return ReducedSubTree;
27996 }
27997 // Scales Vec using given Cnt scale factor and then performs vector combine
27998 // with previous value of VecOp.
27999 Value *VecRes = nullptr;
28000 bool VecResSignedness = false;
28001 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned,
28002 bool ReducedInTree) {
28003 if (ReducedInTree) {
28004 CreateSingleOp(Vec, Cnt, IsSigned, ReducedInTree);
28005 return;
28006 }
28007 Type *ScalarTy = Vec->getType()->getScalarType();
28008 // Scale Vec using given Cnt scale factor.
28009 if (Cnt > 1) {
28010 ElementCount EC = cast<VectorType>(Val: Vec->getType())->getElementCount();
28011 switch (RdxKind) {
28012 case RecurKind::Add: {
28013 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
28014 unsigned VF = getNumElements(Ty: Vec->getType());
28015 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
28016 << ". (HorRdx)\n");
28017 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
28018 for (unsigned I : seq<unsigned>(Size: Cnt))
28019 std::iota(first: std::next(x: Mask.begin(), n: VF * I),
28020 last: std::next(x: Mask.begin(), n: VF * (I + 1)), value: 0);
28021 ++NumVectorInstructions;
28022 Vec = Builder.CreateShuffleVector(V: Vec, Mask);
28023 break;
28024 }
28025 // res = mul vv, n
28026 if (ScalarTy != DestTy->getScalarType())
28027 Vec = Builder.CreateIntCast(
28028 V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
28029 isSigned: IsSigned);
28030 Value *Scale = ConstantVector::getSplat(
28031 EC, Elt: ConstantInt::get(Ty: DestTy->getScalarType(), V: Cnt));
28032 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
28033 << ". (HorRdx)\n");
28034 ++NumVectorInstructions;
28035 Vec = Builder.CreateMul(LHS: Vec, RHS: Scale);
28036 break;
28037 }
28038 case RecurKind::Xor: {
28039 // res = n % 2 ? 0 : vv
28040 LLVM_DEBUG(dbgs()
28041 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
28042 if (Cnt % 2 == 0)
28043 Vec = Constant::getNullValue(Ty: Vec->getType());
28044 break;
28045 }
28046 case RecurKind::FAdd: {
28047 // res = fmul v, n
28048 Value *Scale =
28049 ConstantVector::getSplat(EC, Elt: ConstantFP::get(Ty: ScalarTy, V: Cnt));
28050 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
28051 << ". (HorRdx)\n");
28052 ++NumVectorInstructions;
28053 Vec = Builder.CreateFMul(L: Vec, R: Scale);
28054 break;
28055 }
28056 case RecurKind::And:
28057 case RecurKind::Or:
28058 case RecurKind::SMax:
28059 case RecurKind::SMin:
28060 case RecurKind::UMax:
28061 case RecurKind::UMin:
28062 case RecurKind::FMax:
28063 case RecurKind::FMin:
28064 case RecurKind::FMaximum:
28065 case RecurKind::FMinimum:
28066 // res = vv
28067 break;
28068 case RecurKind::Sub:
28069 case RecurKind::AddChainWithSubs:
28070 case RecurKind::Mul:
28071 case RecurKind::FMul:
28072 case RecurKind::FMulAdd:
28073 case RecurKind::AnyOf:
28074 case RecurKind::FindIV:
28075 case RecurKind::FindLast:
28076 case RecurKind::FMaxNum:
28077 case RecurKind::FMinNum:
28078 case RecurKind::FMaximumNum:
28079 case RecurKind::FMinimumNum:
28080 case RecurKind::None:
28081 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
28082 }
28083 }
28084 // Combine Vec with the previous VecOp.
28085 if (!VecRes) {
28086 VecRes = Vec;
28087 VecResSignedness = IsSigned;
28088 } else {
28089 ++NumVectorInstructions;
28090 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
28091 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
28092 // Handle ctpop.
28093 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
28094 unsigned VecVF = getNumElements(Ty: Vec->getType());
28095 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
28096 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
28097 // Ensure that VecRes is always larger than Vec
28098 if (VecResVF < VecVF) {
28099 std::swap(a&: VecRes, b&: Vec);
28100 std::swap(a&: VecResVF, b&: VecVF);
28101 }
28102 if (VecResVF != VecVF) {
28103 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
28104 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
28105 Vec = Builder.CreateShuffleVector(V: Vec, Mask: ResizeMask);
28106 }
28107 VecRes = Builder.CreateShuffleVector(V1: VecRes, V2: Vec, Mask, Name: "rdx.op");
28108 return;
28109 }
28110 if (VecRes->getType()->getScalarType() != DestTy->getScalarType()) {
28111 assert(getNumElements(VecRes->getType()) % getNumElements(DestTy) ==
28112 0 &&
28113 "Expected the number of elements in VecRes to be a multiple "
28114 "of the number of elements in DestTy");
28115 VecRes = Builder.CreateIntCast(
28116 V: VecRes,
28117 DestTy: getWidenedType(ScalarTy: DestTy->getScalarType(),
28118 VF: getNumElements(Ty: VecRes->getType())),
28119 isSigned: VecResSignedness);
28120 }
28121 if (ScalarTy != DestTy->getScalarType())
28122 Vec = Builder.CreateIntCast(
28123 V: Vec,
28124 DestTy: getWidenedType(ScalarTy: DestTy->getScalarType(),
28125 VF: getNumElements(Ty: Vec->getType())),
28126 isSigned: IsSigned);
28127 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
28128 unsigned VecVF = getNumElements(Ty: Vec->getType());
28129 // Ensure that VecRes is always larger than Vec
28130 if (VecResVF < VecVF) {
28131 std::swap(a&: VecRes, b&: Vec);
28132 std::swap(a&: VecResVF, b&: VecVF);
28133 }
28134 // extract + op + insert
28135 Value *Op = VecRes;
28136 if (VecResVF != VecVF)
28137 Op = createExtractVector(Builder, Vec: VecRes, SubVecVF: VecVF, /*Index=*/0);
28138 Op = createOp(Builder, RdxKind, LHS: Op, RHS: Vec, Name: "rdx.op", ReductionOps);
28139 if (VecResVF != VecVF)
28140 Op = createInsertVector(Builder, Vec: VecRes, V: Op, /*Index=*/0);
28141 VecRes = Op;
28142 }
28143 };
28144 for (auto [Vec, Scale, IsSigned, ReducedInTree] : VectorValuesAndScales)
28145 CreateVecOp(Vec, Scale, IsSigned, ReducedInTree);
28146 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false,
28147 /*ReducedInTree=*/false);
28148
28149 return ReducedSubTree;
28150 }
28151
28152 /// Emit a horizontal reduction of the vectorized value.
28153 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
28154 const TargetTransformInfo *TTI, Type *DestTy) {
28155 assert(VectorizedValue && "Need to have a vectorized tree node");
28156 assert(RdxKind != RecurKind::FMulAdd &&
28157 "A call to the llvm.fmuladd intrinsic is not handled yet");
28158
28159 auto *FTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
28160 if (FTy->getScalarType() == Builder.getInt1Ty() &&
28161 RdxKind == RecurKind::Add &&
28162 DestTy->getScalarType() != FTy->getScalarType()) {
28163 // Convert vector_reduce_add(ZExt(<n x i1>)) to
28164 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
28165 Value *V = Builder.CreateBitCast(
28166 V: VectorizedValue, DestTy: Builder.getIntNTy(N: FTy->getNumElements()));
28167 ++NumVectorInstructions;
28168 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V);
28169 }
28170 ++NumVectorInstructions;
28171 return createSimpleReduction(B&: Builder, Src: VectorizedValue, RdxKind);
28172 }
28173
28174 /// Emits optimized code for unique scalar value reused \p Cnt times.
28175 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
28176 unsigned Cnt) {
28177 assert(IsSupportedHorRdxIdentityOp &&
28178 "The optimization of matched scalar identity horizontal reductions "
28179 "must be supported.");
28180 if (Cnt == 1)
28181 return VectorizedValue;
28182 switch (RdxKind) {
28183 case RecurKind::Add: {
28184 // res = mul vv, n
28185 Value *Scale =
28186 ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt,
28187 /*IsSigned=*/false, /*ImplicitTrunc=*/true);
28188 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
28189 << VectorizedValue << ". (HorRdx)\n");
28190 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
28191 }
28192 case RecurKind::Xor: {
28193 // res = n % 2 ? 0 : vv
28194 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
28195 << ". (HorRdx)\n");
28196 if (Cnt % 2 == 0)
28197 return Constant::getNullValue(Ty: VectorizedValue->getType());
28198 return VectorizedValue;
28199 }
28200 case RecurKind::FAdd: {
28201 // res = fmul v, n
28202 Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
28203 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
28204 << VectorizedValue << ". (HorRdx)\n");
28205 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
28206 }
28207 case RecurKind::And:
28208 case RecurKind::Or:
28209 case RecurKind::SMax:
28210 case RecurKind::SMin:
28211 case RecurKind::UMax:
28212 case RecurKind::UMin:
28213 case RecurKind::FMax:
28214 case RecurKind::FMin:
28215 case RecurKind::FMaximum:
28216 case RecurKind::FMinimum:
28217 // res = vv
28218 return VectorizedValue;
28219 case RecurKind::Sub:
28220 case RecurKind::AddChainWithSubs:
28221 case RecurKind::Mul:
28222 case RecurKind::FMul:
28223 case RecurKind::FMulAdd:
28224 case RecurKind::AnyOf:
28225 case RecurKind::FindIV:
28226 case RecurKind::FindLast:
28227 case RecurKind::FMaxNum:
28228 case RecurKind::FMinNum:
28229 case RecurKind::FMaximumNum:
28230 case RecurKind::FMinimumNum:
28231 case RecurKind::None:
28232 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
28233 }
28234 return nullptr;
28235 }
28236
28237 /// Emits actual operation for the scalar identity values, found during
28238 /// horizontal reduction analysis.
28239 Value *
28240 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
28241 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
28242 ArrayRef<Value *> TrackedToOrig) {
28243 assert(IsSupportedHorRdxIdentityOp &&
28244 "The optimization of matched scalar identity horizontal reductions "
28245 "must be supported.");
28246 ArrayRef<Value *> VL = R.getRootNodeScalars();
28247 auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
28248 if (VTy->getElementType() != VL.front()->getType()) {
28249 VectorizedValue = Builder.CreateIntCast(
28250 V: VectorizedValue,
28251 DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
28252 isSigned: R.isSignedMinBitwidthRootNode());
28253 }
28254 switch (RdxKind) {
28255 case RecurKind::Add: {
28256 // root = mul prev_root, <1, 1, n, 1>
28257 SmallVector<Constant *> Vals;
28258 for (const auto [Idx, V] : enumerate(First&: VL)) {
28259 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig[Idx]);
28260 Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /*IsSigned=*/false));
28261 }
28262 auto *Scale = ConstantVector::get(V: Vals);
28263 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
28264 << VectorizedValue << ". (HorRdx)\n");
28265 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
28266 }
28267 case RecurKind::And:
28268 case RecurKind::Or:
28269 // No need for multiple or/and(s).
28270 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
28271 << ". (HorRdx)\n");
28272 return VectorizedValue;
28273 case RecurKind::SMax:
28274 case RecurKind::SMin:
28275 case RecurKind::UMax:
28276 case RecurKind::UMin:
28277 case RecurKind::FMax:
28278 case RecurKind::FMin:
28279 case RecurKind::FMaximum:
28280 case RecurKind::FMinimum:
28281 // No need for multiple min/max(s) of the same value.
28282 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
28283 << ". (HorRdx)\n");
28284 return VectorizedValue;
28285 case RecurKind::Xor: {
28286 // Replace values with even number of repeats with 0, since
28287 // x xor x = 0.
28288 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
28289 // 7>, if elements 4th and 6th elements have even number of repeats.
28290 SmallVector<int> Mask(
28291 cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
28292 PoisonMaskElem);
28293 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
28294 bool NeedShuffle = false;
28295 for (const auto [Idx, V] : enumerate(First&: VL)) {
28296 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig[Idx]);
28297 if (Cnt % 2 == 0) {
28298 Mask[Idx] = VL.size();
28299 NeedShuffle = true;
28300 }
28301 }
28302 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
28303 : Mask) dbgs()
28304 << I << " ";
28305 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
28306 if (NeedShuffle)
28307 VectorizedValue = Builder.CreateShuffleVector(
28308 V1: VectorizedValue,
28309 V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
28310 return VectorizedValue;
28311 }
28312 case RecurKind::FAdd: {
28313 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
28314 SmallVector<Constant *> Vals;
28315 for (const auto [Idx, V] : enumerate(First&: VL)) {
28316 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig[Idx]);
28317 Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
28318 }
28319 auto *Scale = ConstantVector::get(V: Vals);
28320 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
28321 }
28322 case RecurKind::Sub:
28323 case RecurKind::AddChainWithSubs:
28324 case RecurKind::Mul:
28325 case RecurKind::FMul:
28326 case RecurKind::FMulAdd:
28327 case RecurKind::AnyOf:
28328 case RecurKind::FindIV:
28329 case RecurKind::FindLast:
28330 case RecurKind::FMaxNum:
28331 case RecurKind::FMinNum:
28332 case RecurKind::FMaximumNum:
28333 case RecurKind::FMinimumNum:
28334 case RecurKind::None:
28335 llvm_unreachable("Unexpected reduction kind for reused scalars.");
28336 }
28337 return nullptr;
28338 }
28339};
28340} // end anonymous namespace
28341
28342/// Gets recurrence kind from the specified value.
28343static RecurKind getRdxKind(Value *V) {
28344 return HorizontalReduction::getRdxKind(V);
28345}
28346static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
28347 if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
28348 return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
28349
28350 unsigned AggregateSize = 1;
28351 auto *IV = cast<InsertValueInst>(Val: InsertInst);
28352 Type *CurrentType = IV->getType();
28353 do {
28354 if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
28355 for (auto *Elt : ST->elements())
28356 if (Elt != ST->getElementType(N: 0)) // check homogeneity
28357 return std::nullopt;
28358 AggregateSize *= ST->getNumElements();
28359 CurrentType = ST->getElementType(N: 0);
28360 } else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
28361 AggregateSize *= AT->getNumElements();
28362 CurrentType = AT->getElementType();
28363 } else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
28364 AggregateSize *= VT->getNumElements();
28365 return AggregateSize;
28366 } else if (CurrentType->isSingleValueType()) {
28367 return AggregateSize;
28368 } else {
28369 return std::nullopt;
28370 }
28371 } while (true);
28372}
28373
28374static void findBuildAggregateRec(Instruction *LastInsertInst,
28375 TargetTransformInfo *TTI,
28376 SmallVectorImpl<Value *> &BuildVectorOpds,
28377 SmallVectorImpl<Value *> &InsertElts,
28378 unsigned OperandOffset, const BoUpSLP &R) {
28379 do {
28380 Value *InsertedOperand = LastInsertInst->getOperand(i: 1);
28381 std::optional<unsigned> OperandIndex =
28382 getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
28383 if (!OperandIndex || R.isDeleted(I: LastInsertInst))
28384 return;
28385 if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
28386 findBuildAggregateRec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
28387 BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex, R);
28388
28389 } else {
28390 BuildVectorOpds[*OperandIndex] = InsertedOperand;
28391 InsertElts[*OperandIndex] = LastInsertInst;
28392 }
28393 LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: 0));
28394 } while (LastInsertInst != nullptr &&
28395 isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
28396 LastInsertInst->hasOneUse());
28397}
28398
28399/// Recognize construction of vectors like
28400/// %ra = insertelement <4 x float> poison, float %s0, i32 0
28401/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
28402/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
28403/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
28404/// starting from the last insertelement or insertvalue instruction.
28405///
28406/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
28407/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
28408/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
28409///
28410/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
28411///
28412/// \return true if it matches.
28413static bool findBuildAggregate(Instruction *LastInsertInst,
28414 TargetTransformInfo *TTI,
28415 SmallVectorImpl<Value *> &BuildVectorOpds,
28416 SmallVectorImpl<Value *> &InsertElts,
28417 const BoUpSLP &R) {
28418
28419 assert((isa<InsertElementInst>(LastInsertInst) ||
28420 isa<InsertValueInst>(LastInsertInst)) &&
28421 "Expected insertelement or insertvalue instruction!");
28422
28423 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
28424 "Expected empty result vectors!");
28425
28426 std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
28427 if (!AggregateSize)
28428 return false;
28429 BuildVectorOpds.resize(N: *AggregateSize);
28430 InsertElts.resize(N: *AggregateSize);
28431
28432 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: 0, R);
28433 llvm::erase(C&: BuildVectorOpds, V: nullptr);
28434 llvm::erase(C&: InsertElts, V: nullptr);
28435 if (BuildVectorOpds.size() >= 2)
28436 return true;
28437
28438 return false;
28439}
28440
28441/// Try and get a reduction instruction from a phi node.
28442///
28443/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
28444/// if they come from either \p ParentBB or a containing loop latch.
28445///
28446/// \returns A candidate reduction value if possible, or \code nullptr \endcode
28447/// if not possible.
28448static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
28449 BasicBlock *ParentBB, LoopInfo *LI) {
28450 // There are situations where the reduction value is not dominated by the
28451 // reduction phi. Vectorizing such cases has been reported to cause
28452 // miscompiles. See PR25787.
28453 auto DominatedReduxValue = [&](Value *R) {
28454 return isa<Instruction>(Val: R) &&
28455 DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
28456 };
28457
28458 Instruction *Rdx = nullptr;
28459
28460 // Return the incoming value if it comes from the same BB as the phi node.
28461 if (P->getIncomingBlock(i: 0) == ParentBB) {
28462 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
28463 } else if (P->getIncomingBlock(i: 1) == ParentBB) {
28464 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
28465 }
28466
28467 if (Rdx && DominatedReduxValue(Rdx))
28468 return Rdx;
28469
28470 // Otherwise, check whether we have a loop latch to look at.
28471 Loop *BBL = LI->getLoopFor(BB: ParentBB);
28472 if (!BBL)
28473 return nullptr;
28474 BasicBlock *BBLatch = BBL->getLoopLatch();
28475 if (!BBLatch)
28476 return nullptr;
28477
28478 // There is a loop latch, return the incoming value if it comes from
28479 // that. This reduction pattern occasionally turns up.
28480 if (P->getIncomingBlock(i: 0) == BBLatch) {
28481 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
28482 } else if (P->getIncomingBlock(i: 1) == BBLatch) {
28483 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
28484 }
28485
28486 if (Rdx && DominatedReduxValue(Rdx))
28487 return Rdx;
28488
28489 return nullptr;
28490}
28491
28492static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
28493 if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
28494 return true;
28495 if (match(V: I, P: m_FMaxNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28496 return true;
28497 if (match(V: I, P: m_FMinNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28498 return true;
28499 if (match(V: I, P: m_FMaximum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28500 return true;
28501 if (match(V: I, P: m_FMinimum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28502 return true;
28503 if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28504 return true;
28505 if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28506 return true;
28507 if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28508 return true;
28509 if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
28510 return true;
28511 return false;
28512}
28513
28514/// We could have an initial reduction that is not an add.
28515/// r *= v1 + v2 + v3 + v4
28516/// In such a case start looking for a tree rooted in the first '+'.
28517/// \Returns the new root if found, which may be nullptr if not an instruction.
28518static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
28519 Instruction *Root) {
28520 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
28521 isa<IntrinsicInst>(Root)) &&
28522 "Expected binop, select, or intrinsic for reduction matching");
28523 Value *LHS =
28524 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
28525 Value *RHS =
28526 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + 1);
28527 if (LHS == Phi)
28528 return dyn_cast<Instruction>(Val: RHS);
28529 if (RHS == Phi)
28530 return dyn_cast<Instruction>(Val: LHS);
28531 return nullptr;
28532}
28533
28534/// \p Returns the first operand of \p I that does not match \p Phi. If
28535/// operand is not an instruction it returns nullptr.
28536static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
28537 Value *Op0 = nullptr;
28538 Value *Op1 = nullptr;
28539 if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
28540 return nullptr;
28541 return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
28542}
28543
28544/// \Returns true if \p I is a candidate instruction for reduction vectorization.
28545static bool isReductionCandidate(Instruction *I) {
28546 bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
28547 Value *B0 = nullptr, *B1 = nullptr;
28548 bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
28549 return IsBinop || IsSelect;
28550}
28551
28552bool SLPVectorizerPass::vectorizeHorReduction(
28553 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
28554 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
28555 if (!ShouldVectorizeHor)
28556 return false;
28557 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
28558
28559 if (Root->getParent() != BB || isa<PHINode>(Val: Root))
28560 return false;
28561
28562 // If we can find a secondary reduction root, use that instead.
28563 auto SelectRoot = [&]() {
28564 if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
28565 HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
28566 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
28567 return NewRoot;
28568 return Root;
28569 };
28570
28571 // Start analysis starting from Root instruction. If horizontal reduction is
28572 // found, try to vectorize it. If it is not a horizontal reduction or
28573 // vectorization is not possible or not effective, and currently analyzed
28574 // instruction is a binary operation, try to vectorize the operands, using
28575 // pre-order DFS traversal order. If the operands were not vectorized, repeat
28576 // the same procedure considering each operand as a possible root of the
28577 // horizontal reduction.
28578 // Interrupt the process if the Root instruction itself was vectorized or all
28579 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
28580 // If a horizintal reduction was not matched or vectorized we collect
28581 // instructions for possible later attempts for vectorization.
28582 std::queue<std::pair<Instruction *, unsigned>> Stack;
28583 Stack.emplace(args: SelectRoot(), args: 0);
28584 SmallPtrSet<Value *, 8> VisitedInstrs;
28585 bool Res = false;
28586 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
28587 if (R.isAnalyzedReductionRoot(I: Inst))
28588 return nullptr;
28589 if (!isReductionCandidate(I: Inst))
28590 return nullptr;
28591 HorizontalReduction HorRdx;
28592 if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: *SE, DT&: *DT, DL: *DL, TTI: *TTI, TLI: *TLI))
28593 return nullptr;
28594 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI, TLI: *TLI, AC, DT&: *DT);
28595 };
28596 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
28597 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
28598 FutureSeed = getNonPhiOperand(I: Root, Phi: P);
28599 if (!FutureSeed)
28600 return false;
28601 }
28602 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
28603 // analysis is done separately.
28604 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
28605 PostponedInsts.push_back(Elt: FutureSeed);
28606 return true;
28607 };
28608
28609 while (!Stack.empty()) {
28610 Instruction *Inst;
28611 unsigned Level;
28612 std::tie(args&: Inst, args&: Level) = Stack.front();
28613 Stack.pop();
28614 // Do not try to analyze instruction that has already been vectorized.
28615 // This may happen when we vectorize instruction operands on a previous
28616 // iteration while stack was populated before that happened.
28617 if (R.isDeleted(I: Inst))
28618 continue;
28619 if (Value *VectorizedV = TryToReduce(Inst)) {
28620 Res = true;
28621 if (auto *I = dyn_cast<Instruction>(Val: VectorizedV); I && I != Inst) {
28622 // Try to find another reduction.
28623 Stack.emplace(args&: I, args&: Level);
28624 continue;
28625 }
28626 if (R.isDeleted(I: Inst))
28627 continue;
28628 } else {
28629 // We could not vectorize `Inst` so try to use it as a future seed.
28630 if (!TryAppendToPostponedInsts(Inst)) {
28631 assert(Stack.empty() && "Expected empty stack");
28632 break;
28633 }
28634 }
28635
28636 // Try to vectorize operands.
28637 // Continue analysis for the instruction from the same basic block only to
28638 // save compile time.
28639 if (++Level < RecursionMaxDepth)
28640 for (auto *Op : Inst->operand_values())
28641 if (VisitedInstrs.insert(Ptr: Op).second)
28642 if (auto *I = dyn_cast<Instruction>(Val: Op))
28643 // Do not try to vectorize CmpInst operands, this is done
28644 // separately.
28645 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
28646 !R.isDeleted(I) && I->getParent() == BB)
28647 Stack.emplace(args&: I, args&: Level);
28648 }
28649 return Res;
28650}
28651
28652bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
28653 if (!I)
28654 return false;
28655
28656 if (!isa<BinaryOperator, CmpInst>(Val: I) || isa<VectorType>(Val: I->getType()))
28657 return false;
28658 // Skip potential FMA candidates.
28659 if ((I->getOpcode() == Instruction::FAdd ||
28660 I->getOpcode() == Instruction::FSub) &&
28661 canConvertToFMA(VL: I, S: getSameOpcode(VL: I, TLI: *TLI), DT&: *DT, DL: *DL, TTI&: *TTI, TLI: *TLI)
28662 .isValid())
28663 return false;
28664
28665 Value *P = I->getParent();
28666
28667 // Vectorize in current basic block only.
28668 auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
28669 auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: 1));
28670 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
28671 R.isDeleted(I: Op0) || R.isDeleted(I: Op1))
28672 return false;
28673
28674 // First collect all possible candidates
28675 SmallVector<std::pair<Value *, Value *>, 4> Candidates;
28676 Candidates.emplace_back(Args&: Op0, Args&: Op1);
28677
28678 auto *A = dyn_cast<BinaryOperator>(Val: Op0);
28679 auto *B = dyn_cast<BinaryOperator>(Val: Op1);
28680 // Try to skip B.
28681 if (A && B && B->hasOneUse()) {
28682 auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 0));
28683 auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 1));
28684 if (B0 && B0->getParent() == P && !R.isDeleted(I: B0))
28685 Candidates.emplace_back(Args&: A, Args&: B0);
28686 if (B1 && B1->getParent() == P && !R.isDeleted(I: B1))
28687 Candidates.emplace_back(Args&: A, Args&: B1);
28688 }
28689 // Try to skip A.
28690 if (B && A && A->hasOneUse()) {
28691 auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 0));
28692 auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 1));
28693 if (A0 && A0->getParent() == P && !R.isDeleted(I: A0))
28694 Candidates.emplace_back(Args&: A0, Args&: B);
28695 if (A1 && A1->getParent() == P && !R.isDeleted(I: A1))
28696 Candidates.emplace_back(Args&: A1, Args&: B);
28697 }
28698
28699 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
28700 ArrayRef<Value *> Ops) {
28701 if (!isReductionCandidate(I: Inst))
28702 return false;
28703 Type *Ty = Inst->getType();
28704 if (!isValidElementType(Ty) || Ty->isPointerTy())
28705 return false;
28706 HorizontalReduction HorRdx(Inst, Ops);
28707 if (!HorRdx.matchReductionForOperands())
28708 return false;
28709 // Check the cost of operations.
28710 VectorType *VecTy = getWidenedType(ScalarTy: Ty, VF: Ops.size());
28711 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
28712 InstructionCost ScalarCost =
28713 TTI.getScalarizationOverhead(
28714 Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: getNumElements(Ty: VecTy)), /*Insert=*/false,
28715 /*Extract=*/true, CostKind) +
28716 TTI.getInstructionCost(U: Inst, CostKind);
28717 InstructionCost RedCost;
28718 switch (::getRdxKind(V: Inst)) {
28719 case RecurKind::Add:
28720 case RecurKind::Mul:
28721 case RecurKind::Or:
28722 case RecurKind::And:
28723 case RecurKind::Xor:
28724 case RecurKind::FAdd:
28725 case RecurKind::FMul: {
28726 FastMathFlags FMF;
28727 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: Inst))
28728 FMF = FPCI->getFastMathFlags();
28729 RedCost = TTI.getArithmeticReductionCost(Opcode: Inst->getOpcode(), Ty: VecTy, FMF,
28730 CostKind);
28731 break;
28732 }
28733 default:
28734 return false;
28735 }
28736 if (RedCost >= ScalarCost)
28737 return false;
28738
28739 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI: &TTI, TLI: *TLI, AC, DT&: *DT) != nullptr;
28740 };
28741 if (Candidates.size() == 1)
28742 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList(VL: {Op0, Op1}, R);
28743
28744 // We have multiple options. Try to pick the single best.
28745 std::optional<int> BestCandidate = R.findBestRootPair(Candidates).first;
28746 if (!BestCandidate)
28747 return false;
28748 return (*BestCandidate == 0 &&
28749 TryToReduce(I, {Candidates[*BestCandidate].first,
28750 Candidates[*BestCandidate].second})) ||
28751 tryToVectorizeList(VL: {Candidates[*BestCandidate].first,
28752 Candidates[*BestCandidate].second},
28753 R);
28754}
28755
28756bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
28757 BasicBlock *BB, BoUpSLP &R) {
28758 SmallVector<WeakTrackingVH> PostponedInsts;
28759 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
28760 Res |= tryToVectorize(Insts: PostponedInsts, R);
28761 return Res;
28762}
28763
28764bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
28765 BoUpSLP &R) {
28766 bool Res = false;
28767 for (Value *V : Insts)
28768 if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
28769 Res |= tryToVectorize(I: Inst, R);
28770 return Res;
28771}
28772
28773bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
28774 BasicBlock *BB, BoUpSLP &R,
28775 bool MaxVFOnly) {
28776 if (!R.canMapToVector(T: IVI->getType()))
28777 return false;
28778
28779 SmallVector<Value *, 16> BuildVectorOpds;
28780 SmallVector<Value *, 16> BuildVectorInsts;
28781 if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R))
28782 return false;
28783
28784 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
28785 R.getORE()->emit(RemarkBuilder: [&]() {
28786 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
28787 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
28788 "trying reduction first.";
28789 });
28790 return false;
28791 }
28792 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
28793 // Aggregate value is unlikely to be processed in vector register.
28794 return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
28795}
28796
28797bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
28798 BasicBlock *BB, BoUpSLP &R,
28799 bool MaxVFOnly) {
28800 SmallVector<Value *, 16> BuildVectorInsts;
28801 SmallVector<Value *, 16> BuildVectorOpds;
28802 SmallVector<int> Mask;
28803 if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R) ||
28804 (all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
28805 isFixedVectorShuffle(VL: BuildVectorOpds, Mask, AC)))
28806 return false;
28807
28808 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
28809 R.getORE()->emit(RemarkBuilder: [&]() {
28810 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
28811 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
28812 "trying reduction first.";
28813 });
28814 return false;
28815 }
28816 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
28817 return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
28818}
28819
28820template <typename T>
28821static bool tryToVectorizeSequence(
28822 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
28823 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
28824 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
28825 bool MaxVFOnly, BoUpSLP &R) {
28826 bool Changed = false;
28827 // Sort by type, parent, operands.
28828 stable_sort(Incoming, Comparator);
28829
28830 // Try to vectorize elements base on their type.
28831 SmallVector<T *> Candidates;
28832 SmallVector<T *> VL;
28833 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
28834 VL.clear()) {
28835 // Look for the next elements with the same type, parent and operand
28836 // kinds.
28837 auto *I = dyn_cast<Instruction>(*IncIt);
28838 if (!I || R.isDeleted(I)) {
28839 ++IncIt;
28840 continue;
28841 }
28842 auto *SameTypeIt = IncIt;
28843 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
28844 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
28845 AreCompatible(VL, *SameTypeIt))) {
28846 auto *I = dyn_cast<Instruction>(*SameTypeIt);
28847 ++SameTypeIt;
28848 if (I && !R.isDeleted(I))
28849 VL.push_back(cast<T>(I));
28850 }
28851
28852 // Try to vectorize them.
28853 unsigned NumElts = VL.size();
28854 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
28855 << NumElts << ")\n");
28856 // The vectorization is a 3-state attempt:
28857 // 1. Try to vectorize instructions with the same/alternate opcodes with the
28858 // size of maximal register at first.
28859 // 2. Try to vectorize remaining instructions with the same type, if
28860 // possible. This may result in the better vectorization results rather than
28861 // if we try just to vectorize instructions with the same/alternate opcodes.
28862 // 3. Final attempt to try to vectorize all instructions with the
28863 // same/alternate ops only, this may result in some extra final
28864 // vectorization.
28865 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
28866 // Success start over because instructions might have been changed.
28867 Changed = true;
28868 VL.swap(Candidates);
28869 Candidates.clear();
28870 for (T *V : VL) {
28871 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
28872 Candidates.push_back(V);
28873 }
28874 } else {
28875 /// \Returns the minimum number of elements that we will attempt to
28876 /// vectorize.
28877 auto GetMinNumElements = [&R](Value *V) {
28878 unsigned EltSize = R.getVectorElementSize(V);
28879 return std::max(a: 2U, b: R.getMaxVecRegSize() / EltSize);
28880 };
28881 if (NumElts < GetMinNumElements(*IncIt) &&
28882 (Candidates.empty() ||
28883 Candidates.front()->getType() == (*IncIt)->getType())) {
28884 for (T *V : VL) {
28885 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
28886 Candidates.push_back(V);
28887 }
28888 }
28889 }
28890 // Final attempt to vectorize instructions with the same types.
28891 if (Candidates.size() > 1 &&
28892 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
28893 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
28894 // Success start over because instructions might have been changed.
28895 Changed = true;
28896 } else if (MaxVFOnly) {
28897 // Try to vectorize using small vectors.
28898 SmallVector<T *> VL;
28899 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
28900 VL.clear()) {
28901 auto *I = dyn_cast<Instruction>(*It);
28902 if (!I || R.isDeleted(I)) {
28903 ++It;
28904 continue;
28905 }
28906 auto *SameTypeIt = It;
28907 while (SameTypeIt != End &&
28908 (!isa<Instruction>(*SameTypeIt) ||
28909 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
28910 AreCompatible(*SameTypeIt, *It))) {
28911 auto *I = dyn_cast<Instruction>(*SameTypeIt);
28912 ++SameTypeIt;
28913 if (I && !R.isDeleted(I))
28914 VL.push_back(cast<T>(I));
28915 }
28916 unsigned NumElts = VL.size();
28917 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
28918 /*MaxVFOnly=*/false))
28919 Changed = true;
28920 It = SameTypeIt;
28921 }
28922 }
28923 Candidates.clear();
28924 }
28925
28926 // Start over at the next instruction of a different type (or the end).
28927 IncIt = SameTypeIt;
28928 }
28929 return Changed;
28930}
28931
28932/// Compare two cmp instructions. If IsCompatibility is true, function returns
28933/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
28934/// operands. If IsCompatibility is false, function implements strict weak
28935/// ordering relation between two cmp instructions, returning true if the first
28936/// instruction is "less" than the second, i.e. its predicate is less than the
28937/// predicate of the second or the operands IDs are less than the operands IDs
28938/// of the second cmp instruction.
28939template <bool IsCompatibility>
28940static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
28941 const DominatorTree &DT) {
28942 assert(isValidElementType(V->getType()) &&
28943 isValidElementType(V2->getType()) &&
28944 "Expected valid element types only.");
28945 if (V == V2)
28946 return IsCompatibility;
28947 auto *CI1 = cast<CmpInst>(Val: V);
28948 auto *CI2 = cast<CmpInst>(Val: V2);
28949 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() <
28950 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
28951 return !IsCompatibility;
28952 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() >
28953 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
28954 return false;
28955 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() <
28956 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
28957 return !IsCompatibility;
28958 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() >
28959 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
28960 return false;
28961 CmpInst::Predicate Pred1 = CI1->getPredicate();
28962 CmpInst::Predicate Pred2 = CI2->getPredicate();
28963 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
28964 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
28965 CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
28966 CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
28967 if (BasePred1 < BasePred2)
28968 return !IsCompatibility;
28969 if (BasePred1 > BasePred2)
28970 return false;
28971 // Compare operands.
28972 bool CI1Preds = Pred1 == BasePred1;
28973 bool CI2Preds = Pred2 == BasePred1;
28974 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
28975 auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - 1);
28976 auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - 1);
28977 if (Op1 == Op2)
28978 continue;
28979 if (Op1->getValueID() < Op2->getValueID())
28980 return !IsCompatibility;
28981 if (Op1->getValueID() > Op2->getValueID())
28982 return false;
28983 if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
28984 if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
28985 if (IsCompatibility) {
28986 if (I1->getParent() != I2->getParent())
28987 return false;
28988 } else {
28989 // Try to compare nodes with same parent.
28990 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
28991 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
28992 if (!NodeI1)
28993 return NodeI2 != nullptr;
28994 if (!NodeI2)
28995 return false;
28996 assert((NodeI1 == NodeI2) ==
28997 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
28998 "Different nodes should have different DFS numbers");
28999 if (NodeI1 != NodeI2)
29000 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
29001 }
29002 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
29003 if (S && (IsCompatibility || !S.isAltShuffle()))
29004 continue;
29005 if (IsCompatibility)
29006 return false;
29007 if (I1->getOpcode() != I2->getOpcode())
29008 return I1->getOpcode() < I2->getOpcode();
29009 }
29010 }
29011 return IsCompatibility;
29012}
29013
29014template <typename ItT>
29015bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
29016 BasicBlock *BB, BoUpSLP &R) {
29017 bool Changed = false;
29018 // Try to find reductions first.
29019 for (CmpInst *I : CmpInsts) {
29020 if (R.isDeleted(I))
29021 continue;
29022 for (Value *Op : I->operands())
29023 if (auto *RootOp = dyn_cast<Instruction>(Val: Op)) {
29024 Changed |= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R);
29025 if (R.isDeleted(I))
29026 break;
29027 }
29028 }
29029 // Try to vectorize operands as vector bundles.
29030 for (CmpInst *I : CmpInsts) {
29031 if (R.isDeleted(I))
29032 continue;
29033 Changed |= tryToVectorize(I, R);
29034 }
29035 // Try to vectorize list of compares.
29036 // Sort by type, compare predicate, etc.
29037 auto CompareSorter = [&](Value *V, Value *V2) {
29038 if (V == V2)
29039 return false;
29040 return compareCmp<false>(V, V2, TLI&: *TLI, DT: *DT);
29041 };
29042
29043 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
29044 if (VL.empty() || VL.back() == V1)
29045 return true;
29046 return compareCmp<true>(V: V1, V2: VL.back(), TLI&: *TLI, DT: *DT);
29047 };
29048
29049 SmallVector<Value *> Vals;
29050 for (Instruction *V : CmpInsts)
29051 if (!R.isDeleted(I: V) && isValidElementType(Ty: getValueType(V)))
29052 Vals.push_back(Elt: V);
29053 if (Vals.size() <= 1)
29054 return Changed;
29055 Changed |= tryToVectorizeSequence<Value>(
29056 Vals, CompareSorter, AreCompatibleCompares,
29057 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
29058 // Exclude possible reductions from other blocks.
29059 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
29060 return any_of(V->users(), [V](User *U) {
29061 auto *Select = dyn_cast<SelectInst>(Val: U);
29062 return Select &&
29063 Select->getParent() != cast<Instruction>(Val: V)->getParent();
29064 });
29065 });
29066 if (ArePossiblyReducedInOtherBlock)
29067 return false;
29068 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
29069 },
29070 /*MaxVFOnly=*/true, R);
29071 return Changed;
29072}
29073
29074bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
29075 BasicBlock *BB, BoUpSLP &R) {
29076 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
29077 "This function only accepts Insert instructions");
29078 bool OpsChanged = false;
29079 SmallVector<WeakTrackingVH> PostponedInsts;
29080 for (auto *I : reverse(C&: Instructions)) {
29081 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
29082 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
29083 continue;
29084 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
29085 OpsChanged |=
29086 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/true);
29087 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
29088 OpsChanged |=
29089 vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /*MaxVFOnly=*/true);
29090 }
29091 // pass2 - try to vectorize reductions only
29092 if (R.isDeleted(I))
29093 continue;
29094 OpsChanged |= vectorizeHorReduction(P: nullptr, Root: I, BB, R, PostponedInsts);
29095 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
29096 continue;
29097 // pass3 - try to match and vectorize a buildvector sequence.
29098 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
29099 OpsChanged |=
29100 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/false);
29101 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
29102 OpsChanged |= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
29103 /*MaxVFOnly=*/false);
29104 }
29105 }
29106 // Now try to vectorize postponed instructions.
29107 OpsChanged |= tryToVectorize(Insts: PostponedInsts, R);
29108
29109 Instructions.clear();
29110 return OpsChanged;
29111}
29112
29113bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
29114 bool Changed = false;
29115 SmallVector<Value *, 4> Incoming;
29116 SmallPtrSet<Value *, 16> VisitedInstrs;
29117 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
29118 // node. Allows better to identify the chains that can be vectorized in the
29119 // better way.
29120 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
29121 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
29122 assert(isValidElementType(V1->getType()) &&
29123 isValidElementType(V2->getType()) &&
29124 "Expected vectorizable types only.");
29125 if (V1 == V2)
29126 return false;
29127 // It is fine to compare type IDs here, since we expect only vectorizable
29128 // types, like ints, floats and pointers, we don't care about other type.
29129 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
29130 return true;
29131 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
29132 return false;
29133 if (V1->getType()->getScalarSizeInBits() <
29134 V2->getType()->getScalarSizeInBits())
29135 return true;
29136 if (V1->getType()->getScalarSizeInBits() >
29137 V2->getType()->getScalarSizeInBits())
29138 return false;
29139 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
29140 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
29141 if (Opcodes1.size() < Opcodes2.size())
29142 return true;
29143 if (Opcodes1.size() > Opcodes2.size())
29144 return false;
29145 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
29146 {
29147 // Instructions come first.
29148 auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]);
29149 auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I]);
29150 if (I1 && I2) {
29151 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
29152 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
29153 if (!NodeI1)
29154 return NodeI2 != nullptr;
29155 if (!NodeI2)
29156 return false;
29157 assert((NodeI1 == NodeI2) ==
29158 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
29159 "Different nodes should have different DFS numbers");
29160 if (NodeI1 != NodeI2)
29161 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
29162 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
29163 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
29164 const auto *E1 = dyn_cast<ExtractElementInst>(Val: I1);
29165 const auto *E2 = dyn_cast<ExtractElementInst>(Val: I2);
29166 if (!E1 || !E2)
29167 continue;
29168
29169 // Sort on ExtractElementInsts primarily by vector operands. Prefer
29170 // program order of the vector operands.
29171 const auto *V1 = dyn_cast<Instruction>(Val: E1->getVectorOperand());
29172 const auto *V2 = dyn_cast<Instruction>(Val: E2->getVectorOperand());
29173 if (V1 != V2) {
29174 if (V1 && !V2)
29175 return true;
29176 if (!V1 && V2)
29177 return false;
29178 DomTreeNodeBase<BasicBlock> *NodeI1 =
29179 DT->getNode(BB: V1->getParent());
29180 DomTreeNodeBase<BasicBlock> *NodeI2 =
29181 DT->getNode(BB: V2->getParent());
29182 if (!NodeI1)
29183 return NodeI2 != nullptr;
29184 if (!NodeI2)
29185 return false;
29186 assert((NodeI1 == NodeI2) ==
29187 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
29188 "Different nodes should have different DFS numbers");
29189 if (NodeI1 != NodeI2)
29190 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
29191 return V1->comesBefore(Other: V2);
29192 }
29193 // If we have the same vector operand, try to sort by constant
29194 // index.
29195 std::optional<unsigned> Id1 = getExtractIndex(E: E1);
29196 std::optional<unsigned> Id2 = getExtractIndex(E: E2);
29197 // Bring constants to the top
29198 if (Id1 && !Id2)
29199 return true;
29200 if (!Id1 && Id2)
29201 return false;
29202 // First elements come first.
29203 if (Id1 && Id2)
29204 return *Id1 < *Id2;
29205
29206 continue;
29207 }
29208 if (I1->getOpcode() == I2->getOpcode())
29209 continue;
29210 return I1->getOpcode() < I2->getOpcode();
29211 }
29212 if (I1)
29213 return true;
29214 if (I2)
29215 return false;
29216 }
29217 {
29218 // Non-undef constants come next.
29219 bool C1 = isa<Constant>(Val: Opcodes1[I]) && !isa<UndefValue>(Val: Opcodes1[I]);
29220 bool C2 = isa<Constant>(Val: Opcodes2[I]) && !isa<UndefValue>(Val: Opcodes2[I]);
29221 if (C1 && C2)
29222 continue;
29223 if (C1)
29224 return true;
29225 if (C2)
29226 return false;
29227 }
29228 bool U1 = isa<UndefValue>(Val: Opcodes1[I]);
29229 bool U2 = isa<UndefValue>(Val: Opcodes2[I]);
29230 {
29231 // Non-constant non-instructions come next.
29232 if (!U1 && !U2) {
29233 auto ValID1 = Opcodes1[I]->getValueID();
29234 auto ValID2 = Opcodes2[I]->getValueID();
29235 if (ValID1 == ValID2)
29236 continue;
29237 if (ValID1 < ValID2)
29238 return true;
29239 if (ValID1 > ValID2)
29240 return false;
29241 }
29242 if (!U1)
29243 return true;
29244 if (!U2)
29245 return false;
29246 }
29247 // Undefs come last.
29248 assert(U1 && U2 && "The only thing left should be undef & undef.");
29249 }
29250 return false;
29251 };
29252 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
29253 Value *V1) {
29254 if (VL.empty() || V1 == VL.back())
29255 return true;
29256 Value *V2 = VL.back();
29257 if (V1->getType() != V2->getType())
29258 return false;
29259 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
29260 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
29261 if (Opcodes1.size() != Opcodes2.size())
29262 return false;
29263 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
29264 // Undefs are compatible with any other value.
29265 if (isa<UndefValue>(Val: Opcodes1[I]) || isa<UndefValue>(Val: Opcodes2[I]))
29266 continue;
29267 if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]))
29268 if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I])) {
29269 if (R.isDeleted(I: I1) || R.isDeleted(I: I2))
29270 return false;
29271 if (I1->getParent() != I2->getParent())
29272 return false;
29273 if (getSameOpcode(VL: {I1, I2}, TLI: *TLI))
29274 continue;
29275 return false;
29276 }
29277 if (isa<Constant>(Val: Opcodes1[I]) && isa<Constant>(Val: Opcodes2[I]))
29278 continue;
29279 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
29280 return false;
29281 }
29282 return true;
29283 };
29284
29285 bool HaveVectorizedPhiNodes = false;
29286 do {
29287 // Collect the incoming values from the PHIs.
29288 Incoming.clear();
29289 for (Instruction &I : *BB) {
29290 auto *P = dyn_cast<PHINode>(Val: &I);
29291 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
29292 break;
29293
29294 // No need to analyze deleted, vectorized and non-vectorizable
29295 // instructions.
29296 if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
29297 isValidElementType(Ty: P->getType()))
29298 Incoming.push_back(Elt: P);
29299 }
29300
29301 if (Incoming.size() <= 1)
29302 break;
29303
29304 // Find the corresponding non-phi nodes for better matching when trying to
29305 // build the tree.
29306 for (Value *V : Incoming) {
29307 SmallVectorImpl<Value *> &Opcodes =
29308 PHIToOpcodes.try_emplace(Key: V).first->getSecond();
29309 if (!Opcodes.empty())
29310 continue;
29311 SmallVector<Value *, 4> Nodes(1, V);
29312 SmallPtrSet<Value *, 4> Visited;
29313 while (!Nodes.empty()) {
29314 auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
29315 if (!Visited.insert(Ptr: PHI).second)
29316 continue;
29317 for (Value *V : PHI->incoming_values()) {
29318 if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
29319 Nodes.push_back(Elt: PHI1);
29320 continue;
29321 }
29322 Opcodes.emplace_back(Args&: V);
29323 }
29324 }
29325 }
29326
29327 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
29328 Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
29329 TryToVectorizeHelper: [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
29330 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
29331 },
29332 /*MaxVFOnly=*/true, R);
29333 Changed |= HaveVectorizedPhiNodes;
29334 if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
29335 auto *PHI = dyn_cast<PHINode>(P.first);
29336 return !PHI || R.isDeleted(I: PHI);
29337 }))
29338 PHIToOpcodes.clear();
29339 VisitedInstrs.insert_range(R&: Incoming);
29340 } while (HaveVectorizedPhiNodes);
29341
29342 VisitedInstrs.clear();
29343
29344 InstSetVector PostProcessInserts;
29345 SmallSetVector<CmpInst *, 8> PostProcessCmps;
29346 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
29347 // also vectorizes `PostProcessCmps`.
29348 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
29349 bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
29350 if (VectorizeCmps) {
29351 Changed |= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
29352 PostProcessCmps.clear();
29353 }
29354 PostProcessInserts.clear();
29355 return Changed;
29356 };
29357 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
29358 auto IsInPostProcessInstrs = [&](Instruction *I) {
29359 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
29360 return PostProcessCmps.contains(key: Cmp);
29361 return isa<InsertElementInst, InsertValueInst>(Val: I) &&
29362 PostProcessInserts.contains(key: I);
29363 };
29364 // Returns true if `I` is an instruction without users, like terminator, or
29365 // function call with ignored return value, store. Ignore unused instructions
29366 // (basing on instruction type, except for CallInst and InvokeInst).
29367 auto HasNoUsers = [](Instruction *I) {
29368 return I->use_empty() &&
29369 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(Val: I));
29370 };
29371 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
29372 // Skip instructions with scalable type. The num of elements is unknown at
29373 // compile-time for scalable type.
29374 if (isa<ScalableVectorType>(Val: It->getType()))
29375 continue;
29376
29377 // Skip instructions marked for the deletion.
29378 if (R.isDeleted(I: &*It))
29379 continue;
29380 // We may go through BB multiple times so skip the one we have checked.
29381 if (!VisitedInstrs.insert(Ptr: &*It).second) {
29382 if (HasNoUsers(&*It) &&
29383 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
29384 // We would like to start over since some instructions are deleted
29385 // and the iterator may become invalid value.
29386 Changed = true;
29387 It = BB->begin();
29388 E = BB->end();
29389 }
29390 continue;
29391 }
29392
29393 // Try to vectorize reductions that use PHINodes.
29394 if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
29395 // Check that the PHI is a reduction PHI.
29396 if (P->getNumIncomingValues() == 2) {
29397 // Try to match and vectorize a horizontal reduction.
29398 Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
29399 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
29400 Changed = true;
29401 It = BB->begin();
29402 E = BB->end();
29403 continue;
29404 }
29405 }
29406 // Try to vectorize the incoming values of the PHI, to catch reductions
29407 // that feed into PHIs.
29408 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
29409 // Skip if the incoming block is the current BB for now. Also, bypass
29410 // unreachable IR for efficiency and to avoid crashing.
29411 // TODO: Collect the skipped incoming values and try to vectorize them
29412 // after processing BB.
29413 if (BB == P->getIncomingBlock(i: I) ||
29414 !DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
29415 continue;
29416
29417 // Postponed instructions should not be vectorized here, delay their
29418 // vectorization.
29419 if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
29420 PI && !IsInPostProcessInstrs(PI)) {
29421 bool Res =
29422 vectorizeRootInstruction(P: nullptr, Root: PI, BB: P->getIncomingBlock(i: I), R);
29423 Changed |= Res;
29424 if (Res && R.isDeleted(I: P)) {
29425 It = BB->begin();
29426 E = BB->end();
29427 break;
29428 }
29429 }
29430 }
29431 continue;
29432 }
29433
29434 if (HasNoUsers(&*It)) {
29435 bool OpsChanged = false;
29436 auto *SI = dyn_cast<StoreInst>(Val&: It);
29437 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
29438 if (SI) {
29439 auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
29440 // Try to vectorize chain in store, if this is the only store to the
29441 // address in the block.
29442 // TODO: This is just a temporarily solution to save compile time. Need
29443 // to investigate if we can safely turn on slp-vectorize-hor-store
29444 // instead to allow lookup for reduction chains in all non-vectorized
29445 // stores (need to check side effects and compile time).
29446 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
29447 SI->getValueOperand()->hasOneUse();
29448 }
29449 if (TryToVectorizeRoot) {
29450 for (auto *V : It->operand_values()) {
29451 // Postponed instructions should not be vectorized here, delay their
29452 // vectorization.
29453 if (auto *VI = dyn_cast<Instruction>(Val: V);
29454 VI && !IsInPostProcessInstrs(VI))
29455 // Try to match and vectorize a horizontal reduction.
29456 OpsChanged |= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R);
29457 }
29458 }
29459 // Start vectorization of post-process list of instructions from the
29460 // top-tree instructions to try to vectorize as many instructions as
29461 // possible.
29462 OpsChanged |=
29463 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
29464 if (OpsChanged) {
29465 // We would like to start over since some instructions are deleted
29466 // and the iterator may become invalid value.
29467 Changed = true;
29468 It = BB->begin();
29469 E = BB->end();
29470 continue;
29471 }
29472 }
29473
29474 if (isa<InsertElementInst, InsertValueInst>(Val: It))
29475 PostProcessInserts.insert(X: &*It);
29476 else if (isa<CmpInst>(Val: It))
29477 PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
29478 }
29479
29480 return Changed;
29481}
29482
29483bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
29484 auto Changed = false;
29485 for (auto &Entry : GEPs) {
29486 // If the getelementptr list has fewer than two elements, there's nothing
29487 // to do.
29488 if (Entry.second.size() < 2)
29489 continue;
29490
29491 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
29492 << Entry.second.size() << ".\n");
29493
29494 // Process the GEP list in chunks suitable for the target's supported
29495 // vector size. If a vector register can't hold 1 element, we are done. We
29496 // are trying to vectorize the index computations, so the maximum number of
29497 // elements is based on the size of the index expression, rather than the
29498 // size of the GEP itself (the target's pointer size).
29499 auto *It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst *GEP) {
29500 return !R.isDeleted(I: GEP);
29501 });
29502 if (It == Entry.second.end())
29503 continue;
29504 unsigned MaxVecRegSize = R.getMaxVecRegSize();
29505 unsigned EltSize = R.getVectorElementSize(V: *(*It)->idx_begin());
29506 if (MaxVecRegSize < EltSize)
29507 continue;
29508
29509 unsigned MaxElts = MaxVecRegSize / EltSize;
29510 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
29511 auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
29512 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
29513
29514 // Initialize a set a candidate getelementptrs. Note that we use a
29515 // SetVector here to preserve program order. If the index computations
29516 // are vectorizable and begin with loads, we want to minimize the chance
29517 // of having to reorder them later.
29518 SetVector<Value *> Candidates(llvm::from_range, GEPList);
29519
29520 // Some of the candidates may have already been vectorized after we
29521 // initially collected them or their index is optimized to constant value.
29522 // If so, they are marked as deleted, so remove them from the set of
29523 // candidates.
29524 Candidates.remove_if(P: [&R](Value *I) {
29525 return R.isDeleted(I: cast<Instruction>(Val: I)) ||
29526 isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
29527 });
29528
29529 // Remove from the set of candidates all pairs of getelementptrs with
29530 // constant differences. Such getelementptrs are likely not good
29531 // candidates for vectorization in a bottom-up phase since one can be
29532 // computed from the other. We also ensure all candidate getelementptr
29533 // indices are unique.
29534 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
29535 auto *GEPI = GEPList[I];
29536 if (!Candidates.count(key: GEPI))
29537 continue;
29538 const SCEV *SCEVI = SE->getSCEV(V: GEPList[I]);
29539 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
29540 auto *GEPJ = GEPList[J];
29541 if (!Candidates.count(key: GEPJ))
29542 continue;
29543 const SCEV *SCEVJ = SE->getSCEV(V: GEPList[J]);
29544 if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
29545 Candidates.remove(X: GEPI);
29546 Candidates.remove(X: GEPJ);
29547 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
29548 Candidates.remove(X: GEPJ);
29549 }
29550 }
29551 }
29552
29553 // We break out of the above computation as soon as we know there are
29554 // fewer than two candidates remaining.
29555 if (Candidates.size() < 2)
29556 continue;
29557
29558 // Add the single, non-constant index of each candidate to the bundle. We
29559 // ensured the indices met these constraints when we originally collected
29560 // the getelementptrs.
29561 SmallVector<Value *, 16> Bundle(Candidates.size());
29562 auto BundleIndex = 0u;
29563 for (auto *V : Candidates) {
29564 auto *GEP = cast<GetElementPtrInst>(Val: V);
29565 auto *GEPIdx = GEP->idx_begin()->get();
29566 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
29567 Bundle[BundleIndex++] = GEPIdx;
29568 }
29569
29570 // Try and vectorize the indices. We are currently only interested in
29571 // gather-like cases of the form:
29572 //
29573 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
29574 //
29575 // where the loads of "a", the loads of "b", and the subtractions can be
29576 // performed in parallel. It's likely that detecting this pattern in a
29577 // bottom-up phase will be simpler and less costly than building a
29578 // full-blown top-down phase beginning at the consecutive loads.
29579 Changed |= tryToVectorizeList(VL: Bundle, R);
29580 }
29581 }
29582 return Changed;
29583}
29584
29585bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
29586 bool Changed = false;
29587 // Sort by type, base pointers and values operand. Value operands must be
29588 // compatible (have the same opcode, same parent), otherwise it is
29589 // definitely not profitable to try to vectorize them.
29590 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
29591 if (V->getValueOperand()->getType()->getTypeID() <
29592 V2->getValueOperand()->getType()->getTypeID())
29593 return true;
29594 if (V->getValueOperand()->getType()->getTypeID() >
29595 V2->getValueOperand()->getType()->getTypeID())
29596 return false;
29597 if (V->getPointerOperandType()->getTypeID() <
29598 V2->getPointerOperandType()->getTypeID())
29599 return true;
29600 if (V->getPointerOperandType()->getTypeID() >
29601 V2->getPointerOperandType()->getTypeID())
29602 return false;
29603 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
29604 V2->getValueOperand()->getType()->getScalarSizeInBits())
29605 return true;
29606 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
29607 V2->getValueOperand()->getType()->getScalarSizeInBits())
29608 return false;
29609 // UndefValues are compatible with all other values.
29610 auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand());
29611 auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
29612 if (I1 && I2) {
29613 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
29614 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
29615 assert(NodeI1 && "Should only process reachable instructions");
29616 assert(NodeI2 && "Should only process reachable instructions");
29617 assert((NodeI1 == NodeI2) ==
29618 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
29619 "Different nodes should have different DFS numbers");
29620 if (NodeI1 != NodeI2)
29621 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
29622 return I1->getOpcode() < I2->getOpcode();
29623 }
29624 if (I1 && !I2)
29625 return true;
29626 if (!I1 && I2)
29627 return false;
29628 return V->getValueOperand()->getValueID() <
29629 V2->getValueOperand()->getValueID();
29630 };
29631
29632 bool SameParent = true;
29633 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
29634 if (VL.empty()) {
29635 SameParent = true;
29636 return true;
29637 }
29638 StoreInst *V2 = VL.back();
29639 if (V1 == V2)
29640 return true;
29641 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
29642 return false;
29643 if (V1->getPointerOperandType() != V2->getPointerOperandType())
29644 return false;
29645 // Undefs are compatible with any other value.
29646 if (isa<UndefValue>(Val: V1->getValueOperand()) ||
29647 isa<UndefValue>(Val: V2->getValueOperand()))
29648 return true;
29649 if (isa<Constant>(Val: V1->getValueOperand()) &&
29650 isa<Constant>(Val: V2->getValueOperand()))
29651 return true;
29652 // Check if the operands of the stores can be vectorized. They can be
29653 // vectorized, if they have compatible operands or have operands, which can
29654 // be vectorized as copyables.
29655 auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand());
29656 auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
29657 if (I1 || I2) {
29658 // Accept only tail-following non-compatible values for now.
29659 // TODO: investigate if it is possible to vectorize incompatible values,
29660 // if the copyables are first in the list.
29661 if (I1 && !I2)
29662 return false;
29663 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
29664 SmallVector<Value *> NewVL(VL.size() + 1);
29665 for (auto [SI, V] : zip(t&: VL, u&: NewVL))
29666 V = SI->getValueOperand();
29667 NewVL.back() = V1->getValueOperand();
29668 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
29669 InstructionsState S = Analysis.buildInstructionsState(
29670 VL: NewVL, R, /*WithProfitabilityCheck=*/true,
29671 /*SkipSameCodeCheck=*/!SameParent);
29672 if (S)
29673 return true;
29674 if (!SameParent)
29675 return false;
29676 }
29677 return V1->getValueOperand()->getValueID() ==
29678 V2->getValueOperand()->getValueID();
29679 };
29680
29681 // Attempt to sort and vectorize each of the store-groups.
29682 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
29683 for (auto &Pair : Stores) {
29684 if (Pair.second.size() < 2)
29685 continue;
29686
29687 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
29688 << Pair.second.size() << ".\n");
29689
29690 if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
29691 continue;
29692
29693 // Reverse stores to do bottom-to-top analysis. This is important if the
29694 // values are stores to the same addresses several times, in this case need
29695 // to follow the stores order (reversed to meet the memory dependecies).
29696 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
29697 Pair.second.rend());
29698 Changed |= tryToVectorizeSequence<StoreInst>(
29699 Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
29700 TryToVectorizeHelper: [&](ArrayRef<StoreInst *> Candidates, bool) {
29701 return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
29702 },
29703 /*MaxVFOnly=*/false, R);
29704 }
29705 return Changed;
29706}
29707