1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/Loads.h"
42#include "llvm/Analysis/LoopAccessAnalysis.h"
43#include "llvm/Analysis/LoopInfo.h"
44#include "llvm/Analysis/MemoryLocation.h"
45#include "llvm/Analysis/OptimizationRemarkEmitter.h"
46#include "llvm/Analysis/ScalarEvolution.h"
47#include "llvm/Analysis/ScalarEvolutionExpressions.h"
48#include "llvm/Analysis/TargetLibraryInfo.h"
49#include "llvm/Analysis/TargetTransformInfo.h"
50#include "llvm/Analysis/ValueTracking.h"
51#include "llvm/Analysis/VectorUtils.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/DerivedTypes.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
68#include "llvm/IR/PatternMatch.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/DOTGraphTraits.h"
82#include "llvm/Support/Debug.h"
83#include "llvm/Support/DebugCounter.h"
84#include "llvm/Support/ErrorHandling.h"
85#include "llvm/Support/GraphWriter.h"
86#include "llvm/Support/InstructionCost.h"
87#include "llvm/Support/KnownBits.h"
88#include "llvm/Support/MathExtras.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Transforms/Utils/InjectTLIMappings.h"
91#include "llvm/Transforms/Utils/Local.h"
92#include "llvm/Transforms/Utils/LoopUtils.h"
93#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
128 SLPCostThreshold("slp-threshold", cl::init(Val: 0), cl::Hidden,
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
132static cl::opt<bool> SLPSkipEarlyProfitabilityCheck(
133 "slp-skip-early-profitability-check", cl::init(Val: false), cl::Hidden,
134 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
135 "heuristics and makes vectorization decision via cost modeling."));
136
137static cl::opt<bool>
138ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
139 cl::desc("Attempt to vectorize horizontal reductions"));
140
141static cl::opt<bool> ShouldStartVectorizeHorAtStore(
142 "slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
143 cl::desc(
144 "Attempt to vectorize horizontal reductions feeding into a store"));
145
146static cl::opt<bool> SplitAlternateInstructions(
147 "slp-split-alternate-instructions", cl::init(Val: true), cl::Hidden,
148 cl::desc("Improve the code quality by splitting alternate instructions"));
149
150static cl::opt<int>
151MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: 128), cl::Hidden,
152 cl::desc("Attempt to vectorize for this register size in bits"));
153
154static cl::opt<unsigned>
155MaxVFOption("slp-max-vf", cl::init(Val: 0), cl::Hidden,
156 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
157
158/// Limits the size of scheduling regions in a block.
159/// It avoid long compile times for _very_ large blocks where vector
160/// instructions are spread over a wide range.
161/// This limit is way higher than needed by real-world functions.
162static cl::opt<int>
163ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: 100000), cl::Hidden,
164 cl::desc("Limit the size of the SLP scheduling region per block"));
165
166static cl::opt<int> MinVectorRegSizeOption(
167 "slp-min-reg-size", cl::init(Val: 128), cl::Hidden,
168 cl::desc("Attempt to vectorize for this register size in bits"));
169
170static cl::opt<unsigned> RecursionMaxDepth(
171 "slp-recursion-max-depth", cl::init(Val: 12), cl::Hidden,
172 cl::desc("Limit the recursion depth when building a vectorizable tree"));
173
174static cl::opt<unsigned> MinTreeSize(
175 "slp-min-tree-size", cl::init(Val: 3), cl::Hidden,
176 cl::desc("Only vectorize small trees if they are fully vectorizable"));
177
178// The maximum depth that the look-ahead score heuristic will explore.
179// The higher this value, the higher the compilation time overhead.
180static cl::opt<int> LookAheadMaxDepth(
181 "slp-max-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
182 cl::desc("The maximum look-ahead depth for operand reordering scores"));
183
184// The maximum depth that the look-ahead score heuristic will explore
185// when it probing among candidates for vectorization tree roots.
186// The higher this value, the higher the compilation time overhead but unlike
187// similar limit for operands ordering this is less frequently used, hence
188// impact of higher value is less noticeable.
189static cl::opt<int> RootLookAheadMaxDepth(
190 "slp-max-root-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
191 cl::desc("The maximum look-ahead depth for searching best rooting option"));
192
193static cl::opt<unsigned> MinProfitableStridedLoads(
194 "slp-min-strided-loads", cl::init(Val: 2), cl::Hidden,
195 cl::desc("The minimum number of loads, which should be considered strided, "
196 "if the stride is > 1 or is runtime value"));
197
198static cl::opt<unsigned> MaxProfitableLoadStride(
199 "slp-max-stride", cl::init(Val: 8), cl::Hidden,
200 cl::desc("The maximum stride, considered to be profitable."));
201
202static cl::opt<bool>
203 DisableTreeReorder("slp-disable-tree-reorder", cl::init(Val: false), cl::Hidden,
204 cl::desc("Disable tree reordering even if it is "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ForceStridedLoads("slp-force-strided-loads", cl::init(Val: false), cl::Hidden,
209 cl::desc("Generate strided loads even if they are not "
210 "profitable. Used for testing only."));
211
212static cl::opt<bool>
213 ViewSLPTree("view-slp-tree", cl::Hidden,
214 cl::desc("Display the SLP trees with Graphviz"));
215
216static cl::opt<bool> VectorizeNonPowerOf2(
217 "slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
218 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
219
220/// Enables vectorization of copyable elements.
221static cl::opt<bool> VectorizeCopyableElements(
222 "slp-copyable-elements", cl::init(Val: true), cl::Hidden,
223 cl::desc("Try to replace values with the idempotent instructions for "
224 "better vectorization."));
225
226// Limit the number of alias checks. The limit is chosen so that
227// it has no negative effect on the llvm benchmarks.
228static const unsigned AliasedCheckLimit = 10;
229
230// Limit of the number of uses for potentially transformed instructions/values,
231// used in checks to avoid compile-time explode.
232static constexpr int UsesLimit = 64;
233
234// Another limit for the alias checks: The maximum distance between load/store
235// instructions where alias checks are done.
236// This limit is useful for very large basic blocks.
237static const unsigned MaxMemDepDistance = 160;
238
239/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
240/// regions to be handled.
241static const int MinScheduleRegionSize = 16;
242
243/// Maximum allowed number of operands in the PHI nodes.
244static const unsigned MaxPHINumOperands = 128;
245
246/// Predicate for the element types that the SLP vectorizer supports.
247///
248/// The most important thing to filter here are types which are invalid in LLVM
249/// vectors. We also filter target specific types which have absolutely no
250/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
251/// avoids spending time checking the cost model and realizing that they will
252/// be inevitably scalarized.
253static bool isValidElementType(Type *Ty) {
254 // TODO: Support ScalableVectorType.
255 if (SLPReVec && isa<FixedVectorType>(Val: Ty))
256 Ty = Ty->getScalarType();
257 return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
258 !Ty->isPPC_FP128Ty();
259}
260
261/// Returns the type of the given value/instruction \p V. If it is store,
262/// returns the type of its value operand, for Cmp - the types of the compare
263/// operands and for insertelement - the type os the inserted operand.
264/// Otherwise, just the type of the value is returned.
265static Type *getValueType(Value *V) {
266 if (auto *SI = dyn_cast<StoreInst>(Val: V))
267 return SI->getValueOperand()->getType();
268 if (auto *CI = dyn_cast<CmpInst>(Val: V))
269 return CI->getOperand(i_nocapture: 0)->getType();
270 if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
271 return IE->getOperand(i_nocapture: 1)->getType();
272 return V->getType();
273}
274
275/// \returns the number of elements for Ty.
276static unsigned getNumElements(Type *Ty) {
277 assert(!isa<ScalableVectorType>(Ty) &&
278 "ScalableVectorType is not supported.");
279 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
280 return VecTy->getNumElements();
281 return 1;
282}
283
284/// \returns the vector type of ScalarTy based on vectorization factor.
285static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
286 return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
287 NumElts: VF * getNumElements(Ty: ScalarTy));
288}
289
290/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
291/// which forms type, which splits by \p TTI into whole vector types during
292/// legalization.
293static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
294 Type *Ty, unsigned Sz) {
295 if (!isValidElementType(Ty))
296 return bit_ceil(Value: Sz);
297 // Find the number of elements, which forms full vectors.
298 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
299 if (NumParts == 0 || NumParts >= Sz)
300 return bit_ceil(Value: Sz);
301 return bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)) * NumParts;
302}
303
304/// Returns the number of elements of the given type \p Ty, not greater than \p
305/// Sz, which forms type, which splits by \p TTI into whole vector types during
306/// legalization.
307static unsigned
308getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
309 unsigned Sz) {
310 if (!isValidElementType(Ty))
311 return bit_floor(Value: Sz);
312 // Find the number of elements, which forms full vectors.
313 unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
314 if (NumParts == 0 || NumParts >= Sz)
315 return bit_floor(Value: Sz);
316 unsigned RegVF = bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts));
317 if (RegVF > Sz)
318 return bit_floor(Value: Sz);
319 return (Sz / RegVF) * RegVF;
320}
321
322static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
323 SmallVectorImpl<int> &Mask) {
324 // The ShuffleBuilder implementation use shufflevector to splat an "element".
325 // But the element have different meaning for SLP (scalar) and REVEC
326 // (vector). We need to expand Mask into masks which shufflevector can use
327 // directly.
328 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
329 for (unsigned I : seq<unsigned>(Size: Mask.size()))
330 for (auto [J, MaskV] : enumerate(First: MutableArrayRef(NewMask).slice(
331 N: I * VecTyNumElements, M: VecTyNumElements)))
332 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
333 : Mask[I] * VecTyNumElements + J;
334 Mask.swap(RHS&: NewMask);
335}
336
337/// \returns the number of groups of shufflevector
338/// A group has the following features
339/// 1. All of value in a group are shufflevector.
340/// 2. The mask of all shufflevector is isExtractSubvectorMask.
341/// 3. The mask of all shufflevector uses all of the elements of the source.
342/// e.g., it is 1 group (%0)
343/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
344/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
345/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
346/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
347/// it is 2 groups (%3 and %4)
348/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
353/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
354/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
355/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
356/// it is 0 group
357/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
358/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
359/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
360/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
361static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
362 if (VL.empty())
363 return 0;
364 if (!all_of(Range&: VL, P: IsaPred<ShuffleVectorInst>))
365 return 0;
366 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
367 unsigned SVNumElements =
368 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
369 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
370 if (SVNumElements % ShuffleMaskSize != 0)
371 return 0;
372 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
373 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
374 return 0;
375 unsigned NumGroup = 0;
376 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
377 auto *SV = cast<ShuffleVectorInst>(Val: VL[I]);
378 Value *Src = SV->getOperand(i_nocapture: 0);
379 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
380 SmallBitVector ExpectedIndex(GroupSize);
381 if (!all_of(Range&: Group, P: [&](Value *V) {
382 auto *SV = cast<ShuffleVectorInst>(Val: V);
383 // From the same source.
384 if (SV->getOperand(i_nocapture: 0) != Src)
385 return false;
386 int Index;
387 if (!SV->isExtractSubvectorMask(Index))
388 return false;
389 ExpectedIndex.set(Index / ShuffleMaskSize);
390 return true;
391 }))
392 return 0;
393 if (!ExpectedIndex.all())
394 return 0;
395 ++NumGroup;
396 }
397 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
398 return NumGroup;
399}
400
401/// \returns a shufflevector mask which is used to vectorize shufflevectors
402/// e.g.,
403/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
408/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
409/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
410/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
411/// the result is
412/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
413static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
414 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
415 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
416 unsigned SVNumElements =
417 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
418 SmallVector<int> Mask;
419 unsigned AccumulateLength = 0;
420 for (Value *V : VL) {
421 auto *SV = cast<ShuffleVectorInst>(Val: V);
422 for (int M : SV->getShuffleMask())
423 Mask.push_back(Elt: M == PoisonMaskElem ? PoisonMaskElem
424 : AccumulateLength + M);
425 AccumulateLength += SVNumElements;
426 }
427 return Mask;
428}
429
430/// \returns True if the value is a constant (but not globals/constant
431/// expressions).
432static bool isConstant(Value *V) {
433 return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
434}
435
436/// Checks if \p V is one of vector-like instructions, i.e. undef,
437/// insertelement/extractelement with constant indices for fixed vector type or
438/// extractvalue instruction.
439static bool isVectorLikeInstWithConstOps(Value *V) {
440 if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
441 !isa<ExtractValueInst, UndefValue>(Val: V))
442 return false;
443 auto *I = dyn_cast<Instruction>(Val: V);
444 if (!I || isa<ExtractValueInst>(Val: I))
445 return true;
446 if (!isa<FixedVectorType>(Val: I->getOperand(i: 0)->getType()))
447 return false;
448 if (isa<ExtractElementInst>(Val: I))
449 return isConstant(V: I->getOperand(i: 1));
450 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
451 return isConstant(V: I->getOperand(i: 2));
452}
453
454/// Returns power-of-2 number of elements in a single register (part), given the
455/// total number of elements \p Size and number of registers (parts) \p
456/// NumParts.
457static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
458 return std::min<unsigned>(a: Size, b: bit_ceil(Value: divideCeil(Numerator: Size, Denominator: NumParts)));
459}
460
461/// Returns correct remaining number of elements, considering total amount \p
462/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
463/// and current register (part) \p Part.
464static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
465 unsigned Part) {
466 return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
467}
468
469#if !defined(NDEBUG)
470/// Print a short descriptor of the instruction bundle suitable for debug output.
471static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
472 std::string Result;
473 raw_string_ostream OS(Result);
474 if (Idx >= 0)
475 OS << "Idx: " << Idx << ", ";
476 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
477 return Result;
478}
479#endif
480
481/// \returns true if all of the instructions in \p VL are in the same block or
482/// false otherwise.
483static bool allSameBlock(ArrayRef<Value *> VL) {
484 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
485 if (It == VL.end())
486 return false;
487 Instruction *I0 = cast<Instruction>(Val: *It);
488 if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
489 return true;
490
491 BasicBlock *BB = I0->getParent();
492 for (Value *V : iterator_range(It, VL.end())) {
493 if (isa<PoisonValue>(Val: V))
494 continue;
495 auto *II = dyn_cast<Instruction>(Val: V);
496 if (!II)
497 return false;
498
499 if (BB != II->getParent())
500 return false;
501 }
502 return true;
503}
504
505/// \returns True if all of the values in \p VL are constants (but not
506/// globals/constant expressions).
507static bool allConstant(ArrayRef<Value *> VL) {
508 // Constant expressions and globals can't be vectorized like normal integer/FP
509 // constants.
510 return all_of(Range&: VL, P: isConstant);
511}
512
513/// \returns True if all of the values in \p VL are identical or some of them
514/// are UndefValue.
515static bool isSplat(ArrayRef<Value *> VL) {
516 Value *FirstNonUndef = nullptr;
517 for (Value *V : VL) {
518 if (isa<UndefValue>(Val: V))
519 continue;
520 if (!FirstNonUndef) {
521 FirstNonUndef = V;
522 continue;
523 }
524 if (V != FirstNonUndef)
525 return false;
526 }
527 return FirstNonUndef != nullptr;
528}
529
530/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
531/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
532/// patterns that make it effectively commutative (like equality comparisons
533/// with zero).
534/// In most cases, users should not call this function directly (since \p I and
535/// \p InstWithUses are the same). However, when analyzing interchangeable
536/// instructions, we need to use the converted opcode along with the original
537/// uses.
538/// \param I The instruction to check for commutativity
539/// \param ValWithUses The value whose uses are analyzed for special
540/// patterns
541static bool isCommutative(Instruction *I, Value *ValWithUses,
542 bool IsCopyable = false) {
543 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
544 return Cmp->isCommutative();
545 if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
546 return BO->isCommutative() ||
547 (BO->getOpcode() == Instruction::Sub &&
548 ValWithUses->hasUseList() &&
549 !ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
550 all_of(
551 Range: ValWithUses->uses(),
552 P: [&](const Use &U) {
553 // Commutative, if icmp eq/ne sub, 0
554 CmpPredicate Pred;
555 if (match(V: U.getUser(),
556 P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
557 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
558 return true;
559 // Commutative, if abs(sub nsw, true) or abs(sub, false).
560 ConstantInt *Flag;
561 auto *I = dyn_cast<BinaryOperator>(Val: U.get());
562 return match(V: U.getUser(),
563 P: m_Intrinsic<Intrinsic::abs>(
564 Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
565 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
566 Flag->isOne());
567 })) ||
568 (BO->getOpcode() == Instruction::FSub &&
569 ValWithUses->hasUseList() &&
570 !ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
571 all_of(Range: ValWithUses->uses(), P: [](const Use &U) {
572 return match(V: U.getUser(),
573 P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
574 }));
575 return I->isCommutative();
576}
577
578/// Checks if the operand is commutative. In commutative operations, not all
579/// operands might commutable, e.g. for fmuladd only 2 first operands are
580/// commutable.
581static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
582 bool IsCopyable = false) {
583 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
584 "The instruction is not commutative.");
585 if (isa<CmpInst>(Val: I))
586 return true;
587 if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
588 switch (BO->getOpcode()) {
589 case Instruction::Sub:
590 case Instruction::FSub:
591 return true;
592 default:
593 break;
594 }
595 }
596 return I->isCommutableOperand(Op);
597}
598
599/// This is a helper function to check whether \p I is commutative.
600/// This is a convenience wrapper that calls the two-parameter version of
601/// isCommutative with the same instruction for both parameters. This is
602/// the common case where the instruction being checked for commutativity
603/// is the same as the instruction whose uses are analyzed for special
604/// patterns (see the two-parameter version above for details).
605/// \param I The instruction to check for commutativity
606/// \returns true if the instruction is commutative, false otherwise
607static bool isCommutative(Instruction *I) { return isCommutative(I, ValWithUses: I); }
608
609/// \returns number of operands of \p I, considering commutativity. Returns 2
610/// for commutative intrinsics.
611/// \param I The instruction to check for commutativity
612static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {
613 if (isa<IntrinsicInst>(Val: I) && isCommutative(I)) {
614 // IntrinsicInst::isCommutative returns true if swapping the first "two"
615 // arguments to the intrinsic produces the same result.
616 constexpr unsigned IntrinsicNumOperands = 2;
617 return IntrinsicNumOperands;
618 }
619 return I->getNumOperands();
620}
621
622template <typename T>
623static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
624 unsigned Offset) {
625 static_assert(std::is_same_v<T, InsertElementInst> ||
626 std::is_same_v<T, ExtractElementInst>,
627 "unsupported T");
628 int Index = Offset;
629 if (const auto *IE = dyn_cast<T>(Inst)) {
630 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
631 if (!VT)
632 return std::nullopt;
633 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
634 if (!CI)
635 return std::nullopt;
636 if (CI->getValue().uge(VT->getNumElements()))
637 return std::nullopt;
638 Index *= VT->getNumElements();
639 Index += CI->getZExtValue();
640 return Index;
641 }
642 return std::nullopt;
643}
644
645/// \returns inserting or extracting index of InsertElement, ExtractElement or
646/// InsertValue instruction, using Offset as base offset for index.
647/// \returns std::nullopt if the index is not an immediate.
648static std::optional<unsigned> getElementIndex(const Value *Inst,
649 unsigned Offset = 0) {
650 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
651 return Index;
652 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
653 return Index;
654
655 int Index = Offset;
656
657 const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
658 if (!IV)
659 return std::nullopt;
660
661 Type *CurrentType = IV->getType();
662 for (unsigned I : IV->indices()) {
663 if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
664 Index *= ST->getNumElements();
665 CurrentType = ST->getElementType(N: I);
666 } else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
667 Index *= AT->getNumElements();
668 CurrentType = AT->getElementType();
669 } else {
670 return std::nullopt;
671 }
672 Index += I;
673 }
674 return Index;
675}
676
677/// \returns true if all of the values in \p VL use the same opcode.
678/// For comparison instructions, also checks if predicates match.
679/// PoisonValues are considered matching.
680/// Interchangeable instructions are not considered.
681static bool allSameOpcode(ArrayRef<Value *> VL) {
682 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
683 if (It == VL.end())
684 return true;
685 Instruction *MainOp = cast<Instruction>(Val: *It);
686 unsigned Opcode = MainOp->getOpcode();
687 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
688 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
689 : CmpInst::BAD_ICMP_PREDICATE;
690 return std::all_of(first: It, last: VL.end(), pred: [&](Value *V) {
691 if (auto *CI = dyn_cast<CmpInst>(Val: V))
692 return BasePred == CI->getPredicate();
693 if (auto *I = dyn_cast<Instruction>(Val: V))
694 return I->getOpcode() == Opcode;
695 return isa<PoisonValue>(Val: V);
696 });
697}
698
699namespace {
700/// Specifies the way the mask should be analyzed for undefs/poisonous elements
701/// in the shuffle mask.
702enum class UseMask {
703 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
704 ///< check for the mask elements for the first argument (mask
705 ///< indices are in range [0:VF)).
706 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
707 ///< for the mask elements for the second argument (mask indices
708 ///< are in range [VF:2*VF))
709 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
710 ///< future shuffle elements and mark them as ones as being used
711 ///< in future. Non-undef elements are considered as unused since
712 ///< they're already marked as used in the mask.
713};
714} // namespace
715
716/// Prepares a use bitset for the given mask either for the first argument or
717/// for the second.
718static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
719 UseMask MaskArg) {
720 SmallBitVector UseMask(VF, true);
721 for (auto [Idx, Value] : enumerate(First&: Mask)) {
722 if (Value == PoisonMaskElem) {
723 if (MaskArg == UseMask::UndefsAsMask)
724 UseMask.reset(Idx);
725 continue;
726 }
727 if (MaskArg == UseMask::FirstArg && Value < VF)
728 UseMask.reset(Idx: Value);
729 else if (MaskArg == UseMask::SecondArg && Value >= VF)
730 UseMask.reset(Idx: Value - VF);
731 }
732 return UseMask;
733}
734
735/// Checks if the given value is actually an undefined constant vector.
736/// Also, if the \p UseMask is not empty, tries to check if the non-masked
737/// elements actually mask the insertelement buildvector, if any.
738template <bool IsPoisonOnly = false>
739static SmallBitVector isUndefVector(const Value *V,
740 const SmallBitVector &UseMask = {}) {
741 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
742 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
743 if (isa<T>(V))
744 return Res;
745 auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
746 if (!VecTy)
747 return Res.reset();
748 auto *C = dyn_cast<Constant>(Val: V);
749 if (!C) {
750 if (!UseMask.empty()) {
751 const Value *Base = V;
752 while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
753 Base = II->getOperand(i_nocapture: 0);
754 if (isa<T>(II->getOperand(i_nocapture: 1)))
755 continue;
756 std::optional<unsigned> Idx = getElementIndex(Inst: II);
757 if (!Idx) {
758 Res.reset();
759 return Res;
760 }
761 if (*Idx < UseMask.size() && !UseMask.test(Idx: *Idx))
762 Res.reset(Idx: *Idx);
763 }
764 // TODO: Add analysis for shuffles here too.
765 if (V == Base) {
766 Res.reset();
767 } else {
768 SmallBitVector SubMask(UseMask.size(), false);
769 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
770 }
771 } else {
772 Res.reset();
773 }
774 return Res;
775 }
776 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
777 if (Constant *Elem = C->getAggregateElement(Elt: I))
778 if (!isa<T>(Elem) &&
779 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(Idx: I))))
780 Res.reset(Idx: I);
781 }
782 return Res;
783}
784
785/// Checks if the vector of instructions can be represented as a shuffle, like:
786/// %x0 = extractelement <4 x i8> %x, i32 0
787/// %x3 = extractelement <4 x i8> %x, i32 3
788/// %y1 = extractelement <4 x i8> %y, i32 1
789/// %y2 = extractelement <4 x i8> %y, i32 2
790/// %x0x0 = mul i8 %x0, %x0
791/// %x3x3 = mul i8 %x3, %x3
792/// %y1y1 = mul i8 %y1, %y1
793/// %y2y2 = mul i8 %y2, %y2
794/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
795/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
796/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
797/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
798/// ret <4 x i8> %ins4
799/// can be transformed into:
800/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
801/// i32 6>
802/// %2 = mul <4 x i8> %1, %1
803/// ret <4 x i8> %2
804/// Mask will return the Shuffle Mask equivalent to the extracted elements.
805/// TODO: Can we split off and reuse the shuffle mask detection from
806/// ShuffleVectorInst/getShuffleCost?
807static std::optional<TargetTransformInfo::ShuffleKind>
808isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
809 AssumptionCache *AC) {
810 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
811 if (It == VL.end())
812 return std::nullopt;
813 unsigned Size =
814 std::accumulate(first: VL.begin(), last: VL.end(), init: 0u, binary_op: [](unsigned S, Value *V) {
815 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
816 if (!EI)
817 return S;
818 auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
819 if (!VTy)
820 return S;
821 return std::max(a: S, b: VTy->getNumElements());
822 });
823
824 Value *Vec1 = nullptr;
825 Value *Vec2 = nullptr;
826 bool HasNonUndefVec = any_of(Range&: VL, P: [&](Value *V) {
827 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
828 if (!EE)
829 return false;
830 Value *Vec = EE->getVectorOperand();
831 if (isa<UndefValue>(Val: Vec))
832 return false;
833 return isGuaranteedNotToBePoison(V: Vec, AC);
834 });
835 enum ShuffleMode { Unknown, Select, Permute };
836 ShuffleMode CommonShuffleMode = Unknown;
837 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
838 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
839 // Undef can be represented as an undef element in a vector.
840 if (isa<UndefValue>(Val: VL[I]))
841 continue;
842 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
843 if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
844 return std::nullopt;
845 auto *Vec = EI->getVectorOperand();
846 // We can extractelement from undef or poison vector.
847 if (isUndefVector</*isPoisonOnly=*/true>(V: Vec).all())
848 continue;
849 // All vector operands must have the same number of vector elements.
850 if (isa<UndefValue>(Val: Vec)) {
851 Mask[I] = I;
852 } else {
853 if (isa<UndefValue>(Val: EI->getIndexOperand()))
854 continue;
855 auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
856 if (!Idx)
857 return std::nullopt;
858 // Undefined behavior if Idx is negative or >= Size.
859 if (Idx->getValue().uge(RHS: Size))
860 continue;
861 unsigned IntIdx = Idx->getValue().getZExtValue();
862 Mask[I] = IntIdx;
863 }
864 if (isUndefVector(V: Vec).all() && HasNonUndefVec)
865 continue;
866 // For correct shuffling we have to have at most 2 different vector operands
867 // in all extractelement instructions.
868 if (!Vec1 || Vec1 == Vec) {
869 Vec1 = Vec;
870 } else if (!Vec2 || Vec2 == Vec) {
871 Vec2 = Vec;
872 Mask[I] += Size;
873 } else {
874 return std::nullopt;
875 }
876 if (CommonShuffleMode == Permute)
877 continue;
878 // If the extract index is not the same as the operation number, it is a
879 // permutation.
880 if (Mask[I] % Size != I) {
881 CommonShuffleMode = Permute;
882 continue;
883 }
884 CommonShuffleMode = Select;
885 }
886 // If we're not crossing lanes in different vectors, consider it as blending.
887 if (CommonShuffleMode == Select && Vec2)
888 return TargetTransformInfo::SK_Select;
889 // If Vec2 was never used, we have a permutation of a single vector, otherwise
890 // we have permutation of 2 vectors.
891 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
892 : TargetTransformInfo::SK_PermuteSingleSrc;
893}
894
895/// \returns True if Extract{Value,Element} instruction extracts element Idx.
896static std::optional<unsigned> getExtractIndex(const Instruction *E) {
897 unsigned Opcode = E->getOpcode();
898 assert((Opcode == Instruction::ExtractElement ||
899 Opcode == Instruction::ExtractValue) &&
900 "Expected extractelement or extractvalue instruction.");
901 if (Opcode == Instruction::ExtractElement) {
902 auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: 1));
903 if (!CI)
904 return std::nullopt;
905 // Check if the index is out of bound - we can get the source vector from
906 // operand 0
907 unsigned Idx = CI->getZExtValue();
908 auto *EE = cast<ExtractElementInst>(Val: E);
909 const unsigned VF = ::getNumElements(Ty: EE->getVectorOperandType());
910 if (Idx >= VF)
911 return std::nullopt;
912 return Idx;
913 }
914 auto *EI = cast<ExtractValueInst>(Val: E);
915 if (EI->getNumIndices() != 1)
916 return std::nullopt;
917 return *EI->idx_begin();
918}
919
920/// Checks if the provided value does not require scheduling. It does not
921/// require scheduling if this is not an instruction or it is an instruction
922/// that does not read/write memory and all operands are either not instructions
923/// or phi nodes or instructions from different blocks.
924static bool areAllOperandsNonInsts(Value *V);
925/// Checks if the provided value does not require scheduling. It does not
926/// require scheduling if this is not an instruction or it is an instruction
927/// that does not read/write memory and all users are phi nodes or instructions
928/// from the different blocks.
929static bool isUsedOutsideBlock(Value *V);
930/// Checks if the specified value does not require scheduling. It does not
931/// require scheduling if all operands and all users do not need to be scheduled
932/// in the current basic block.
933static bool doesNotNeedToBeScheduled(Value *V);
934
935/// \returns true if \p Opcode is allowed as part of the main/alternate
936/// instruction for SLP vectorization.
937///
938/// Example of unsupported opcode is SDIV that can potentially cause UB if the
939/// "shuffled out" lane would result in division by zero.
940static bool isValidForAlternation(unsigned Opcode) {
941 return !Instruction::isIntDivRem(Opcode);
942}
943
944namespace {
945
946/// Helper class that determines VL can use the same opcode.
947/// Alternate instruction is supported. In addition, it supports interchangeable
948/// instruction. An interchangeable instruction is an instruction that can be
949/// converted to another instruction with same semantics. For example, x << 1 is
950/// equal to x * 2. x * 1 is equal to x | 0.
951class BinOpSameOpcodeHelper {
952 using MaskType = std::uint_fast16_t;
953 /// Sort SupportedOp because it is used by binary_search.
954 constexpr static std::initializer_list<unsigned> SupportedOp = {
955 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
956 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
957 enum : MaskType {
958 ShlBIT = 0b1,
959 AShrBIT = 0b10,
960 MulBIT = 0b100,
961 AddBIT = 0b1000,
962 SubBIT = 0b10000,
963 AndBIT = 0b100000,
964 OrBIT = 0b1000000,
965 XorBIT = 0b10000000,
966 MainOpBIT = 0b100000000,
967 LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
968 };
969 /// Return a non-nullptr if either operand of I is a ConstantInt.
970 /// The second return value represents the operand position. We check the
971 /// right-hand side first (1). If the right hand side is not a ConstantInt and
972 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
973 /// side (0).
974 static std::pair<ConstantInt *, unsigned>
975 isBinOpWithConstantInt(const Instruction *I) {
976 unsigned Opcode = I->getOpcode();
977 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
978 (void)SupportedOp;
979 auto *BinOp = cast<BinaryOperator>(Val: I);
980 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 1)))
981 return {CI, 1};
982 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
983 Opcode == Instruction::AShr)
984 return {nullptr, 0};
985 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 0)))
986 return {CI, 0};
987 return {nullptr, 0};
988 }
989 struct InterchangeableInfo {
990 const Instruction *I = nullptr;
991 /// The bit it sets represents whether MainOp can be converted to.
992 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
993 MulBIT | AShrBIT | ShlBIT;
994 /// We cannot create an interchangeable instruction that does not exist in
995 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
996 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
997 /// 1]. SeenBefore is used to know what operations have been seen before.
998 MaskType SeenBefore = 0;
999 InterchangeableInfo(const Instruction *I) : I(I) {}
1000 /// Return false allows BinOpSameOpcodeHelper to find an alternate
1001 /// instruction. Directly setting the mask will destroy the mask state,
1002 /// preventing us from determining which instruction it should convert to.
1003 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1004 if (Mask & InterchangeableMask) {
1005 SeenBefore |= OpcodeInMaskForm;
1006 Mask &= InterchangeableMask;
1007 return true;
1008 }
1009 return false;
1010 }
1011 bool equal(unsigned Opcode) {
1012 return Opcode == I->getOpcode() && trySet(OpcodeInMaskForm: MainOpBIT, InterchangeableMask: MainOpBIT);
1013 }
1014 unsigned getOpcode() const {
1015 MaskType Candidate = Mask & SeenBefore;
1016 if (Candidate & MainOpBIT)
1017 return I->getOpcode();
1018 if (Candidate & ShlBIT)
1019 return Instruction::Shl;
1020 if (Candidate & AShrBIT)
1021 return Instruction::AShr;
1022 if (Candidate & MulBIT)
1023 return Instruction::Mul;
1024 if (Candidate & AddBIT)
1025 return Instruction::Add;
1026 if (Candidate & SubBIT)
1027 return Instruction::Sub;
1028 if (Candidate & AndBIT)
1029 return Instruction::And;
1030 if (Candidate & OrBIT)
1031 return Instruction::Or;
1032 if (Candidate & XorBIT)
1033 return Instruction::Xor;
1034 llvm_unreachable("Cannot find interchangeable instruction.");
1035 }
1036
1037 /// Return true if the instruction can be converted to \p Opcode.
1038 bool hasCandidateOpcode(unsigned Opcode) const {
1039 MaskType Candidate = Mask & SeenBefore;
1040 switch (Opcode) {
1041 case Instruction::Shl:
1042 return Candidate & ShlBIT;
1043 case Instruction::AShr:
1044 return Candidate & AShrBIT;
1045 case Instruction::Mul:
1046 return Candidate & MulBIT;
1047 case Instruction::Add:
1048 return Candidate & AddBIT;
1049 case Instruction::Sub:
1050 return Candidate & SubBIT;
1051 case Instruction::And:
1052 return Candidate & AndBIT;
1053 case Instruction::Or:
1054 return Candidate & OrBIT;
1055 case Instruction::Xor:
1056 return Candidate & XorBIT;
1057 case Instruction::LShr:
1058 case Instruction::FAdd:
1059 case Instruction::FSub:
1060 case Instruction::FMul:
1061 case Instruction::SDiv:
1062 case Instruction::UDiv:
1063 case Instruction::FDiv:
1064 case Instruction::SRem:
1065 case Instruction::URem:
1066 case Instruction::FRem:
1067 return false;
1068 default:
1069 break;
1070 }
1071 llvm_unreachable("Cannot find interchangeable instruction.");
1072 }
1073
1074 SmallVector<Value *> getOperand(const Instruction *To) const {
1075 unsigned ToOpcode = To->getOpcode();
1076 unsigned FromOpcode = I->getOpcode();
1077 if (FromOpcode == ToOpcode)
1078 return SmallVector<Value *>(I->operands());
1079 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1080 auto [CI, Pos] = isBinOpWithConstantInt(I);
1081 const APInt &FromCIValue = CI->getValue();
1082 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1083 APInt ToCIValue;
1084 switch (FromOpcode) {
1085 case Instruction::Shl:
1086 if (ToOpcode == Instruction::Mul) {
1087 ToCIValue = APInt::getOneBitSet(numBits: FromCIValueBitWidth,
1088 BitNo: FromCIValue.getZExtValue());
1089 } else {
1090 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1091 ToCIValue = ToOpcode == Instruction::And
1092 ? APInt::getAllOnes(numBits: FromCIValueBitWidth)
1093 : APInt::getZero(numBits: FromCIValueBitWidth);
1094 }
1095 break;
1096 case Instruction::Mul:
1097 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1098 if (ToOpcode == Instruction::Shl) {
1099 ToCIValue = APInt(FromCIValueBitWidth, FromCIValue.logBase2());
1100 } else {
1101 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1102 ToCIValue = ToOpcode == Instruction::And
1103 ? APInt::getAllOnes(numBits: FromCIValueBitWidth)
1104 : APInt::getZero(numBits: FromCIValueBitWidth);
1105 }
1106 break;
1107 case Instruction::Add:
1108 case Instruction::Sub:
1109 if (FromCIValue.isZero()) {
1110 ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth);
1111 } else {
1112 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1113 "Cannot convert the instruction.");
1114 ToCIValue = FromCIValue;
1115 ToCIValue.negate();
1116 }
1117 break;
1118 case Instruction::And:
1119 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1120 ToCIValue = ToOpcode == Instruction::Mul
1121 ? APInt::getOneBitSet(numBits: FromCIValueBitWidth, BitNo: 0)
1122 : APInt::getZero(numBits: FromCIValueBitWidth);
1123 break;
1124 default:
1125 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1126 ToCIValue = APInt::getZero(numBits: FromCIValueBitWidth);
1127 break;
1128 }
1129 Value *LHS = I->getOperand(i: 1 - Pos);
1130 Constant *RHS =
1131 ConstantInt::get(Ty: I->getOperand(i: Pos)->getType(), V: ToCIValue);
1132 // constant + x cannot be -constant - x
1133 // instead, it should be x - -constant
1134 if (Pos == 1 ||
1135 ((FromOpcode == Instruction::Add || FromOpcode == Instruction::Or ||
1136 FromOpcode == Instruction::Xor) &&
1137 ToOpcode == Instruction::Sub))
1138 return SmallVector<Value *>({LHS, RHS});
1139 return SmallVector<Value *>({RHS, LHS});
1140 }
1141 };
1142 InterchangeableInfo MainOp;
1143 InterchangeableInfo AltOp;
1144 bool isValidForAlternation(const Instruction *I) const {
1145 return ::isValidForAlternation(Opcode: MainOp.I->getOpcode()) &&
1146 ::isValidForAlternation(Opcode: I->getOpcode());
1147 }
1148 bool initializeAltOp(const Instruction *I) {
1149 if (AltOp.I)
1150 return true;
1151 if (!isValidForAlternation(I))
1152 return false;
1153 AltOp.I = I;
1154 return true;
1155 }
1156
1157public:
1158 BinOpSameOpcodeHelper(const Instruction *MainOp,
1159 const Instruction *AltOp = nullptr)
1160 : MainOp(MainOp), AltOp(AltOp) {
1161 assert(is_sorted(SupportedOp) && "SupportedOp is not sorted.");
1162 }
1163 bool add(const Instruction *I) {
1164 assert(isa<BinaryOperator>(I) &&
1165 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1166 unsigned Opcode = I->getOpcode();
1167 MaskType OpcodeInMaskForm;
1168 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1169 switch (Opcode) {
1170 case Instruction::Shl:
1171 OpcodeInMaskForm = ShlBIT;
1172 break;
1173 case Instruction::AShr:
1174 OpcodeInMaskForm = AShrBIT;
1175 break;
1176 case Instruction::Mul:
1177 OpcodeInMaskForm = MulBIT;
1178 break;
1179 case Instruction::Add:
1180 OpcodeInMaskForm = AddBIT;
1181 break;
1182 case Instruction::Sub:
1183 OpcodeInMaskForm = SubBIT;
1184 break;
1185 case Instruction::And:
1186 OpcodeInMaskForm = AndBIT;
1187 break;
1188 case Instruction::Or:
1189 OpcodeInMaskForm = OrBIT;
1190 break;
1191 case Instruction::Xor:
1192 OpcodeInMaskForm = XorBIT;
1193 break;
1194 default:
1195 return MainOp.equal(Opcode) ||
1196 (initializeAltOp(I) && AltOp.equal(Opcode));
1197 }
1198 MaskType InterchangeableMask = OpcodeInMaskForm;
1199 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1200 if (CI) {
1201 constexpr MaskType CanBeAll =
1202 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1203 const APInt &CIValue = CI->getValue();
1204 switch (Opcode) {
1205 case Instruction::Shl:
1206 if (CIValue.ult(RHS: CIValue.getBitWidth()))
1207 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1208 break;
1209 case Instruction::Mul:
1210 if (CIValue.isOne()) {
1211 InterchangeableMask = CanBeAll;
1212 break;
1213 }
1214 if (CIValue.isPowerOf2())
1215 InterchangeableMask = MulBIT | ShlBIT;
1216 break;
1217 case Instruction::Add:
1218 case Instruction::Sub:
1219 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1220 break;
1221 case Instruction::And:
1222 if (CIValue.isAllOnes())
1223 InterchangeableMask = CanBeAll;
1224 break;
1225 case Instruction::Xor:
1226 if (CIValue.isZero())
1227 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1228 break;
1229 default:
1230 if (CIValue.isZero())
1231 InterchangeableMask = CanBeAll;
1232 break;
1233 }
1234 }
1235 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1236 (initializeAltOp(I) &&
1237 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1238 }
1239 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1240 /// Checks if the list of potential opcodes includes \p Opcode.
1241 bool hasCandidateOpcode(unsigned Opcode) const {
1242 return MainOp.hasCandidateOpcode(Opcode);
1243 }
1244 bool hasAltOp() const { return AltOp.I; }
1245 unsigned getAltOpcode() const {
1246 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1247 }
1248 SmallVector<Value *> getOperand(const Instruction *I) const {
1249 return MainOp.getOperand(To: I);
1250 }
1251};
1252
1253/// Main data required for vectorization of instructions.
1254class InstructionsState {
1255 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1256 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1257 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1258 /// isAltShuffle).
1259 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1260 /// from getMainAltOpsNoStateVL.
1261 /// For those InstructionsState that use alternate instructions, the resulting
1262 /// vectorized output ultimately comes from a shufflevector. For example,
1263 /// given a vector list (VL):
1264 /// VL[0] = add i32 a, e
1265 /// VL[1] = sub i32 b, f
1266 /// VL[2] = add i32 c, g
1267 /// VL[3] = sub i32 d, h
1268 /// The vectorized result would be:
1269 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1270 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1271 /// result = shufflevector <4 x i32> intermediated_0,
1272 /// <4 x i32> intermediated_1,
1273 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1274 /// Since shufflevector is used in the final result, when calculating the cost
1275 /// (getEntryCost), we must account for the usage of shufflevector in
1276 /// GetVectorCost.
1277 Instruction *MainOp = nullptr;
1278 Instruction *AltOp = nullptr;
1279 /// Wether the instruction state represents copyable instructions.
1280 bool HasCopyables = false;
1281
1282public:
1283 Instruction *getMainOp() const {
1284 assert(valid() && "InstructionsState is invalid.");
1285 return MainOp;
1286 }
1287
1288 Instruction *getAltOp() const {
1289 assert(valid() && "InstructionsState is invalid.");
1290 return AltOp;
1291 }
1292
1293 /// The main/alternate opcodes for the list of instructions.
1294 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1295
1296 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1297
1298 /// Some of the instructions in the list have alternate opcodes.
1299 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1300
1301 /// Checks if the instruction matches either the main or alternate opcode.
1302 /// \returns
1303 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1304 /// to it
1305 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1306 /// it
1307 /// - nullptr if \param I cannot be matched or converted to either opcode
1308 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1309 assert(MainOp && "MainOp cannot be nullptr.");
1310 if (I->getOpcode() == MainOp->getOpcode())
1311 return MainOp;
1312 // Prefer AltOp instead of interchangeable instruction of MainOp.
1313 assert(AltOp && "AltOp cannot be nullptr.");
1314 if (I->getOpcode() == AltOp->getOpcode())
1315 return AltOp;
1316 if (!I->isBinaryOp())
1317 return nullptr;
1318 BinOpSameOpcodeHelper Converter(MainOp);
1319 if (!Converter.add(I) || !Converter.add(I: MainOp))
1320 return nullptr;
1321 if (isAltShuffle() && !Converter.hasCandidateOpcode(Opcode: MainOp->getOpcode())) {
1322 BinOpSameOpcodeHelper AltConverter(AltOp);
1323 if (AltConverter.add(I) && AltConverter.add(I: AltOp) &&
1324 AltConverter.hasCandidateOpcode(Opcode: AltOp->getOpcode()))
1325 return AltOp;
1326 }
1327 if (Converter.hasAltOp() && !isAltShuffle())
1328 return nullptr;
1329 return Converter.hasAltOp() ? AltOp : MainOp;
1330 }
1331
1332 /// Checks if main/alt instructions are shift operations.
1333 bool isShiftOp() const {
1334 return getMainOp()->isShift() && getAltOp()->isShift();
1335 }
1336
1337 /// Checks if main/alt instructions are bitwise logic operations.
1338 bool isBitwiseLogicOp() const {
1339 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1340 }
1341
1342 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1343 bool isMulDivLikeOp() const {
1344 constexpr std::array<unsigned, 8> MulDiv = {
1345 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1346 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1347 Instruction::URem, Instruction::FRem};
1348 return is_contained(Range: MulDiv, Element: getOpcode()) &&
1349 is_contained(Range: MulDiv, Element: getAltOpcode());
1350 }
1351
1352 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1353 bool isAddSubLikeOp() const {
1354 constexpr std::array<unsigned, 4> AddSub = {
1355 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1356 Instruction::FSub};
1357 return is_contained(Range: AddSub, Element: getOpcode()) &&
1358 is_contained(Range: AddSub, Element: getAltOpcode());
1359 }
1360
1361 /// Checks if main/alt instructions are cmp operations.
1362 bool isCmpOp() const {
1363 return (getOpcode() == Instruction::ICmp ||
1364 getOpcode() == Instruction::FCmp) &&
1365 getAltOpcode() == getOpcode();
1366 }
1367
1368 /// Checks if the current state is valid, i.e. has non-null MainOp
1369 bool valid() const { return MainOp && AltOp; }
1370
1371 explicit operator bool() const { return valid(); }
1372
1373 InstructionsState() = delete;
1374 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1375 bool HasCopyables = false)
1376 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1377 static InstructionsState invalid() { return {nullptr, nullptr}; }
1378
1379 /// Checks if the value is a copyable element.
1380 bool isCopyableElement(Value *V) const {
1381 assert(valid() && "InstructionsState is invalid.");
1382 if (!HasCopyables)
1383 return false;
1384 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1385 return false;
1386 auto *I = dyn_cast<Instruction>(Val: V);
1387 if (!I)
1388 return !isa<PoisonValue>(Val: V);
1389 if (I->getParent() != MainOp->getParent() &&
1390 (!isVectorLikeInstWithConstOps(V: I) ||
1391 !isVectorLikeInstWithConstOps(V: MainOp)))
1392 return true;
1393 if (I->getOpcode() == MainOp->getOpcode())
1394 return false;
1395 if (!I->isBinaryOp())
1396 return true;
1397 BinOpSameOpcodeHelper Converter(MainOp);
1398 return !Converter.add(I) || !Converter.add(I: MainOp) ||
1399 Converter.hasAltOp() || !Converter.hasCandidateOpcode(Opcode: getOpcode());
1400 }
1401
1402 /// Checks if the value is non-schedulable.
1403 bool isNonSchedulable(Value *V) const {
1404 assert(valid() && "InstructionsState is invalid.");
1405 auto *I = dyn_cast<Instruction>(Val: V);
1406 if (!HasCopyables)
1407 return !I || isa<PHINode>(Val: I) || isVectorLikeInstWithConstOps(V: I) ||
1408 doesNotNeedToBeScheduled(V);
1409 // MainOp for copyables always schedulable to correctly identify
1410 // non-schedulable copyables.
1411 if (getMainOp() == V)
1412 return false;
1413 if (isCopyableElement(V)) {
1414 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1415 auto *I = dyn_cast<Instruction>(Val: V);
1416 return !I || isa<PHINode>(Val: I) || I->getParent() != MainOp->getParent() ||
1417 (doesNotNeedToBeScheduled(V: I) &&
1418 // If the copyable instructions comes after MainOp
1419 // (non-schedulable, but used in the block) - cannot vectorize
1420 // it, will possibly generate use before def.
1421 !MainOp->comesBefore(Other: I));
1422 };
1423
1424 return IsNonSchedulableCopyableElement(V);
1425 }
1426 return !I || isa<PHINode>(Val: I) || isVectorLikeInstWithConstOps(V: I) ||
1427 doesNotNeedToBeScheduled(V);
1428 }
1429
1430 /// Checks if the state represents copyable instructions.
1431 bool areInstructionsWithCopyableElements() const {
1432 assert(valid() && "InstructionsState is invalid.");
1433 return HasCopyables;
1434 }
1435};
1436
1437std::pair<Instruction *, SmallVector<Value *>>
1438convertTo(Instruction *I, const InstructionsState &S) {
1439 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1440 assert(SelectedOp && "Cannot convert the instruction.");
1441 if (I->isBinaryOp()) {
1442 BinOpSameOpcodeHelper Converter(I);
1443 return std::make_pair(x&: SelectedOp, y: Converter.getOperand(I: SelectedOp));
1444 }
1445 return std::make_pair(x&: SelectedOp, y: SmallVector<Value *>(I->operands()));
1446}
1447
1448} // end anonymous namespace
1449
1450static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1451 const TargetLibraryInfo &TLI);
1452
1453/// Find an instruction with a specific opcode in VL.
1454/// \param VL Array of values to search through. Must contain only Instructions
1455/// and PoisonValues.
1456/// \param Opcode The instruction opcode to search for
1457/// \returns
1458/// - The first instruction found with matching opcode
1459/// - nullptr if no matching instruction is found
1460static Instruction *findInstructionWithOpcode(ArrayRef<Value *> VL,
1461 unsigned Opcode) {
1462 for (Value *V : VL) {
1463 if (isa<PoisonValue>(Val: V))
1464 continue;
1465 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1466 auto *Inst = cast<Instruction>(Val: V);
1467 if (Inst->getOpcode() == Opcode)
1468 return Inst;
1469 }
1470 return nullptr;
1471}
1472
1473/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1474/// compatible instructions or constants, or just some other regular values.
1475static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1476 Value *Op1, const TargetLibraryInfo &TLI) {
1477 return (isConstant(V: BaseOp0) && isConstant(V: Op0)) ||
1478 (isConstant(V: BaseOp1) && isConstant(V: Op1)) ||
1479 (!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
1480 !isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) ||
1481 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1482 getSameOpcode(VL: {BaseOp0, Op0}, TLI) ||
1483 getSameOpcode(VL: {BaseOp1, Op1}, TLI);
1484}
1485
1486/// \returns true if a compare instruction \p CI has similar "look" and
1487/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1488/// swapped, false otherwise.
1489static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1490 const TargetLibraryInfo &TLI) {
1491 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1492 "Assessing comparisons of different types?");
1493 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1494 CmpInst::Predicate Pred = CI->getPredicate();
1495 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
1496
1497 Value *BaseOp0 = BaseCI->getOperand(i_nocapture: 0);
1498 Value *BaseOp1 = BaseCI->getOperand(i_nocapture: 1);
1499 Value *Op0 = CI->getOperand(i_nocapture: 0);
1500 Value *Op1 = CI->getOperand(i_nocapture: 1);
1501
1502 return (BasePred == Pred &&
1503 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1504 (BasePred == SwappedPred &&
1505 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
1506}
1507
1508/// \returns analysis of the Instructions in \p VL described in
1509/// InstructionsState, the Opcode that we suppose the whole list
1510/// could be vectorized even if its structure is diverse.
1511static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1512 const TargetLibraryInfo &TLI) {
1513 // Make sure these are all Instructions.
1514 if (!all_of(Range&: VL, P: IsaPred<Instruction, PoisonValue>))
1515 return InstructionsState::invalid();
1516
1517 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
1518 if (It == VL.end())
1519 return InstructionsState::invalid();
1520
1521 Instruction *MainOp = cast<Instruction>(Val: *It);
1522 unsigned InstCnt = std::count_if(first: It, last: VL.end(), pred: IsaPred<Instruction>);
1523 if ((VL.size() > 2 && !isa<PHINode>(Val: MainOp) && InstCnt < VL.size() / 2) ||
1524 (VL.size() == 2 && InstCnt < 2))
1525 return InstructionsState::invalid();
1526
1527 bool IsCastOp = isa<CastInst>(Val: MainOp);
1528 bool IsBinOp = isa<BinaryOperator>(Val: MainOp);
1529 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
1530 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
1531 : CmpInst::BAD_ICMP_PREDICATE;
1532 Instruction *AltOp = MainOp;
1533 unsigned Opcode = MainOp->getOpcode();
1534 unsigned AltOpcode = Opcode;
1535
1536 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1537 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1538 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1539 UniquePreds.insert(X: BasePred);
1540 UniqueNonSwappedPreds.insert(X: BasePred);
1541 for (Value *V : VL) {
1542 auto *I = dyn_cast<CmpInst>(Val: V);
1543 if (!I)
1544 return false;
1545 CmpInst::Predicate CurrentPred = I->getPredicate();
1546 CmpInst::Predicate SwappedCurrentPred =
1547 CmpInst::getSwappedPredicate(pred: CurrentPred);
1548 UniqueNonSwappedPreds.insert(X: CurrentPred);
1549 if (!UniquePreds.contains(key: CurrentPred) &&
1550 !UniquePreds.contains(key: SwappedCurrentPred))
1551 UniquePreds.insert(X: CurrentPred);
1552 }
1553 // Total number of predicates > 2, but if consider swapped predicates
1554 // compatible only 2, consider swappable predicates as compatible opcodes,
1555 // not alternate.
1556 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1557 }();
1558 // Check for one alternate opcode from another BinaryOperator.
1559 // TODO - generalize to support all operators (types, calls etc.).
1560 Intrinsic::ID BaseID = 0;
1561 SmallVector<VFInfo> BaseMappings;
1562 if (auto *CallBase = dyn_cast<CallInst>(Val: MainOp)) {
1563 BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
1564 BaseMappings = VFDatabase(*CallBase).getMappings(CI: *CallBase);
1565 if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
1566 return InstructionsState::invalid();
1567 }
1568 bool AnyPoison = InstCnt != VL.size();
1569 // Check MainOp too to be sure that it matches the requirements for the
1570 // instructions.
1571 for (Value *V : iterator_range(It, VL.end())) {
1572 auto *I = dyn_cast<Instruction>(Val: V);
1573 if (!I)
1574 continue;
1575
1576 // Cannot combine poison and divisions.
1577 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1578 // intrinsics/functions only.
1579 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I)))
1580 return InstructionsState::invalid();
1581 unsigned InstOpcode = I->getOpcode();
1582 if (IsBinOp && isa<BinaryOperator>(Val: I)) {
1583 if (BinOpHelper.add(I))
1584 continue;
1585 } else if (IsCastOp && isa<CastInst>(Val: I)) {
1586 Value *Op0 = MainOp->getOperand(i: 0);
1587 Type *Ty0 = Op0->getType();
1588 Value *Op1 = I->getOperand(i: 0);
1589 Type *Ty1 = Op1->getType();
1590 if (Ty0 == Ty1) {
1591 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1592 continue;
1593 if (Opcode == AltOpcode) {
1594 assert(isValidForAlternation(Opcode) &&
1595 isValidForAlternation(InstOpcode) &&
1596 "Cast isn't safe for alternation, logic needs to be updated!");
1597 AltOpcode = InstOpcode;
1598 AltOp = I;
1599 continue;
1600 }
1601 }
1602 } else if (auto *Inst = dyn_cast<CmpInst>(Val: I); Inst && IsCmpOp) {
1603 auto *BaseInst = cast<CmpInst>(Val: MainOp);
1604 Type *Ty0 = BaseInst->getOperand(i_nocapture: 0)->getType();
1605 Type *Ty1 = Inst->getOperand(i_nocapture: 0)->getType();
1606 if (Ty0 == Ty1) {
1607 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1608 assert(InstOpcode == AltOpcode &&
1609 "Alternate instructions are only supported by BinaryOperator "
1610 "and CastInst.");
1611 // Check for compatible operands. If the corresponding operands are not
1612 // compatible - need to perform alternate vectorization.
1613 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1614 CmpInst::Predicate SwappedCurrentPred =
1615 CmpInst::getSwappedPredicate(pred: CurrentPred);
1616
1617 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1618 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1619 continue;
1620
1621 if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
1622 continue;
1623 auto *AltInst = cast<CmpInst>(Val: AltOp);
1624 if (MainOp != AltOp) {
1625 if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
1626 continue;
1627 } else if (BasePred != CurrentPred) {
1628 assert(
1629 isValidForAlternation(InstOpcode) &&
1630 "CmpInst isn't safe for alternation, logic needs to be updated!");
1631 AltOp = I;
1632 continue;
1633 }
1634 CmpInst::Predicate AltPred = AltInst->getPredicate();
1635 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1636 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1637 continue;
1638 }
1639 } else if (InstOpcode == Opcode) {
1640 assert(InstOpcode == AltOpcode &&
1641 "Alternate instructions are only supported by BinaryOperator and "
1642 "CastInst.");
1643 if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
1644 if (Gep->getNumOperands() != 2 ||
1645 Gep->getOperand(i_nocapture: 0)->getType() != MainOp->getOperand(i: 0)->getType())
1646 return InstructionsState::invalid();
1647 } else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
1648 if (!isVectorLikeInstWithConstOps(V: EI))
1649 return InstructionsState::invalid();
1650 } else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
1651 auto *BaseLI = cast<LoadInst>(Val: MainOp);
1652 if (!LI->isSimple() || !BaseLI->isSimple())
1653 return InstructionsState::invalid();
1654 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
1655 auto *CallBase = cast<CallInst>(Val: MainOp);
1656 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1657 return InstructionsState::invalid();
1658 if (Call->hasOperandBundles() &&
1659 (!CallBase->hasOperandBundles() ||
1660 !std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
1661 last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
1662 first2: CallBase->op_begin() +
1663 CallBase->getBundleOperandsStartIndex())))
1664 return InstructionsState::invalid();
1665 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
1666 if (ID != BaseID)
1667 return InstructionsState::invalid();
1668 if (!ID) {
1669 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(CI: *Call);
1670 if (Mappings.size() != BaseMappings.size() ||
1671 Mappings.front().ISA != BaseMappings.front().ISA ||
1672 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1673 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1674 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1675 Mappings.front().Shape.Parameters !=
1676 BaseMappings.front().Shape.Parameters)
1677 return InstructionsState::invalid();
1678 }
1679 }
1680 continue;
1681 }
1682 return InstructionsState::invalid();
1683 }
1684
1685 if (IsBinOp) {
1686 MainOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getMainOpcode());
1687 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1688 AltOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getAltOpcode());
1689 assert(MainOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1690 }
1691 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1692 "Incorrect implementation of allSameOpcode.");
1693 InstructionsState S(MainOp, AltOp);
1694 assert(all_of(VL,
1695 [&](Value *V) {
1696 return isa<PoisonValue>(V) ||
1697 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1698 }) &&
1699 "Invalid InstructionsState.");
1700 return S;
1701}
1702
1703/// \returns true if all of the values in \p VL have the same type or false
1704/// otherwise.
1705static bool allSameType(ArrayRef<Value *> VL) {
1706 Type *Ty = VL.consume_front()->getType();
1707 return all_of(Range&: VL, P: [&](Value *V) { return V->getType() == Ty; });
1708}
1709
1710/// \returns True if in-tree use also needs extract. This refers to
1711/// possible scalar operand in vectorized instruction.
1712static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1713 TargetLibraryInfo *TLI,
1714 const TargetTransformInfo *TTI) {
1715 if (!UserInst)
1716 return false;
1717 unsigned Opcode = UserInst->getOpcode();
1718 switch (Opcode) {
1719 case Instruction::Load: {
1720 LoadInst *LI = cast<LoadInst>(Val: UserInst);
1721 return (LI->getPointerOperand() == Scalar);
1722 }
1723 case Instruction::Store: {
1724 StoreInst *SI = cast<StoreInst>(Val: UserInst);
1725 return (SI->getPointerOperand() == Scalar);
1726 }
1727 case Instruction::Call: {
1728 CallInst *CI = cast<CallInst>(Val: UserInst);
1729 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1730 return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
1731 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1732 Arg.value().get() == Scalar;
1733 });
1734 }
1735 default:
1736 return false;
1737 }
1738}
1739
1740/// \returns the AA location that is being access by the instruction.
1741static MemoryLocation getLocation(Instruction *I) {
1742 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1743 return MemoryLocation::get(SI);
1744 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1745 return MemoryLocation::get(LI);
1746 return MemoryLocation();
1747}
1748
1749/// \returns True if the instruction is not a volatile or atomic load/store.
1750static bool isSimple(Instruction *I) {
1751 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1752 return LI->isSimple();
1753 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1754 return SI->isSimple();
1755 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
1756 return !MI->isVolatile();
1757 return true;
1758}
1759
1760/// Shuffles \p Mask in accordance with the given \p SubMask.
1761/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1762/// one but two input vectors.
1763static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1764 bool ExtendingManyInputs = false) {
1765 if (SubMask.empty())
1766 return;
1767 assert(
1768 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1769 // Check if input scalars were extended to match the size of other node.
1770 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1771 "SubMask with many inputs support must be larger than the mask.");
1772 if (Mask.empty()) {
1773 Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
1774 return;
1775 }
1776 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1777 int TermValue = std::min(a: Mask.size(), b: SubMask.size());
1778 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1779 if (SubMask[I] == PoisonMaskElem ||
1780 (!ExtendingManyInputs &&
1781 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1782 continue;
1783 NewMask[I] = Mask[SubMask[I]];
1784 }
1785 Mask.swap(RHS&: NewMask);
1786}
1787
1788/// Order may have elements assigned special value (size) which is out of
1789/// bounds. Such indices only appear on places which correspond to undef values
1790/// (see canReuseExtract for details) and used in order to avoid undef values
1791/// have effect on operands ordering.
1792/// The first loop below simply finds all unused indices and then the next loop
1793/// nest assigns these indices for undef values positions.
1794/// As an example below Order has two undef positions and they have assigned
1795/// values 3 and 7 respectively:
1796/// before: 6 9 5 4 9 2 1 0
1797/// after: 6 3 5 4 7 2 1 0
1798static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1799 const size_t Sz = Order.size();
1800 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1801 SmallBitVector MaskedIndices(Sz);
1802 for (unsigned I = 0; I < Sz; ++I) {
1803 if (Order[I] < Sz)
1804 UnusedIndices.reset(Idx: Order[I]);
1805 else
1806 MaskedIndices.set(I);
1807 }
1808 if (MaskedIndices.none())
1809 return;
1810 assert(UnusedIndices.count() == MaskedIndices.count() &&
1811 "Non-synced masked/available indices.");
1812 int Idx = UnusedIndices.find_first();
1813 int MIdx = MaskedIndices.find_first();
1814 while (MIdx >= 0) {
1815 assert(Idx >= 0 && "Indices must be synced.");
1816 Order[MIdx] = Idx;
1817 Idx = UnusedIndices.find_next(Prev: Idx);
1818 MIdx = MaskedIndices.find_next(Prev: MIdx);
1819 }
1820}
1821
1822/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1823/// Opcode1.
1824static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, Type *ScalarTy,
1825 unsigned Opcode0, unsigned Opcode1) {
1826 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
1827 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1828 for (unsigned Lane : seq<unsigned>(Size: VL.size())) {
1829 if (isa<PoisonValue>(Val: VL[Lane]))
1830 continue;
1831 if (cast<Instruction>(Val: VL[Lane])->getOpcode() == Opcode1)
1832 OpcodeMask.set(I: Lane * ScalarTyNumElements,
1833 E: Lane * ScalarTyNumElements + ScalarTyNumElements);
1834 }
1835 return OpcodeMask;
1836}
1837
1838/// Replicates the given \p Val \p VF times.
1839static SmallVector<Constant *> replicateMask(ArrayRef<Constant *> Val,
1840 unsigned VF) {
1841 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1842 "Expected scalar constants.");
1843 SmallVector<Constant *> NewVal(Val.size() * VF);
1844 for (auto [I, V] : enumerate(First&: Val))
1845 std::fill_n(first: NewVal.begin() + I * VF, n: VF, value: V);
1846 return NewVal;
1847}
1848
1849static void inversePermutation(ArrayRef<unsigned> Indices,
1850 SmallVectorImpl<int> &Mask) {
1851 Mask.clear();
1852 const unsigned E = Indices.size();
1853 Mask.resize(N: E, NV: PoisonMaskElem);
1854 for (unsigned I = 0; I < E; ++I)
1855 Mask[Indices[I]] = I;
1856}
1857
1858/// Reorders the list of scalars in accordance with the given \p Mask.
1859static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1860 ArrayRef<int> Mask) {
1861 assert(!Mask.empty() && "Expected non-empty mask.");
1862 SmallVector<Value *> Prev(Scalars.size(),
1863 PoisonValue::get(T: Scalars.front()->getType()));
1864 Prev.swap(RHS&: Scalars);
1865 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1866 if (Mask[I] != PoisonMaskElem)
1867 Scalars[Mask[I]] = Prev[I];
1868}
1869
1870/// Checks if the provided value does not require scheduling. It does not
1871/// require scheduling if this is not an instruction or it is an instruction
1872/// that does not read/write memory and all operands are either not instructions
1873/// or phi nodes or instructions from different blocks.
1874static bool areAllOperandsNonInsts(Value *V) {
1875 auto *I = dyn_cast<Instruction>(Val: V);
1876 if (!I)
1877 return true;
1878 return !mayHaveNonDefUseDependency(I: *I) &&
1879 all_of(Range: I->operands(), P: [I](Value *V) {
1880 auto *IO = dyn_cast<Instruction>(Val: V);
1881 if (!IO)
1882 return true;
1883 return isa<PHINode>(Val: IO) || IO->getParent() != I->getParent();
1884 });
1885}
1886
1887/// Checks if the provided value does not require scheduling. It does not
1888/// require scheduling if this is not an instruction or it is an instruction
1889/// that does not read/write memory and all users are phi nodes or instructions
1890/// from the different blocks.
1891static bool isUsedOutsideBlock(Value *V) {
1892 auto *I = dyn_cast<Instruction>(Val: V);
1893 if (!I)
1894 return true;
1895 // Limits the number of uses to save compile time.
1896 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1897 all_of(Range: I->users(), P: [I](User *U) {
1898 auto *IU = dyn_cast<Instruction>(Val: U);
1899 if (!IU)
1900 return true;
1901 return IU->getParent() != I->getParent() || isa<PHINode>(Val: IU);
1902 });
1903}
1904
1905/// Checks if the specified value does not require scheduling. It does not
1906/// require scheduling if all operands and all users do not need to be scheduled
1907/// in the current basic block.
1908static bool doesNotNeedToBeScheduled(Value *V) {
1909 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1910}
1911
1912/// Checks if the specified array of instructions does not require scheduling.
1913/// It is so if all either instructions have operands that do not require
1914/// scheduling or their users do not require scheduling since they are phis or
1915/// in other basic blocks.
1916static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1917 return !VL.empty() &&
1918 (all_of(Range&: VL, P: isUsedOutsideBlock) || all_of(Range&: VL, P: areAllOperandsNonInsts));
1919}
1920
1921/// Returns true if widened type of \p Ty elements with size \p Sz represents
1922/// full vector type, i.e. adding extra element results in extra parts upon type
1923/// legalization.
1924static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1925 unsigned Sz) {
1926 if (Sz <= 1)
1927 return false;
1928 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Val: Ty))
1929 return false;
1930 if (has_single_bit(Value: Sz))
1931 return true;
1932 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
1933 return NumParts > 0 && NumParts < Sz && has_single_bit(Value: Sz / NumParts) &&
1934 Sz % NumParts == 0;
1935}
1936
1937/// Returns number of parts, the type \p VecTy will be split at the codegen
1938/// phase. If the type is going to be scalarized or does not uses whole
1939/// registers, returns 1.
1940static unsigned
1941getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1942 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1943 unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
1944 if (NumParts == 0 || NumParts >= Limit)
1945 return 1;
1946 unsigned Sz = getNumElements(Ty: VecTy);
1947 if (NumParts >= Sz || Sz % NumParts != 0 ||
1948 !hasFullVectorsOrPowerOf2(TTI, Ty: VecTy->getElementType(), Sz: Sz / NumParts))
1949 return 1;
1950 return NumParts;
1951}
1952
1953/// Bottom Up SLP Vectorizer.
1954class slpvectorizer::BoUpSLP {
1955 class TreeEntry;
1956 class ScheduleEntity;
1957 class ScheduleData;
1958 class ScheduleCopyableData;
1959 class ScheduleBundle;
1960 class ShuffleCostEstimator;
1961 class ShuffleInstructionBuilder;
1962
1963 /// If we decide to generate strided load / store, this struct contains all
1964 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1965 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1966 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1967 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1968 /// size of element of FixedVectorType.
1969 struct StridedPtrInfo {
1970 Value *StrideVal = nullptr;
1971 const SCEV *StrideSCEV = nullptr;
1972 FixedVectorType *Ty = nullptr;
1973 };
1974 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1975
1976public:
1977 /// Tracks the state we can represent the loads in the given sequence.
1978 enum class LoadsState {
1979 Gather,
1980 Vectorize,
1981 ScatterVectorize,
1982 StridedVectorize,
1983 CompressVectorize
1984 };
1985
1986 using ValueList = SmallVector<Value *, 8>;
1987 using InstrList = SmallVector<Instruction *, 16>;
1988 using ValueSet = SmallPtrSet<Value *, 16>;
1989 using StoreList = SmallVector<StoreInst *, 8>;
1990 using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
1991 using OrdersType = SmallVector<unsigned, 4>;
1992
1993 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1994 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1995 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1996 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1997 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1998 AC(AC), DB(DB), DL(DL), ORE(ORE),
1999 Builder(Se->getContext(), TargetFolder(*DL)) {
2000 CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
2001 // Use the vector register size specified by the target unless overridden
2002 // by a command-line option.
2003 // TODO: It would be better to limit the vectorization factor based on
2004 // data type rather than just register size. For example, x86 AVX has
2005 // 256-bit registers, but it does not support integer operations
2006 // at that width (that requires AVX2).
2007 if (MaxVectorRegSizeOption.getNumOccurrences())
2008 MaxVecRegSize = MaxVectorRegSizeOption;
2009 else
2010 MaxVecRegSize =
2011 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
2012 .getFixedValue();
2013
2014 if (MinVectorRegSizeOption.getNumOccurrences())
2015 MinVecRegSize = MinVectorRegSizeOption;
2016 else
2017 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2018 }
2019
2020 /// Vectorize the tree that starts with the elements in \p VL.
2021 /// Returns the vectorized root.
2022 Value *vectorizeTree();
2023
2024 /// Vectorize the tree but with the list of externally used values \p
2025 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2026 /// generated extractvalue instructions.
2027 Value *vectorizeTree(
2028 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2029 Instruction *ReductionRoot = nullptr,
2030 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales = {});
2031
2032 /// \returns the cost incurred by unwanted spills and fills, caused by
2033 /// holding live values over call sites.
2034 InstructionCost getSpillCost();
2035
2036 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2037 /// final cost.
2038 InstructionCost
2039 calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {});
2040
2041 /// \returns the vectorization cost of the subtree that starts at \p VL.
2042 /// A negative number means that this is profitable.
2043 InstructionCost getTreeCost(InstructionCost TreeCost,
2044 ArrayRef<Value *> VectorizedVals = {},
2045 InstructionCost ReductionCost = TTI::TCC_Free);
2046
2047 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2048 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2049 void buildTree(ArrayRef<Value *> Roots,
2050 const SmallDenseSet<Value *> &UserIgnoreLst);
2051
2052 /// Construct a vectorizable tree that starts at \p Roots.
2053 void buildTree(ArrayRef<Value *> Roots);
2054
2055 /// Return the scalars of the root node.
2056 ArrayRef<Value *> getRootNodeScalars() const {
2057 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2058 return VectorizableTree.front()->Scalars;
2059 }
2060
2061 /// Returns the type/is-signed info for the root node in the graph without
2062 /// casting.
2063 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2064 const TreeEntry &Root = *VectorizableTree.front();
2065 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2066 !Root.Scalars.front()->getType()->isIntegerTy())
2067 return std::nullopt;
2068 auto It = MinBWs.find(Val: &Root);
2069 if (It != MinBWs.end())
2070 return std::make_pair(x: IntegerType::get(C&: Root.Scalars.front()->getContext(),
2071 NumBits: It->second.first),
2072 y: It->second.second);
2073 if (Root.getOpcode() == Instruction::ZExt ||
2074 Root.getOpcode() == Instruction::SExt)
2075 return std::make_pair(x: cast<CastInst>(Val: Root.getMainOp())->getSrcTy(),
2076 y: Root.getOpcode() == Instruction::SExt);
2077 return std::nullopt;
2078 }
2079
2080 /// Checks if the root graph node can be emitted with narrower bitwidth at
2081 /// codegen and returns it signedness, if so.
2082 bool isSignedMinBitwidthRootNode() const {
2083 return MinBWs.at(Val: VectorizableTree.front().get()).second;
2084 }
2085
2086 /// Returns reduction type after minbitdth analysis.
2087 FixedVectorType *getReductionType() const {
2088 if (ReductionBitWidth == 0 ||
2089 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2090 ReductionBitWidth >=
2091 DL->getTypeSizeInBits(
2092 Ty: VectorizableTree.front()->Scalars.front()->getType()))
2093 return getWidenedType(
2094 ScalarTy: VectorizableTree.front()->Scalars.front()->getType(),
2095 VF: VectorizableTree.front()->getVectorFactor());
2096 return getWidenedType(
2097 ScalarTy: IntegerType::get(
2098 C&: VectorizableTree.front()->Scalars.front()->getContext(),
2099 NumBits: ReductionBitWidth),
2100 VF: VectorizableTree.front()->getVectorFactor());
2101 }
2102
2103 /// Returns the opcode of the root node, or 0, if the root node is gather.
2104 bool isReducedBitcastRoot() const {
2105 return VectorizableTree.front()->hasState() &&
2106 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2107 VectorizableTree.front()->CombinedOp ==
2108 TreeEntry::ReducedBitcastBSwap) &&
2109 VectorizableTree.front()->State == TreeEntry::Vectorize;
2110 }
2111
2112 /// Builds external uses of the vectorized scalars, i.e. the list of
2113 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2114 /// ExternallyUsedValues contains additional list of external uses to handle
2115 /// vectorization of reductions.
2116 void
2117 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2118
2119 /// Transforms graph nodes to target specific representations, if profitable.
2120 void transformNodes();
2121
2122 /// Clear the internal data structures that are created by 'buildTree'.
2123 void deleteTree() {
2124 VectorizableTree.clear();
2125 ScalarToTreeEntries.clear();
2126 DeletedNodes.clear();
2127 TransformedToGatherNodes.clear();
2128 OperandsToTreeEntry.clear();
2129 ScalarsInSplitNodes.clear();
2130 MustGather.clear();
2131 NonScheduledFirst.clear();
2132 EntryToLastInstruction.clear();
2133 LastInstructionToPos.clear();
2134 LoadEntriesToVectorize.clear();
2135 IsGraphTransformMode = false;
2136 GatheredLoadsEntriesFirst.reset();
2137 CompressEntryToData.clear();
2138 ExternalUses.clear();
2139 ExternalUsesAsOriginalScalar.clear();
2140 ExternalUsesWithNonUsers.clear();
2141 for (auto &Iter : BlocksSchedules) {
2142 BlockScheduling *BS = Iter.second.get();
2143 BS->clear();
2144 }
2145 MinBWs.clear();
2146 ReductionBitWidth = 0;
2147 BaseGraphSize = 1;
2148 CastMaxMinBWSizes.reset();
2149 ExtraBitWidthNodes.clear();
2150 InstrElementSize.clear();
2151 UserIgnoreList = nullptr;
2152 PostponedGathers.clear();
2153 ValueToGatherNodes.clear();
2154 TreeEntryToStridedPtrInfoMap.clear();
2155 }
2156
2157 unsigned getTreeSize() const { return VectorizableTree.size(); }
2158
2159 /// Returns the base graph size, before any transformations.
2160 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2161
2162 /// Perform LICM and CSE on the newly generated gather sequences.
2163 void optimizeGatherSequence();
2164
2165 /// Does this non-empty order represent an identity order? Identity
2166 /// should be represented as an empty order, so this is used to
2167 /// decide if we can canonicalize a computed order. Undef elements
2168 /// (represented as size) are ignored.
2169 static bool isIdentityOrder(ArrayRef<unsigned> Order) {
2170 assert(!Order.empty() && "expected non-empty order");
2171 const unsigned Sz = Order.size();
2172 return all_of(Range: enumerate(First&: Order), P: [&](const auto &P) {
2173 return P.value() == P.index() || P.value() == Sz;
2174 });
2175 }
2176
2177 /// Checks if the specified gather tree entry \p TE can be represented as a
2178 /// shuffled vector entry + (possibly) permutation with other gathers. It
2179 /// implements the checks only for possibly ordered scalars (Loads,
2180 /// ExtractElement, ExtractValue), which can be part of the graph.
2181 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2182 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2183 /// node might be ignored.
2184 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2185 bool TopToBottom,
2186 bool IgnoreReorder);
2187
2188 /// Sort loads into increasing pointers offsets to allow greater clustering.
2189 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2190
2191 /// Gets reordering data for the given tree entry. If the entry is vectorized
2192 /// - just return ReorderIndices, otherwise check if the scalars can be
2193 /// reordered and return the most optimal order.
2194 /// \return std::nullopt if ordering is not important, empty order, if
2195 /// identity order is important, or the actual order.
2196 /// \param TopToBottom If true, include the order of vectorized stores and
2197 /// insertelement nodes, otherwise skip them.
2198 /// \param IgnoreReorder true, if the root node order can be ignored.
2199 std::optional<OrdersType>
2200 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2201
2202 /// Checks if it is profitable to reorder the current tree.
2203 /// If the tree does not contain many profitable reordable nodes, better to
2204 /// skip it to save compile time.
2205 bool isProfitableToReorder() const;
2206
2207 /// Reorders the current graph to the most profitable order starting from the
2208 /// root node to the leaf nodes. The best order is chosen only from the nodes
2209 /// of the same size (vectorization factor). Smaller nodes are considered
2210 /// parts of subgraph with smaller VF and they are reordered independently. We
2211 /// can make it because we still need to extend smaller nodes to the wider VF
2212 /// and we can merge reordering shuffles with the widening shuffles.
2213 void reorderTopToBottom();
2214
2215 /// Reorders the current graph to the most profitable order starting from
2216 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2217 /// number of reshuffles if the leaf nodes use the same order. In this case we
2218 /// can merge the orders and just shuffle user node instead of shuffling its
2219 /// operands. Plus, even the leaf nodes have different orders, it allows to
2220 /// sink reordering in the graph closer to the root node and merge it later
2221 /// during analysis.
2222 void reorderBottomToTop(bool IgnoreReorder = false);
2223
2224 /// \return The vector element size in bits to use when vectorizing the
2225 /// expression tree ending at \p V. If V is a store, the size is the width of
2226 /// the stored value. Otherwise, the size is the width of the largest loaded
2227 /// value reaching V. This method is used by the vectorizer to calculate
2228 /// vectorization factors.
2229 unsigned getVectorElementSize(Value *V);
2230
2231 /// Compute the minimum type sizes required to represent the entries in a
2232 /// vectorizable tree.
2233 void computeMinimumValueSizes();
2234
2235 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2236 unsigned getMaxVecRegSize() const {
2237 return MaxVecRegSize;
2238 }
2239
2240 // \returns minimum vector register size as set by cl::opt.
2241 unsigned getMinVecRegSize() const {
2242 return MinVecRegSize;
2243 }
2244
2245 unsigned getMinVF(unsigned Sz) const {
2246 return std::max(a: 2U, b: getMinVecRegSize() / Sz);
2247 }
2248
2249 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2250 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2251 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2252 return MaxVF ? MaxVF : UINT_MAX;
2253 }
2254
2255 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2256 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2257 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2258 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2259 ///
2260 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2261 unsigned canMapToVector(Type *T) const;
2262
2263 /// \returns True if the VectorizableTree is both tiny and not fully
2264 /// vectorizable. We do not vectorize such trees.
2265 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2266
2267 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2268 /// It may happen, if all gather nodes are loads and they cannot be
2269 /// "clusterized". In this case even subgraphs cannot be vectorized more
2270 /// effectively than the base graph.
2271 bool isTreeNotExtendable() const;
2272
2273 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
2274 /// can be load combined in the backend. Load combining may not be allowed in
2275 /// the IR optimizer, so we do not want to alter the pattern. For example,
2276 /// partially transforming a scalar bswap() pattern into vector code is
2277 /// effectively impossible for the backend to undo.
2278 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2279 /// may not be necessary.
2280 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
2281
2282 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
2283 /// can be load combined in the backend. Load combining may not be allowed in
2284 /// the IR optimizer, so we do not want to alter the pattern. For example,
2285 /// partially transforming a scalar bswap() pattern into vector code is
2286 /// effectively impossible for the backend to undo.
2287 /// TODO: If load combining is allowed in the IR optimizer, this analysis
2288 /// may not be necessary.
2289 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
2290 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2291 Align Alignment, const int64_t Diff,
2292 const size_t Sz) const;
2293
2294 /// Return true if an array of scalar loads can be replaced with a strided
2295 /// load (with constant stride).
2296 ///
2297 /// It is possible that the load gets "widened". Suppose that originally each
2298 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2299 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2300 /// ...
2301 /// %b + 0 * %s + (w - 1)
2302 ///
2303 /// %b + 1 * %s + 0
2304 /// %b + 1 * %s + 1
2305 /// %b + 1 * %s + 2
2306 /// ...
2307 /// %b + 1 * %s + (w - 1)
2308 /// ...
2309 ///
2310 /// %b + (n - 1) * %s + 0
2311 /// %b + (n - 1) * %s + 1
2312 /// %b + (n - 1) * %s + 2
2313 /// ...
2314 /// %b + (n - 1) * %s + (w - 1)
2315 ///
2316 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2317 ///
2318 /// \param PointerOps list of pointer arguments of loads.
2319 /// \param ElemTy original scalar type of loads.
2320 /// \param Alignment alignment of the first load.
2321 /// \param SortedIndices is the order of PointerOps as returned by
2322 /// `sortPtrAccesses`
2323 /// \param Diff Pointer difference between the lowest and the highes pointer
2324 /// in `PointerOps` as returned by `getPointersDiff`.
2325 /// \param Ptr0 first pointer in `PointersOps`.
2326 /// \param PtrN last pointer in `PointersOps`.
2327 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2328 /// of `SPtrInfo` necessary to generate the strided load later.
2329 bool analyzeConstantStrideCandidate(
2330 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2331 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2332 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2333
2334 /// Return true if an array of scalar loads can be replaced with a strided
2335 /// load (with run-time stride).
2336 /// \param PointerOps list of pointer arguments of loads.
2337 /// \param ScalarTy type of loads.
2338 /// \param CommonAlignment common alignement of loads as computed by
2339 /// `computeCommonAlignment<LoadInst>`.
2340 /// \param SortedIndicies is a list of indicies computed by this function such
2341 /// that the sequence `PointerOps[SortedIndices[0]],
2342 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2343 /// ordered by the coefficient of the stride. For example, if PointerOps is
2344 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2345 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2346 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2347 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2348 /// of `SPtrInfo` necessary to generate the strided load later.
2349 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2350 Align CommonAlignment,
2351 SmallVectorImpl<unsigned> &SortedIndices,
2352 StridedPtrInfo &SPtrInfo) const;
2353
2354 /// Checks if the given array of loads can be represented as a vectorized,
2355 /// scatter or just simple gather.
2356 /// \param VL list of loads.
2357 /// \param VL0 main load value.
2358 /// \param Order returned order of load instructions.
2359 /// \param PointerOps returned list of pointer operands.
2360 /// \param BestVF return best vector factor, if recursive check found better
2361 /// vectorization sequences rather than masked gather.
2362 /// \param TryRecursiveCheck used to check if long masked gather can be
2363 /// represented as a serie of loads/insert subvector, if profitable.
2364 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
2365 SmallVectorImpl<unsigned> &Order,
2366 SmallVectorImpl<Value *> &PointerOps,
2367 StridedPtrInfo &SPtrInfo,
2368 unsigned *BestVF = nullptr,
2369 bool TryRecursiveCheck = true) const;
2370
2371 /// Registers non-vectorizable sequence of loads
2372 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2373 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2374 }
2375
2376 /// Checks if the given loads sequence is known as not vectorizable
2377 template <typename T>
2378 bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const {
2379 return ListOfKnonwnNonVectorizableLoads.contains(V: hash_value(VL));
2380 }
2381
2382 OptimizationRemarkEmitter *getORE() { return ORE; }
2383
2384 /// This structure holds any data we need about the edges being traversed
2385 /// during buildTreeRec(). We keep track of:
2386 /// (i) the user TreeEntry index, and
2387 /// (ii) the index of the edge.
2388 struct EdgeInfo {
2389 EdgeInfo() = default;
2390 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2391 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
2392 /// The user TreeEntry.
2393 TreeEntry *UserTE = nullptr;
2394 /// The operand index of the use.
2395 unsigned EdgeIdx = UINT_MAX;
2396#ifndef NDEBUG
2397 friend inline raw_ostream &operator<<(raw_ostream &OS,
2398 const BoUpSLP::EdgeInfo &EI) {
2399 EI.dump(OS);
2400 return OS;
2401 }
2402 /// Debug print.
2403 void dump(raw_ostream &OS) const {
2404 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2405 << " EdgeIdx:" << EdgeIdx << "}";
2406 }
2407 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2408#endif
2409 bool operator == (const EdgeInfo &Other) const {
2410 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2411 }
2412
2413 operator bool() const { return UserTE != nullptr; }
2414 };
2415 friend struct DenseMapInfo<EdgeInfo>;
2416
2417 /// A helper class used for scoring candidates for two consecutive lanes.
2418 class LookAheadHeuristics {
2419 const TargetLibraryInfo &TLI;
2420 const DataLayout &DL;
2421 ScalarEvolution &SE;
2422 const BoUpSLP &R;
2423 int NumLanes; // Total number of lanes (aka vectorization factor).
2424 int MaxLevel; // The maximum recursion depth for accumulating score.
2425
2426 public:
2427 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
2428 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2429 int MaxLevel)
2430 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2431 MaxLevel(MaxLevel) {}
2432
2433 // The hard-coded scores listed here are not very important, though it shall
2434 // be higher for better matches to improve the resulting cost. When
2435 // computing the scores of matching one sub-tree with another, we are
2436 // basically counting the number of values that are matching. So even if all
2437 // scores are set to 1, we would still get a decent matching result.
2438 // However, sometimes we have to break ties. For example we may have to
2439 // choose between matching loads vs matching opcodes. This is what these
2440 // scores are helping us with: they provide the order of preference. Also,
2441 // this is important if the scalar is externally used or used in another
2442 // tree entry node in the different lane.
2443
2444 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2445 static const int ScoreConsecutiveLoads = 4;
2446 /// The same load multiple times. This should have a better score than
2447 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2448 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2449 /// a vector load and 1.0 for a broadcast.
2450 static const int ScoreSplatLoads = 3;
2451 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2452 static const int ScoreReversedLoads = 3;
2453 /// A load candidate for masked gather.
2454 static const int ScoreMaskedGatherCandidate = 1;
2455 /// ExtractElementInst from same vector and consecutive indexes.
2456 static const int ScoreConsecutiveExtracts = 4;
2457 /// ExtractElementInst from same vector and reversed indices.
2458 static const int ScoreReversedExtracts = 3;
2459 /// Constants.
2460 static const int ScoreConstants = 2;
2461 /// Instructions with the same opcode.
2462 static const int ScoreSameOpcode = 2;
2463 /// Instructions with alt opcodes (e.g, add + sub).
2464 static const int ScoreAltOpcodes = 1;
2465 /// Identical instructions (a.k.a. splat or broadcast).
2466 static const int ScoreSplat = 1;
2467 /// Matching with an undef is preferable to failing.
2468 static const int ScoreUndef = 1;
2469 /// Score for failing to find a decent match.
2470 static const int ScoreFail = 0;
2471 /// Score if all users are vectorized.
2472 static const int ScoreAllUserVectorized = 1;
2473
2474 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2475 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2476 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2477 /// MainAltOps.
2478 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
2479 ArrayRef<Value *> MainAltOps) const {
2480 if (!isValidElementType(Ty: V1->getType()) ||
2481 !isValidElementType(Ty: V2->getType()))
2482 return LookAheadHeuristics::ScoreFail;
2483
2484 if (V1 == V2) {
2485 if (isa<LoadInst>(Val: V1)) {
2486 // Retruns true if the users of V1 and V2 won't need to be extracted.
2487 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2488 // Bail out if we have too many uses to save compilation time.
2489 if (V1->hasNUsesOrMore(N: UsesLimit) || V2->hasNUsesOrMore(N: UsesLimit))
2490 return false;
2491
2492 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2493 return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
2494 return U == U1 || U == U2 || R.isVectorized(V: U);
2495 });
2496 };
2497 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2498 };
2499 // A broadcast of a load can be cheaper on some targets.
2500 if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
2501 NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
2502 ((int)V1->getNumUses() == NumLanes ||
2503 AllUsersAreInternal(V1, V2)))
2504 return LookAheadHeuristics::ScoreSplatLoads;
2505 }
2506 return LookAheadHeuristics::ScoreSplat;
2507 }
2508
2509 auto CheckSameEntryOrFail = [&]() {
2510 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V: V1); !TEs1.empty()) {
2511 SmallPtrSet<TreeEntry *, 4> Set(llvm::from_range, TEs1);
2512 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V: V2);
2513 !TEs2.empty() &&
2514 any_of(Range&: TEs2, P: [&](TreeEntry *E) { return Set.contains(Ptr: E); }))
2515 return LookAheadHeuristics::ScoreSplatLoads;
2516 }
2517 return LookAheadHeuristics::ScoreFail;
2518 };
2519
2520 auto *LI1 = dyn_cast<LoadInst>(Val: V1);
2521 auto *LI2 = dyn_cast<LoadInst>(Val: V2);
2522 if (LI1 && LI2) {
2523 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2524 !LI2->isSimple())
2525 return CheckSameEntryOrFail();
2526
2527 std::optional<int64_t> Dist = getPointersDiff(
2528 ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
2529 PtrB: LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2530 if (!Dist || *Dist == 0) {
2531 if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
2532 getUnderlyingObject(V: LI2->getPointerOperand()) &&
2533 R.TTI->isLegalMaskedGather(
2534 DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
2535 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2536 return CheckSameEntryOrFail();
2537 }
2538 // The distance is too large - still may be profitable to use masked
2539 // loads/gathers.
2540 if (std::abs(i: *Dist) > NumLanes / 2)
2541 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2542 // This still will detect consecutive loads, but we might have "holes"
2543 // in some cases. It is ok for non-power-2 vectorization and may produce
2544 // better results. It should not affect current vectorization.
2545 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
2546 : LookAheadHeuristics::ScoreReversedLoads;
2547 }
2548
2549 auto *C1 = dyn_cast<Constant>(Val: V1);
2550 auto *C2 = dyn_cast<Constant>(Val: V2);
2551 if (C1 && C2)
2552 return LookAheadHeuristics::ScoreConstants;
2553
2554 // Consider constants and buildvector compatible.
2555 if ((C1 && isa<InsertElementInst>(Val: V2)) ||
2556 (C2 && isa<InsertElementInst>(Val: V1)))
2557 return LookAheadHeuristics::ScoreConstants;
2558
2559 // Extracts from consecutive indexes of the same vector better score as
2560 // the extracts could be optimized away.
2561 Value *EV1;
2562 ConstantInt *Ex1Idx;
2563 if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
2564 // Undefs are always profitable for extractelements.
2565 // Compiler can easily combine poison and extractelement <non-poison> or
2566 // undef and extractelement <poison>. But combining undef +
2567 // extractelement <non-poison-but-may-produce-poison> requires some
2568 // extra operations.
2569 if (isa<UndefValue>(Val: V2))
2570 return (isa<PoisonValue>(Val: V2) || isUndefVector(V: EV1).all())
2571 ? LookAheadHeuristics::ScoreConsecutiveExtracts
2572 : LookAheadHeuristics::ScoreSameOpcode;
2573 Value *EV2 = nullptr;
2574 ConstantInt *Ex2Idx = nullptr;
2575 if (match(V: V2,
2576 P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
2577 R: m_Undef())))) {
2578 // Undefs are always profitable for extractelements.
2579 if (!Ex2Idx)
2580 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2581 if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
2582 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2583 if (EV2 == EV1) {
2584 int Idx1 = Ex1Idx->getZExtValue();
2585 int Idx2 = Ex2Idx->getZExtValue();
2586 int Dist = Idx2 - Idx1;
2587 // The distance is too large - still may be profitable to use
2588 // shuffles.
2589 if (std::abs(x: Dist) == 0)
2590 return LookAheadHeuristics::ScoreSplat;
2591 if (std::abs(x: Dist) > NumLanes / 2)
2592 return LookAheadHeuristics::ScoreSameOpcode;
2593 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
2594 : LookAheadHeuristics::ScoreReversedExtracts;
2595 }
2596 return LookAheadHeuristics::ScoreAltOpcodes;
2597 }
2598 return CheckSameEntryOrFail();
2599 }
2600
2601 auto *I1 = dyn_cast<Instruction>(Val: V1);
2602 auto *I2 = dyn_cast<Instruction>(Val: V2);
2603 if (I1 && I2) {
2604 if (I1->getParent() != I2->getParent())
2605 return CheckSameEntryOrFail();
2606 SmallVector<Value *, 4> Ops(MainAltOps);
2607 Ops.push_back(Elt: I1);
2608 Ops.push_back(Elt: I2);
2609 InstructionsState S = getSameOpcode(VL: Ops, TLI);
2610 // Note: Only consider instructions with <= 2 operands to avoid
2611 // complexity explosion.
2612 if (S &&
2613 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2614 !S.isAltShuffle()) &&
2615 all_of(Range&: Ops, P: [&S](Value *V) {
2616 return isa<PoisonValue>(Val: V) ||
2617 cast<Instruction>(Val: V)->getNumOperands() ==
2618 S.getMainOp()->getNumOperands();
2619 }))
2620 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2621 : LookAheadHeuristics::ScoreSameOpcode;
2622 }
2623
2624 if (I1 && isa<PoisonValue>(Val: V2))
2625 return LookAheadHeuristics::ScoreSameOpcode;
2626
2627 if (isa<UndefValue>(Val: V2))
2628 return LookAheadHeuristics::ScoreUndef;
2629
2630 return CheckSameEntryOrFail();
2631 }
2632
2633 /// Go through the operands of \p LHS and \p RHS recursively until
2634 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2635 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2636 /// of \p U1 and \p U2), except at the beginning of the recursion where
2637 /// these are set to nullptr.
2638 ///
2639 /// For example:
2640 /// \verbatim
2641 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2642 /// \ / \ / \ / \ /
2643 /// + + + +
2644 /// G1 G2 G3 G4
2645 /// \endverbatim
2646 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2647 /// each level recursively, accumulating the score. It starts from matching
2648 /// the additions at level 0, then moves on to the loads (level 1). The
2649 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2650 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2651 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2652 /// Please note that the order of the operands does not matter, as we
2653 /// evaluate the score of all profitable combinations of operands. In
2654 /// other words the score of G1 and G4 is the same as G1 and G2. This
2655 /// heuristic is based on ideas described in:
2656 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2657 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2658 /// Luís F. W. Góes
2659 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
2660 Instruction *U2, int CurrLevel,
2661 ArrayRef<Value *> MainAltOps) const {
2662
2663 // Get the shallow score of V1 and V2.
2664 int ShallowScoreAtThisLevel =
2665 getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
2666
2667 // If reached MaxLevel,
2668 // or if V1 and V2 are not instructions,
2669 // or if they are SPLAT,
2670 // or if they are not consecutive,
2671 // or if profitable to vectorize loads or extractelements, early return
2672 // the current cost.
2673 auto *I1 = dyn_cast<Instruction>(Val: LHS);
2674 auto *I2 = dyn_cast<Instruction>(Val: RHS);
2675 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2676 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2677 (((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) ||
2678 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2679 (isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
2680 ShallowScoreAtThisLevel))
2681 return ShallowScoreAtThisLevel;
2682 assert(I1 && I2 && "Should have early exited.");
2683
2684 // Contains the I2 operand indexes that got matched with I1 operands.
2685 SmallSet<unsigned, 4> Op2Used;
2686
2687 // Recursion towards the operands of I1 and I2. We are trying all possible
2688 // operand pairs, and keeping track of the best score.
2689 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2690 OpIdx1 != NumOperands1; ++OpIdx1) {
2691 // Try to pair op1I with the best operand of I2.
2692 int MaxTmpScore = 0;
2693 unsigned MaxOpIdx2 = 0;
2694 bool FoundBest = false;
2695 // If I2 is commutative try all combinations.
2696 unsigned FromIdx = isCommutative(I: I2) ? 0 : OpIdx1;
2697 unsigned ToIdx = isCommutative(I: I2)
2698 ? I2->getNumOperands()
2699 : std::min(a: I2->getNumOperands(), b: OpIdx1 + 1);
2700 assert(FromIdx <= ToIdx && "Bad index");
2701 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2702 // Skip operands already paired with OpIdx1.
2703 if (Op2Used.count(V: OpIdx2))
2704 continue;
2705 // Recursively calculate the cost at each level
2706 int TmpScore =
2707 getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
2708 U1: I1, U2: I2, CurrLevel: CurrLevel + 1, MainAltOps: {});
2709 // Look for the best score.
2710 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2711 TmpScore > MaxTmpScore) {
2712 MaxTmpScore = TmpScore;
2713 MaxOpIdx2 = OpIdx2;
2714 FoundBest = true;
2715 }
2716 }
2717 if (FoundBest) {
2718 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2719 Op2Used.insert(V: MaxOpIdx2);
2720 ShallowScoreAtThisLevel += MaxTmpScore;
2721 }
2722 }
2723 return ShallowScoreAtThisLevel;
2724 }
2725 };
2726 /// A helper data structure to hold the operands of a vector of instructions.
2727 /// This supports a fixed vector length for all operand vectors.
2728 class VLOperands {
2729 /// For each operand we need (i) the value, and (ii) the opcode that it
2730 /// would be attached to if the expression was in a left-linearized form.
2731 /// This is required to avoid illegal operand reordering.
2732 /// For example:
2733 /// \verbatim
2734 /// 0 Op1
2735 /// |/
2736 /// Op1 Op2 Linearized + Op2
2737 /// \ / ----------> |/
2738 /// - -
2739 ///
2740 /// Op1 - Op2 (0 + Op1) - Op2
2741 /// \endverbatim
2742 ///
2743 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2744 ///
2745 /// Another way to think of this is to track all the operations across the
2746 /// path from the operand all the way to the root of the tree and to
2747 /// calculate the operation that corresponds to this path. For example, the
2748 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2749 /// corresponding operation is a '-' (which matches the one in the
2750 /// linearized tree, as shown above).
2751 ///
2752 /// For lack of a better term, we refer to this operation as Accumulated
2753 /// Path Operation (APO).
2754 struct OperandData {
2755 OperandData() = default;
2756 OperandData(Value *V, bool APO, bool IsUsed)
2757 : V(V), APO(APO), IsUsed(IsUsed) {}
2758 /// The operand value.
2759 Value *V = nullptr;
2760 /// TreeEntries only allow a single opcode, or an alternate sequence of
2761 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2762 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2763 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2764 /// (e.g., Add/Mul)
2765 bool APO = false;
2766 /// Helper data for the reordering function.
2767 bool IsUsed = false;
2768 };
2769
2770 /// During operand reordering, we are trying to select the operand at lane
2771 /// that matches best with the operand at the neighboring lane. Our
2772 /// selection is based on the type of value we are looking for. For example,
2773 /// if the neighboring lane has a load, we need to look for a load that is
2774 /// accessing a consecutive address. These strategies are summarized in the
2775 /// 'ReorderingMode' enumerator.
2776 enum class ReorderingMode {
2777 Load, ///< Matching loads to consecutive memory addresses
2778 Opcode, ///< Matching instructions based on opcode (same or alternate)
2779 Constant, ///< Matching constants
2780 Splat, ///< Matching the same instruction multiple times (broadcast)
2781 Failed, ///< We failed to create a vectorizable group
2782 };
2783
2784 using OperandDataVec = SmallVector<OperandData, 2>;
2785
2786 /// A vector of operand vectors.
2787 SmallVector<OperandDataVec, 4> OpsVec;
2788 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2789 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2790 unsigned ArgSize = 0;
2791
2792 const TargetLibraryInfo &TLI;
2793 const DataLayout &DL;
2794 ScalarEvolution &SE;
2795 const BoUpSLP &R;
2796 const Loop *L = nullptr;
2797
2798 /// \returns the operand data at \p OpIdx and \p Lane.
2799 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2800 return OpsVec[OpIdx][Lane];
2801 }
2802
2803 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2804 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2805 return OpsVec[OpIdx][Lane];
2806 }
2807
2808 /// Clears the used flag for all entries.
2809 void clearUsed() {
2810 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2811 OpIdx != NumOperands; ++OpIdx)
2812 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2813 ++Lane)
2814 OpsVec[OpIdx][Lane].IsUsed = false;
2815 }
2816
2817 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2818 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2819 std::swap(a&: OpsVec[OpIdx1][Lane], b&: OpsVec[OpIdx2][Lane]);
2820 }
2821
2822 /// \param Lane lane of the operands under analysis.
2823 /// \param OpIdx operand index in \p Lane lane we're looking the best
2824 /// candidate for.
2825 /// \param Idx operand index of the current candidate value.
2826 /// \returns The additional score due to possible broadcasting of the
2827 /// elements in the lane. It is more profitable to have power-of-2 unique
2828 /// elements in the lane, it will be vectorized with higher probability
2829 /// after removing duplicates. Currently the SLP vectorizer supports only
2830 /// vectorization of the power-of-2 number of unique scalars.
2831 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2832 const SmallBitVector &UsedLanes) const {
2833 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2834 if (!isa<Instruction>(Val: IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2835 isa<ExtractElementInst>(Val: IdxLaneV))
2836 return 0;
2837 SmallDenseMap<Value *, unsigned, 4> Uniques;
2838 for (unsigned Ln : seq<unsigned>(Size: getNumLanes())) {
2839 if (Ln == Lane)
2840 continue;
2841 Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
2842 if (!isa<Instruction>(Val: OpIdxLnV))
2843 return 0;
2844 Uniques.try_emplace(Key: OpIdxLnV, Args&: Ln);
2845 }
2846 unsigned UniquesCount = Uniques.size();
2847 auto IdxIt = Uniques.find(Val: IdxLaneV);
2848 unsigned UniquesCntWithIdxLaneV =
2849 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2850 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2851 auto OpIdxIt = Uniques.find(Val: OpIdxLaneV);
2852 unsigned UniquesCntWithOpIdxLaneV =
2853 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2854 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2855 return 0;
2856 return std::min(a: bit_ceil(Value: UniquesCntWithOpIdxLaneV) -
2857 UniquesCntWithOpIdxLaneV,
2858 b: UniquesCntWithOpIdxLaneV -
2859 bit_floor(Value: UniquesCntWithOpIdxLaneV)) -
2860 ((IdxIt != Uniques.end() && UsedLanes.test(Idx: IdxIt->second))
2861 ? UniquesCntWithIdxLaneV - bit_floor(Value: UniquesCntWithIdxLaneV)
2862 : bit_ceil(Value: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2863 }
2864
2865 /// \param Lane lane of the operands under analysis.
2866 /// \param OpIdx operand index in \p Lane lane we're looking the best
2867 /// candidate for.
2868 /// \param Idx operand index of the current candidate value.
2869 /// \returns The additional score for the scalar which users are all
2870 /// vectorized.
2871 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2872 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2873 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2874 // Do not care about number of uses for vector-like instructions
2875 // (extractelement/extractvalue with constant indices), they are extracts
2876 // themselves and already externally used. Vectorization of such
2877 // instructions does not add extra extractelement instruction, just may
2878 // remove it.
2879 if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
2880 isVectorLikeInstWithConstOps(V: OpIdxLaneV))
2881 return LookAheadHeuristics::ScoreAllUserVectorized;
2882 auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
2883 if (!IdxLaneI || !isa<Instruction>(Val: OpIdxLaneV))
2884 return 0;
2885 return R.areAllUsersVectorized(I: IdxLaneI)
2886 ? LookAheadHeuristics::ScoreAllUserVectorized
2887 : 0;
2888 }
2889
2890 /// Score scaling factor for fully compatible instructions but with
2891 /// different number of external uses. Allows better selection of the
2892 /// instructions with less external uses.
2893 static const int ScoreScaleFactor = 10;
2894
2895 /// \Returns the look-ahead score, which tells us how much the sub-trees
2896 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2897 /// score. This helps break ties in an informed way when we cannot decide on
2898 /// the order of the operands by just considering the immediate
2899 /// predecessors.
2900 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2901 int Lane, unsigned OpIdx, unsigned Idx,
2902 bool &IsUsed, const SmallBitVector &UsedLanes) {
2903 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2904 LookAheadMaxDepth);
2905 // Keep track of the instruction stack as we recurse into the operands
2906 // during the look-ahead score exploration.
2907 int Score =
2908 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2909 /*CurrLevel=*/1, MainAltOps);
2910 if (Score) {
2911 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2912 if (Score <= -SplatScore) {
2913 // Failed score.
2914 Score = 0;
2915 } else {
2916 Score += SplatScore;
2917 // Scale score to see the difference between different operands
2918 // and similar operands but all vectorized/not all vectorized
2919 // uses. It does not affect actual selection of the best
2920 // compatible operand in general, just allows to select the
2921 // operand with all vectorized uses.
2922 Score *= ScoreScaleFactor;
2923 Score += getExternalUseScore(Lane, OpIdx, Idx);
2924 IsUsed = true;
2925 }
2926 }
2927 return Score;
2928 }
2929
2930 /// Best defined scores per lanes between the passes. Used to choose the
2931 /// best operand (with the highest score) between the passes.
2932 /// The key - {Operand Index, Lane}.
2933 /// The value - the best score between the passes for the lane and the
2934 /// operand.
2935 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
2936 BestScoresPerLanes;
2937
2938 // Search all operands in Ops[*][Lane] for the one that matches best
2939 // Ops[OpIdx][LastLane] and return its opreand index.
2940 // If no good match can be found, return std::nullopt.
2941 std::optional<unsigned>
2942 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2943 ArrayRef<ReorderingMode> ReorderingModes,
2944 ArrayRef<Value *> MainAltOps,
2945 const SmallBitVector &UsedLanes) {
2946 unsigned NumOperands = getNumOperands();
2947
2948 // The operand of the previous lane at OpIdx.
2949 Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
2950
2951 // Our strategy mode for OpIdx.
2952 ReorderingMode RMode = ReorderingModes[OpIdx];
2953 if (RMode == ReorderingMode::Failed)
2954 return std::nullopt;
2955
2956 // The linearized opcode of the operand at OpIdx, Lane.
2957 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2958
2959 // The best operand index and its score.
2960 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2961 // are using the score to differentiate between the two.
2962 struct BestOpData {
2963 std::optional<unsigned> Idx;
2964 unsigned Score = 0;
2965 } BestOp;
2966 BestOp.Score =
2967 BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: 0)
2968 .first->second;
2969
2970 // Track if the operand must be marked as used. If the operand is set to
2971 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2972 // want to reestimate the operands again on the following iterations).
2973 bool IsUsed = RMode == ReorderingMode::Splat ||
2974 RMode == ReorderingMode::Constant ||
2975 RMode == ReorderingMode::Load;
2976 // Iterate through all unused operands and look for the best.
2977 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2978 // Get the operand at Idx and Lane.
2979 OperandData &OpData = getData(OpIdx: Idx, Lane);
2980 Value *Op = OpData.V;
2981 bool OpAPO = OpData.APO;
2982
2983 // Skip already selected operands.
2984 if (OpData.IsUsed)
2985 continue;
2986
2987 // Skip if we are trying to move the operand to a position with a
2988 // different opcode in the linearized tree form. This would break the
2989 // semantics.
2990 if (OpAPO != OpIdxAPO)
2991 continue;
2992
2993 // Look for an operand that matches the current mode.
2994 switch (RMode) {
2995 case ReorderingMode::Load:
2996 case ReorderingMode::Opcode: {
2997 bool LeftToRight = Lane > LastLane;
2998 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2999 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
3000 int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
3001 OpIdx, Idx, IsUsed, UsedLanes);
3002 if (Score > static_cast<int>(BestOp.Score) ||
3003 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
3004 Idx == OpIdx)) {
3005 BestOp.Idx = Idx;
3006 BestOp.Score = Score;
3007 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = Score;
3008 }
3009 break;
3010 }
3011 case ReorderingMode::Constant:
3012 if (isa<Constant>(Val: Op) ||
3013 (!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
3014 BestOp.Idx = Idx;
3015 if (isa<Constant>(Val: Op)) {
3016 BestOp.Score = LookAheadHeuristics::ScoreConstants;
3017 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
3018 LookAheadHeuristics::ScoreConstants;
3019 }
3020 if (isa<UndefValue>(Val: Op) || !isa<Constant>(Val: Op))
3021 IsUsed = false;
3022 }
3023 break;
3024 case ReorderingMode::Splat:
3025 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Val: Op))) {
3026 IsUsed = Op == OpLastLane;
3027 if (Op == OpLastLane) {
3028 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3029 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
3030 LookAheadHeuristics::ScoreSplat;
3031 }
3032 BestOp.Idx = Idx;
3033 }
3034 break;
3035 case ReorderingMode::Failed:
3036 llvm_unreachable("Not expected Failed reordering mode.");
3037 }
3038 }
3039
3040 if (BestOp.Idx) {
3041 getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
3042 return BestOp.Idx;
3043 }
3044 // If we could not find a good match return std::nullopt.
3045 return std::nullopt;
3046 }
3047
3048 /// Helper for reorderOperandVecs.
3049 /// \returns the lane that we should start reordering from. This is the one
3050 /// which has the least number of operands that can freely move about or
3051 /// less profitable because it already has the most optimal set of operands.
3052 unsigned getBestLaneToStartReordering() const {
3053 unsigned Min = UINT_MAX;
3054 unsigned SameOpNumber = 0;
3055 // std::pair<unsigned, unsigned> is used to implement a simple voting
3056 // algorithm and choose the lane with the least number of operands that
3057 // can freely move about or less profitable because it already has the
3058 // most optimal set of operands. The first unsigned is a counter for
3059 // voting, the second unsigned is the counter of lanes with instructions
3060 // with same/alternate opcodes and same parent basic block.
3061 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
3062 // Try to be closer to the original results, if we have multiple lanes
3063 // with same cost. If 2 lanes have the same cost, use the one with the
3064 // highest index.
3065 for (int I = getNumLanes(); I > 0; --I) {
3066 unsigned Lane = I - 1;
3067 OperandsOrderData NumFreeOpsHash =
3068 getMaxNumOperandsThatCanBeReordered(Lane);
3069 // Compare the number of operands that can move and choose the one with
3070 // the least number.
3071 if (NumFreeOpsHash.NumOfAPOs < Min) {
3072 Min = NumFreeOpsHash.NumOfAPOs;
3073 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3074 HashMap.clear();
3075 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
3076 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3077 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3078 // Select the most optimal lane in terms of number of operands that
3079 // should be moved around.
3080 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3081 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
3082 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3083 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3084 auto [It, Inserted] =
3085 HashMap.try_emplace(Key: NumFreeOpsHash.Hash, Args: 1, Args&: Lane);
3086 if (!Inserted)
3087 ++It->second.first;
3088 }
3089 }
3090 // Select the lane with the minimum counter.
3091 unsigned BestLane = 0;
3092 unsigned CntMin = UINT_MAX;
3093 for (const auto &Data : reverse(C&: HashMap)) {
3094 if (Data.second.first < CntMin) {
3095 CntMin = Data.second.first;
3096 BestLane = Data.second.second;
3097 }
3098 }
3099 return BestLane;
3100 }
3101
3102 /// Data structure that helps to reorder operands.
3103 struct OperandsOrderData {
3104 /// The best number of operands with the same APOs, which can be
3105 /// reordered.
3106 unsigned NumOfAPOs = UINT_MAX;
3107 /// Number of operands with the same/alternate instruction opcode and
3108 /// parent.
3109 unsigned NumOpsWithSameOpcodeParent = 0;
3110 /// Hash for the actual operands ordering.
3111 /// Used to count operands, actually their position id and opcode
3112 /// value. It is used in the voting mechanism to find the lane with the
3113 /// least number of operands that can freely move about or less profitable
3114 /// because it already has the most optimal set of operands. Can be
3115 /// replaced with SmallVector<unsigned> instead but hash code is faster
3116 /// and requires less memory.
3117 unsigned Hash = 0;
3118 };
3119 /// \returns the maximum number of operands that are allowed to be reordered
3120 /// for \p Lane and the number of compatible instructions(with the same
3121 /// parent/opcode). This is used as a heuristic for selecting the first lane
3122 /// to start operand reordering.
3123 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3124 unsigned CntTrue = 0;
3125 unsigned NumOperands = getNumOperands();
3126 // Operands with the same APO can be reordered. We therefore need to count
3127 // how many of them we have for each APO, like this: Cnt[APO] = x.
3128 // Since we only have two APOs, namely true and false, we can avoid using
3129 // a map. Instead we can simply count the number of operands that
3130 // correspond to one of them (in this case the 'true' APO), and calculate
3131 // the other by subtracting it from the total number of operands.
3132 // Operands with the same instruction opcode and parent are more
3133 // profitable since we don't need to move them in many cases, with a high
3134 // probability such lane already can be vectorized effectively.
3135 bool AllUndefs = true;
3136 unsigned NumOpsWithSameOpcodeParent = 0;
3137 Instruction *OpcodeI = nullptr;
3138 BasicBlock *Parent = nullptr;
3139 unsigned Hash = 0;
3140 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3141 const OperandData &OpData = getData(OpIdx, Lane);
3142 if (OpData.APO)
3143 ++CntTrue;
3144 // Use Boyer-Moore majority voting for finding the majority opcode and
3145 // the number of times it occurs.
3146 if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
3147 if (!OpcodeI || !getSameOpcode(VL: {OpcodeI, I}, TLI) ||
3148 I->getParent() != Parent) {
3149 if (NumOpsWithSameOpcodeParent == 0) {
3150 NumOpsWithSameOpcodeParent = 1;
3151 OpcodeI = I;
3152 Parent = I->getParent();
3153 } else {
3154 --NumOpsWithSameOpcodeParent;
3155 }
3156 } else {
3157 ++NumOpsWithSameOpcodeParent;
3158 }
3159 }
3160 Hash = hash_combine(
3161 args: Hash, args: hash_value(value: (OpIdx + 1) * (OpData.V->getValueID() + 1)));
3162 AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
3163 }
3164 if (AllUndefs)
3165 return {};
3166 OperandsOrderData Data;
3167 Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
3168 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3169 Data.Hash = Hash;
3170 return Data;
3171 }
3172
3173 /// Go through the instructions in VL and append their operands.
3174 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3175 const InstructionsState &S) {
3176 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3177 assert((empty() || all_of(Operands,
3178 [this](const ValueList &VL) {
3179 return VL.size() == getNumLanes();
3180 })) &&
3181 "Expected same number of lanes");
3182 assert(S.valid() && "InstructionsState is invalid.");
3183 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3184 // arguments to the intrinsic produces the same result.
3185 Instruction *MainOp = S.getMainOp();
3186 unsigned NumOperands = MainOp->getNumOperands();
3187 ArgSize = ::getNumberOfPotentiallyCommutativeOps(I: MainOp);
3188 OpsVec.resize(N: ArgSize);
3189 unsigned NumLanes = VL.size();
3190 for (OperandDataVec &Ops : OpsVec)
3191 Ops.resize(N: NumLanes);
3192 for (unsigned Lane : seq<unsigned>(Size: NumLanes)) {
3193 // Our tree has just 3 nodes: the root and two operands.
3194 // It is therefore trivial to get the APO. We only need to check the
3195 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3196 // operand. The LHS operand of both add and sub is never attached to an
3197 // inversese operation in the linearized form, therefore its APO is
3198 // false. The RHS is true only if V is an inverse operation.
3199
3200 // Since operand reordering is performed on groups of commutative
3201 // operations or alternating sequences (e.g., +, -), we can safely tell
3202 // the inverse operations by checking commutativity.
3203 auto *I = dyn_cast<Instruction>(Val: VL[Lane]);
3204 if (!I && isa<PoisonValue>(Val: VL[Lane])) {
3205 for (unsigned OpIdx : seq<unsigned>(Size: NumOperands))
3206 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3207 continue;
3208 }
3209 bool IsInverseOperation = false;
3210 if (S.isCopyableElement(V: VL[Lane])) {
3211 // The value is a copyable element.
3212 IsInverseOperation =
3213 !isCommutative(I: MainOp, ValWithUses: VL[Lane], /*IsCopyable=*/true);
3214 } else {
3215 assert(I && "Expected instruction");
3216 auto [SelectedOp, Ops] = convertTo(I, S);
3217 // We cannot check commutativity by the converted instruction
3218 // (SelectedOp) because isCommutative also examines def-use
3219 // relationships.
3220 IsInverseOperation = !isCommutative(I: SelectedOp, ValWithUses: I);
3221 }
3222 for (unsigned OpIdx : seq<unsigned>(Size: ArgSize)) {
3223 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3224 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3225 }
3226 }
3227 }
3228
3229 /// \returns the number of operands.
3230 unsigned getNumOperands() const { return ArgSize; }
3231
3232 /// \returns the number of lanes.
3233 unsigned getNumLanes() const { return OpsVec[0].size(); }
3234
3235 /// \returns the operand value at \p OpIdx and \p Lane.
3236 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3237 return getData(OpIdx, Lane).V;
3238 }
3239
3240 /// \returns true if the data structure is empty.
3241 bool empty() const { return OpsVec.empty(); }
3242
3243 /// Clears the data.
3244 void clear() { OpsVec.clear(); }
3245
3246 /// \Returns true if there are enough operands identical to \p Op to fill
3247 /// the whole vector (it is mixed with constants or loop invariant values).
3248 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3249 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3250 assert(Op == getValue(OpIdx, Lane) &&
3251 "Op is expected to be getValue(OpIdx, Lane).");
3252 // Small number of loads - try load matching.
3253 if (isa<LoadInst>(Val: Op) && getNumLanes() == 2 && getNumOperands() == 2)
3254 return false;
3255 bool OpAPO = getData(OpIdx, Lane).APO;
3256 bool IsInvariant = L && L->isLoopInvariant(V: Op);
3257 unsigned Cnt = 0;
3258 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3259 if (Ln == Lane)
3260 continue;
3261 // This is set to true if we found a candidate for broadcast at Lane.
3262 bool FoundCandidate = false;
3263 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3264 OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3265 if (Data.APO != OpAPO || Data.IsUsed)
3266 continue;
3267 Value *OpILane = getValue(OpIdx: OpI, Lane);
3268 bool IsConstantOp = isa<Constant>(Val: OpILane);
3269 // Consider the broadcast candidate if:
3270 // 1. Same value is found in one of the operands.
3271 if (Data.V == Op ||
3272 // 2. The operand in the given lane is not constant but there is a
3273 // constant operand in another lane (which can be moved to the
3274 // given lane). In this case we can represent it as a simple
3275 // permutation of constant and broadcast.
3276 (!IsConstantOp &&
3277 ((Lns > 2 && isa<Constant>(Val: Data.V)) ||
3278 // 2.1. If we have only 2 lanes, need to check that value in the
3279 // next lane does not build same opcode sequence.
3280 (Lns == 2 &&
3281 !getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + 1) % OpE, Lane: Ln)}, TLI) &&
3282 isa<Constant>(Val: Data.V)))) ||
3283 // 3. The operand in the current lane is loop invariant (can be
3284 // hoisted out) and another operand is also a loop invariant
3285 // (though not a constant). In this case the whole vector can be
3286 // hoisted out.
3287 // FIXME: need to teach the cost model about this case for better
3288 // estimation.
3289 (IsInvariant && !isa<Constant>(Val: Data.V) &&
3290 !getSameOpcode(VL: {Op, Data.V}, TLI) &&
3291 L->isLoopInvariant(V: Data.V))) {
3292 FoundCandidate = true;
3293 Data.IsUsed = Data.V == Op;
3294 if (Data.V == Op)
3295 ++Cnt;
3296 break;
3297 }
3298 }
3299 if (!FoundCandidate)
3300 return false;
3301 }
3302 return getNumLanes() == 2 || Cnt > 1;
3303 }
3304
3305 /// Checks if there is at least single compatible operand in lanes other
3306 /// than \p Lane, compatible with the operand \p Op.
3307 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3308 assert(Op == getValue(OpIdx, Lane) &&
3309 "Op is expected to be getValue(OpIdx, Lane).");
3310 bool OpAPO = getData(OpIdx, Lane).APO;
3311 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3312 if (Ln == Lane)
3313 continue;
3314 if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
3315 const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3316 if (Data.APO != OpAPO || Data.IsUsed)
3317 return true;
3318 Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
3319 return (L && L->isLoopInvariant(V: OpILn)) ||
3320 (getSameOpcode(VL: {Op, OpILn}, TLI) &&
3321 allSameBlock(VL: {Op, OpILn}));
3322 }))
3323 return true;
3324 }
3325 return false;
3326 }
3327
3328 public:
3329 /// Initialize with all the operands of the instruction vector \p RootVL.
3330 VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands,
3331 const InstructionsState &S, const BoUpSLP &R)
3332 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3333 L(R.LI->getLoopFor(BB: S.getMainOp()->getParent())) {
3334 // Append all the operands of RootVL.
3335 appendOperands(VL: RootVL, Operands, S);
3336 }
3337
3338 /// \Returns a value vector with the operands across all lanes for the
3339 /// opearnd at \p OpIdx.
3340 ValueList getVL(unsigned OpIdx) const {
3341 ValueList OpVL(OpsVec[OpIdx].size());
3342 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3343 "Expected same num of lanes across all operands");
3344 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3345 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3346 return OpVL;
3347 }
3348
3349 // Performs operand reordering for 2 or more operands.
3350 // The original operands are in OrigOps[OpIdx][Lane].
3351 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3352 void reorder() {
3353 unsigned NumOperands = getNumOperands();
3354 unsigned NumLanes = getNumLanes();
3355 // Each operand has its own mode. We are using this mode to help us select
3356 // the instructions for each lane, so that they match best with the ones
3357 // we have selected so far.
3358 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3359
3360 // This is a greedy single-pass algorithm. We are going over each lane
3361 // once and deciding on the best order right away with no back-tracking.
3362 // However, in order to increase its effectiveness, we start with the lane
3363 // that has operands that can move the least. For example, given the
3364 // following lanes:
3365 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3366 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3367 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3368 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3369 // we will start at Lane 1, since the operands of the subtraction cannot
3370 // be reordered. Then we will visit the rest of the lanes in a circular
3371 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3372
3373 // Find the first lane that we will start our search from.
3374 unsigned FirstLane = getBestLaneToStartReordering();
3375
3376 // Initialize the modes.
3377 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3378 Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
3379 // Keep track if we have instructions with all the same opcode on one
3380 // side.
3381 if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
3382 // Check if OpLane0 should be broadcast.
3383 if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) ||
3384 !canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
3385 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3386 else if (isa<LoadInst>(Val: OpILane0))
3387 ReorderingModes[OpIdx] = ReorderingMode::Load;
3388 else
3389 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3390 } else if (isa<Constant>(Val: OpLane0)) {
3391 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3392 } else if (isa<Argument>(Val: OpLane0)) {
3393 // Our best hope is a Splat. It may save some cost in some cases.
3394 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3395 } else {
3396 llvm_unreachable("Unexpected value kind.");
3397 }
3398 }
3399
3400 // Check that we don't have same operands. No need to reorder if operands
3401 // are just perfect diamond or shuffled diamond match. Do not do it only
3402 // for possible broadcasts or non-power of 2 number of scalars (just for
3403 // now).
3404 auto &&SkipReordering = [this]() {
3405 SmallPtrSet<Value *, 4> UniqueValues;
3406 ArrayRef<OperandData> Op0 = OpsVec.front();
3407 for (const OperandData &Data : Op0)
3408 UniqueValues.insert(Ptr: Data.V);
3409 for (ArrayRef<OperandData> Op :
3410 ArrayRef(OpsVec).slice(N: 1, M: getNumOperands() - 1)) {
3411 if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
3412 return !UniqueValues.contains(Ptr: Data.V);
3413 }))
3414 return false;
3415 }
3416 // TODO: Check if we can remove a check for non-power-2 number of
3417 // scalars after full support of non-power-2 vectorization.
3418 return UniqueValues.size() != 2 &&
3419 hasFullVectorsOrPowerOf2(TTI: *R.TTI, Ty: Op0.front().V->getType(),
3420 Sz: UniqueValues.size());
3421 };
3422
3423 // If the initial strategy fails for any of the operand indexes, then we
3424 // perform reordering again in a second pass. This helps avoid assigning
3425 // high priority to the failed strategy, and should improve reordering for
3426 // the non-failed operand indexes.
3427 for (int Pass = 0; Pass != 2; ++Pass) {
3428 // Check if no need to reorder operands since they're are perfect or
3429 // shuffled diamond match.
3430 // Need to do it to avoid extra external use cost counting for
3431 // shuffled matches, which may cause regressions.
3432 if (SkipReordering())
3433 break;
3434 // Skip the second pass if the first pass did not fail.
3435 bool StrategyFailed = false;
3436 // Mark all operand data as free to use.
3437 clearUsed();
3438 // We keep the original operand order for the FirstLane, so reorder the
3439 // rest of the lanes. We are visiting the nodes in a circular fashion,
3440 // using FirstLane as the center point and increasing the radius
3441 // distance.
3442 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3443 for (unsigned I = 0; I < NumOperands; ++I)
3444 MainAltOps[I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
3445
3446 SmallBitVector UsedLanes(NumLanes);
3447 UsedLanes.set(FirstLane);
3448 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3449 // Visit the lane on the right and then the lane on the left.
3450 for (int Direction : {+1, -1}) {
3451 int Lane = FirstLane + Direction * Distance;
3452 if (Lane < 0 || Lane >= (int)NumLanes)
3453 continue;
3454 UsedLanes.set(Lane);
3455 int LastLane = Lane - Direction;
3456 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3457 "Out of bounds");
3458 // Look for a good match for each operand.
3459 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3460 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3461 std::optional<unsigned> BestIdx =
3462 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3463 MainAltOps: MainAltOps[OpIdx], UsedLanes);
3464 // By not selecting a value, we allow the operands that follow to
3465 // select a better matching value. We will get a non-null value in
3466 // the next run of getBestOperand().
3467 if (BestIdx) {
3468 // Swap the current operand with the one returned by
3469 // getBestOperand().
3470 swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
3471 } else {
3472 // Enable the second pass.
3473 StrategyFailed = true;
3474 }
3475 // Try to get the alternate opcode and follow it during analysis.
3476 if (MainAltOps[OpIdx].size() != 2) {
3477 OperandData &AltOp = getData(OpIdx, Lane);
3478 InstructionsState OpS =
3479 getSameOpcode(VL: {MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3480 if (OpS && OpS.isAltShuffle())
3481 MainAltOps[OpIdx].push_back(Elt: AltOp.V);
3482 }
3483 }
3484 }
3485 }
3486 // Skip second pass if the strategy did not fail.
3487 if (!StrategyFailed)
3488 break;
3489 }
3490 }
3491
3492#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3493 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3494 switch (RMode) {
3495 case ReorderingMode::Load:
3496 return "Load";
3497 case ReorderingMode::Opcode:
3498 return "Opcode";
3499 case ReorderingMode::Constant:
3500 return "Constant";
3501 case ReorderingMode::Splat:
3502 return "Splat";
3503 case ReorderingMode::Failed:
3504 return "Failed";
3505 }
3506 llvm_unreachable("Unimplemented Reordering Type");
3507 }
3508
3509 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3510 raw_ostream &OS) {
3511 return OS << getModeStr(RMode);
3512 }
3513
3514 /// Debug print.
3515 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3516 printMode(RMode, dbgs());
3517 }
3518
3519 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3520 return printMode(RMode, OS);
3521 }
3522
3523 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
3524 const unsigned Indent = 2;
3525 unsigned Cnt = 0;
3526 for (const OperandDataVec &OpDataVec : OpsVec) {
3527 OS << "Operand " << Cnt++ << "\n";
3528 for (const OperandData &OpData : OpDataVec) {
3529 OS.indent(Indent) << "{";
3530 if (Value *V = OpData.V)
3531 OS << *V;
3532 else
3533 OS << "null";
3534 OS << ", APO:" << OpData.APO << "}\n";
3535 }
3536 OS << "\n";
3537 }
3538 return OS;
3539 }
3540
3541 /// Debug print.
3542 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3543#endif
3544 };
3545
3546 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3547 /// for a pair which have highest score deemed to have best chance to form
3548 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3549 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3550 /// of the cost, considered to be good enough score.
3551 std::optional<int>
3552 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3553 int Limit = LookAheadHeuristics::ScoreFail) const {
3554 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3555 RootLookAheadMaxDepth);
3556 int BestScore = Limit;
3557 std::optional<int> Index;
3558 for (int I : seq<int>(Begin: 0, End: Candidates.size())) {
3559 int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates[I].first,
3560 RHS: Candidates[I].second,
3561 /*U1=*/nullptr, /*U2=*/nullptr,
3562 /*CurrLevel=*/1, MainAltOps: {});
3563 if (Score > BestScore) {
3564 BestScore = Score;
3565 Index = I;
3566 }
3567 }
3568 return Index;
3569 }
3570
3571 /// Checks if the instruction is marked for deletion.
3572 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(V: I); }
3573
3574 /// Removes an instruction from its block and eventually deletes it.
3575 /// It's like Instruction::eraseFromParent() except that the actual deletion
3576 /// is delayed until BoUpSLP is destructed.
3577 void eraseInstruction(Instruction *I) {
3578 DeletedInstructions.insert(V: I);
3579 }
3580
3581 /// Remove instructions from the parent function and clear the operands of \p
3582 /// DeadVals instructions, marking for deletion trivially dead operands.
3583 template <typename T>
3584 void removeInstructionsAndOperands(
3585 ArrayRef<T *> DeadVals,
3586 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
3587 SmallVector<WeakTrackingVH> DeadInsts;
3588 for (T *V : DeadVals) {
3589 auto *I = cast<Instruction>(V);
3590 eraseInstruction(I);
3591 }
3592 DenseSet<Value *> Processed;
3593 for (T *V : DeadVals) {
3594 if (!V || !Processed.insert(V).second)
3595 continue;
3596 auto *I = cast<Instruction>(V);
3597 salvageDebugInfo(*I);
3598 ArrayRef<TreeEntry *> Entries = getTreeEntries(V: I);
3599 for (Use &U : I->operands()) {
3600 if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
3601 OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
3602 wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
3603 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3604 return Entry->VectorizedValue == OpI;
3605 })))
3606 DeadInsts.push_back(Elt: OpI);
3607 }
3608 I->dropAllReferences();
3609 }
3610 for (T *V : DeadVals) {
3611 auto *I = cast<Instruction>(V);
3612 if (!I->getParent())
3613 continue;
3614 assert((I->use_empty() || all_of(I->uses(),
3615 [&](Use &U) {
3616 return isDeleted(
3617 cast<Instruction>(U.getUser()));
3618 })) &&
3619 "trying to erase instruction with users.");
3620 I->removeFromParent();
3621 SE->forgetValue(V: I);
3622 }
3623 // Process the dead instruction list until empty.
3624 while (!DeadInsts.empty()) {
3625 Value *V = DeadInsts.pop_back_val();
3626 Instruction *VI = cast_or_null<Instruction>(Val: V);
3627 if (!VI || !VI->getParent())
3628 continue;
3629 assert(isInstructionTriviallyDead(VI, TLI) &&
3630 "Live instruction found in dead worklist!");
3631 assert(VI->use_empty() && "Instructions with uses are not dead.");
3632
3633 // Don't lose the debug info while deleting the instructions.
3634 salvageDebugInfo(I&: *VI);
3635
3636 // Null out all of the instruction's operands to see if any operand
3637 // becomes dead as we go.
3638 for (Use &OpU : VI->operands()) {
3639 Value *OpV = OpU.get();
3640 if (!OpV)
3641 continue;
3642 OpU.set(nullptr);
3643
3644 if (!OpV->use_empty())
3645 continue;
3646
3647 // If the operand is an instruction that became dead as we nulled out
3648 // the operand, and if it is 'trivially' dead, delete it in a future
3649 // loop iteration.
3650 if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
3651 if (!DeletedInstructions.contains(V: OpI) &&
3652 (!OpI->getType()->isVectorTy() ||
3653 none_of(VectorValuesAndScales,
3654 [&](const std::tuple<Value *, unsigned, bool> &V) {
3655 return std::get<0>(t: V) == OpI;
3656 })) &&
3657 isInstructionTriviallyDead(I: OpI, TLI))
3658 DeadInsts.push_back(Elt: OpI);
3659 }
3660
3661 VI->removeFromParent();
3662 eraseInstruction(I: VI);
3663 SE->forgetValue(V: VI);
3664 }
3665 }
3666
3667 /// Checks if the instruction was already analyzed for being possible
3668 /// reduction root.
3669 bool isAnalyzedReductionRoot(Instruction *I) const {
3670 return AnalyzedReductionsRoots.count(Ptr: I);
3671 }
3672 /// Register given instruction as already analyzed for being possible
3673 /// reduction root.
3674 void analyzedReductionRoot(Instruction *I) {
3675 AnalyzedReductionsRoots.insert(Ptr: I);
3676 }
3677 /// Checks if the provided list of reduced values was checked already for
3678 /// vectorization.
3679 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
3680 return AnalyzedReductionVals.contains(V: hash_value(S: VL));
3681 }
3682 /// Adds the list of reduced values to list of already checked values for the
3683 /// vectorization.
3684 void analyzedReductionVals(ArrayRef<Value *> VL) {
3685 AnalyzedReductionVals.insert(V: hash_value(S: VL));
3686 }
3687 /// Clear the list of the analyzed reduction root instructions.
3688 void clearReductionData() {
3689 AnalyzedReductionsRoots.clear();
3690 AnalyzedReductionVals.clear();
3691 AnalyzedMinBWVals.clear();
3692 }
3693 /// Checks if the given value is gathered in one of the nodes.
3694 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3695 return any_of(Range: MustGather, P: [&](Value *V) { return Vals.contains(V); });
3696 }
3697 /// Checks if the given value is gathered in one of the nodes.
3698 bool isGathered(const Value *V) const {
3699 return MustGather.contains(Ptr: V);
3700 }
3701 /// Checks if the specified value was not schedule.
3702 bool isNotScheduled(const Value *V) const {
3703 return NonScheduledFirst.contains(Ptr: V);
3704 }
3705
3706 /// Check if the value is vectorized in the tree.
3707 bool isVectorized(const Value *V) const {
3708 assert(V && "V cannot be nullptr.");
3709 return ScalarToTreeEntries.contains(Val: V);
3710 }
3711
3712 ~BoUpSLP();
3713
3714private:
3715 /// Determine if a node \p E in can be demoted to a smaller type with a
3716 /// truncation. We collect the entries that will be demoted in ToDemote.
3717 /// \param E Node for analysis
3718 /// \param ToDemote indices of the nodes to be demoted.
3719 bool collectValuesToDemote(
3720 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3721 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
3722 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3723 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3724
3725 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3726 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3727 /// they have only one user and reordarable).
3728 /// \param ReorderableGathers List of all gather nodes that require reordering
3729 /// (e.g., gather of extractlements or partially vectorizable loads).
3730 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3731 /// reordering, subset of \p NonVectorized.
3732 void buildReorderableOperands(
3733 TreeEntry *UserTE,
3734 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3735 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3736 SmallVectorImpl<TreeEntry *> &GatherOps);
3737
3738 /// Checks if the given \p TE is a gather node with clustered reused scalars
3739 /// and reorders it per given \p Mask.
3740 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3741
3742 /// Checks if all users of \p I are the part of the vectorization tree.
3743 bool areAllUsersVectorized(
3744 Instruction *I,
3745 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3746
3747 /// Return information about the vector formed for the specified index
3748 /// of a vector of (the same) instruction.
3749 TargetTransformInfo::OperandValueInfo
3750 getOperandInfo(ArrayRef<Value *> Ops) const;
3751
3752 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3753 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3754 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3755 return const_cast<TreeEntry *>(
3756 getOperandEntry(E: const_cast<const TreeEntry *>(E), Idx));
3757 }
3758
3759 /// Gets the root instruction for the given node. If the node is a strided
3760 /// load/store node with the reverse order, the root instruction is the last
3761 /// one.
3762 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3763
3764 /// \returns Cast context for the given graph node.
3765 TargetTransformInfo::CastContextHint
3766 getCastContextHint(const TreeEntry &TE) const;
3767
3768 /// \returns the cost of the vectorizable entry.
3769 InstructionCost getEntryCost(const TreeEntry *E,
3770 ArrayRef<Value *> VectorizedVals,
3771 SmallPtrSetImpl<Value *> &CheckedExtracts);
3772
3773 /// Checks if it is legal and profitable to build SplitVectorize node for the
3774 /// given \p VL.
3775 /// \param Op1 first homogeneous scalars.
3776 /// \param Op2 second homogeneous scalars.
3777 /// \param ReorderIndices indices to reorder the scalars.
3778 /// \returns true if the node was successfully built.
3779 bool canBuildSplitNode(ArrayRef<Value *> VL,
3780 const InstructionsState &LocalState,
3781 SmallVectorImpl<Value *> &Op1,
3782 SmallVectorImpl<Value *> &Op2,
3783 OrdersType &ReorderIndices) const;
3784
3785 /// This is the recursive part of buildTree.
3786 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3787 unsigned InterleaveFactor = 0);
3788
3789 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3790 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3791 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3792 /// returns false, setting \p CurrentOrder to either an empty vector or a
3793 /// non-identity permutation that allows to reuse extract instructions.
3794 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3795 /// extract order.
3796 bool canReuseExtract(ArrayRef<Value *> VL,
3797 SmallVectorImpl<unsigned> &CurrentOrder,
3798 bool ResizeAllowed = false) const;
3799
3800 /// Vectorize a single entry in the tree.
3801 Value *vectorizeTree(TreeEntry *E);
3802
3803 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3804 /// \p E.
3805 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3806
3807 /// Create a new vector from a list of scalar values. Produces a sequence
3808 /// which exploits values reused across lanes, and arranges the inserts
3809 /// for ease of later optimization.
3810 template <typename BVTy, typename ResTy, typename... Args>
3811 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3812
3813 /// Create a new vector from a list of scalar values. Produces a sequence
3814 /// which exploits values reused across lanes, and arranges the inserts
3815 /// for ease of later optimization.
3816 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3817
3818 /// Returns the instruction in the bundle, which can be used as a base point
3819 /// for scheduling. Usually it is the last instruction in the bundle, except
3820 /// for the case when all operands are external (in this case, it is the first
3821 /// instruction in the list).
3822 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3823
3824 /// Tries to find extractelement instructions with constant indices from fixed
3825 /// vector type and gather such instructions into a bunch, which highly likely
3826 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3827 /// was successful, the matched scalars are replaced by poison values in \p VL
3828 /// for future analysis.
3829 std::optional<TargetTransformInfo::ShuffleKind>
3830 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3831 SmallVectorImpl<int> &Mask) const;
3832
3833 /// Tries to find extractelement instructions with constant indices from fixed
3834 /// vector type and gather such instructions into a bunch, which highly likely
3835 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3836 /// was successful, the matched scalars are replaced by poison values in \p VL
3837 /// for future analysis.
3838 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3839 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3840 SmallVectorImpl<int> &Mask,
3841 unsigned NumParts) const;
3842
3843 /// Checks if the gathered \p VL can be represented as a single register
3844 /// shuffle(s) of previous tree entries.
3845 /// \param TE Tree entry checked for permutation.
3846 /// \param VL List of scalars (a subset of the TE scalar), checked for
3847 /// permutations. Must form single-register vector.
3848 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3849 /// commands to build the mask using the original vector value, without
3850 /// relying on the potential reordering.
3851 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3852 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3853 std::optional<TargetTransformInfo::ShuffleKind>
3854 isGatherShuffledSingleRegisterEntry(
3855 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3856 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3857 bool ForOrder);
3858
3859 /// Checks if the gathered \p VL can be represented as multi-register
3860 /// shuffle(s) of previous tree entries.
3861 /// \param TE Tree entry checked for permutation.
3862 /// \param VL List of scalars (a subset of the TE scalar), checked for
3863 /// permutations.
3864 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3865 /// commands to build the mask using the original vector value, without
3866 /// relying on the potential reordering.
3867 /// \returns per-register series of ShuffleKind, if gathered values can be
3868 /// represented as shuffles of previous tree entries. \p Mask is filled with
3869 /// the shuffle mask (also on per-register base).
3870 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3871 isGatherShuffledEntry(
3872 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3873 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3874 unsigned NumParts, bool ForOrder = false);
3875
3876 /// \returns the cost of gathering (inserting) the values in \p VL into a
3877 /// vector.
3878 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3879 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3880 Type *ScalarTy) const;
3881
3882 /// Set the Builder insert point to one after the last instruction in
3883 /// the bundle
3884 void setInsertPointAfterBundle(const TreeEntry *E);
3885
3886 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3887 /// specified, the starting vector value is poison.
3888 Value *
3889 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3890 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3891
3892 /// \returns whether the VectorizableTree is fully vectorizable and will
3893 /// be beneficial even the tree height is tiny.
3894 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3895
3896 /// Run through the list of all gathered loads in the graph and try to find
3897 /// vector loads/masked gathers instead of regular gathers. Later these loads
3898 /// are reshufled to build final gathered nodes.
3899 void tryToVectorizeGatheredLoads(
3900 const SmallMapVector<
3901 std::tuple<BasicBlock *, Value *, Type *>,
3902 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3903 &GatheredLoads);
3904
3905 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3906 /// users of \p TE and collects the stores. It returns the map from the store
3907 /// pointers to the collected stores.
3908 SmallVector<SmallVector<StoreInst *>>
3909 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3910
3911 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3912 /// stores in \p StoresVec can form a vector instruction. If so it returns
3913 /// true and populates \p ReorderIndices with the shuffle indices of the
3914 /// stores when compared to the sorted vector.
3915 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3916 OrdersType &ReorderIndices) const;
3917
3918 /// Iterates through the users of \p TE, looking for scalar stores that can be
3919 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3920 /// their order and builds an order index vector for each store bundle. It
3921 /// returns all these order vectors found.
3922 /// We run this after the tree has formed, otherwise we may come across user
3923 /// instructions that are not yet in the tree.
3924 SmallVector<OrdersType, 1>
3925 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3926
3927 /// Tries to reorder the gathering node for better vectorization
3928 /// opportunities.
3929 void reorderGatherNode(TreeEntry &TE);
3930
3931 /// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
3932 /// .., 56))-like pattern.
3933 /// If the int shifts unique, also strided, but not ordered, sets \p Order.
3934 /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
3935 bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
3936 bool &IsBSwap) const;
3937
3938 class TreeEntry {
3939 public:
3940 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3941 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3942
3943 /// \returns Common mask for reorder indices and reused scalars.
3944 SmallVector<int> getCommonMask() const {
3945 if (State == TreeEntry::SplitVectorize)
3946 return {};
3947 SmallVector<int> Mask;
3948 inversePermutation(Indices: ReorderIndices, Mask);
3949 ::addMask(Mask, SubMask: ReuseShuffleIndices);
3950 return Mask;
3951 }
3952
3953 /// \returns The mask for split nodes.
3954 SmallVector<int> getSplitMask() const {
3955 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3956 "Expected only split vectorize node.");
3957 SmallVector<int> Mask(getVectorFactor(), PoisonMaskElem);
3958 unsigned CommonVF = std::max<unsigned>(
3959 a: CombinedEntriesWithIndices.back().second,
3960 b: Scalars.size() - CombinedEntriesWithIndices.back().second);
3961 for (auto [Idx, I] : enumerate(First: ReorderIndices))
3962 Mask[I] =
3963 Idx + (Idx >= CombinedEntriesWithIndices.back().second
3964 ? CommonVF - CombinedEntriesWithIndices.back().second
3965 : 0);
3966 return Mask;
3967 }
3968
3969 /// Updates (reorders) SplitVectorize node according to the given mask \p
3970 /// Mask and order \p MaskOrder.
3971 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3972 ArrayRef<int> MaskOrder);
3973
3974 /// \returns true if the scalars in VL are equal to this entry.
3975 bool isSame(ArrayRef<Value *> VL) const {
3976 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3977 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3978 return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
3979 return VL.size() == Mask.size() &&
3980 std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
3981 binary_pred: [Scalars](Value *V, int Idx) {
3982 return (isa<UndefValue>(Val: V) &&
3983 Idx == PoisonMaskElem) ||
3984 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3985 });
3986 };
3987 if (!ReorderIndices.empty()) {
3988 // TODO: implement matching if the nodes are just reordered, still can
3989 // treat the vector as the same if the list of scalars matches VL
3990 // directly, without reordering.
3991 SmallVector<int> Mask;
3992 inversePermutation(Indices: ReorderIndices, Mask);
3993 if (VL.size() == Scalars.size())
3994 return IsSame(Scalars, Mask);
3995 if (VL.size() == ReuseShuffleIndices.size()) {
3996 ::addMask(Mask, SubMask: ReuseShuffleIndices);
3997 return IsSame(Scalars, Mask);
3998 }
3999 return false;
4000 }
4001 return IsSame(Scalars, ReuseShuffleIndices);
4002 }
4003
4004 /// \returns true if current entry has same operands as \p TE.
4005 bool hasEqualOperands(const TreeEntry &TE) const {
4006 if (TE.getNumOperands() != getNumOperands())
4007 return false;
4008 SmallBitVector Used(getNumOperands());
4009 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
4010 unsigned PrevCount = Used.count();
4011 for (unsigned K = 0; K < E; ++K) {
4012 if (Used.test(Idx: K))
4013 continue;
4014 if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
4015 Used.set(K);
4016 break;
4017 }
4018 }
4019 // Check if we actually found the matching operand.
4020 if (PrevCount == Used.count())
4021 return false;
4022 }
4023 return true;
4024 }
4025
4026 /// \return Final vectorization factor for the node. Defined by the total
4027 /// number of vectorized scalars, including those, used several times in the
4028 /// entry and counted in the \a ReuseShuffleIndices, if any.
4029 unsigned getVectorFactor() const {
4030 if (!ReuseShuffleIndices.empty())
4031 return ReuseShuffleIndices.size();
4032 return Scalars.size();
4033 };
4034
4035 /// Checks if the current node is a gather node.
4036 bool isGather() const { return State == NeedToGather; }
4037
4038 /// A vector of scalars.
4039 ValueList Scalars;
4040
4041 /// The Scalars are vectorized into this value. It is initialized to Null.
4042 WeakTrackingVH VectorizedValue = nullptr;
4043
4044 /// Do we need to gather this sequence or vectorize it
4045 /// (either with vector instruction or with scatter/gather
4046 /// intrinsics for store/load)?
4047 enum EntryState {
4048 Vectorize, ///< The node is regularly vectorized.
4049 ScatterVectorize, ///< Masked scatter/gather node.
4050 StridedVectorize, ///< Strided loads (and stores)
4051 CompressVectorize, ///< (Masked) load with compress.
4052 NeedToGather, ///< Gather/buildvector node.
4053 CombinedVectorize, ///< Vectorized node, combined with its user into more
4054 ///< complex node like select/cmp to minmax, mul/add to
4055 ///< fma, etc. Must be used for the following nodes in
4056 ///< the pattern, not the very first one.
4057 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4058 ///< independently and then combines back.
4059 };
4060 EntryState State;
4061
4062 /// List of combined opcodes supported by the vectorizer.
4063 enum CombinedOpcode {
4064 NotCombinedOp = -1,
4065 MinMax = Instruction::OtherOpsEnd + 1,
4066 FMulAdd,
4067 ReducedBitcast,
4068 ReducedBitcastBSwap,
4069 };
4070 CombinedOpcode CombinedOp = NotCombinedOp;
4071
4072 /// Does this sequence require some shuffling?
4073 SmallVector<int, 4> ReuseShuffleIndices;
4074
4075 /// Does this entry require reordering?
4076 SmallVector<unsigned, 4> ReorderIndices;
4077
4078 /// Points back to the VectorizableTree.
4079 ///
4080 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4081 /// to be a pointer and needs to be able to initialize the child iterator.
4082 /// Thus we need a reference back to the container to translate the indices
4083 /// to entries.
4084 VecTreeTy &Container;
4085
4086 /// The TreeEntry index containing the user of this entry.
4087 EdgeInfo UserTreeIndex;
4088
4089 /// The index of this treeEntry in VectorizableTree.
4090 unsigned Idx = 0;
4091
4092 /// For gather/buildvector/alt opcode nodes, which are combined from
4093 /// other nodes as a series of insertvector instructions.
4094 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4095
4096 private:
4097 /// The operands of each instruction in each lane Operands[op_index][lane].
4098 /// Note: This helps avoid the replication of the code that performs the
4099 /// reordering of operands during buildTreeRec() and vectorizeTree().
4100 SmallVector<ValueList, 2> Operands;
4101
4102 /// Copyable elements of the entry node.
4103 SmallPtrSet<const Value *, 4> CopyableElements;
4104
4105 /// MainOp and AltOp are recorded inside. S should be obtained from
4106 /// newTreeEntry.
4107 InstructionsState S = InstructionsState::invalid();
4108
4109 /// Interleaving factor for interleaved loads Vectorize nodes.
4110 unsigned InterleaveFactor = 0;
4111
4112 /// True if the node does not require scheduling.
4113 bool DoesNotNeedToSchedule = false;
4114
4115 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4116 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4117 if (Operands.size() < OpIdx + 1)
4118 Operands.resize(N: OpIdx + 1);
4119 assert(Operands[OpIdx].empty() && "Already resized?");
4120 assert(OpVL.size() <= Scalars.size() &&
4121 "Number of operands is greater than the number of scalars.");
4122 Operands[OpIdx].resize(N: OpVL.size());
4123 copy(Range&: OpVL, Out: Operands[OpIdx].begin());
4124 }
4125
4126 public:
4127 /// Returns interleave factor for interleave nodes.
4128 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4129 /// Sets interleaving factor for the interleaving nodes.
4130 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4131
4132 /// Marks the node as one that does not require scheduling.
4133 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4134 /// Returns true if the node is marked as one that does not require
4135 /// scheduling.
4136 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4137
4138 /// Set this bundle's operands from \p Operands.
4139 void setOperands(ArrayRef<ValueList> Operands) {
4140 for (unsigned I : seq<unsigned>(Size: Operands.size()))
4141 setOperand(OpIdx: I, OpVL: Operands[I]);
4142 }
4143
4144 /// Reorders operands of the node to the given mask \p Mask.
4145 void reorderOperands(ArrayRef<int> Mask) {
4146 for (ValueList &Operand : Operands)
4147 reorderScalars(Scalars&: Operand, Mask);
4148 }
4149
4150 /// \returns the \p OpIdx operand of this TreeEntry.
4151 ValueList &getOperand(unsigned OpIdx) {
4152 assert(OpIdx < Operands.size() && "Off bounds");
4153 return Operands[OpIdx];
4154 }
4155
4156 /// \returns the \p OpIdx operand of this TreeEntry.
4157 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4158 assert(OpIdx < Operands.size() && "Off bounds");
4159 return Operands[OpIdx];
4160 }
4161
4162 /// \returns the number of operands.
4163 unsigned getNumOperands() const { return Operands.size(); }
4164
4165 /// \return the single \p OpIdx operand.
4166 Value *getSingleOperand(unsigned OpIdx) const {
4167 assert(OpIdx < Operands.size() && "Off bounds");
4168 assert(!Operands[OpIdx].empty() && "No operand available");
4169 return Operands[OpIdx][0];
4170 }
4171
4172 /// Some of the instructions in the list have alternate opcodes.
4173 bool isAltShuffle() const { return S.isAltShuffle(); }
4174
4175 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4176 return S.getMatchingMainOpOrAltOp(I);
4177 }
4178
4179 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4180 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4181 /// \p OpValue.
4182 Value *isOneOf(Value *Op) const {
4183 auto *I = dyn_cast<Instruction>(Val: Op);
4184 if (I && getMatchingMainOpOrAltOp(I))
4185 return Op;
4186 return S.getMainOp();
4187 }
4188
4189 void setOperations(const InstructionsState &S) {
4190 assert(S && "InstructionsState is invalid.");
4191 this->S = S;
4192 }
4193
4194 Instruction *getMainOp() const { return S.getMainOp(); }
4195
4196 Instruction *getAltOp() const { return S.getAltOp(); }
4197
4198 /// The main/alternate opcodes for the list of instructions.
4199 unsigned getOpcode() const { return S.getOpcode(); }
4200
4201 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4202
4203 bool hasState() const { return S.valid(); }
4204
4205 /// Add \p V to the list of copyable elements.
4206 void addCopyableElement(Value *V) {
4207 assert(S.isCopyableElement(V) && "Not a copyable element.");
4208 CopyableElements.insert(Ptr: V);
4209 }
4210
4211 /// Returns true if \p V is a copyable element.
4212 bool isCopyableElement(Value *V) const {
4213 return CopyableElements.contains(Ptr: V);
4214 }
4215
4216 /// Returns true if any scalar in the list is a copyable element.
4217 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4218
4219 /// Returns the state of the operations.
4220 const InstructionsState &getOperations() const { return S; }
4221
4222 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4223 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4224 unsigned findLaneForValue(Value *V) const {
4225 unsigned FoundLane = getVectorFactor();
4226 for (auto *It = find(Range: Scalars, Val: V), *End = Scalars.end(); It != End;
4227 std::advance(i&: It, n: 1)) {
4228 if (*It != V)
4229 continue;
4230 FoundLane = std::distance(first: Scalars.begin(), last: It);
4231 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4232 if (!ReorderIndices.empty())
4233 FoundLane = ReorderIndices[FoundLane];
4234 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4235 if (ReuseShuffleIndices.empty())
4236 break;
4237 if (auto *RIt = find(Range: ReuseShuffleIndices, Val: FoundLane);
4238 RIt != ReuseShuffleIndices.end()) {
4239 FoundLane = std::distance(first: ReuseShuffleIndices.begin(), last: RIt);
4240 break;
4241 }
4242 }
4243 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4244 return FoundLane;
4245 }
4246
4247 /// Build a shuffle mask for graph entry which represents a merge of main
4248 /// and alternate operations.
4249 void
4250 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4251 SmallVectorImpl<int> &Mask,
4252 SmallVectorImpl<Value *> *OpScalars = nullptr,
4253 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4254
4255 /// Return true if this is a non-power-of-2 node.
4256 bool isNonPowOf2Vec() const {
4257 bool IsNonPowerOf2 = !has_single_bit(Value: Scalars.size());
4258 return IsNonPowerOf2;
4259 }
4260
4261 /// Return true if this is a node, which tries to vectorize number of
4262 /// elements, forming whole vectors.
4263 bool
4264 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4265 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4266 TTI, Ty: getValueType(V: Scalars.front()), Sz: Scalars.size());
4267 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4268 "Reshuffling not supported with non-power-of-2 vectors yet.");
4269 return IsNonPowerOf2;
4270 }
4271
4272 Value *getOrdered(unsigned Idx) const {
4273 if (ReorderIndices.empty())
4274 return Scalars[Idx];
4275 SmallVector<int> Mask;
4276 inversePermutation(Indices: ReorderIndices, Mask);
4277 return Scalars[Mask[Idx]];
4278 }
4279
4280#ifndef NDEBUG
4281 /// Debug printer.
4282 LLVM_DUMP_METHOD void dump() const {
4283 dbgs() << Idx << ".\n";
4284 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4285 dbgs() << "Operand " << OpI << ":\n";
4286 for (const Value *V : Operands[OpI])
4287 dbgs().indent(2) << *V << "\n";
4288 }
4289 dbgs() << "Scalars: \n";
4290 for (Value *V : Scalars)
4291 dbgs().indent(2) << *V << "\n";
4292 dbgs() << "State: ";
4293 if (S && hasCopyableElements())
4294 dbgs() << "[[Copyable]] ";
4295 switch (State) {
4296 case Vectorize:
4297 if (InterleaveFactor > 0) {
4298 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4299 << "\n";
4300 } else {
4301 dbgs() << "Vectorize\n";
4302 }
4303 break;
4304 case ScatterVectorize:
4305 dbgs() << "ScatterVectorize\n";
4306 break;
4307 case StridedVectorize:
4308 dbgs() << "StridedVectorize\n";
4309 break;
4310 case CompressVectorize:
4311 dbgs() << "CompressVectorize\n";
4312 break;
4313 case NeedToGather:
4314 dbgs() << "NeedToGather\n";
4315 break;
4316 case CombinedVectorize:
4317 dbgs() << "CombinedVectorize\n";
4318 break;
4319 case SplitVectorize:
4320 dbgs() << "SplitVectorize\n";
4321 break;
4322 }
4323 if (S) {
4324 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4325 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4326 } else {
4327 dbgs() << "MainOp: NULL\n";
4328 dbgs() << "AltOp: NULL\n";
4329 }
4330 dbgs() << "VectorizedValue: ";
4331 if (VectorizedValue)
4332 dbgs() << *VectorizedValue << "\n";
4333 else
4334 dbgs() << "NULL\n";
4335 dbgs() << "ReuseShuffleIndices: ";
4336 if (ReuseShuffleIndices.empty())
4337 dbgs() << "Empty";
4338 else
4339 for (int ReuseIdx : ReuseShuffleIndices)
4340 dbgs() << ReuseIdx << ", ";
4341 dbgs() << "\n";
4342 dbgs() << "ReorderIndices: ";
4343 for (unsigned ReorderIdx : ReorderIndices)
4344 dbgs() << ReorderIdx << ", ";
4345 dbgs() << "\n";
4346 dbgs() << "UserTreeIndex: ";
4347 if (UserTreeIndex)
4348 dbgs() << UserTreeIndex;
4349 else
4350 dbgs() << "<invalid>";
4351 dbgs() << "\n";
4352 if (!CombinedEntriesWithIndices.empty()) {
4353 dbgs() << "Combined entries: ";
4354 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4355 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4356 });
4357 dbgs() << "\n";
4358 }
4359 }
4360#endif
4361 };
4362
4363#ifndef NDEBUG
4364 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4365 InstructionCost VecCost, InstructionCost ScalarCost,
4366 StringRef Banner) const {
4367 dbgs() << "SLP: " << Banner << ":\n";
4368 E->dump();
4369 dbgs() << "SLP: Costs:\n";
4370 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4371 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4372 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4373 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4374 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4375 }
4376#endif
4377
4378 /// Create a new gather TreeEntry
4379 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4380 const InstructionsState &S,
4381 const EdgeInfo &UserTreeIdx,
4382 ArrayRef<int> ReuseShuffleIndices = {}) {
4383 auto Invalid = ScheduleBundle::invalid();
4384 return newTreeEntry(VL, Bundle&: Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4385 }
4386
4387 /// Create a new VectorizableTree entry.
4388 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4389 const InstructionsState &S,
4390 const EdgeInfo &UserTreeIdx,
4391 ArrayRef<int> ReuseShuffleIndices = {},
4392 ArrayRef<unsigned> ReorderIndices = {},
4393 unsigned InterleaveFactor = 0) {
4394 TreeEntry::EntryState EntryState =
4395 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4396 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4397 ReuseShuffleIndices, ReorderIndices);
4398 if (E && InterleaveFactor > 0)
4399 E->setInterleave(InterleaveFactor);
4400 return E;
4401 }
4402
4403 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4404 TreeEntry::EntryState EntryState,
4405 ScheduleBundle &Bundle, const InstructionsState &S,
4406 const EdgeInfo &UserTreeIdx,
4407 ArrayRef<int> ReuseShuffleIndices = {},
4408 ArrayRef<unsigned> ReorderIndices = {}) {
4409 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4410 EntryState == TreeEntry::SplitVectorize)) ||
4411 (Bundle && EntryState != TreeEntry::NeedToGather &&
4412 EntryState != TreeEntry::SplitVectorize)) &&
4413 "Need to vectorize gather entry?");
4414 // Gathered loads still gathered? Do not create entry, use the original one.
4415 if (GatheredLoadsEntriesFirst.has_value() &&
4416 EntryState == TreeEntry::NeedToGather && S &&
4417 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4418 !UserTreeIdx.UserTE)
4419 return nullptr;
4420 VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
4421 TreeEntry *Last = VectorizableTree.back().get();
4422 Last->Idx = VectorizableTree.size() - 1;
4423 Last->State = EntryState;
4424 if (UserTreeIdx.UserTE)
4425 OperandsToTreeEntry.try_emplace(
4426 Key: std::make_pair(x: UserTreeIdx.UserTE, y: UserTreeIdx.EdgeIdx), Args&: Last);
4427 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4428 // for non-power-of-two vectors.
4429 assert(
4430 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4431 ReuseShuffleIndices.empty()) &&
4432 "Reshuffling scalars not yet supported for nodes with padding");
4433 Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
4434 in_end: ReuseShuffleIndices.end());
4435 if (ReorderIndices.empty()) {
4436 Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
4437 if (S)
4438 Last->setOperations(S);
4439 } else {
4440 // Reorder scalars and build final mask.
4441 Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
4442 transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
4443 F: [VL](unsigned Idx) -> Value * {
4444 if (Idx >= VL.size())
4445 return UndefValue::get(T: VL.front()->getType());
4446 return VL[Idx];
4447 });
4448 InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
4449 if (S)
4450 Last->setOperations(S);
4451 Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
4452 }
4453 if (EntryState == TreeEntry::SplitVectorize) {
4454 assert(S && "Split nodes must have operations.");
4455 Last->setOperations(S);
4456 SmallPtrSet<Value *, 4> Processed;
4457 for (Value *V : VL) {
4458 auto *I = dyn_cast<Instruction>(Val: V);
4459 if (!I)
4460 continue;
4461 auto It = ScalarsInSplitNodes.find(Val: V);
4462 if (It == ScalarsInSplitNodes.end()) {
4463 ScalarsInSplitNodes.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4464 (void)Processed.insert(Ptr: V);
4465 } else if (Processed.insert(Ptr: V).second) {
4466 assert(!is_contained(It->getSecond(), Last) &&
4467 "Value already associated with the node.");
4468 It->getSecond().push_back(Elt: Last);
4469 }
4470 }
4471 } else if (!Last->isGather()) {
4472 if (isa<PHINode>(Val: S.getMainOp()) ||
4473 isVectorLikeInstWithConstOps(V: S.getMainOp()) ||
4474 (!S.areInstructionsWithCopyableElements() &&
4475 doesNotNeedToSchedule(VL)) ||
4476 all_of(Range&: VL, P: [&](Value *V) { return S.isNonSchedulable(V); }))
4477 Last->setDoesNotNeedToSchedule();
4478 SmallPtrSet<Value *, 4> Processed;
4479 for (Value *V : VL) {
4480 if (isa<PoisonValue>(Val: V))
4481 continue;
4482 if (S.isCopyableElement(V)) {
4483 Last->addCopyableElement(V);
4484 continue;
4485 }
4486 auto It = ScalarToTreeEntries.find(Val: V);
4487 if (It == ScalarToTreeEntries.end()) {
4488 ScalarToTreeEntries.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4489 (void)Processed.insert(Ptr: V);
4490 } else if (Processed.insert(Ptr: V).second) {
4491 assert(!is_contained(It->getSecond(), Last) &&
4492 "Value already associated with the node.");
4493 It->getSecond().push_back(Elt: Last);
4494 }
4495 }
4496 // Update the scheduler bundle to point to this TreeEntry.
4497 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4498 "Bundle and VL out of sync");
4499 if (!Bundle.getBundle().empty()) {
4500#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4501 auto *BundleMember = Bundle.getBundle().begin();
4502 SmallPtrSet<Value *, 4> Processed;
4503 for (Value *V : VL) {
4504 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4505 continue;
4506 ++BundleMember;
4507 }
4508 assert(BundleMember == Bundle.getBundle().end() &&
4509 "Bundle and VL out of sync");
4510#endif
4511 Bundle.setTreeEntry(Last);
4512 }
4513 } else {
4514 // Build a map for gathered scalars to the nodes where they are used.
4515 bool AllConstsOrCasts = true;
4516 for (Value *V : VL) {
4517 if (S && S.areInstructionsWithCopyableElements() &&
4518 S.isCopyableElement(V))
4519 Last->addCopyableElement(V);
4520 if (!isConstant(V)) {
4521 auto *I = dyn_cast<CastInst>(Val: V);
4522 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4523 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4524 !UserTreeIdx.UserTE->isGather())
4525 ValueToGatherNodes.try_emplace(Key: V).first->getSecond().insert(X: Last);
4526 }
4527 }
4528 if (AllConstsOrCasts)
4529 CastMaxMinBWSizes =
4530 std::make_pair(x: std::numeric_limits<unsigned>::max(), y: 1);
4531 MustGather.insert_range(R&: VL);
4532 }
4533
4534 if (UserTreeIdx.UserTE)
4535 Last->UserTreeIndex = UserTreeIdx;
4536 return Last;
4537 }
4538
4539 /// -- Vectorization State --
4540 /// Holds all of the tree entries.
4541 TreeEntry::VecTreeTy VectorizableTree;
4542
4543#ifndef NDEBUG
4544 /// Debug printer.
4545 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4546 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4547 VectorizableTree[Id]->dump();
4548 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4549 dbgs() << "[[TRANSFORMED TO GATHER]]";
4550 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4551 dbgs() << "[[DELETED NODE]]";
4552 dbgs() << "\n";
4553 }
4554 }
4555#endif
4556
4557 /// Get list of vector entries, associated with the value \p V.
4558 ArrayRef<TreeEntry *> getTreeEntries(Value *V) const {
4559 assert(V && "V cannot be nullptr.");
4560 auto It = ScalarToTreeEntries.find(Val: V);
4561 if (It == ScalarToTreeEntries.end())
4562 return {};
4563 return It->getSecond();
4564 }
4565
4566 /// Get list of split vector entries, associated with the value \p V.
4567 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4568 assert(V && "V cannot be nullptr.");
4569 auto It = ScalarsInSplitNodes.find(Val: V);
4570 if (It == ScalarsInSplitNodes.end())
4571 return {};
4572 return It->getSecond();
4573 }
4574
4575 /// Returns first vector node for value \p V, matching values \p VL.
4576 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4577 bool SameVF = false) const {
4578 assert(V && "V cannot be nullptr.");
4579 for (TreeEntry *TE : ScalarToTreeEntries.lookup(Val: V))
4580 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4581 return TE;
4582 return nullptr;
4583 }
4584
4585 /// Check that the operand node of alternate node does not generate
4586 /// buildvector sequence. If it is, then probably not worth it to build
4587 /// alternate shuffle, if number of buildvector operands + alternate
4588 /// instruction > than the number of buildvector instructions.
4589 /// \param S the instructions state of the analyzed values.
4590 /// \param VL list of the instructions with alternate opcodes.
4591 bool areAltOperandsProfitable(const InstructionsState &S,
4592 ArrayRef<Value *> VL) const;
4593
4594 /// Contains all the outputs of legality analysis for a list of values to
4595 /// vectorize.
4596 class ScalarsVectorizationLegality {
4597 InstructionsState S;
4598 bool IsLegal;
4599 bool TryToFindDuplicates;
4600 bool TrySplitVectorize;
4601
4602 public:
4603 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4604 bool TryToFindDuplicates = true,
4605 bool TrySplitVectorize = false)
4606 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4607 TrySplitVectorize(TrySplitVectorize) {
4608 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4609 "Inconsistent state");
4610 }
4611 const InstructionsState &getInstructionsState() const { return S; };
4612 bool isLegal() const { return IsLegal; }
4613 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4614 bool trySplitVectorize() const { return TrySplitVectorize; }
4615 };
4616
4617 /// Checks if the specified list of the instructions/values can be vectorized
4618 /// in general.
4619 ScalarsVectorizationLegality
4620 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4621 const EdgeInfo &UserTreeIdx,
4622 bool TryCopyableElementsVectorization) const;
4623
4624 /// Checks if the specified list of the instructions/values can be vectorized
4625 /// and fills required data before actual scheduling of the instructions.
4626 TreeEntry::EntryState getScalarsVectorizationState(
4627 const InstructionsState &S, ArrayRef<Value *> VL,
4628 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4629 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4630
4631 /// Maps a specific scalar to its tree entry(ies).
4632 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4633
4634 /// List of deleted non-profitable nodes.
4635 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4636
4637 /// List of nodes, transformed to gathered, with their conservative
4638 /// gather/buildvector cost estimation.
4639 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4640
4641 /// Maps the operand index and entry to the corresponding tree entry.
4642 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4643 OperandsToTreeEntry;
4644
4645 /// Scalars, used in split vectorize nodes.
4646 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4647
4648 /// Maps a value to the proposed vectorizable size.
4649 SmallDenseMap<Value *, unsigned> InstrElementSize;
4650
4651 /// A list of scalars that we found that we need to keep as scalars.
4652 ValueSet MustGather;
4653
4654 /// A set of first non-schedulable values.
4655 ValueSet NonScheduledFirst;
4656
4657 /// A map between the vectorized entries and the last instructions in the
4658 /// bundles. The bundles are built in use order, not in the def order of the
4659 /// instructions. So, we cannot rely directly on the last instruction in the
4660 /// bundle being the last instruction in the program order during
4661 /// vectorization process since the basic blocks are affected, need to
4662 /// pre-gather them before.
4663 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4664
4665 /// Keeps the mapping between the last instructions and their insertion
4666 /// points, which is an instruction-after-the-last-instruction.
4667 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4668
4669 /// List of gather nodes, depending on other gather/vector nodes, which should
4670 /// be emitted after the vector instruction emission process to correctly
4671 /// handle order of the vector instructions and shuffles.
4672 SetVector<const TreeEntry *> PostponedGathers;
4673
4674 using ValueToGatherNodesMap =
4675 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4676 ValueToGatherNodesMap ValueToGatherNodes;
4677
4678 /// A list of the load entries (node indices), which can be vectorized using
4679 /// strided or masked gather approach, but attempted to be represented as
4680 /// contiguous loads.
4681 SetVector<unsigned> LoadEntriesToVectorize;
4682
4683 /// true if graph nodes transforming mode is on.
4684 bool IsGraphTransformMode = false;
4685
4686 /// The index of the first gathered load entry in the VectorizeTree.
4687 std::optional<unsigned> GatheredLoadsEntriesFirst;
4688
4689 /// Maps compress entries to their mask data for the final codegen.
4690 SmallDenseMap<const TreeEntry *,
4691 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4692 CompressEntryToData;
4693
4694 /// This POD struct describes one external user in the vectorized tree.
4695 struct ExternalUser {
4696 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4697 : Scalar(S), User(U), E(E), Lane(L) {}
4698
4699 /// Which scalar in our function.
4700 Value *Scalar = nullptr;
4701
4702 /// Which user that uses the scalar.
4703 llvm::User *User = nullptr;
4704
4705 /// Vector node, the value is part of.
4706 const TreeEntry &E;
4707
4708 /// Which lane does the scalar belong to.
4709 unsigned Lane;
4710 };
4711 using UserList = SmallVector<ExternalUser, 16>;
4712
4713 /// Checks if two instructions may access the same memory.
4714 ///
4715 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4716 /// is invariant in the calling loop.
4717 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4718 Instruction *Inst2) {
4719 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4720 // First check if the result is already in the cache.
4721 AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
4722 auto Res = AliasCache.try_emplace(Key);
4723 if (!Res.second)
4724 return Res.first->second;
4725 bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
4726 // Store the result in the cache.
4727 Res.first->getSecond() = Aliased;
4728 return Aliased;
4729 }
4730
4731 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4732
4733 /// Cache for alias results.
4734 /// TODO: consider moving this to the AliasAnalysis itself.
4735 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4736
4737 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4738 // globally through SLP because we don't perform any action which
4739 // invalidates capture results.
4740 BatchAAResults BatchAA;
4741
4742 /// Temporary store for deleted instructions. Instructions will be deleted
4743 /// eventually when the BoUpSLP is destructed. The deferral is required to
4744 /// ensure that there are no incorrect collisions in the AliasCache, which
4745 /// can happen if a new instruction is allocated at the same address as a
4746 /// previously deleted instruction.
4747 DenseSet<Instruction *> DeletedInstructions;
4748
4749 /// Set of the instruction, being analyzed already for reductions.
4750 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4751
4752 /// Set of hashes for the list of reduction values already being analyzed.
4753 DenseSet<size_t> AnalyzedReductionVals;
4754
4755 /// Values, already been analyzed for mininmal bitwidth and found to be
4756 /// non-profitable.
4757 DenseSet<Value *> AnalyzedMinBWVals;
4758
4759 /// A list of values that need to extracted out of the tree.
4760 /// This list holds pairs of (Internal Scalar : External User). External User
4761 /// can be nullptr, it means that this Internal Scalar will be used later,
4762 /// after vectorization.
4763 UserList ExternalUses;
4764
4765 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4766 /// extractelement instructions.
4767 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4768
4769 /// A list of scalar to be extracted without specific user necause of too many
4770 /// uses.
4771 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4772
4773 /// Values used only by @llvm.assume calls.
4774 SmallPtrSet<const Value *, 32> EphValues;
4775
4776 /// Holds all of the instructions that we gathered, shuffle instructions and
4777 /// extractelements.
4778 SetVector<Instruction *> GatherShuffleExtractSeq;
4779
4780 /// A list of blocks that we are going to CSE.
4781 DenseSet<BasicBlock *> CSEBlocks;
4782
4783 /// List of hashes of vector of loads, which are known to be non vectorizable.
4784 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4785
4786 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4787 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4788 /// instructions, while ScheduleBundle represents a batch of instructions,
4789 /// going to be groupped together. ScheduleCopyableData models extra user for
4790 /// "copyable" instructions.
4791 class ScheduleEntity {
4792 friend class ScheduleBundle;
4793 friend class ScheduleData;
4794 friend class ScheduleCopyableData;
4795
4796 protected:
4797 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4798 Kind getKind() const { return K; }
4799 ScheduleEntity(Kind K) : K(K) {}
4800
4801 private:
4802 /// Used for getting a "good" final ordering of instructions.
4803 int SchedulingPriority = 0;
4804 /// True if this instruction (or bundle) is scheduled (or considered as
4805 /// scheduled in the dry-run).
4806 bool IsScheduled = false;
4807 /// The kind of the ScheduleEntity.
4808 const Kind K = Kind::ScheduleData;
4809
4810 public:
4811 ScheduleEntity() = delete;
4812 /// Gets/sets the scheduling priority.
4813 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4814 int getSchedulingPriority() const { return SchedulingPriority; }
4815 bool isReady() const {
4816 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4817 return SD->isReady();
4818 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4819 return CD->isReady();
4820 return cast<ScheduleBundle>(Val: this)->isReady();
4821 }
4822 /// Returns true if the dependency information has been calculated.
4823 /// Note that depenendency validity can vary between instructions within
4824 /// a single bundle.
4825 bool hasValidDependencies() const {
4826 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4827 return SD->hasValidDependencies();
4828 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4829 return CD->hasValidDependencies();
4830 return cast<ScheduleBundle>(Val: this)->hasValidDependencies();
4831 }
4832 /// Gets the number of unscheduled dependencies.
4833 int getUnscheduledDeps() const {
4834 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4835 return SD->getUnscheduledDeps();
4836 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4837 return CD->getUnscheduledDeps();
4838 return cast<ScheduleBundle>(Val: this)->unscheduledDepsInBundle();
4839 }
4840 /// Increments the number of unscheduled dependencies.
4841 int incrementUnscheduledDeps(int Incr) {
4842 if (auto *SD = dyn_cast<ScheduleData>(Val: this))
4843 return SD->incrementUnscheduledDeps(Incr);
4844 return cast<ScheduleCopyableData>(Val: this)->incrementUnscheduledDeps(Incr);
4845 }
4846 /// Gets the number of dependencies.
4847 int getDependencies() const {
4848 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4849 return SD->getDependencies();
4850 return cast<ScheduleCopyableData>(Val: this)->getDependencies();
4851 }
4852 /// Gets the instruction.
4853 Instruction *getInst() const {
4854 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4855 return SD->getInst();
4856 return cast<ScheduleCopyableData>(Val: this)->getInst();
4857 }
4858
4859 /// Gets/sets if the bundle is scheduled.
4860 bool isScheduled() const { return IsScheduled; }
4861 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4862
4863 static bool classof(const ScheduleEntity *) { return true; }
4864
4865#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4866 void dump(raw_ostream &OS) const {
4867 if (const auto *SD = dyn_cast<ScheduleData>(this))
4868 return SD->dump(OS);
4869 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4870 return CD->dump(OS);
4871 return cast<ScheduleBundle>(this)->dump(OS);
4872 }
4873
4874 LLVM_DUMP_METHOD void dump() const {
4875 dump(dbgs());
4876 dbgs() << '\n';
4877 }
4878#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4879 };
4880
4881#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4882 friend inline raw_ostream &operator<<(raw_ostream &OS,
4883 const BoUpSLP::ScheduleEntity &SE) {
4884 SE.dump(OS);
4885 return OS;
4886 }
4887#endif
4888
4889 /// Contains all scheduling relevant data for an instruction.
4890 /// A ScheduleData either represents a single instruction or a member of an
4891 /// instruction bundle (= a group of instructions which is combined into a
4892 /// vector instruction).
4893 class ScheduleData final : public ScheduleEntity {
4894 public:
4895 // The initial value for the dependency counters. It means that the
4896 // dependencies are not calculated yet.
4897 enum { InvalidDeps = -1 };
4898
4899 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4900 static bool classof(const ScheduleEntity *Entity) {
4901 return Entity->getKind() == Kind::ScheduleData;
4902 }
4903
4904 void init(int BlockSchedulingRegionID, Instruction *I) {
4905 NextLoadStore = nullptr;
4906 IsScheduled = false;
4907 SchedulingRegionID = BlockSchedulingRegionID;
4908 clearDependencies();
4909 Inst = I;
4910 }
4911
4912 /// Verify basic self consistency properties
4913 void verify() {
4914 if (hasValidDependencies()) {
4915 assert(UnscheduledDeps <= Dependencies && "invariant");
4916 } else {
4917 assert(UnscheduledDeps == Dependencies && "invariant");
4918 }
4919
4920 if (IsScheduled) {
4921 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4922 "unexpected scheduled state");
4923 }
4924 }
4925
4926 /// Returns true if the dependency information has been calculated.
4927 /// Note that depenendency validity can vary between instructions within
4928 /// a single bundle.
4929 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4930
4931 /// Returns true if it is ready for scheduling, i.e. it has no more
4932 /// unscheduled depending instructions/bundles.
4933 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4934
4935 /// Modifies the number of unscheduled dependencies for this instruction,
4936 /// and returns the number of remaining dependencies for the containing
4937 /// bundle.
4938 int incrementUnscheduledDeps(int Incr) {
4939 assert(hasValidDependencies() &&
4940 "increment of unscheduled deps would be meaningless");
4941 UnscheduledDeps += Incr;
4942 assert(UnscheduledDeps >= 0 &&
4943 "Expected valid number of unscheduled deps");
4944 return UnscheduledDeps;
4945 }
4946
4947 /// Sets the number of unscheduled dependencies to the number of
4948 /// dependencies.
4949 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4950
4951 /// Clears all dependency information.
4952 void clearDependencies() {
4953 clearDirectDependencies();
4954 MemoryDependencies.clear();
4955 ControlDependencies.clear();
4956 }
4957
4958 /// Clears all direct dependencies only, except for control and memory
4959 /// dependencies.
4960 /// Required for copyable elements to correctly handle control/memory deps
4961 /// and avoid extra reclaculation of such deps.
4962 void clearDirectDependencies() {
4963 Dependencies = InvalidDeps;
4964 resetUnscheduledDeps();
4965 IsScheduled = false;
4966 }
4967
4968 /// Gets the number of unscheduled dependencies.
4969 int getUnscheduledDeps() const { return UnscheduledDeps; }
4970 /// Gets the number of dependencies.
4971 int getDependencies() const { return Dependencies; }
4972 /// Initializes the number of dependencies.
4973 void initDependencies() { Dependencies = 0; }
4974 /// Increments the number of dependencies.
4975 void incDependencies() { Dependencies++; }
4976
4977 /// Gets scheduling region ID.
4978 int getSchedulingRegionID() const { return SchedulingRegionID; }
4979
4980 /// Gets the instruction.
4981 Instruction *getInst() const { return Inst; }
4982
4983 /// Gets the list of memory dependencies.
4984 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4985 return MemoryDependencies;
4986 }
4987 /// Adds a memory dependency.
4988 void addMemoryDependency(ScheduleData *Dep) {
4989 MemoryDependencies.push_back(Elt: Dep);
4990 }
4991 /// Gets the list of control dependencies.
4992 ArrayRef<ScheduleData *> getControlDependencies() const {
4993 return ControlDependencies;
4994 }
4995 /// Adds a control dependency.
4996 void addControlDependency(ScheduleData *Dep) {
4997 ControlDependencies.push_back(Elt: Dep);
4998 }
4999 /// Gets/sets the next load/store instruction in the block.
5000 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
5001 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5002
5003 void dump(raw_ostream &OS) const { OS << *Inst; }
5004
5005 LLVM_DUMP_METHOD void dump() const {
5006 dump(OS&: dbgs());
5007 dbgs() << '\n';
5008 }
5009
5010 private:
5011 Instruction *Inst = nullptr;
5012
5013 /// Single linked list of all memory instructions (e.g. load, store, call)
5014 /// in the block - until the end of the scheduling region.
5015 ScheduleData *NextLoadStore = nullptr;
5016
5017 /// The dependent memory instructions.
5018 /// This list is derived on demand in calculateDependencies().
5019 SmallVector<ScheduleData *> MemoryDependencies;
5020
5021 /// List of instructions which this instruction could be control dependent
5022 /// on. Allowing such nodes to be scheduled below this one could introduce
5023 /// a runtime fault which didn't exist in the original program.
5024 /// ex: this is a load or udiv following a readonly call which inf loops
5025 SmallVector<ScheduleData *> ControlDependencies;
5026
5027 /// This ScheduleData is in the current scheduling region if this matches
5028 /// the current SchedulingRegionID of BlockScheduling.
5029 int SchedulingRegionID = 0;
5030
5031 /// The number of dependencies. Constitutes of the number of users of the
5032 /// instruction plus the number of dependent memory instructions (if any).
5033 /// This value is calculated on demand.
5034 /// If InvalidDeps, the number of dependencies is not calculated yet.
5035 int Dependencies = InvalidDeps;
5036
5037 /// The number of dependencies minus the number of dependencies of scheduled
5038 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5039 /// for scheduling.
5040 /// Note that this is negative as long as Dependencies is not calculated.
5041 int UnscheduledDeps = InvalidDeps;
5042 };
5043
5044#ifndef NDEBUG
5045 friend inline raw_ostream &operator<<(raw_ostream &OS,
5046 const BoUpSLP::ScheduleData &SD) {
5047 SD.dump(OS);
5048 return OS;
5049 }
5050#endif
5051
5052 class ScheduleBundle final : public ScheduleEntity {
5053 /// The schedule data for the instructions in the bundle.
5054 SmallVector<ScheduleEntity *> Bundle;
5055 /// True if this bundle is valid.
5056 bool IsValid = true;
5057 /// The TreeEntry that this instruction corresponds to.
5058 TreeEntry *TE = nullptr;
5059 ScheduleBundle(bool IsValid)
5060 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5061
5062 public:
5063 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5064 static bool classof(const ScheduleEntity *Entity) {
5065 return Entity->getKind() == Kind::ScheduleBundle;
5066 }
5067
5068 /// Verify basic self consistency properties
5069 void verify() const {
5070 for (const ScheduleEntity *SD : Bundle) {
5071 if (SD->hasValidDependencies()) {
5072 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5073 "invariant");
5074 } else {
5075 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5076 "invariant");
5077 }
5078
5079 if (isScheduled()) {
5080 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5081 "unexpected scheduled state");
5082 }
5083 }
5084 }
5085
5086 /// Returns the number of unscheduled dependencies in the bundle.
5087 int unscheduledDepsInBundle() const {
5088 assert(*this && "bundle must not be empty");
5089 int Sum = 0;
5090 for (const ScheduleEntity *BundleMember : Bundle) {
5091 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5092 return ScheduleData::InvalidDeps;
5093 Sum += BundleMember->getUnscheduledDeps();
5094 }
5095 return Sum;
5096 }
5097
5098 /// Returns true if the dependency information has been calculated.
5099 /// Note that depenendency validity can vary between instructions within
5100 /// a single bundle.
5101 bool hasValidDependencies() const {
5102 return all_of(Range: Bundle, P: [](const ScheduleEntity *SD) {
5103 return SD->hasValidDependencies();
5104 });
5105 }
5106
5107 /// Returns true if it is ready for scheduling, i.e. it has no more
5108 /// unscheduled depending instructions/bundles.
5109 bool isReady() const {
5110 assert(*this && "bundle must not be empty");
5111 return unscheduledDepsInBundle() == 0 && !isScheduled();
5112 }
5113
5114 /// Returns the bundle of scheduling data, associated with the current
5115 /// instruction.
5116 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5117 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5118 /// Adds an instruction to the bundle.
5119 void add(ScheduleEntity *SD) { Bundle.push_back(Elt: SD); }
5120
5121 /// Gets/sets the associated tree entry.
5122 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5123 TreeEntry *getTreeEntry() const { return TE; }
5124
5125 static ScheduleBundle invalid() { return {false}; }
5126
5127 operator bool() const { return IsValid; }
5128
5129#ifndef NDEBUG
5130 void dump(raw_ostream &OS) const {
5131 if (!*this) {
5132 OS << "[]";
5133 return;
5134 }
5135 OS << '[';
5136 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5137 if (isa<ScheduleCopyableData>(SD))
5138 OS << "<Copyable>";
5139 OS << *SD->getInst();
5140 });
5141 OS << ']';
5142 }
5143
5144 LLVM_DUMP_METHOD void dump() const {
5145 dump(dbgs());
5146 dbgs() << '\n';
5147 }
5148#endif // NDEBUG
5149 };
5150
5151#ifndef NDEBUG
5152 friend inline raw_ostream &operator<<(raw_ostream &OS,
5153 const BoUpSLP::ScheduleBundle &Bundle) {
5154 Bundle.dump(OS);
5155 return OS;
5156 }
5157#endif
5158
5159 /// Contains all scheduling relevant data for the copyable instruction.
5160 /// It models the virtual instructions, supposed to replace the original
5161 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5162 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5163 /// instruction %virt = add %0, 0.
5164 class ScheduleCopyableData final : public ScheduleEntity {
5165 /// The source schedule data for the instruction.
5166 Instruction *Inst = nullptr;
5167 /// The edge information for the instruction.
5168 const EdgeInfo EI;
5169 /// This ScheduleData is in the current scheduling region if this matches
5170 /// the current SchedulingRegionID of BlockScheduling.
5171 int SchedulingRegionID = 0;
5172 /// Bundle, this data is part of.
5173 ScheduleBundle &Bundle;
5174
5175 public:
5176 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5177 const EdgeInfo &EI, ScheduleBundle &Bundle)
5178 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5179 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5180 static bool classof(const ScheduleEntity *Entity) {
5181 return Entity->getKind() == Kind::ScheduleCopyableData;
5182 }
5183
5184 /// Verify basic self consistency properties
5185 void verify() {
5186 if (hasValidDependencies()) {
5187 assert(UnscheduledDeps <= Dependencies && "invariant");
5188 } else {
5189 assert(UnscheduledDeps == Dependencies && "invariant");
5190 }
5191
5192 if (IsScheduled) {
5193 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5194 "unexpected scheduled state");
5195 }
5196 }
5197
5198 /// Returns true if the dependency information has been calculated.
5199 /// Note that depenendency validity can vary between instructions within
5200 /// a single bundle.
5201 bool hasValidDependencies() const {
5202 return Dependencies != ScheduleData::InvalidDeps;
5203 }
5204
5205 /// Returns true if it is ready for scheduling, i.e. it has no more
5206 /// unscheduled depending instructions/bundles.
5207 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5208
5209 /// Modifies the number of unscheduled dependencies for this instruction,
5210 /// and returns the number of remaining dependencies for the containing
5211 /// bundle.
5212 int incrementUnscheduledDeps(int Incr) {
5213 assert(hasValidDependencies() &&
5214 "increment of unscheduled deps would be meaningless");
5215 UnscheduledDeps += Incr;
5216 assert(UnscheduledDeps >= 0 && "invariant");
5217 return UnscheduledDeps;
5218 }
5219
5220 /// Sets the number of unscheduled dependencies to the number of
5221 /// dependencies.
5222 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5223
5224 /// Gets the number of unscheduled dependencies.
5225 int getUnscheduledDeps() const { return UnscheduledDeps; }
5226 /// Gets the number of dependencies.
5227 int getDependencies() const { return Dependencies; }
5228 /// Initializes the number of dependencies.
5229 void initDependencies() { Dependencies = 0; }
5230 /// Increments the number of dependencies.
5231 void incDependencies() { Dependencies++; }
5232
5233 /// Gets scheduling region ID.
5234 int getSchedulingRegionID() const { return SchedulingRegionID; }
5235
5236 /// Gets the instruction.
5237 Instruction *getInst() const { return Inst; }
5238
5239 /// Clears all dependency information.
5240 void clearDependencies() {
5241 Dependencies = ScheduleData::InvalidDeps;
5242 UnscheduledDeps = ScheduleData::InvalidDeps;
5243 IsScheduled = false;
5244 }
5245
5246 /// Gets the edge information.
5247 const EdgeInfo &getEdgeInfo() const { return EI; }
5248
5249 /// Gets the bundle.
5250 ScheduleBundle &getBundle() { return Bundle; }
5251 const ScheduleBundle &getBundle() const { return Bundle; }
5252
5253#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5254 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5255
5256 LLVM_DUMP_METHOD void dump() const {
5257 dump(dbgs());
5258 dbgs() << '\n';
5259 }
5260#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5261
5262 private:
5263 /// true, if it has valid dependency information. These nodes always have
5264 /// only single dependency.
5265 int Dependencies = ScheduleData::InvalidDeps;
5266
5267 /// The number of dependencies minus the number of dependencies of scheduled
5268 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5269 /// for scheduling.
5270 /// Note that this is negative as long as Dependencies is not calculated.
5271 int UnscheduledDeps = ScheduleData::InvalidDeps;
5272 };
5273
5274#ifndef NDEBUG
5275 friend inline raw_ostream &
5276 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5277 SD.dump(OS);
5278 return OS;
5279 }
5280#endif
5281
5282 friend struct GraphTraits<BoUpSLP *>;
5283 friend struct DOTGraphTraits<BoUpSLP *>;
5284
5285 /// Contains all scheduling data for a basic block.
5286 /// It does not schedules instructions, which are not memory read/write
5287 /// instructions and their operands are either constants, or arguments, or
5288 /// phis, or instructions from others blocks, or their users are phis or from
5289 /// the other blocks. The resulting vector instructions can be placed at the
5290 /// beginning of the basic block without scheduling (if operands does not need
5291 /// to be scheduled) or at the end of the block (if users are outside of the
5292 /// block). It allows to save some compile time and memory used by the
5293 /// compiler.
5294 /// ScheduleData is assigned for each instruction in between the boundaries of
5295 /// the tree entry, even for those, which are not part of the graph. It is
5296 /// required to correctly follow the dependencies between the instructions and
5297 /// their correct scheduling. The ScheduleData is not allocated for the
5298 /// instructions, which do not require scheduling, like phis, nodes with
5299 /// extractelements/insertelements only or nodes with instructions, with
5300 /// uses/operands outside of the block.
5301 struct BlockScheduling {
5302 BlockScheduling(BasicBlock *BB)
5303 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5304
5305 void clear() {
5306 ScheduledBundles.clear();
5307 ScheduledBundlesList.clear();
5308 ScheduleCopyableDataMap.clear();
5309 ScheduleCopyableDataMapByInst.clear();
5310 ScheduleCopyableDataMapByInstUser.clear();
5311 ScheduleCopyableDataMapByUsers.clear();
5312 ReadyInsts.clear();
5313 ScheduleStart = nullptr;
5314 ScheduleEnd = nullptr;
5315 FirstLoadStoreInRegion = nullptr;
5316 LastLoadStoreInRegion = nullptr;
5317 RegionHasStackSave = false;
5318
5319 // Reduce the maximum schedule region size by the size of the
5320 // previous scheduling run.
5321 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5322 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5323 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5324 ScheduleRegionSize = 0;
5325
5326 // Make a new scheduling region, i.e. all existing ScheduleData is not
5327 // in the new region yet.
5328 ++SchedulingRegionID;
5329 }
5330
5331 ScheduleData *getScheduleData(Instruction *I) {
5332 if (!I)
5333 return nullptr;
5334 if (BB != I->getParent())
5335 // Avoid lookup if can't possibly be in map.
5336 return nullptr;
5337 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
5338 if (SD && isInSchedulingRegion(SD: *SD))
5339 return SD;
5340 return nullptr;
5341 }
5342
5343 ScheduleData *getScheduleData(Value *V) {
5344 return getScheduleData(I: dyn_cast<Instruction>(Val: V));
5345 }
5346
5347 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5348 /// operand number) and value.
5349 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5350 const Value *V) const {
5351 if (ScheduleCopyableDataMap.empty())
5352 return nullptr;
5353 auto It = ScheduleCopyableDataMap.find(Val: std::make_pair(x: EI, y&: V));
5354 if (It == ScheduleCopyableDataMap.end())
5355 return nullptr;
5356 ScheduleCopyableData *SD = It->getSecond().get();
5357 if (!isInSchedulingRegion(SD: *SD))
5358 return nullptr;
5359 return SD;
5360 }
5361
5362 /// Returns the ScheduleCopyableData for the given user \p User, operand
5363 /// number and operand \p V.
5364 SmallVector<ScheduleCopyableData *>
5365 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5366 const Value *V) {
5367 if (ScheduleCopyableDataMapByInstUser.empty())
5368 return {};
5369 const auto It = ScheduleCopyableDataMapByInstUser.find(
5370 Val: std::make_pair(x: std::make_pair(x&: User, y&: OperandIdx), y&: V));
5371 if (It == ScheduleCopyableDataMapByInstUser.end())
5372 return {};
5373 SmallVector<ScheduleCopyableData *> Res;
5374 for (ScheduleCopyableData *SD : It->getSecond()) {
5375 if (isInSchedulingRegion(SD: *SD))
5376 Res.push_back(Elt: SD);
5377 }
5378 return Res;
5379 }
5380
5381 /// Returns true if all operands of the given instruction \p User are
5382 /// replaced by copyable data.
5383 /// \param User The user instruction.
5384 /// \param Op The operand, which might be replaced by the copyable data.
5385 /// \param SLP The SLP tree.
5386 /// \param NumOps The number of operands used. If the instruction uses the
5387 /// same operand several times, check for the first use, then the second,
5388 /// etc.
5389 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5390 Instruction *Op, BoUpSLP &SLP,
5391 unsigned NumOps) const {
5392 assert(NumOps > 0 && "No operands");
5393 if (ScheduleCopyableDataMap.empty())
5394 return false;
5395 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5396 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(V: User);
5397 if (Entries.empty())
5398 return false;
5399 unsigned CurNumOps = 0;
5400 for (const Use &U : User->operands()) {
5401 if (U.get() != Op)
5402 continue;
5403 ++CurNumOps;
5404 // Check all tree entries, if they have operands replaced by copyable
5405 // data.
5406 for (TreeEntry *TE : Entries) {
5407 unsigned Inc = 0;
5408 bool IsNonSchedulableWithParentPhiNode =
5409 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5410 TE->UserTreeIndex.UserTE->hasState() &&
5411 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5412 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5413 // Count the number of unique phi nodes, which are the parent for
5414 // parent entry, and exit, if all the unique phis are processed.
5415 if (IsNonSchedulableWithParentPhiNode) {
5416 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5417 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5418 for (Value *V : ParentTE->Scalars) {
5419 auto *PHI = dyn_cast<PHINode>(Val: V);
5420 if (!PHI)
5421 continue;
5422 if (ParentsUniqueUsers.insert(Ptr: PHI).second &&
5423 is_contained(Range: PHI->incoming_values(), Element: User))
5424 ++Inc;
5425 }
5426 } else {
5427 Inc = count(Range&: TE->Scalars, Element: User);
5428 }
5429
5430 // Check if the user is commutative.
5431 // The commutatives are handled later, as their operands can be
5432 // reordered.
5433 // Same applies even for non-commutative cmps, because we can invert
5434 // their predicate potentially and, thus, reorder the operands.
5435 bool IsCommutativeUser =
5436 ::isCommutative(I: User) &&
5437 ::isCommutableOperand(I: User, ValWithUses: User, Op: U.getOperandNo());
5438 if (!IsCommutativeUser) {
5439 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(I: User);
5440 IsCommutativeUser =
5441 ::isCommutative(I: MainOp, ValWithUses: User) &&
5442 ::isCommutableOperand(I: MainOp, ValWithUses: User, Op: U.getOperandNo());
5443 }
5444 // The commutative user with the same operands can be safely
5445 // considered as non-commutative, operands reordering does not change
5446 // the semantics.
5447 assert(
5448 (!IsCommutativeUser ||
5449 (((::isCommutative(User) &&
5450 ::isCommutableOperand(User, User, 0) &&
5451 ::isCommutableOperand(User, User, 1)) ||
5452 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5453 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5454 User, 0) &&
5455 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5456 User, 1))))) &&
5457 "Expected commutative user with 2 first commutable operands");
5458 bool IsCommutativeWithSameOps =
5459 IsCommutativeUser && User->getOperand(i: 0) == User->getOperand(i: 1);
5460 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5461 !isa<CmpInst>(Val: User)) {
5462 EdgeInfo EI(TE, U.getOperandNo());
5463 if (CurNumOps != NumOps || getScheduleCopyableData(EI, V: Op))
5464 continue;
5465 return false;
5466 }
5467 PotentiallyReorderedEntriesCount.try_emplace(Key: TE, Args: 0)
5468 .first->getSecond() += Inc;
5469 }
5470 }
5471 if (PotentiallyReorderedEntriesCount.empty())
5472 return true;
5473 // Check the commutative/cmp entries.
5474 for (auto &P : PotentiallyReorderedEntriesCount) {
5475 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5476 bool IsNonSchedulableWithParentPhiNode =
5477 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5478 P.first->UserTreeIndex.UserTE->hasState() &&
5479 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5480 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5481 auto *It = find(Range&: P.first->Scalars, Val: User);
5482 do {
5483 assert(It != P.first->Scalars.end() &&
5484 "User is not in the tree entry");
5485 int Lane = std::distance(first: P.first->Scalars.begin(), last: It);
5486 assert(Lane >= 0 && "Lane is not found");
5487 if (isa<StoreInst>(Val: User) && !P.first->ReorderIndices.empty())
5488 Lane = P.first->ReorderIndices[Lane];
5489 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5490 "Couldn't find extract lane");
5491 // Count the number of unique phi nodes, which are the parent for
5492 // parent entry, and exit, if all the unique phis are processed.
5493 if (IsNonSchedulableWithParentPhiNode) {
5494 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5495 Value *User = ParentTE->Scalars[Lane];
5496 if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5497 It =
5498 find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5499 continue;
5500 }
5501 }
5502 for (unsigned OpIdx :
5503 seq<unsigned>(Size: ::getNumberOfPotentiallyCommutativeOps(
5504 I: P.first->getMainOp()))) {
5505 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5506 getScheduleCopyableData(EI: EdgeInfo(P.first, OpIdx), V: Op))
5507 --P.getSecond();
5508 }
5509 // If parent node is schedulable, it will be handled correctly.
5510 It = find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5511 } while (It != P.first->Scalars.end());
5512 }
5513 return all_of(Range&: PotentiallyReorderedEntriesCount,
5514 P: [&](const std::pair<const TreeEntry *, unsigned> &P) {
5515 return P.second == NumOps - 1;
5516 });
5517 }
5518
5519 SmallVector<ScheduleCopyableData *>
5520 getScheduleCopyableData(const Instruction *I) const {
5521 if (ScheduleCopyableDataMapByInst.empty())
5522 return {};
5523 const auto It = ScheduleCopyableDataMapByInst.find(Val: I);
5524 if (It == ScheduleCopyableDataMapByInst.end())
5525 return {};
5526 SmallVector<ScheduleCopyableData *> Res;
5527 for (ScheduleCopyableData *SD : It->getSecond()) {
5528 if (isInSchedulingRegion(SD: *SD))
5529 Res.push_back(Elt: SD);
5530 }
5531 return Res;
5532 }
5533
5534 SmallVector<ScheduleCopyableData *>
5535 getScheduleCopyableDataUsers(const Instruction *User) const {
5536 if (ScheduleCopyableDataMapByUsers.empty())
5537 return {};
5538 const auto It = ScheduleCopyableDataMapByUsers.find(Val: User);
5539 if (It == ScheduleCopyableDataMapByUsers.end())
5540 return {};
5541 SmallVector<ScheduleCopyableData *> Res;
5542 for (ScheduleCopyableData *SD : It->getSecond()) {
5543 if (isInSchedulingRegion(SD: *SD))
5544 Res.push_back(Elt: SD);
5545 }
5546 return Res;
5547 }
5548
5549 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5550 Instruction *I,
5551 int SchedulingRegionID,
5552 ScheduleBundle &Bundle) {
5553 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5554 ScheduleCopyableData *CD =
5555 ScheduleCopyableDataMap
5556 .try_emplace(Key: std::make_pair(x: EI, y&: I),
5557 Args: std::make_unique<ScheduleCopyableData>(
5558 args&: SchedulingRegionID, args&: I, args: EI, args&: Bundle))
5559 .first->getSecond()
5560 .get();
5561 ScheduleCopyableDataMapByInst[I].push_back(Elt: CD);
5562 if (EI.UserTE) {
5563 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
5564 const auto *It = find(Range&: Op, Val: I);
5565 assert(It != Op.end() && "Lane not set");
5566 SmallPtrSet<Instruction *, 4> Visited;
5567 do {
5568 int Lane = std::distance(first: Op.begin(), last: It);
5569 assert(Lane >= 0 && "Lane not set");
5570 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
5571 !EI.UserTE->ReorderIndices.empty())
5572 Lane = EI.UserTE->ReorderIndices[Lane];
5573 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5574 "Couldn't find extract lane");
5575 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
5576 if (!Visited.insert(Ptr: In).second) {
5577 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5578 continue;
5579 }
5580 ScheduleCopyableDataMapByInstUser
5581 .try_emplace(Key: std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I))
5582 .first->getSecond()
5583 .push_back(Elt: CD);
5584 ScheduleCopyableDataMapByUsers.try_emplace(Key: I)
5585 .first->getSecond()
5586 .insert(X: CD);
5587 // Remove extra deps for users, becoming non-immediate users of the
5588 // instruction. It may happen, if the chain of same copyable elements
5589 // appears in the tree.
5590 if (In == I) {
5591 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5592 if (ScheduleCopyableData *UserCD =
5593 getScheduleCopyableData(EI: UserEI, V: In))
5594 ScheduleCopyableDataMapByUsers[I].remove(X: UserCD);
5595 }
5596 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5597 } while (It != Op.end());
5598 } else {
5599 ScheduleCopyableDataMapByUsers.try_emplace(Key: I).first->getSecond().insert(
5600 X: CD);
5601 }
5602 return *CD;
5603 }
5604
5605 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5606 auto *I = dyn_cast<Instruction>(Val: V);
5607 if (!I)
5608 return {};
5609 auto It = ScheduledBundles.find(Val: I);
5610 if (It == ScheduledBundles.end())
5611 return {};
5612 return It->getSecond();
5613 }
5614
5615 /// Returns true if the entity is in the scheduling region.
5616 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5617 if (const auto *Data = dyn_cast<ScheduleData>(Val: &SD))
5618 return Data->getSchedulingRegionID() == SchedulingRegionID;
5619 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: &SD))
5620 return CD->getSchedulingRegionID() == SchedulingRegionID;
5621 return all_of(Range: cast<ScheduleBundle>(Val: SD).getBundle(),
5622 P: [&](const ScheduleEntity *BundleMember) {
5623 return isInSchedulingRegion(SD: *BundleMember);
5624 });
5625 }
5626
5627 /// Marks an instruction as scheduled and puts all dependent ready
5628 /// instructions into the ready-list.
5629 template <typename ReadyListType>
5630 void schedule(const BoUpSLP &R, const InstructionsState &S,
5631 const EdgeInfo &EI, ScheduleEntity *Data,
5632 ReadyListType &ReadyList) {
5633 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5634 ArrayRef<ScheduleBundle *> Bundles) {
5635 // Handle the def-use chain dependencies.
5636
5637 // Decrement the unscheduled counter and insert to ready list if ready.
5638 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5639 if ((IsControl || Data->hasValidDependencies()) &&
5640 Data->incrementUnscheduledDeps(-1) == 0) {
5641 // There are no more unscheduled dependencies after
5642 // decrementing, so we can put the dependent instruction
5643 // into the ready list.
5644 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5645 ArrayRef<ScheduleBundle *> Bundles;
5646 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5647 CopyableBundle.push_back(Elt: &CD->getBundle());
5648 Bundles = CopyableBundle;
5649 } else {
5650 Bundles = getScheduleBundles(V: Data->getInst());
5651 }
5652 if (!Bundles.empty()) {
5653 for (ScheduleBundle *Bundle : Bundles) {
5654 if (Bundle->unscheduledDepsInBundle() == 0) {
5655 assert(!Bundle->isScheduled() &&
5656 "already scheduled bundle gets ready");
5657 ReadyList.insert(Bundle);
5658 LLVM_DEBUG(dbgs()
5659 << "SLP: gets ready: " << *Bundle << "\n");
5660 }
5661 }
5662 return;
5663 }
5664 assert(!Data->isScheduled() &&
5665 "already scheduled bundle gets ready");
5666 assert(!isa<ScheduleCopyableData>(Data) &&
5667 "Expected non-copyable data");
5668 ReadyList.insert(Data);
5669 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5670 }
5671 };
5672
5673 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5674 Instruction *I) {
5675 if (!ScheduleCopyableDataMap.empty()) {
5676 SmallVector<ScheduleCopyableData *> CopyableData =
5677 getScheduleCopyableData(User, OperandIdx: OpIdx, V: I);
5678 for (ScheduleCopyableData *CD : CopyableData)
5679 DecrUnsched(CD, /*IsControl=*/false);
5680 if (!CopyableData.empty())
5681 return;
5682 }
5683 if (ScheduleData *OpSD = getScheduleData(I))
5684 DecrUnsched(OpSD, /*IsControl=*/false);
5685 };
5686
5687 // If BundleMember is a vector bundle, its operands may have been
5688 // reordered during buildTree(). We therefore need to get its operands
5689 // through the TreeEntry.
5690 if (!Bundles.empty()) {
5691 auto *In = BundleMember->getInst();
5692 // Count uses of each instruction operand.
5693 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5694 unsigned TotalOpCount = 0;
5695 if (isa<ScheduleCopyableData>(Val: BundleMember)) {
5696 // Copyable data is used only once (uses itself).
5697 TotalOpCount = OperandsUses[In] = 1;
5698 } else {
5699 for (const Use &U : In->operands()) {
5700 if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5701 auto Res = OperandsUses.try_emplace(Key: I, Args: 0);
5702 ++Res.first->getSecond();
5703 ++TotalOpCount;
5704 }
5705 }
5706 }
5707 // Decrement the unscheduled counter and insert to ready list if
5708 // ready.
5709 auto DecrUnschedForInst =
5710 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5711 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5712 &Checked) {
5713 if (!ScheduleCopyableDataMap.empty()) {
5714 const EdgeInfo EI = {UserTE, OpIdx};
5715 if (ScheduleCopyableData *CD =
5716 getScheduleCopyableData(EI, V: I)) {
5717 if (!Checked.insert(V: std::make_pair(x&: CD, y&: OpIdx)).second)
5718 return;
5719 DecrUnsched(CD, /*IsControl=*/false);
5720 return;
5721 }
5722 }
5723 auto It = OperandsUses.find(Val: I);
5724 assert(It != OperandsUses.end() && "Operand not found");
5725 if (It->second > 0) {
5726 if (ScheduleData *OpSD = getScheduleData(I)) {
5727 if (!Checked.insert(V: std::make_pair(x&: OpSD, y&: OpIdx)).second)
5728 return;
5729 --It->getSecond();
5730 assert(TotalOpCount > 0 && "No more operands to decrement");
5731 --TotalOpCount;
5732 DecrUnsched(OpSD, /*IsControl=*/false);
5733 } else {
5734 --It->getSecond();
5735 assert(TotalOpCount > 0 && "No more operands to decrement");
5736 --TotalOpCount;
5737 }
5738 }
5739 };
5740
5741 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5742 for (ScheduleBundle *Bundle : Bundles) {
5743 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5744 break;
5745 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5746 // Need to search for the lane since the tree entry can be
5747 // reordered.
5748 auto *It = find(Range&: Bundle->getTreeEntry()->Scalars, Val: In);
5749 bool IsNonSchedulableWithParentPhiNode =
5750 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5751 Bundle->getTreeEntry()->UserTreeIndex &&
5752 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5753 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5754 TreeEntry::SplitVectorize &&
5755 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5756 Instruction::PHI;
5757 do {
5758 int Lane =
5759 std::distance(first: Bundle->getTreeEntry()->Scalars.begin(), last: It);
5760 assert(Lane >= 0 && "Lane not set");
5761 if (isa<StoreInst>(Val: In) &&
5762 !Bundle->getTreeEntry()->ReorderIndices.empty())
5763 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5764 assert(Lane < static_cast<int>(
5765 Bundle->getTreeEntry()->Scalars.size()) &&
5766 "Couldn't find extract lane");
5767
5768 // Since vectorization tree is being built recursively this
5769 // assertion ensures that the tree entry has all operands set
5770 // before reaching this code. Couple of exceptions known at the
5771 // moment are extracts where their second (immediate) operand is
5772 // not added. Since immediates do not affect scheduler behavior
5773 // this is considered okay.
5774 assert(In &&
5775 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
5776 In->getNumOperands() ==
5777 Bundle->getTreeEntry()->getNumOperands() ||
5778 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5779 "Missed TreeEntry operands?");
5780
5781 // Count the number of unique phi nodes, which are the parent for
5782 // parent entry, and exit, if all the unique phis are processed.
5783 if (IsNonSchedulableWithParentPhiNode) {
5784 const TreeEntry *ParentTE =
5785 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5786 Value *User = ParentTE->Scalars[Lane];
5787 if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5788 It = std::find(first: std::next(x: It),
5789 last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5790 continue;
5791 }
5792 }
5793
5794 for (unsigned OpIdx :
5795 seq<unsigned>(Size: Bundle->getTreeEntry()->getNumOperands()))
5796 if (auto *I = dyn_cast<Instruction>(
5797 Val: Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5798 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5799 << *I << "\n");
5800 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5801 }
5802 // If parent node is schedulable, it will be handled correctly.
5803 if (Bundle->getTreeEntry()->isCopyableElement(V: In))
5804 break;
5805 It = std::find(first: std::next(x: It),
5806 last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5807 } while (It != Bundle->getTreeEntry()->Scalars.end());
5808 }
5809 } else {
5810 // If BundleMember is a stand-alone instruction, no operand reordering
5811 // has taken place, so we directly access its operands.
5812 for (Use &U : BundleMember->getInst()->operands()) {
5813 if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5814 LLVM_DEBUG(dbgs()
5815 << "SLP: check for readiness (def): " << *I << "\n");
5816 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5817 }
5818 }
5819 }
5820 // Handle the memory dependencies.
5821 auto *SD = dyn_cast<ScheduleData>(Val: BundleMember);
5822 if (!SD)
5823 return;
5824 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5825 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5826 if (!VisitedMemory.insert(Ptr: MemoryDep).second)
5827 continue;
5828 // There are no more unscheduled dependencies after decrementing,
5829 // so we can put the dependent instruction into the ready list.
5830 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5831 << *MemoryDep << "\n");
5832 DecrUnsched(MemoryDep);
5833 }
5834 // Handle the control dependencies.
5835 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5836 for (ScheduleData *Dep : SD->getControlDependencies()) {
5837 if (!VisitedControl.insert(Ptr: Dep).second)
5838 continue;
5839 // There are no more unscheduled dependencies after decrementing,
5840 // so we can put the dependent instruction into the ready list.
5841 LLVM_DEBUG(dbgs()
5842 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5843 DecrUnsched(Dep, /*IsControl=*/true);
5844 }
5845 };
5846 if (auto *SD = dyn_cast<ScheduleData>(Val: Data)) {
5847 SD->setScheduled(/*Scheduled=*/true);
5848 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5849 SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles;
5850 SmallVector<ScheduleBundle *> Bundles;
5851 Instruction *In = SD->getInst();
5852 if (R.isVectorized(V: In)) {
5853 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(V: In);
5854 for (TreeEntry *TE : Entries) {
5855 if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(Val: In) &&
5856 In->getNumOperands() != TE->getNumOperands())
5857 continue;
5858 auto &BundlePtr =
5859 PseudoBundles.emplace_back(Args: std::make_unique<ScheduleBundle>());
5860 BundlePtr->setTreeEntry(TE);
5861 BundlePtr->add(SD);
5862 Bundles.push_back(Elt: BundlePtr.get());
5863 }
5864 }
5865 ProcessBundleMember(SD, Bundles);
5866 } else {
5867 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Val: Data);
5868 Bundle.setScheduled(/*Scheduled=*/true);
5869 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5870 auto AreAllBundlesScheduled =
5871 [&](const ScheduleEntity *SD,
5872 ArrayRef<ScheduleBundle *> SDBundles) {
5873 if (isa<ScheduleCopyableData>(Val: SD))
5874 return true;
5875 return !SDBundles.empty() &&
5876 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5877 return SDBundle->isScheduled();
5878 });
5879 };
5880 for (ScheduleEntity *SD : Bundle.getBundle()) {
5881 ArrayRef<ScheduleBundle *> SDBundles;
5882 if (!isa<ScheduleCopyableData>(Val: SD))
5883 SDBundles = getScheduleBundles(V: SD->getInst());
5884 if (AreAllBundlesScheduled(SD, SDBundles)) {
5885 SD->setScheduled(/*Scheduled=*/true);
5886 ProcessBundleMember(SD, isa<ScheduleCopyableData>(Val: SD) ? &Bundle
5887 : SDBundles);
5888 }
5889 }
5890 }
5891 }
5892
5893 /// Verify basic self consistency properties of the data structure.
5894 void verify() {
5895 if (!ScheduleStart)
5896 return;
5897
5898 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5899 ScheduleStart->comesBefore(ScheduleEnd) &&
5900 "Not a valid scheduling region?");
5901
5902 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5903 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
5904 if (!Bundles.empty()) {
5905 for (ScheduleBundle *Bundle : Bundles) {
5906 assert(isInSchedulingRegion(*Bundle) &&
5907 "primary schedule data not in window?");
5908 Bundle->verify();
5909 }
5910 continue;
5911 }
5912 auto *SD = getScheduleData(I);
5913 if (!SD)
5914 continue;
5915 assert(isInSchedulingRegion(*SD) &&
5916 "primary schedule data not in window?");
5917 SD->verify();
5918 }
5919
5920 assert(all_of(ReadyInsts,
5921 [](const ScheduleEntity *Bundle) {
5922 return Bundle->isReady();
5923 }) &&
5924 "item in ready list not ready?");
5925 }
5926
5927 /// Put all instructions into the ReadyList which are ready for scheduling.
5928 template <typename ReadyListType>
5929 void initialFillReadyList(ReadyListType &ReadyList) {
5930 SmallPtrSet<ScheduleBundle *, 16> Visited;
5931 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5932 ScheduleData *SD = getScheduleData(I);
5933 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5934 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
5935 !Bundles.empty()) {
5936 for (ScheduleBundle *Bundle : Bundles) {
5937 if (!Visited.insert(Ptr: Bundle).second)
5938 continue;
5939 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5940 ReadyList.insert(Bundle);
5941 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5942 << *Bundle << "\n");
5943 }
5944 }
5945 continue;
5946 }
5947 ReadyList.insert(SD);
5948 LLVM_DEBUG(dbgs()
5949 << "SLP: initially in ready list: " << *SD << "\n");
5950 }
5951 }
5952 }
5953
5954 /// Build a bundle from the ScheduleData nodes corresponding to the
5955 /// scalar instruction for each lane.
5956 /// \param VL The list of scalar instructions.
5957 /// \param S The state of the instructions.
5958 /// \param EI The edge in the SLP graph or the user node/operand number.
5959 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5960 const InstructionsState &S, const EdgeInfo &EI);
5961
5962 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5963 /// cyclic dependencies. This is only a dry-run, no instructions are
5964 /// actually moved at this stage.
5965 /// \returns the scheduling bundle. The returned Optional value is not
5966 /// std::nullopt if \p VL is allowed to be scheduled.
5967 std::optional<ScheduleBundle *>
5968 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5969 const InstructionsState &S, const EdgeInfo &EI);
5970
5971 /// Allocates schedule data chunk.
5972 ScheduleData *allocateScheduleDataChunks();
5973
5974 /// Extends the scheduling region so that V is inside the region.
5975 /// \returns true if the region size is within the limit.
5976 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5977
5978 /// Initialize the ScheduleData structures for new instructions in the
5979 /// scheduling region.
5980 void initScheduleData(Instruction *FromI, Instruction *ToI,
5981 ScheduleData *PrevLoadStore,
5982 ScheduleData *NextLoadStore);
5983
5984 /// Updates the dependency information of a bundle and of all instructions/
5985 /// bundles which depend on the original bundle.
5986 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
5987 BoUpSLP *SLP,
5988 ArrayRef<ScheduleData *> ControlDeps = {});
5989
5990 /// Sets all instruction in the scheduling region to un-scheduled.
5991 void resetSchedule();
5992
5993 BasicBlock *BB;
5994
5995 /// Simple memory allocation for ScheduleData.
5996 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
5997
5998 /// The size of a ScheduleData array in ScheduleDataChunks.
5999 int ChunkSize;
6000
6001 /// The allocator position in the current chunk, which is the last entry
6002 /// of ScheduleDataChunks.
6003 int ChunkPos;
6004
6005 /// Attaches ScheduleData to Instruction.
6006 /// Note that the mapping survives during all vectorization iterations, i.e.
6007 /// ScheduleData structures are recycled.
6008 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6009
6010 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6011 /// number) and the operand instruction, represented as copyable element.
6012 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6013 std::unique_ptr<ScheduleCopyableData>>
6014 ScheduleCopyableDataMap;
6015
6016 /// Represents mapping between instruction and all related
6017 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6018 /// element). The SLP tree may contain several representations of the same
6019 /// instruction.
6020 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6021 ScheduleCopyableDataMapByInst;
6022
6023 /// Represents mapping between user value and operand number, the operand
6024 /// value and all related ScheduleCopyableData. The relation is 1:n, because
6025 /// the same user may refernce the same operand in different tree entries
6026 /// and the operand may be modelled by the different copyable data element.
6027 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
6028 SmallVector<ScheduleCopyableData *>>
6029 ScheduleCopyableDataMapByInstUser;
6030
6031 /// Represents mapping between instruction and all related
6032 /// ScheduleCopyableData. It represents the mapping between the actual
6033 /// instruction and the last copyable data element in the chain. E.g., if
6034 /// the graph models the following instructions:
6035 /// %0 = non-add instruction ...
6036 /// ...
6037 /// %4 = add %3, 1
6038 /// %5 = add %4, 1
6039 /// %6 = insertelement poison, %0, 0
6040 /// %7 = insertelement %6, %5, 1
6041 /// And the graph is modeled as:
6042 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6043 /// -> [1, 0] -> [%1, 0]
6044 ///
6045 /// this map will map %0 only to the copyable element <1>, which is the last
6046 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6047 /// keep the map to <0>, not the %0.
6048 SmallDenseMap<const Instruction *,
6049 SmallSetVector<ScheduleCopyableData *, 4>>
6050 ScheduleCopyableDataMapByUsers;
6051
6052 /// Attaches ScheduleBundle to Instruction.
6053 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6054 ScheduledBundles;
6055 /// The list of ScheduleBundles.
6056 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6057
6058 /// The ready-list for scheduling (only used for the dry-run).
6059 SetVector<ScheduleEntity *> ReadyInsts;
6060
6061 /// The first instruction of the scheduling region.
6062 Instruction *ScheduleStart = nullptr;
6063
6064 /// The first instruction _after_ the scheduling region.
6065 Instruction *ScheduleEnd = nullptr;
6066
6067 /// The first memory accessing instruction in the scheduling region
6068 /// (can be null).
6069 ScheduleData *FirstLoadStoreInRegion = nullptr;
6070
6071 /// The last memory accessing instruction in the scheduling region
6072 /// (can be null).
6073 ScheduleData *LastLoadStoreInRegion = nullptr;
6074
6075 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6076 /// region? Used to optimize the dependence calculation for the
6077 /// common case where there isn't.
6078 bool RegionHasStackSave = false;
6079
6080 /// The current size of the scheduling region.
6081 int ScheduleRegionSize = 0;
6082
6083 /// The maximum size allowed for the scheduling region.
6084 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6085
6086 /// The ID of the scheduling region. For a new vectorization iteration this
6087 /// is incremented which "removes" all ScheduleData from the region.
6088 /// Make sure that the initial SchedulingRegionID is greater than the
6089 /// initial SchedulingRegionID in ScheduleData (which is 0).
6090 int SchedulingRegionID = 1;
6091 };
6092
6093 /// Attaches the BlockScheduling structures to basic blocks.
6094 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6095
6096 /// Performs the "real" scheduling. Done before vectorization is actually
6097 /// performed in a basic block.
6098 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6099
6100 /// List of users to ignore during scheduling and that don't need extracting.
6101 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6102
6103 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6104 /// sorted SmallVectors of unsigned.
6105 struct OrdersTypeDenseMapInfo {
6106 static OrdersType getEmptyKey() {
6107 OrdersType V;
6108 V.push_back(Elt: ~1U);
6109 return V;
6110 }
6111
6112 static OrdersType getTombstoneKey() {
6113 OrdersType V;
6114 V.push_back(Elt: ~2U);
6115 return V;
6116 }
6117
6118 static unsigned getHashValue(const OrdersType &V) {
6119 return static_cast<unsigned>(hash_combine_range(R: V));
6120 }
6121
6122 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6123 return LHS == RHS;
6124 }
6125 };
6126
6127 // Analysis and block reference.
6128 Function *F;
6129 ScalarEvolution *SE;
6130 TargetTransformInfo *TTI;
6131 TargetLibraryInfo *TLI;
6132 LoopInfo *LI;
6133 DominatorTree *DT;
6134 AssumptionCache *AC;
6135 DemandedBits *DB;
6136 const DataLayout *DL;
6137 OptimizationRemarkEmitter *ORE;
6138
6139 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6140 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6141
6142 /// Instruction builder to construct the vectorized tree.
6143 IRBuilder<TargetFolder> Builder;
6144
6145 /// A map of scalar integer values to the smallest bit width with which they
6146 /// can legally be represented. The values map to (width, signed) pairs,
6147 /// where "width" indicates the minimum bit width and "signed" is True if the
6148 /// value must be signed-extended, rather than zero-extended, back to its
6149 /// original width.
6150 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6151
6152 /// Final size of the reduced vector, if the current graph represents the
6153 /// input for the reduction and it was possible to narrow the size of the
6154 /// reduction.
6155 unsigned ReductionBitWidth = 0;
6156
6157 /// Canonical graph size before the transformations.
6158 unsigned BaseGraphSize = 1;
6159
6160 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6161 /// type sizes, used in the tree.
6162 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6163
6164 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6165 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6166 DenseSet<unsigned> ExtraBitWidthNodes;
6167};
6168
6169template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6170 using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;
6171 using SecondInfo = DenseMapInfo<unsigned>;
6172 static BoUpSLP::EdgeInfo getEmptyKey() {
6173 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6174 SecondInfo::getEmptyKey());
6175 }
6176
6177 static BoUpSLP::EdgeInfo getTombstoneKey() {
6178 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6179 SecondInfo::getTombstoneKey());
6180 }
6181
6182 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6183 return detail::combineHashValue(a: FirstInfo::getHashValue(PtrVal: Val.UserTE),
6184 b: SecondInfo::getHashValue(Val: Val.EdgeIdx));
6185 }
6186
6187 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6188 const BoUpSLP::EdgeInfo &RHS) {
6189 return LHS == RHS;
6190 }
6191};
6192
6193template <> struct llvm::GraphTraits<BoUpSLP *> {
6194 using TreeEntry = BoUpSLP::TreeEntry;
6195
6196 /// NodeRef has to be a pointer per the GraphWriter.
6197 using NodeRef = TreeEntry *;
6198
6199 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6200
6201 /// Add the VectorizableTree to the index iterator to be able to return
6202 /// TreeEntry pointers.
6203 struct ChildIteratorType
6204 : public iterator_adaptor_base<
6205 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6206 ContainerTy &VectorizableTree;
6207
6208 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
6209 ContainerTy &VT)
6210 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
6211
6212 NodeRef operator*() { return I->UserTE; }
6213 };
6214
6215 static NodeRef getEntryNode(BoUpSLP &R) {
6216 return R.VectorizableTree[0].get();
6217 }
6218
6219 static ChildIteratorType child_begin(NodeRef N) {
6220 return {&N->UserTreeIndex, N->Container};
6221 }
6222
6223 static ChildIteratorType child_end(NodeRef N) {
6224 return {&N->UserTreeIndex + 1, N->Container};
6225 }
6226
6227 /// For the node iterator we just need to turn the TreeEntry iterator into a
6228 /// TreeEntry* iterator so that it dereferences to NodeRef.
6229 class nodes_iterator {
6230 using ItTy = ContainerTy::iterator;
6231 ItTy It;
6232
6233 public:
6234 nodes_iterator(const ItTy &It2) : It(It2) {}
6235 NodeRef operator*() { return It->get(); }
6236 nodes_iterator operator++() {
6237 ++It;
6238 return *this;
6239 }
6240 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6241 };
6242
6243 static nodes_iterator nodes_begin(BoUpSLP *R) {
6244 return nodes_iterator(R->VectorizableTree.begin());
6245 }
6246
6247 static nodes_iterator nodes_end(BoUpSLP *R) {
6248 return nodes_iterator(R->VectorizableTree.end());
6249 }
6250
6251 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6252};
6253
6254template <>
6255struct llvm::DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6256 using TreeEntry = BoUpSLP::TreeEntry;
6257
6258 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6259
6260 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6261 std::string Str;
6262 raw_string_ostream OS(Str);
6263 OS << Entry->Idx << ".\n";
6264 if (isSplat(VL: Entry->Scalars))
6265 OS << "<splat> ";
6266 for (auto *V : Entry->Scalars) {
6267 OS << *V;
6268 if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
6269 return EU.Scalar == V;
6270 }))
6271 OS << " <extract>";
6272 OS << "\n";
6273 }
6274 return Str;
6275 }
6276
6277 static std::string getNodeAttributes(const TreeEntry *Entry,
6278 const BoUpSLP *) {
6279 if (Entry->isGather())
6280 return "color=red";
6281 if (Entry->State == TreeEntry::ScatterVectorize ||
6282 Entry->State == TreeEntry::StridedVectorize ||
6283 Entry->State == TreeEntry::CompressVectorize)
6284 return "color=blue";
6285 return "";
6286 }
6287};
6288
6289BoUpSLP::~BoUpSLP() {
6290 SmallVector<WeakTrackingVH> DeadInsts;
6291 for (auto *I : DeletedInstructions) {
6292 if (!I->getParent()) {
6293 // Temporarily insert instruction back to erase them from parent and
6294 // memory later.
6295 if (isa<PHINode>(Val: I))
6296 // Phi nodes must be the very first instructions in the block.
6297 I->insertBefore(BB&: F->getEntryBlock(),
6298 InsertPos: F->getEntryBlock().getFirstNonPHIIt());
6299 else
6300 I->insertBefore(InsertPos: F->getEntryBlock().getTerminator()->getIterator());
6301 continue;
6302 }
6303 for (Use &U : I->operands()) {
6304 auto *Op = dyn_cast<Instruction>(Val: U.get());
6305 if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
6306 wouldInstructionBeTriviallyDead(I: Op, TLI))
6307 DeadInsts.emplace_back(Args&: Op);
6308 }
6309 I->dropAllReferences();
6310 }
6311 for (auto *I : DeletedInstructions) {
6312 assert(I->use_empty() &&
6313 "trying to erase instruction with users.");
6314 I->eraseFromParent();
6315 }
6316
6317 // Cleanup any dead scalar code feeding the vectorized instructions
6318 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
6319
6320#ifdef EXPENSIVE_CHECKS
6321 // If we could guarantee that this call is not extremely slow, we could
6322 // remove the ifdef limitation (see PR47712).
6323 assert(!verifyFunction(*F, &dbgs()));
6324#endif
6325}
6326
6327/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6328/// contains original mask for the scalars reused in the node. Procedure
6329/// transform this mask in accordance with the given \p Mask.
6330static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
6331 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6332 "Expected non-empty mask.");
6333 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6334 Prev.swap(RHS&: Reuses);
6335 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6336 if (Mask[I] != PoisonMaskElem)
6337 Reuses[Mask[I]] = Prev[I];
6338}
6339
6340/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6341/// the original order of the scalars. Procedure transforms the provided order
6342/// in accordance with the given \p Mask. If the resulting \p Order is just an
6343/// identity order, \p Order is cleared.
6344static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
6345 bool BottomOrder = false) {
6346 assert(!Mask.empty() && "Expected non-empty mask.");
6347 unsigned Sz = Mask.size();
6348 if (BottomOrder) {
6349 SmallVector<unsigned> PrevOrder;
6350 if (Order.empty()) {
6351 PrevOrder.resize(N: Sz);
6352 std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: 0);
6353 } else {
6354 PrevOrder.swap(RHS&: Order);
6355 }
6356 Order.assign(NumElts: Sz, Elt: Sz);
6357 for (unsigned I = 0; I < Sz; ++I)
6358 if (Mask[I] != PoisonMaskElem)
6359 Order[I] = PrevOrder[Mask[I]];
6360 if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
6361 return Data.value() == Sz || Data.index() == Data.value();
6362 })) {
6363 Order.clear();
6364 return;
6365 }
6366 fixupOrderingIndices(Order);
6367 return;
6368 }
6369 SmallVector<int> MaskOrder;
6370 if (Order.empty()) {
6371 MaskOrder.resize(N: Sz);
6372 std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: 0);
6373 } else {
6374 inversePermutation(Indices: Order, Mask&: MaskOrder);
6375 }
6376 reorderReuses(Reuses&: MaskOrder, Mask);
6377 if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
6378 Order.clear();
6379 return;
6380 }
6381 Order.assign(NumElts: Sz, Elt: Sz);
6382 for (unsigned I = 0; I < Sz; ++I)
6383 if (MaskOrder[I] != PoisonMaskElem)
6384 Order[MaskOrder[I]] = I;
6385 fixupOrderingIndices(Order);
6386}
6387
6388std::optional<BoUpSLP::OrdersType>
6389BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6390 bool TopToBottom, bool IgnoreReorder) {
6391 assert(TE.isGather() && "Expected gather node only.");
6392 // Try to find subvector extract/insert patterns and reorder only such
6393 // patterns.
6394 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6395 Type *ScalarTy = GatheredScalars.front()->getType();
6396 size_t NumScalars = GatheredScalars.size();
6397 if (!isValidElementType(Ty: ScalarTy))
6398 return std::nullopt;
6399 auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
6400 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: NumScalars);
6401 SmallVector<int> ExtractMask;
6402 SmallVector<int> Mask;
6403 SmallVector<SmallVector<const TreeEntry *>> Entries;
6404 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
6405 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
6406 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
6407 isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
6408 /*ForOrder=*/true);
6409 // No shuffled operands - ignore.
6410 if (GatherShuffles.empty() && ExtractShuffles.empty())
6411 return std::nullopt;
6412 OrdersType CurrentOrder(NumScalars, NumScalars);
6413 if (GatherShuffles.size() == 1 &&
6414 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6415 Entries.front().front()->isSame(VL: TE.Scalars)) {
6416 // If the full matched node in whole tree rotation - no need to consider the
6417 // matching order, rotating the whole tree.
6418 if (TopToBottom)
6419 return std::nullopt;
6420 // No need to keep the order for the same user node.
6421 if (Entries.front().front()->UserTreeIndex.UserTE ==
6422 TE.UserTreeIndex.UserTE)
6423 return std::nullopt;
6424 // No need to keep the order for the matched root node, if it can be freely
6425 // reordered.
6426 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6427 return std::nullopt;
6428 // If shuffling 2 elements only and the matching node has reverse reuses -
6429 // no need to count order, both work fine.
6430 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6431 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6432 any_of(Range: enumerate(First: Entries.front().front()->ReuseShuffleIndices),
6433 P: [](const auto &P) {
6434 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6435 }))
6436 return std::nullopt;
6437
6438 // Perfect match in the graph, will reuse the previously vectorized
6439 // node. Cost is 0.
6440 std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: 0);
6441 return CurrentOrder;
6442 }
6443 auto IsSplatMask = [](ArrayRef<int> Mask) {
6444 int SingleElt = PoisonMaskElem;
6445 return all_of(Range&: Mask, P: [&](int I) {
6446 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6447 SingleElt = I;
6448 return I == PoisonMaskElem || I == SingleElt;
6449 });
6450 };
6451 // Exclusive broadcast mask - ignore.
6452 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6453 (Entries.size() != 1 ||
6454 Entries.front().front()->ReorderIndices.empty())) ||
6455 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6456 return std::nullopt;
6457 SmallBitVector ShuffledSubMasks(NumParts);
6458 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6459 ArrayRef<int> Mask, int PartSz, int NumParts,
6460 function_ref<unsigned(unsigned)> GetVF) {
6461 for (int I : seq<int>(Begin: 0, End: NumParts)) {
6462 if (ShuffledSubMasks.test(Idx: I))
6463 continue;
6464 const int VF = GetVF(I);
6465 if (VF == 0)
6466 continue;
6467 unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
6468 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
6469 // Shuffle of at least 2 vectors - ignore.
6470 if (any_of(Range&: Slice, P: not_equal_to(Arg&: NumScalars))) {
6471 llvm::fill(Range&: Slice, Value&: NumScalars);
6472 ShuffledSubMasks.set(I);
6473 continue;
6474 }
6475 // Try to include as much elements from the mask as possible.
6476 int FirstMin = INT_MAX;
6477 int SecondVecFound = false;
6478 for (int K : seq<int>(Size: Limit)) {
6479 int Idx = Mask[I * PartSz + K];
6480 if (Idx == PoisonMaskElem) {
6481 Value *V = GatheredScalars[I * PartSz + K];
6482 if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
6483 SecondVecFound = true;
6484 break;
6485 }
6486 continue;
6487 }
6488 if (Idx < VF) {
6489 if (FirstMin > Idx)
6490 FirstMin = Idx;
6491 } else {
6492 SecondVecFound = true;
6493 break;
6494 }
6495 }
6496 FirstMin = (FirstMin / PartSz) * PartSz;
6497 // Shuffle of at least 2 vectors - ignore.
6498 if (SecondVecFound) {
6499 llvm::fill(Range&: Slice, Value&: NumScalars);
6500 ShuffledSubMasks.set(I);
6501 continue;
6502 }
6503 for (int K : seq<int>(Size: Limit)) {
6504 int Idx = Mask[I * PartSz + K];
6505 if (Idx == PoisonMaskElem)
6506 continue;
6507 Idx -= FirstMin;
6508 if (Idx >= PartSz) {
6509 SecondVecFound = true;
6510 break;
6511 }
6512 if (CurrentOrder[I * PartSz + Idx] >
6513 static_cast<unsigned>(I * PartSz + K) &&
6514 CurrentOrder[I * PartSz + Idx] !=
6515 static_cast<unsigned>(I * PartSz + Idx))
6516 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6517 }
6518 // Shuffle of at least 2 vectors - ignore.
6519 if (SecondVecFound) {
6520 llvm::fill(Range&: Slice, Value&: NumScalars);
6521 ShuffledSubMasks.set(I);
6522 continue;
6523 }
6524 }
6525 };
6526 int PartSz = getPartNumElems(Size: NumScalars, NumParts);
6527 if (!ExtractShuffles.empty())
6528 TransformMaskToOrder(
6529 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6530 if (!ExtractShuffles[I])
6531 return 0U;
6532 unsigned VF = 0;
6533 unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
6534 for (unsigned Idx : seq<unsigned>(Size: Sz)) {
6535 int K = I * PartSz + Idx;
6536 if (ExtractMask[K] == PoisonMaskElem)
6537 continue;
6538 if (!TE.ReuseShuffleIndices.empty())
6539 K = TE.ReuseShuffleIndices[K];
6540 if (K == PoisonMaskElem)
6541 continue;
6542 if (!TE.ReorderIndices.empty())
6543 K = std::distance(first: TE.ReorderIndices.begin(),
6544 last: find(Range: TE.ReorderIndices, Val: K));
6545 auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars[K]);
6546 if (!EI)
6547 continue;
6548 VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
6549 ->getElementCount()
6550 .getKnownMinValue());
6551 }
6552 return VF;
6553 });
6554 // Check special corner case - single shuffle of the same entry.
6555 if (GatherShuffles.size() == 1 && NumParts != 1) {
6556 if (ShuffledSubMasks.any())
6557 return std::nullopt;
6558 PartSz = NumScalars;
6559 NumParts = 1;
6560 }
6561 if (!Entries.empty())
6562 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6563 if (!GatherShuffles[I])
6564 return 0U;
6565 return std::max(a: Entries[I].front()->getVectorFactor(),
6566 b: Entries[I].back()->getVectorFactor());
6567 });
6568 unsigned NumUndefs = count(Range&: CurrentOrder, Element: NumScalars);
6569 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6570 return std::nullopt;
6571 return std::move(CurrentOrder);
6572}
6573
6574static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6575 const TargetLibraryInfo &TLI,
6576 bool CompareOpcodes = true) {
6577 if (getUnderlyingObject(V: Ptr1, MaxLookup: RecursionMaxDepth) !=
6578 getUnderlyingObject(V: Ptr2, MaxLookup: RecursionMaxDepth))
6579 return false;
6580 auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
6581 auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
6582 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6583 (!GEP2 || GEP2->getNumOperands() == 2) &&
6584 (((!GEP1 || isConstant(V: GEP1->getOperand(i_nocapture: 1))) &&
6585 (!GEP2 || isConstant(V: GEP2->getOperand(i_nocapture: 1)))) ||
6586 !CompareOpcodes ||
6587 (GEP1 && GEP2 &&
6588 getSameOpcode(VL: {GEP1->getOperand(i_nocapture: 1), GEP2->getOperand(i_nocapture: 1)}, TLI)));
6589}
6590
6591/// Calculates minimal alignment as a common alignment.
6592template <typename T>
6593static Align computeCommonAlignment(ArrayRef<Value *> VL) {
6594 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6595 for (Value *V : VL)
6596 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6597 return CommonAlignment;
6598}
6599
6600/// Check if \p Order represents reverse order.
6601static bool isReverseOrder(ArrayRef<unsigned> Order) {
6602 assert(!Order.empty() &&
6603 "Order is empty. Please check it before using isReverseOrder.");
6604 unsigned Sz = Order.size();
6605 return all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
6606 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6607 });
6608}
6609
6610/// Checks if the provided list of pointers \p Pointers represents the strided
6611/// pointers for type ElemTy. If they are not, nullptr is returned.
6612/// Otherwise, SCEV* of the stride value is returned.
6613/// If `PointerOps` can be rearanged into the following sequence:
6614/// ```
6615/// %x + c_0 * stride,
6616/// %x + c_1 * stride,
6617/// %x + c_2 * stride
6618/// ...
6619/// ```
6620/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6621/// and the SCEV of the `stride` will be returned.
6622static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6623 const DataLayout &DL, ScalarEvolution &SE,
6624 SmallVectorImpl<unsigned> &SortedIndices,
6625 SmallVectorImpl<int64_t> &Coeffs) {
6626 assert(Coeffs.size() == PointerOps.size() &&
6627 "Coeffs vector needs to be of correct size");
6628 SmallVector<const SCEV *> SCEVs;
6629 const SCEV *PtrSCEVLowest = nullptr;
6630 const SCEV *PtrSCEVHighest = nullptr;
6631 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6632 // addresses).
6633 for (Value *Ptr : PointerOps) {
6634 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
6635 if (!PtrSCEV)
6636 return nullptr;
6637 SCEVs.push_back(Elt: PtrSCEV);
6638 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6639 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6640 continue;
6641 }
6642 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6643 if (isa<SCEVCouldNotCompute>(Val: Diff))
6644 return nullptr;
6645 if (Diff->isNonConstantNegative()) {
6646 PtrSCEVLowest = PtrSCEV;
6647 continue;
6648 }
6649 const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
6650 if (isa<SCEVCouldNotCompute>(Val: Diff1))
6651 return nullptr;
6652 if (Diff1->isNonConstantNegative()) {
6653 PtrSCEVHighest = PtrSCEV;
6654 continue;
6655 }
6656 }
6657 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6658 const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
6659 if (isa<SCEVCouldNotCompute>(Val: Dist))
6660 return nullptr;
6661 int Size = DL.getTypeStoreSize(Ty: ElemTy);
6662 auto TryGetStride = [&](const SCEV *Dist,
6663 const SCEV *Multiplier) -> const SCEV * {
6664 if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
6665 if (M->getOperand(i: 0) == Multiplier)
6666 return M->getOperand(i: 1);
6667 if (M->getOperand(i: 1) == Multiplier)
6668 return M->getOperand(i: 0);
6669 return nullptr;
6670 }
6671 if (Multiplier == Dist)
6672 return SE.getConstant(Ty: Dist->getType(), V: 1);
6673 return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
6674 };
6675 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6676 const SCEV *Stride = nullptr;
6677 if (Size != 1 || SCEVs.size() > 2) {
6678 const SCEV *Sz = SE.getConstant(Ty: Dist->getType(), V: Size * (SCEVs.size() - 1));
6679 Stride = TryGetStride(Dist, Sz);
6680 if (!Stride)
6681 return nullptr;
6682 }
6683 if (!Stride || isa<SCEVConstant>(Val: Stride))
6684 return nullptr;
6685 // Iterate through all pointers and check if all distances are
6686 // unique multiple of Stride.
6687 using DistOrdPair = std::pair<int64_t, int>;
6688 auto Compare = llvm::less_first();
6689 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6690 int Cnt = 0;
6691 bool IsConsecutive = true;
6692 for (const auto [Idx, PtrSCEV] : enumerate(First&: SCEVs)) {
6693 unsigned Dist = 0;
6694 if (PtrSCEV != PtrSCEVLowest) {
6695 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6696 const SCEV *Coeff = TryGetStride(Diff, Stride);
6697 if (!Coeff)
6698 return nullptr;
6699 const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
6700 if (!SC || isa<SCEVCouldNotCompute>(Val: SC))
6701 return nullptr;
6702 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6703 if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
6704 RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
6705 ->isZero())
6706 return nullptr;
6707 Dist = SC->getAPInt().getZExtValue();
6708 } else {
6709 Coeffs[Idx] = 0;
6710 }
6711 // If the strides are not the same or repeated, we can't vectorize.
6712 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6713 return nullptr;
6714 auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
6715 if (!Res.second)
6716 return nullptr;
6717 // Consecutive order if the inserted element is the last one.
6718 IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
6719 ++Cnt;
6720 }
6721 if (Offsets.size() != SCEVs.size())
6722 return nullptr;
6723 SortedIndices.clear();
6724 if (!IsConsecutive) {
6725 // Fill SortedIndices array only if it is non-consecutive.
6726 SortedIndices.resize(N: PointerOps.size());
6727 Cnt = 0;
6728 for (const std::pair<int64_t, int> &Pair : Offsets) {
6729 SortedIndices[Cnt] = Pair.second;
6730 ++Cnt;
6731 }
6732 }
6733 return Stride;
6734}
6735
6736static std::pair<InstructionCost, InstructionCost>
6737getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6738 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6739 Type *ScalarTy, VectorType *VecTy);
6740
6741/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6742/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6743/// subvector pattern.
6744static InstructionCost
6745getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
6746 VectorType *Tp, ArrayRef<int> Mask = {},
6747 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
6748 int Index = 0, VectorType *SubTp = nullptr,
6749 ArrayRef<const Value *> Args = {}) {
6750 VectorType *DstTy = Tp;
6751 if (!Mask.empty())
6752 DstTy = FixedVectorType::get(ElementType: Tp->getScalarType(), NumElts: Mask.size());
6753
6754 if (Kind != TTI::SK_PermuteTwoSrc)
6755 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6756 Args);
6757 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6758 int NumSubElts;
6759 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
6760 Mask, NumSrcElts, NumSubElts, Index)) {
6761 if (Index + NumSubElts > NumSrcElts &&
6762 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6763 return TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy, SrcTy: Tp, Mask,
6764 CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
6765 }
6766 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6767 Args);
6768}
6769
6770/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6771/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6772/// instead of a scalar.
6773static InstructionCost
6774getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
6775 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6776 bool Extract, TTI::TargetCostKind CostKind,
6777 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6778 assert(!isa<ScalableVectorType>(Ty) &&
6779 "ScalableVectorType is not supported.");
6780 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6781 getNumElements(Ty) &&
6782 "Incorrect usage.");
6783 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6784 assert(SLPReVec && "Only supported by REVEC.");
6785 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6786 // of CreateInsertElement.
6787 unsigned ScalarTyNumElements = VecTy->getNumElements();
6788 InstructionCost Cost = 0;
6789 for (unsigned I : seq(Size: DemandedElts.getBitWidth())) {
6790 if (!DemandedElts[I])
6791 continue;
6792 if (Insert)
6793 Cost += getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: Ty, Mask: {}, CostKind,
6794 Index: I * ScalarTyNumElements, SubTp: VecTy);
6795 if (Extract)
6796 Cost += getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: Ty, Mask: {}, CostKind,
6797 Index: I * ScalarTyNumElements, SubTp: VecTy);
6798 }
6799 return Cost;
6800 }
6801 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6802 CostKind, ForPoisonSrc, VL);
6803}
6804
6805/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6806/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6807static InstructionCost getVectorInstrCost(
6808 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6809 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6810 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6811 if (Opcode == Instruction::ExtractElement) {
6812 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6813 assert(SLPReVec && "Only supported by REVEC.");
6814 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6815 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
6816 Tp: cast<VectorType>(Val), Mask: {}, CostKind,
6817 Index: Index * VecTy->getNumElements(), SubTp: VecTy);
6818 }
6819 }
6820 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6821 ScalarUserAndIdx);
6822}
6823
6824/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6825/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6826static InstructionCost getExtractWithExtendCost(
6827 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6828 VectorType *VecTy, unsigned Index,
6829 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
6830 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Val: Dst)) {
6831 assert(SLPReVec && "Only supported by REVEC.");
6832 auto *SubTp =
6833 getWidenedType(ScalarTy: VecTy->getElementType(), VF: ScalarTy->getNumElements());
6834 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: VecTy, Mask: {}, CostKind,
6835 Index: Index * ScalarTy->getNumElements(), SubTp) +
6836 TTI.getCastInstrCost(Opcode, Dst, Src: SubTp, CCH: TTI::CastContextHint::None,
6837 CostKind);
6838 }
6839 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6840}
6841
6842/// Creates subvector insert. Generates shuffle using \p Generator or
6843/// using default shuffle.
6844static Value *createInsertVector(
6845 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6846 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6847 if (isa<PoisonValue>(Val: Vec) && isa<PoisonValue>(Val: V))
6848 return Vec;
6849 const unsigned SubVecVF = getNumElements(Ty: V->getType());
6850 // Create shuffle, insertvector requires that index is multiple of
6851 // the subvector length.
6852 const unsigned VecVF = getNumElements(Ty: Vec->getType());
6853 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6854 if (isa<PoisonValue>(Val: Vec)) {
6855 auto *Begin = std::next(x: Mask.begin(), n: Index);
6856 std::iota(first: Begin, last: std::next(x: Begin, n: SubVecVF), value: 0);
6857 Vec = Builder.CreateShuffleVector(V, Mask);
6858 return Vec;
6859 }
6860 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
6861 std::iota(first: std::next(x: Mask.begin(), n: Index),
6862 last: std::next(x: Mask.begin(), n: Index + SubVecVF), value: VecVF);
6863 if (Generator)
6864 return Generator(Vec, V, Mask);
6865 // 1. Resize V to the size of Vec.
6866 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6867 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: SubVecVF), value: 0);
6868 V = Builder.CreateShuffleVector(V, Mask: ResizeMask);
6869 // 2. Insert V into Vec.
6870 return Builder.CreateShuffleVector(V1: Vec, V2: V, Mask);
6871}
6872
6873/// Generates subvector extract using \p Generator or using default shuffle.
6874static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
6875 unsigned SubVecVF, unsigned Index) {
6876 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6877 std::iota(first: Mask.begin(), last: Mask.end(), value: Index);
6878 return Builder.CreateShuffleVector(V: Vec, Mask);
6879}
6880
6881/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6882/// with \p Order.
6883/// \return true if the mask represents strided access, false - otherwise.
6884static bool buildCompressMask(ArrayRef<Value *> PointerOps,
6885 ArrayRef<unsigned> Order, Type *ScalarTy,
6886 const DataLayout &DL, ScalarEvolution &SE,
6887 SmallVectorImpl<int> &CompressMask) {
6888 const unsigned Sz = PointerOps.size();
6889 CompressMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
6890 // The first element always set.
6891 CompressMask[0] = 0;
6892 // Check if the mask represents strided access.
6893 std::optional<unsigned> Stride = 0;
6894 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6895 for (unsigned I : seq<unsigned>(Begin: 1, End: Sz)) {
6896 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6897 std::optional<int64_t> OptPos =
6898 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
6899 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6900 return false;
6901 unsigned Pos = static_cast<unsigned>(*OptPos);
6902 CompressMask[I] = Pos;
6903 if (!Stride)
6904 continue;
6905 if (*Stride == 0) {
6906 *Stride = Pos;
6907 continue;
6908 }
6909 if (Pos != *Stride * I)
6910 Stride.reset();
6911 }
6912 return Stride.has_value();
6913}
6914
6915/// Checks if the \p VL can be transformed to a (masked)load + compress or
6916/// (masked) interleaved load.
6917static bool isMaskedLoadCompress(
6918 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6919 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
6920 const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
6921 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6922 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6923 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6924 VectorType *&LoadVecTy) {
6925 InterleaveFactor = 0;
6926 Type *ScalarTy = VL.front()->getType();
6927 const size_t Sz = VL.size();
6928 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6929 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6930 SmallVector<int> Mask;
6931 if (!Order.empty())
6932 inversePermutation(Indices: Order, Mask);
6933 // Check external uses.
6934 for (const auto [I, V] : enumerate(First&: VL)) {
6935 if (AreAllUsersVectorized(V))
6936 continue;
6937 InstructionCost ExtractCost =
6938 TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
6939 Index: Mask.empty() ? I : Mask[I]);
6940 InstructionCost ScalarCost =
6941 TTI.getInstructionCost(U: cast<Instruction>(Val: V), CostKind);
6942 if (ExtractCost <= ScalarCost)
6943 return false;
6944 }
6945 Value *Ptr0;
6946 Value *PtrN;
6947 if (Order.empty()) {
6948 Ptr0 = PointerOps.front();
6949 PtrN = PointerOps.back();
6950 } else {
6951 Ptr0 = PointerOps[Order.front()];
6952 PtrN = PointerOps[Order.back()];
6953 }
6954 std::optional<int64_t> Diff =
6955 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL, SE);
6956 if (!Diff)
6957 return false;
6958 const size_t MaxRegSize =
6959 TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
6960 .getFixedValue();
6961 // Check for very large distances between elements.
6962 if (*Diff / Sz >= MaxRegSize / 8)
6963 return false;
6964 LoadVecTy = getWidenedType(ScalarTy, VF: *Diff + 1);
6965 auto *LI = cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()]);
6966 Align CommonAlignment = LI->getAlign();
6967 IsMasked = !isSafeToLoadUnconditionally(
6968 V: Ptr0, Ty: LoadVecTy, Alignment: CommonAlignment, DL,
6969 ScanFrom: cast<LoadInst>(Val: Order.empty() ? VL.back() : VL[Order.back()]), AC: &AC, DT: &DT,
6970 TLI: &TLI);
6971 if (IsMasked && !TTI.isLegalMaskedLoad(DataType: LoadVecTy, Alignment: CommonAlignment,
6972 AddressSpace: LI->getPointerAddressSpace()))
6973 return false;
6974 // TODO: perform the analysis of each scalar load for better
6975 // safe-load-unconditionally analysis.
6976 bool IsStrided =
6977 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6978 assert(CompressMask.size() >= 2 && "At least two elements are required");
6979 SmallVector<Value *> OrderedPointerOps(PointerOps);
6980 if (!Order.empty())
6981 reorderScalars(Scalars&: OrderedPointerOps, Mask);
6982 auto [ScalarGEPCost, VectorGEPCost] =
6983 getGEPCosts(TTI, Ptrs: OrderedPointerOps, BasePtr: OrderedPointerOps.front(),
6984 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy: LoadVecTy);
6985 // The cost of scalar loads.
6986 InstructionCost ScalarLoadsCost =
6987 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
6988 binary_op: [&](InstructionCost C, Value *V) {
6989 return C + TTI.getInstructionCost(U: cast<Instruction>(Val: V),
6990 CostKind);
6991 }) +
6992 ScalarGEPCost;
6993 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
6994 InstructionCost GatherCost =
6995 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
6996 /*Insert=*/true,
6997 /*Extract=*/false, CostKind) +
6998 ScalarLoadsCost;
6999 InstructionCost LoadCost = 0;
7000 if (IsMasked) {
7001 LoadCost = TTI.getMemIntrinsicInstrCost(
7002 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
7003 CommonAlignment,
7004 LI->getPointerAddressSpace()),
7005 CostKind);
7006 } else {
7007 LoadCost =
7008 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
7009 AddressSpace: LI->getPointerAddressSpace(), CostKind);
7010 }
7011 if (IsStrided && !IsMasked && Order.empty()) {
7012 // Check for potential segmented(interleaved) loads.
7013 VectorType *AlignedLoadVecTy = getWidenedType(
7014 ScalarTy, VF: getFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: *Diff + 1));
7015 if (!isSafeToLoadUnconditionally(V: Ptr0, Ty: AlignedLoadVecTy, Alignment: CommonAlignment,
7016 DL, ScanFrom: cast<LoadInst>(Val: VL.back()), AC: &AC, DT: &DT,
7017 TLI: &TLI))
7018 AlignedLoadVecTy = LoadVecTy;
7019 if (TTI.isLegalInterleavedAccessType(VTy: AlignedLoadVecTy, Factor: CompressMask[1],
7020 Alignment: CommonAlignment,
7021 AddrSpace: LI->getPointerAddressSpace())) {
7022 InstructionCost InterleavedCost =
7023 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7024 Opcode: Instruction::Load, VecTy: AlignedLoadVecTy,
7025 Factor: CompressMask[1], Indices: {}, Alignment: CommonAlignment,
7026 AddressSpace: LI->getPointerAddressSpace(), CostKind, UseMaskForCond: IsMasked);
7027 if (InterleavedCost < GatherCost) {
7028 InterleaveFactor = CompressMask[1];
7029 LoadVecTy = AlignedLoadVecTy;
7030 return true;
7031 }
7032 }
7033 }
7034 InstructionCost CompressCost = ::getShuffleCost(
7035 TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: LoadVecTy, Mask: CompressMask, CostKind);
7036 if (!Order.empty()) {
7037 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7038 for (unsigned I : seq<unsigned>(Size: Sz)) {
7039 NewMask[I] = CompressMask[Mask[I]];
7040 }
7041 CompressMask.swap(RHS&: NewMask);
7042 }
7043 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7044 return TotalVecCost < GatherCost;
7045}
7046
7047/// Checks if the \p VL can be transformed to a (masked)load + compress or
7048/// (masked) interleaved load.
7049static bool
7050isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
7051 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
7052 const DataLayout &DL, ScalarEvolution &SE,
7053 AssumptionCache &AC, const DominatorTree &DT,
7054 const TargetLibraryInfo &TLI,
7055 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7056 bool IsMasked;
7057 unsigned InterleaveFactor;
7058 SmallVector<int> CompressMask;
7059 VectorType *LoadVecTy;
7060 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7061 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7062 CompressMask, LoadVecTy);
7063}
7064
7065/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7066/// PointerOps:
7067/// 1. Target with strided load support is detected.
7068/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7069/// potential stride <= MaxProfitableLoadStride and the potential stride is
7070/// power-of-2 (to avoid perf regressions for the very small number of loads)
7071/// and max distance > number of loads, or potential stride is -1.
7072/// 3. The loads are ordered, or number of unordered loads <=
7073/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7074/// to avoid extra costs for very expensive shuffles).
7075/// 4. Any pointer operand is an instruction with the users outside of the
7076/// current graph (for masked gathers extra extractelement instructions
7077/// might be required).
7078bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
7079 Align Alignment, const int64_t Diff,
7080 const size_t Sz) const {
7081 if (Diff % (Sz - 1) != 0)
7082 return false;
7083
7084 // Try to generate strided load node.
7085 auto IsAnyPointerUsedOutGraph = any_of(Range&: PointerOps, P: [&](Value *V) {
7086 return isa<Instruction>(Val: V) && any_of(Range: V->users(), P: [&](User *U) {
7087 return !isVectorized(V: U) && !MustGather.contains(Ptr: U);
7088 });
7089 });
7090
7091 const uint64_t AbsoluteDiff = std::abs(i: Diff);
7092 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7093 if (IsAnyPointerUsedOutGraph ||
7094 (AbsoluteDiff > Sz &&
7095 (Sz > MinProfitableStridedLoads ||
7096 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7097 AbsoluteDiff % Sz == 0 && has_single_bit(Value: AbsoluteDiff / Sz)))) ||
7098 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7099 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7100 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7101 return false;
7102 if (!TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment))
7103 return false;
7104 return true;
7105 }
7106 return false;
7107}
7108
7109bool BoUpSLP::analyzeConstantStrideCandidate(
7110 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7111 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7112 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7113 const size_t Sz = PointerOps.size();
7114 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7115 // Go through `PointerOps` in sorted order and record offsets from
7116 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7117 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7118 // PointerOps[0]. This is safe since only offset differences are used below.
7119 for (unsigned I : seq<unsigned>(Size: Sz)) {
7120 Value *Ptr =
7121 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7122 std::optional<int64_t> Offset =
7123 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: Ptr, DL: *DL, SE&: *SE);
7124 assert(Offset && "sortPtrAccesses should have validated this pointer");
7125 SortedOffsetsFromBase[I] = *Offset;
7126 }
7127
7128 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7129 // ```
7130 // [
7131 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7132 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7133 // ...
7134 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7135 // GroupSize - 1}), // last group
7136 // ]
7137 // ```
7138 // The distance between consecutive elements within each group should all be
7139 // the same `StrideWithinGroup`. The distance between the first elements of
7140 // consecutive groups should all be the same `StrideBetweenGroups`.
7141
7142 int64_t StrideWithinGroup =
7143 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7144 // Determine size of the first group. Later we will check that all other
7145 // groups have the same size.
7146 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7147 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7148 StrideWithinGroup;
7149 };
7150 auto Indices = seq<unsigned>(Begin: 1, End: Sz);
7151 auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7152 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7153
7154 unsigned VecSz = Sz;
7155 Type *NewScalarTy = ScalarTy;
7156
7157 // Quick detour: at this point we can say what the type of strided load would
7158 // be if all the checks pass. Check if this type is legal for the target.
7159 bool NeedsWidening = Sz != GroupSize;
7160 if (NeedsWidening) {
7161 if (Sz % GroupSize != 0)
7162 return false;
7163
7164 if (StrideWithinGroup != 1)
7165 return false;
7166 VecSz = Sz / GroupSize;
7167 NewScalarTy = Type::getIntNTy(
7168 C&: SE->getContext(),
7169 N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * GroupSize);
7170 }
7171
7172 if (!isStridedLoad(PointerOps, ScalarTy: NewScalarTy, Alignment, Diff, Sz: VecSz))
7173 return false;
7174
7175 int64_t StrideIntVal = StrideWithinGroup;
7176 if (NeedsWidening) {
7177 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7178 // Check that the strides between groups are all the same.
7179 unsigned CurrentGroupStartIdx = GroupSize;
7180 int64_t StrideBetweenGroups =
7181 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7182 StrideIntVal = StrideBetweenGroups;
7183 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7184 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7185 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7186 StrideBetweenGroups)
7187 return false;
7188 }
7189
7190 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7191 auto Indices = seq<unsigned>(Begin: StartIdx + 1, End: Sz);
7192 auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7193 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7194 return GroupEndIdx - StartIdx == GroupSize;
7195 };
7196 for (unsigned I = 0; I < Sz; I += GroupSize) {
7197 if (!CheckGroup(I))
7198 return false;
7199 }
7200 }
7201
7202 Type *StrideTy = DL->getIndexType(PtrTy: Ptr0->getType());
7203 SPtrInfo.StrideVal = ConstantInt::getSigned(Ty: StrideTy, V: StrideIntVal);
7204 SPtrInfo.Ty = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7205 return true;
7206}
7207
7208bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
7209 Type *ScalarTy, Align CommonAlignment,
7210 SmallVectorImpl<unsigned> &SortedIndices,
7211 StridedPtrInfo &SPtrInfo) const {
7212 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7213 // is constant, we partition `PointerOps` sequence into subsequences of
7214 // pointers with the same offset. For each offset we record values from
7215 // `PointerOps` and their indicies in `PointerOps`.
7216 SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
7217 OffsetToPointerOpIdxMap;
7218 for (auto [Idx, Ptr] : enumerate(First&: PointerOps)) {
7219 const SCEV *PtrSCEV = SE->getSCEV(V: Ptr);
7220 if (!PtrSCEV)
7221 return false;
7222
7223 const auto *Add = dyn_cast<SCEVAddExpr>(Val: PtrSCEV);
7224 int64_t Offset = 0;
7225 if (Add) {
7226 // `Offset` is non-zero.
7227 for (int I : seq<int>(Size: Add->getNumOperands())) {
7228 const auto *SC = dyn_cast<SCEVConstant>(Val: Add->getOperand(i: I));
7229 if (!SC)
7230 continue;
7231 Offset = SC->getAPInt().getSExtValue();
7232 break;
7233 }
7234 }
7235 OffsetToPointerOpIdxMap[Offset].first.push_back(Elt: Ptr);
7236 OffsetToPointerOpIdxMap[Offset].second.push_back(Elt: Idx);
7237 }
7238 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7239
7240 // Quick detour: at this point we can say what the type of strided load would
7241 // be if all the checks pass. Check if this type is legal for the target.
7242 const unsigned Sz = PointerOps.size();
7243 unsigned VecSz = Sz;
7244 Type *NewScalarTy = ScalarTy;
7245 if (NumOffsets > 1) {
7246 if (Sz % NumOffsets != 0)
7247 return false;
7248 VecSz = Sz / NumOffsets;
7249 NewScalarTy = Type::getIntNTy(
7250 C&: SE->getContext(),
7251 N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * NumOffsets);
7252 }
7253 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7254 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(Ty: StridedLoadTy) ||
7255 !TTI->isLegalStridedLoadStore(DataType: StridedLoadTy, Alignment: CommonAlignment))
7256 return false;
7257
7258 // Check if the offsets are contiguous and that each group has the required
7259 // size.
7260 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7261 for (auto [Idx, MapPair] : enumerate(First&: OffsetToPointerOpIdxMap)) {
7262 if (MapPair.second.first.size() != VecSz)
7263 return false;
7264 SortedOffsetsV[Idx] = MapPair.first;
7265 }
7266 sort(C&: SortedOffsetsV);
7267
7268 if (NumOffsets > 1) {
7269 for (int I : seq<int>(Begin: 1, End: SortedOffsetsV.size())) {
7270 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7271 return false;
7272 }
7273 }
7274
7275 // Introduce some notation for the explanations below. Let `PointerOps_j`
7276 // denote the subsequence of `PointerOps` with offsets equal to
7277 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7278 // ```
7279 // PointerOps_j[SortedIndices_j[0]],
7280 // PointerOps_j[SortedIndices_j[1]],
7281 // PointerOps_j[SortedIndices_j[2]],
7282 // ...
7283 // ```
7284 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7285 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7286 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7287 // The entire sorted `PointerOps` looks like this:
7288 // ```
7289 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7290 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7291 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7292 // ...
7293 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7294 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7295 //
7296 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7297 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7298 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7299 // ...
7300 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7301 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7302 //
7303 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7304 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7305 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7306 // ...
7307 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7308 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7309 // ...
7310 // ...
7311 // ...
7312 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7313 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7314 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7315 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7316 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7317 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7318 // ...
7319 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7320 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7321 // ```
7322 // In order to be able to generate a strided load, we need the following
7323 // checks to pass:
7324 //
7325 // (1) for each `PointerOps_j` check that the distance
7326 // between adjacent pointers are all equal to the same value (stride).
7327 // (2) for each `PointerOps_j` check that coefficients calculated by
7328 // `calculateRtStride` are all the same.
7329 //
7330 // As we do that, also calculate SortedIndices. Since we should not modify
7331 // `SortedIndices` unless we know that all the checks succeed, record the
7332 // indicies into `SortedIndicesDraft`.
7333 SmallVector<unsigned> SortedIndicesDraft(Sz);
7334
7335 // Given sorted indices for a particular offset (as calculated by
7336 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7337 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7338 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7339 // \param `IndicesInAllPointerOps` vector of indices of the
7340 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7341 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7342 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7343 auto UpdateSortedIndices =
7344 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7345 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7346 if (SortedIndicesForOffset.empty()) {
7347 SortedIndicesForOffset.resize(N: IndicesInAllPointerOps.size());
7348 std::iota(first: SortedIndicesForOffset.begin(),
7349 last: SortedIndicesForOffset.end(), value: 0);
7350 }
7351 for (const auto [Num, Idx] : enumerate(First&: SortedIndicesForOffset)) {
7352 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7353 IndicesInAllPointerOps[Idx];
7354 }
7355 };
7356
7357 int64_t LowestOffset = SortedOffsetsV[0];
7358 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7359
7360 SmallVector<int64_t> Coeffs0(VecSz);
7361 SmallVector<unsigned> SortedIndicesForOffset0;
7362 const SCEV *Stride0 = calculateRtStride(PointerOps: PointerOps0, ElemTy: ScalarTy, DL: *DL, SE&: *SE,
7363 SortedIndices&: SortedIndicesForOffset0, Coeffs&: Coeffs0);
7364 if (!Stride0)
7365 return false;
7366 unsigned NumCoeffs0 = Coeffs0.size();
7367 if (NumCoeffs0 * NumOffsets != Sz)
7368 return false;
7369 sort(C&: Coeffs0);
7370
7371 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7372 OffsetToPointerOpIdxMap[LowestOffset].second;
7373 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7374
7375 // Now that we know what the common stride and coefficients has to be check
7376 // the remaining `PointerOps_j`.
7377 SmallVector<int64_t> Coeffs;
7378 SmallVector<unsigned> SortedIndicesForOffset;
7379 for (int J : seq<int>(Begin: 1, End: NumOffsets)) {
7380 Coeffs.clear();
7381 Coeffs.resize(N: VecSz);
7382 SortedIndicesForOffset.clear();
7383
7384 int64_t Offset = SortedOffsetsV[J];
7385 ArrayRef<Value *> PointerOpsForOffset =
7386 OffsetToPointerOpIdxMap[Offset].first;
7387 ArrayRef<unsigned> IndicesInAllPointerOps =
7388 OffsetToPointerOpIdxMap[Offset].second;
7389 const SCEV *StrideWithinGroup =
7390 calculateRtStride(PointerOps: PointerOpsForOffset, ElemTy: ScalarTy, DL: *DL, SE&: *SE,
7391 SortedIndices&: SortedIndicesForOffset, Coeffs);
7392
7393 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7394 return false;
7395 if (Coeffs.size() != NumCoeffs0)
7396 return false;
7397 sort(C&: Coeffs);
7398 if (Coeffs != Coeffs0)
7399 return false;
7400
7401 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7402 }
7403
7404 SortedIndices.clear();
7405 SortedIndices = std::move(SortedIndicesDraft);
7406 SPtrInfo.StrideSCEV = Stride0;
7407 SPtrInfo.Ty = StridedLoadTy;
7408 return true;
7409}
7410
7411BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
7412 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7413 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7414 unsigned *BestVF, bool TryRecursiveCheck) const {
7415 // Check that a vectorized load would load the same memory as a scalar
7416 // load. For example, we don't want to vectorize loads that are smaller
7417 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7418 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7419 // from such a struct, we read/write packed bits disagreeing with the
7420 // unvectorized version.
7421 if (BestVF)
7422 *BestVF = 0;
7423 if (areKnownNonVectorizableLoads(VL))
7424 return LoadsState::Gather;
7425 Type *ScalarTy = VL0->getType();
7426
7427 if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
7428 return LoadsState::Gather;
7429
7430 // Make sure all loads in the bundle are simple - we can't vectorize
7431 // atomic or volatile loads.
7432 PointerOps.clear();
7433 const size_t Sz = VL.size();
7434 PointerOps.resize(N: Sz);
7435 auto *POIter = PointerOps.begin();
7436 for (Value *V : VL) {
7437 auto *L = dyn_cast<LoadInst>(Val: V);
7438 if (!L || !L->isSimple())
7439 return LoadsState::Gather;
7440 *POIter = L->getPointerOperand();
7441 ++POIter;
7442 }
7443
7444 Order.clear();
7445 // Check the order of pointer operands or that all pointers are the same.
7446 bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order);
7447
7448 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7449 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7450 if (!IsSorted) {
7451 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, SortedIndices&: Order,
7452 SPtrInfo))
7453 return LoadsState::StridedVectorize;
7454
7455 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
7456 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7457 return LoadsState::Gather;
7458
7459 if (!all_of(Range&: PointerOps, P: [&](Value *P) {
7460 return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
7461 }))
7462 return LoadsState::Gather;
7463
7464 } else {
7465 Value *Ptr0;
7466 Value *PtrN;
7467 if (Order.empty()) {
7468 Ptr0 = PointerOps.front();
7469 PtrN = PointerOps.back();
7470 } else {
7471 Ptr0 = PointerOps[Order.front()];
7472 PtrN = PointerOps[Order.back()];
7473 }
7474 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7475 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7476 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7477 std::optional<int64_t> Diff0 =
7478 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: Ptr0, DL: *DL, SE&: *SE);
7479 std::optional<int64_t> DiffN =
7480 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
7481 assert(Diff0 && DiffN &&
7482 "sortPtrAccesses should have validated these pointers");
7483 int64_t Diff = *DiffN - *Diff0;
7484 // Check that the sorted loads are consecutive.
7485 if (static_cast<uint64_t>(Diff) == Sz - 1)
7486 return LoadsState::Vectorize;
7487 if (isMaskedLoadCompress(VL, PointerOps, Order, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
7488 TLI: *TLI, AreAllUsersVectorized: [&](Value *V) {
7489 return areAllUsersVectorized(
7490 I: cast<Instruction>(Val: V), VectorizedVals: UserIgnoreList);
7491 }))
7492 return LoadsState::CompressVectorize;
7493 Align Alignment =
7494 cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()])
7495 ->getAlign();
7496 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, SortedIndices: Order,
7497 Diff, Ptr0, PtrN, SPtrInfo))
7498 return LoadsState::StridedVectorize;
7499 }
7500 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
7501 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7502 return LoadsState::Gather;
7503 // Correctly identify compare the cost of loads + shuffles rather than
7504 // strided/masked gather loads. Returns true if vectorized + shuffles
7505 // representation is better than just gather.
7506 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7507 unsigned *BestVF,
7508 bool ProfitableGatherPointers) {
7509 if (BestVF)
7510 *BestVF = 0;
7511 // Compare masked gather cost and loads + insert subvector costs.
7512 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7513 auto [ScalarGEPCost, VectorGEPCost] =
7514 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(),
7515 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7516 // Estimate the cost of masked gather GEP. If not a splat, roughly
7517 // estimate as a buildvector, otherwise estimate as splat.
7518 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
7519 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7520 VectorType *PtrVecTy = getWidenedType(ScalarTy: PtrScalarTy, VF: Sz);
7521 if (static_cast<unsigned>(count_if(
7522 Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7523 any_of(Range&: PointerOps, P: [&](Value *V) {
7524 return getUnderlyingObject(V) !=
7525 getUnderlyingObject(V: PointerOps.front());
7526 }))
7527 VectorGEPCost += getScalarizationOverhead(TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy,
7528 DemandedElts, /*Insert=*/true,
7529 /*Extract=*/false, CostKind);
7530 else
7531 VectorGEPCost +=
7532 getScalarizationOverhead(
7533 TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, DemandedElts: APInt::getOneBitSet(numBits: Sz, BitNo: 0),
7534 /*Insert=*/true, /*Extract=*/false, CostKind) +
7535 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: PtrVecTy, Mask: {}, CostKind);
7536 // The cost of scalar loads.
7537 InstructionCost ScalarLoadsCost =
7538 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
7539 binary_op: [&](InstructionCost C, Value *V) {
7540 return C + TTI.getInstructionCost(
7541 U: cast<Instruction>(Val: V), CostKind);
7542 }) +
7543 ScalarGEPCost;
7544 // The cost of masked gather.
7545 InstructionCost MaskedGatherCost =
7546 TTI.getMemIntrinsicInstrCost(
7547 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7548 cast<LoadInst>(Val: VL0)->getPointerOperand(),
7549 /*VariableMask=*/false, CommonAlignment),
7550 CostKind) +
7551 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7552 InstructionCost GatherCost =
7553 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7554 /*Insert=*/true,
7555 /*Extract=*/false, CostKind) +
7556 ScalarLoadsCost;
7557 // The list of loads is small or perform partial check already - directly
7558 // compare masked gather cost and gather cost.
7559 constexpr unsigned ListLimit = 4;
7560 if (!TryRecursiveCheck || VL.size() < ListLimit)
7561 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7562
7563 // FIXME: The following code has not been updated for non-power-of-2
7564 // vectors (and not whole registers). The splitting logic here does not
7565 // cover the original vector if the vector factor is not a power of two.
7566 if (!hasFullVectorsOrPowerOf2(TTI, Ty: ScalarTy, Sz: VL.size()))
7567 return false;
7568
7569 unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
7570 unsigned MinVF = getMinVF(Sz: 2 * Sz);
7571 DemandedElts.clearAllBits();
7572 // Iterate through possible vectorization factors and check if vectorized +
7573 // shuffles is better than just gather.
7574 for (unsigned VF =
7575 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VL.size() - 1);
7576 VF >= MinVF;
7577 VF = getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VF - 1)) {
7578 SmallVector<LoadsState> States;
7579 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7580 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
7581 SmallVector<unsigned> Order;
7582 SmallVector<Value *> PointerOps;
7583 LoadsState LS = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
7584 PointerOps, SPtrInfo, BestVF,
7585 /*TryRecursiveCheck=*/false);
7586 // Check that the sorted loads are consecutive.
7587 if (LS == LoadsState::Gather) {
7588 if (BestVF) {
7589 DemandedElts.setAllBits();
7590 break;
7591 }
7592 DemandedElts.setBits(loBit: Cnt, hiBit: Cnt + VF);
7593 continue;
7594 }
7595 // If need the reorder - consider as high-cost masked gather for now.
7596 if ((LS == LoadsState::Vectorize ||
7597 LS == LoadsState::StridedVectorize ||
7598 LS == LoadsState::CompressVectorize) &&
7599 !Order.empty() && !isReverseOrder(Order))
7600 LS = LoadsState::ScatterVectorize;
7601 States.push_back(Elt: LS);
7602 }
7603 if (DemandedElts.isAllOnes())
7604 // All loads gathered - try smaller VF.
7605 continue;
7606 // Can be vectorized later as a serie of loads/insertelements.
7607 InstructionCost VecLdCost = 0;
7608 if (!DemandedElts.isZero()) {
7609 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7610 /*Insert=*/true,
7611 /*Extract=*/false, CostKind) +
7612 ScalarGEPCost;
7613 for (unsigned Idx : seq<unsigned>(Size: VL.size()))
7614 if (DemandedElts[Idx])
7615 VecLdCost +=
7616 TTI.getInstructionCost(U: cast<Instruction>(Val: VL[Idx]), CostKind);
7617 }
7618 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7619 for (auto [I, LS] : enumerate(First&: States)) {
7620 auto *LI0 = cast<LoadInst>(Val: VL[I * VF]);
7621 InstructionCost VectorGEPCost =
7622 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7623 ? 0
7624 : getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
7625 BasePtr: LI0->getPointerOperand(),
7626 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy,
7627 VecTy: SubVecTy)
7628 .second;
7629 if (LS == LoadsState::ScatterVectorize) {
7630 if (static_cast<unsigned>(
7631 count_if(Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) <
7632 PointerOps.size() - 1 ||
7633 any_of(Range&: PointerOps, P: [&](Value *V) {
7634 return getUnderlyingObject(V) !=
7635 getUnderlyingObject(V: PointerOps.front());
7636 }))
7637 VectorGEPCost += getScalarizationOverhead(
7638 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getAllOnes(numBits: VF),
7639 /*Insert=*/true, /*Extract=*/false, CostKind);
7640 else
7641 VectorGEPCost +=
7642 getScalarizationOverhead(
7643 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getOneBitSet(numBits: VF, BitNo: 0),
7644 /*Insert=*/true, /*Extract=*/false, CostKind) +
7645 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: SubVecTy, Mask: {},
7646 CostKind);
7647 }
7648 switch (LS) {
7649 case LoadsState::Vectorize:
7650 VecLdCost +=
7651 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
7652 AddressSpace: LI0->getPointerAddressSpace(), CostKind,
7653 OpdInfo: TTI::OperandValueInfo()) +
7654 VectorGEPCost;
7655 break;
7656 case LoadsState::StridedVectorize:
7657 VecLdCost += TTI.getMemIntrinsicInstrCost(
7658 MICA: MemIntrinsicCostAttributes(
7659 Intrinsic::experimental_vp_strided_load,
7660 SubVecTy, LI0->getPointerOperand(),
7661 /*VariableMask=*/false, CommonAlignment),
7662 CostKind) +
7663 VectorGEPCost;
7664 break;
7665 case LoadsState::CompressVectorize:
7666 VecLdCost += TTI.getMemIntrinsicInstrCost(
7667 MICA: MemIntrinsicCostAttributes(
7668 Intrinsic::masked_load, SubVecTy,
7669 CommonAlignment, LI0->getPointerAddressSpace()),
7670 CostKind) +
7671 ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SubVecTy,
7672 Mask: {}, CostKind);
7673 break;
7674 case LoadsState::ScatterVectorize:
7675 VecLdCost += TTI.getMemIntrinsicInstrCost(
7676 MICA: MemIntrinsicCostAttributes(
7677 Intrinsic::masked_gather, SubVecTy,
7678 LI0->getPointerOperand(),
7679 /*VariableMask=*/false, CommonAlignment),
7680 CostKind) +
7681 VectorGEPCost;
7682 break;
7683 case LoadsState::Gather:
7684 // Gathers are already calculated - ignore.
7685 continue;
7686 }
7687 SmallVector<int> ShuffleMask(VL.size());
7688 for (int Idx : seq<int>(Begin: 0, End: VL.size()))
7689 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7690 if (I > 0)
7691 VecLdCost +=
7692 ::getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
7693 CostKind, Index: I * VF, SubTp: SubVecTy);
7694 }
7695 // If masked gather cost is higher - better to vectorize, so
7696 // consider it as a gather node. It will be better estimated
7697 // later.
7698 if (MaskedGatherCost >= VecLdCost &&
7699 VecLdCost - GatherCost < -SLPCostThreshold) {
7700 if (BestVF)
7701 *BestVF = VF;
7702 return true;
7703 }
7704 }
7705 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7706 };
7707 // TODO: need to improve analysis of the pointers, if not all of them are
7708 // GEPs or have > 2 operands, we end up with a gather node, which just
7709 // increases the cost.
7710 Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
7711 bool ProfitableGatherPointers =
7712 L && Sz > 2 && static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
7713 return L->isLoopInvariant(V);
7714 })) <= Sz / 2;
7715 if (ProfitableGatherPointers || all_of(Range&: PointerOps, P: [](Value *P) {
7716 auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
7717 return (!GEP && doesNotNeedToBeScheduled(V: P)) ||
7718 (GEP && GEP->getNumOperands() == 2 &&
7719 isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: 1)));
7720 })) {
7721 // Check if potential masked gather can be represented as series
7722 // of loads + insertsubvectors.
7723 // If masked gather cost is higher - better to vectorize, so
7724 // consider it as a gather node. It will be better estimated
7725 // later.
7726 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7727 ProfitableGatherPointers))
7728 return LoadsState::ScatterVectorize;
7729 }
7730
7731 return LoadsState::Gather;
7732}
7733
7734static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
7735 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7736 const DataLayout &DL, ScalarEvolution &SE,
7737 SmallVectorImpl<unsigned> &SortedIndices) {
7738 assert(
7739 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7740 "Expected list of pointer operands.");
7741 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7742 // Ptr into, sort and return the sorted indices with values next to one
7743 // another.
7744 SmallMapVector<
7745 std::pair<BasicBlock *, Value *>,
7746 SmallVector<SmallVector<std::tuple<Value *, int64_t, unsigned>>>, 8>
7747 Bases;
7748 Bases
7749 .try_emplace(Key: std::make_pair(
7750 x: BBs.front(), y: getUnderlyingObject(V: VL.front(), MaxLookup: RecursionMaxDepth)))
7751 .first->second.emplace_back().emplace_back(Args: VL.front(), Args: 0U, Args: 0U);
7752
7753 SortedIndices.clear();
7754 for (auto [Cnt, Ptr] : enumerate(First: VL.drop_front())) {
7755 auto Key = std::make_pair(x: BBs[Cnt + 1],
7756 y: getUnderlyingObject(V: Ptr, MaxLookup: RecursionMaxDepth));
7757 bool Found = any_of(Range&: Bases.try_emplace(Key).first->second,
7758 P: [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7759 std::optional<int64_t> Diff =
7760 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7761 ElemTy, Ptr, DL, SE,
7762 /*StrictCheck=*/true);
7763 if (!Diff)
7764 return false;
7765
7766 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7767 return true;
7768 });
7769
7770 if (!Found) {
7771 // If we haven't found enough to usefully cluster, return early.
7772 if (Bases.size() > VL.size() / 2 - 1)
7773 return false;
7774
7775 // Not found already - add a new Base
7776 Bases.find(Key)->second.emplace_back().emplace_back(Args: Ptr, Args: 0, Args: Cnt + 1);
7777 }
7778 }
7779
7780 if (Bases.size() == VL.size())
7781 return false;
7782
7783 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7784 Bases.front().second.size() == VL.size()))
7785 return false;
7786
7787 // For each of the bases sort the pointers by Offset and check if any of the
7788 // base become consecutively allocated.
7789 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7790 SmallPtrSet<Value *, 13> FirstPointers;
7791 SmallPtrSet<Value *, 13> SecondPointers;
7792 Value *P1 = Ptr1;
7793 Value *P2 = Ptr2;
7794 unsigned Depth = 0;
7795 while (!FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1)) {
7796 if (P1 == P2 || Depth > RecursionMaxDepth)
7797 return false;
7798 FirstPointers.insert(Ptr: P1);
7799 SecondPointers.insert(Ptr: P2);
7800 P1 = getUnderlyingObject(V: P1, /*MaxLookup=*/1);
7801 P2 = getUnderlyingObject(V: P2, /*MaxLookup=*/1);
7802 ++Depth;
7803 }
7804 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7805 "Unable to find matching root.");
7806 return FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1);
7807 };
7808 for (auto &Base : Bases) {
7809 for (auto &Vec : Base.second) {
7810 if (Vec.size() > 1) {
7811 stable_sort(Range&: Vec, C: llvm::less_second());
7812 int64_t InitialOffset = std::get<1>(t&: Vec[0]);
7813 bool AnyConsecutive =
7814 all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
7815 return std::get<1>(P.value()) ==
7816 int64_t(P.index()) + InitialOffset;
7817 });
7818 // Fill SortedIndices array only if it looks worth-while to sort the
7819 // ptrs.
7820 if (!AnyConsecutive)
7821 return false;
7822 }
7823 }
7824 stable_sort(Range&: Base.second, C: [&](const auto &V1, const auto &V2) {
7825 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7826 });
7827 }
7828
7829 for (auto &T : Bases)
7830 for (const auto &Vec : T.second)
7831 for (const auto &P : Vec)
7832 SortedIndices.push_back(Elt: std::get<2>(t: P));
7833
7834 assert(SortedIndices.size() == VL.size() &&
7835 "Expected SortedIndices to be the size of VL");
7836 return true;
7837}
7838
7839std::optional<BoUpSLP::OrdersType>
7840BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7841 assert(TE.isGather() && "Expected gather node only.");
7842 Type *ScalarTy = TE.Scalars[0]->getType();
7843
7844 SmallVector<Value *> Ptrs;
7845 Ptrs.reserve(N: TE.Scalars.size());
7846 SmallVector<BasicBlock *> BBs;
7847 BBs.reserve(N: TE.Scalars.size());
7848 for (Value *V : TE.Scalars) {
7849 auto *L = dyn_cast<LoadInst>(Val: V);
7850 if (!L || !L->isSimple())
7851 return std::nullopt;
7852 Ptrs.push_back(Elt: L->getPointerOperand());
7853 BBs.push_back(Elt: L->getParent());
7854 }
7855
7856 BoUpSLP::OrdersType Order;
7857 if (!LoadEntriesToVectorize.contains(key: TE.Idx) &&
7858 clusterSortPtrAccesses(VL: Ptrs, BBs, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
7859 return std::move(Order);
7860 return std::nullopt;
7861}
7862
7863/// Check if two insertelement instructions are from the same buildvector.
7864static bool areTwoInsertFromSameBuildVector(
7865 InsertElementInst *VU, InsertElementInst *V,
7866 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7867 // Instructions must be from the same basic blocks.
7868 if (VU->getParent() != V->getParent())
7869 return false;
7870 // Checks if 2 insertelements are from the same buildvector.
7871 if (VU->getType() != V->getType())
7872 return false;
7873 // Multiple used inserts are separate nodes.
7874 if (!VU->hasOneUse() && !V->hasOneUse())
7875 return false;
7876 auto *IE1 = VU;
7877 auto *IE2 = V;
7878 std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
7879 std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
7880 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7881 return false;
7882 // Go through the vector operand of insertelement instructions trying to find
7883 // either VU as the original vector for IE2 or V as the original vector for
7884 // IE1.
7885 SmallBitVector ReusedIdx(
7886 cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
7887 bool IsReusedIdx = false;
7888 do {
7889 if (IE2 == VU && !IE1)
7890 return VU->hasOneUse();
7891 if (IE1 == V && !IE2)
7892 return V->hasOneUse();
7893 if (IE1 && IE1 != V) {
7894 unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
7895 IsReusedIdx |= ReusedIdx.test(Idx: Idx1);
7896 ReusedIdx.set(Idx1);
7897 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7898 IE1 = nullptr;
7899 else
7900 IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE1));
7901 }
7902 if (IE2 && IE2 != VU) {
7903 unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
7904 IsReusedIdx |= ReusedIdx.test(Idx: Idx2);
7905 ReusedIdx.set(Idx2);
7906 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7907 IE2 = nullptr;
7908 else
7909 IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE2));
7910 }
7911 } while (!IsReusedIdx && (IE1 || IE2));
7912 return false;
7913}
7914
7915/// Checks if the specified instruction \p I is an alternate operation for
7916/// the given \p MainOp and \p AltOp instructions.
7917static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7918 Instruction *AltOp,
7919 const TargetLibraryInfo &TLI);
7920
7921std::optional<BoUpSLP::OrdersType>
7922BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7923 bool IgnoreReorder) {
7924 // No need to reorder if need to shuffle reuses, still need to shuffle the
7925 // node.
7926 if (!TE.ReuseShuffleIndices.empty()) {
7927 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7928 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7929 "Reshuffling scalars not yet supported for nodes with padding");
7930
7931 if (isSplat(VL: TE.Scalars))
7932 return std::nullopt;
7933 // Check if reuse shuffle indices can be improved by reordering.
7934 // For this, check that reuse mask is "clustered", i.e. each scalar values
7935 // is used once in each submask of size <number_of_scalars>.
7936 // Example: 4 scalar values.
7937 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7938 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7939 // element 3 is used twice in the second submask.
7940 unsigned Sz = TE.Scalars.size();
7941 if (TE.isGather()) {
7942 if (std::optional<OrdersType> CurrentOrder =
7943 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7944 SmallVector<int> Mask;
7945 fixupOrderingIndices(Order: *CurrentOrder);
7946 inversePermutation(Indices: *CurrentOrder, Mask);
7947 ::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
7948 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7949 unsigned Sz = TE.Scalars.size();
7950 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7951 for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
7952 if (Idx != PoisonMaskElem)
7953 Res[Idx + K * Sz] = I + K * Sz;
7954 }
7955 return std::move(Res);
7956 }
7957 }
7958 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7959 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
7960 VF: 2 * TE.getVectorFactor())) == 1)
7961 return std::nullopt;
7962 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7963 return std::nullopt;
7964 if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
7965 VF: Sz)) {
7966 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7967 if (TE.ReorderIndices.empty())
7968 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
7969 else
7970 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
7971 ::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
7972 unsigned VF = ReorderMask.size();
7973 OrdersType ResOrder(VF, VF);
7974 unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
7975 SmallBitVector UsedVals(NumParts);
7976 for (unsigned I = 0; I < VF; I += Sz) {
7977 int Val = PoisonMaskElem;
7978 unsigned UndefCnt = 0;
7979 unsigned Limit = std::min(a: Sz, b: VF - I);
7980 if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
7981 P: [&](int Idx) {
7982 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
7983 Val = Idx;
7984 if (Idx == PoisonMaskElem)
7985 ++UndefCnt;
7986 return Idx != PoisonMaskElem && Idx != Val;
7987 }) ||
7988 Val >= static_cast<int>(NumParts) || UsedVals.test(Idx: Val) ||
7989 UndefCnt > Sz / 2)
7990 return std::nullopt;
7991 UsedVals.set(Val);
7992 for (unsigned K = 0; K < NumParts; ++K) {
7993 unsigned Idx = Val + Sz * K;
7994 if (Idx < VF && I + K < VF)
7995 ResOrder[Idx] = I + K;
7996 }
7997 }
7998 return std::move(ResOrder);
7999 }
8000 unsigned VF = TE.getVectorFactor();
8001 // Try build correct order for extractelement instructions.
8002 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8003 TE.ReuseShuffleIndices.end());
8004 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8005 all_of(Range: TE.Scalars, P: [Sz](Value *V) {
8006 if (isa<PoisonValue>(Val: V))
8007 return true;
8008 std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
8009 return Idx && *Idx < Sz;
8010 })) {
8011 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8012 "by BinaryOperator and CastInst.");
8013 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8014 if (TE.ReorderIndices.empty())
8015 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
8016 else
8017 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
8018 for (unsigned I = 0; I < VF; ++I) {
8019 int &Idx = ReusedMask[I];
8020 if (Idx == PoisonMaskElem)
8021 continue;
8022 Value *V = TE.Scalars[ReorderMask[Idx]];
8023 std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
8024 Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
8025 }
8026 }
8027 // Build the order of the VF size, need to reorder reuses shuffles, they are
8028 // always of VF size.
8029 OrdersType ResOrder(VF);
8030 std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: 0);
8031 auto *It = ResOrder.begin();
8032 for (unsigned K = 0; K < VF; K += Sz) {
8033 OrdersType CurrentOrder(TE.ReorderIndices);
8034 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
8035 if (SubMask.front() == PoisonMaskElem)
8036 std::iota(first: SubMask.begin(), last: SubMask.end(), value: 0);
8037 reorderOrder(Order&: CurrentOrder, Mask: SubMask);
8038 transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
8039 std::advance(i&: It, n: Sz);
8040 }
8041 if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
8042 return Data.index() == Data.value();
8043 }))
8044 return std::nullopt; // No need to reorder.
8045 return std::move(ResOrder);
8046 }
8047 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8048 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8049 !Instruction::isBinaryOp(Opcode: TE.UserTreeIndex.UserTE->getOpcode())) &&
8050 (TE.ReorderIndices.empty() || isReverseOrder(Order: TE.ReorderIndices)))
8051 return std::nullopt;
8052 if (TE.State == TreeEntry::SplitVectorize ||
8053 ((TE.State == TreeEntry::Vectorize ||
8054 TE.State == TreeEntry::StridedVectorize ||
8055 TE.State == TreeEntry::CompressVectorize) &&
8056 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) ||
8057 (TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))))) {
8058 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8059 "Alternate instructions are only supported by "
8060 "BinaryOperator and CastInst.");
8061 return TE.ReorderIndices;
8062 }
8063 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8064 TE.isAltShuffle()) {
8065 assert(TE.ReuseShuffleIndices.empty() &&
8066 "ReuseShuffleIndices should be "
8067 "empty for alternate instructions.");
8068 SmallVector<int> Mask;
8069 TE.buildAltOpShuffleMask(
8070 IsAltOp: [&](Instruction *I) {
8071 assert(TE.getMatchingMainOpOrAltOp(I) &&
8072 "Unexpected main/alternate opcode");
8073 return isAlternateInstruction(I, MainOp: TE.getMainOp(), AltOp: TE.getAltOp(), TLI: *TLI);
8074 },
8075 Mask);
8076 const int VF = TE.getVectorFactor();
8077 OrdersType ResOrder(VF, VF);
8078 for (unsigned I : seq<unsigned>(Size: VF)) {
8079 if (Mask[I] == PoisonMaskElem)
8080 continue;
8081 ResOrder[Mask[I] % VF] = I;
8082 }
8083 return std::move(ResOrder);
8084 }
8085 if (!TE.ReorderIndices.empty())
8086 return TE.ReorderIndices;
8087 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8088 if (!TE.ReorderIndices.empty())
8089 return TE.ReorderIndices;
8090
8091 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8092 for (auto [I, V] : zip(t&: UserBVHead, u: TE.Scalars)) {
8093 if (isa<Constant>(Val: V) || !V->hasNUsesOrMore(N: 1))
8094 continue;
8095 auto *II = dyn_cast<InsertElementInst>(Val: *V->user_begin());
8096 if (!II)
8097 continue;
8098 Instruction *BVHead = nullptr;
8099 BasicBlock *BB = II->getParent();
8100 while (II && II->hasOneUse() && II->getParent() == BB) {
8101 BVHead = II;
8102 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
8103 }
8104 I = BVHead;
8105 }
8106
8107 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8108 assert(BB1 != BB2 && "Expected different basic blocks.");
8109 if (!DT->isReachableFromEntry(A: BB1))
8110 return false;
8111 if (!DT->isReachableFromEntry(A: BB2))
8112 return true;
8113 auto *NodeA = DT->getNode(BB: BB1);
8114 auto *NodeB = DT->getNode(BB: BB2);
8115 assert(NodeA && "Should only process reachable instructions");
8116 assert(NodeB && "Should only process reachable instructions");
8117 assert((NodeA == NodeB) ==
8118 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8119 "Different nodes should have different DFS numbers");
8120 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8121 };
8122 auto PHICompare = [&](unsigned I1, unsigned I2) {
8123 Value *V1 = TE.Scalars[I1];
8124 Value *V2 = TE.Scalars[I2];
8125 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8126 return false;
8127 if (isa<PoisonValue>(Val: V1))
8128 return true;
8129 if (isa<PoisonValue>(Val: V2))
8130 return false;
8131 if (V1->getNumUses() < V2->getNumUses())
8132 return true;
8133 if (V1->getNumUses() > V2->getNumUses())
8134 return false;
8135 auto *FirstUserOfPhi1 = cast<Instruction>(Val: *V1->user_begin());
8136 auto *FirstUserOfPhi2 = cast<Instruction>(Val: *V2->user_begin());
8137 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8138 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8139 FirstUserOfPhi2->getParent());
8140 auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1);
8141 auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2);
8142 auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1);
8143 auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2);
8144 if (IE1 && !IE2)
8145 return true;
8146 if (!IE1 && IE2)
8147 return false;
8148 if (IE1 && IE2) {
8149 if (UserBVHead[I1] && !UserBVHead[I2])
8150 return true;
8151 if (!UserBVHead[I1])
8152 return false;
8153 if (UserBVHead[I1] == UserBVHead[I2])
8154 return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
8155 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8156 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8157 UserBVHead[I2]->getParent());
8158 return UserBVHead[I1]->comesBefore(Other: UserBVHead[I2]);
8159 }
8160 if (EE1 && !EE2)
8161 return true;
8162 if (!EE1 && EE2)
8163 return false;
8164 if (EE1 && EE2) {
8165 auto *Inst1 = dyn_cast<Instruction>(Val: EE1->getOperand(i_nocapture: 0));
8166 auto *Inst2 = dyn_cast<Instruction>(Val: EE2->getOperand(i_nocapture: 0));
8167 auto *P1 = dyn_cast<Argument>(Val: EE1->getOperand(i_nocapture: 0));
8168 auto *P2 = dyn_cast<Argument>(Val: EE2->getOperand(i_nocapture: 0));
8169 if (!Inst2 && !P2)
8170 return Inst1 || P1;
8171 if (EE1->getOperand(i_nocapture: 0) == EE2->getOperand(i_nocapture: 0))
8172 return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
8173 if (!Inst1 && Inst2)
8174 return false;
8175 if (Inst1 && Inst2) {
8176 if (Inst1->getParent() != Inst2->getParent())
8177 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8178 return Inst1->comesBefore(Other: Inst2);
8179 }
8180 if (!P1 && P2)
8181 return false;
8182 assert(P1 && P2 &&
8183 "Expected either instructions or arguments vector operands.");
8184 return P1->getArgNo() < P2->getArgNo();
8185 }
8186 return false;
8187 };
8188 OrdersType Phis(TE.Scalars.size());
8189 std::iota(first: Phis.begin(), last: Phis.end(), value: 0);
8190 stable_sort(Range&: Phis, C: PHICompare);
8191 if (isIdentityOrder(Order: Phis))
8192 return std::nullopt; // No need to reorder.
8193 return std::move(Phis);
8194 }
8195 if (TE.isGather() &&
8196 (!TE.hasState() || !TE.isAltShuffle() ||
8197 ScalarsInSplitNodes.contains(Val: TE.getMainOp())) &&
8198 allSameType(VL: TE.Scalars)) {
8199 // TODO: add analysis of other gather nodes with extractelement
8200 // instructions and other values/instructions, not only undefs.
8201 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8202 (all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
8203 any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
8204 all_of(Range: TE.Scalars, P: [](Value *V) {
8205 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8206 return !EE || isa<FixedVectorType>(Val: EE->getVectorOperandType());
8207 })) {
8208 // Check that gather of extractelements can be represented as
8209 // just a shuffle of a single vector.
8210 OrdersType CurrentOrder;
8211 bool Reuse =
8212 canReuseExtract(VL: TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8213 if (Reuse || !CurrentOrder.empty())
8214 return std::move(CurrentOrder);
8215 }
8216 // If the gather node is <undef, v, .., poison> and
8217 // insertelement poison, v, 0 [+ permute]
8218 // is cheaper than
8219 // insertelement poison, v, n - try to reorder.
8220 // If rotating the whole graph, exclude the permute cost, the whole graph
8221 // might be transformed.
8222 int Sz = TE.Scalars.size();
8223 if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
8224 count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - 1) {
8225 const auto *It = find_if_not(Range: TE.Scalars, P: isConstant);
8226 if (It == TE.Scalars.begin())
8227 return OrdersType();
8228 auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
8229 if (It != TE.Scalars.end()) {
8230 OrdersType Order(Sz, Sz);
8231 unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
8232 Order[Idx] = 0;
8233 fixupOrderingIndices(Order);
8234 SmallVector<int> Mask;
8235 inversePermutation(Indices: Order, Mask);
8236 InstructionCost PermuteCost =
8237 TopToBottom
8238 ? 0
8239 : ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
8240 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8241 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: 0,
8242 Op0: PoisonValue::get(T: Ty), Op1: *It);
8243 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8244 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
8245 Op0: PoisonValue::get(T: Ty), Op1: *It);
8246 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8247 OrdersType Order(Sz, Sz);
8248 Order[Idx] = 0;
8249 return std::move(Order);
8250 }
8251 }
8252 }
8253 if (isSplat(VL: TE.Scalars))
8254 return std::nullopt;
8255 if (TE.Scalars.size() >= 3)
8256 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8257 return Order;
8258 // Check if can include the order of vectorized loads. For masked gathers do
8259 // extra analysis later, so include such nodes into a special list.
8260 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8261 SmallVector<Value *> PointerOps;
8262 StridedPtrInfo SPtrInfo;
8263 OrdersType CurrentOrder;
8264 LoadsState Res = canVectorizeLoads(VL: TE.Scalars, VL0: TE.Scalars.front(),
8265 Order&: CurrentOrder, PointerOps, SPtrInfo);
8266 if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
8267 Res == LoadsState::CompressVectorize)
8268 return std::move(CurrentOrder);
8269 }
8270 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8271 // has been auditted for correctness with non-power-of-two vectors.
8272 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
8273 if (std::optional<OrdersType> CurrentOrder =
8274 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8275 return CurrentOrder;
8276 }
8277 return std::nullopt;
8278}
8279
8280/// Checks if the given mask is a "clustered" mask with the same clusters of
8281/// size \p Sz, which are not identity submasks.
8282static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
8283 unsigned Sz) {
8284 ArrayRef<int> FirstCluster = Mask.slice(N: 0, M: Sz);
8285 if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
8286 return false;
8287 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8288 ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
8289 if (Cluster != FirstCluster)
8290 return false;
8291 }
8292 return true;
8293}
8294
8295void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8296 // Reorder reuses mask.
8297 reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
8298 const unsigned Sz = TE.Scalars.size();
8299 // For vectorized and non-clustered reused no need to do anything else.
8300 if (!TE.isGather() ||
8301 !ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
8302 VF: Sz) ||
8303 !isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
8304 return;
8305 SmallVector<int> NewMask;
8306 inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
8307 addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
8308 // Clear reorder since it is going to be applied to the new mask.
8309 TE.ReorderIndices.clear();
8310 // Try to improve gathered nodes with clustered reuses, if possible.
8311 ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: 0, M: Sz);
8312 SmallVector<unsigned> NewOrder(Slice);
8313 inversePermutation(Indices: NewOrder, Mask&: NewMask);
8314 reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
8315 // Fill the reuses mask with the identity submasks.
8316 for (auto *It = TE.ReuseShuffleIndices.begin(),
8317 *End = TE.ReuseShuffleIndices.end();
8318 It != End; std::advance(i&: It, n: Sz))
8319 std::iota(first: It, last: std::next(x: It, n: Sz), value: 0);
8320}
8321
8322static void combineOrders(MutableArrayRef<unsigned> Order,
8323 ArrayRef<unsigned> SecondaryOrder) {
8324 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8325 "Expected same size of orders");
8326 size_t Sz = Order.size();
8327 SmallBitVector UsedIndices(Sz);
8328 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) {
8329 if (Order[Idx] != Sz)
8330 UsedIndices.set(Order[Idx]);
8331 }
8332 if (SecondaryOrder.empty()) {
8333 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
8334 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8335 Order[Idx] = Idx;
8336 } else {
8337 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
8338 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8339 !UsedIndices.test(Idx: SecondaryOrder[Idx]))
8340 Order[Idx] = SecondaryOrder[Idx];
8341 }
8342}
8343
8344bool BoUpSLP::isProfitableToReorder() const {
8345 if (DisableTreeReorder)
8346 return false;
8347
8348 constexpr unsigned TinyVF = 2;
8349 constexpr unsigned TinyTree = 10;
8350 constexpr unsigned PhiOpsLimit = 12;
8351 constexpr unsigned GatherLoadsLimit = 2;
8352 if (VectorizableTree.size() <= TinyTree)
8353 return true;
8354 if (VectorizableTree.front()->hasState() &&
8355 !VectorizableTree.front()->isGather() &&
8356 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8357 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8358 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8359 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8360 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8361 VectorizableTree.front()->ReorderIndices.empty()) {
8362 // Check if the tree has only single store and single (unordered) load node,
8363 // other nodes are phis or geps/binops, combined with phis, and/or single
8364 // gather load node
8365 if (VectorizableTree.front()->hasState() &&
8366 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8367 VectorizableTree.front()->Scalars.size() == TinyVF &&
8368 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8369 return false;
8370 // Single node, which require reorder - skip.
8371 if (VectorizableTree.front()->hasState() &&
8372 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8373 VectorizableTree.front()->ReorderIndices.empty()) {
8374 const unsigned ReorderedSplitsCnt =
8375 count_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8376 return TE->State == TreeEntry::SplitVectorize &&
8377 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8378 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8379 ::isCommutative(I: TE->UserTreeIndex.UserTE->getMainOp());
8380 });
8381 if (ReorderedSplitsCnt <= 1 &&
8382 static_cast<unsigned>(count_if(
8383 Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8384 return ((!TE->isGather() &&
8385 (TE->ReorderIndices.empty() ||
8386 (TE->UserTreeIndex.UserTE &&
8387 TE->UserTreeIndex.UserTE->State ==
8388 TreeEntry::Vectorize &&
8389 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8390 .empty()))) ||
8391 (TE->isGather() && TE->ReorderIndices.empty() &&
8392 (!TE->hasState() || TE->isAltShuffle() ||
8393 TE->getOpcode() == Instruction::Load ||
8394 TE->getOpcode() == Instruction::ZExt ||
8395 TE->getOpcode() == Instruction::SExt))) &&
8396 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8397 !TE->isGather() || none_of(Range&: TE->Scalars, P: [&](Value *V) {
8398 return !isConstant(V) && isVectorized(V);
8399 }));
8400 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8401 return false;
8402 }
8403 bool HasPhis = false;
8404 bool HasLoad = true;
8405 unsigned GatherLoads = 0;
8406 for (const std::unique_ptr<TreeEntry> &TE :
8407 ArrayRef(VectorizableTree).drop_front()) {
8408 if (TE->State == TreeEntry::SplitVectorize)
8409 continue;
8410 if (!TE->hasState()) {
8411 if (all_of(Range&: TE->Scalars, P: IsaPred<Constant, PHINode>) ||
8412 all_of(Range&: TE->Scalars, P: IsaPred<BinaryOperator, PHINode>))
8413 continue;
8414 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8415 any_of(Range&: TE->Scalars, P: IsaPred<PHINode, GEPOperator>))
8416 continue;
8417 return true;
8418 }
8419 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8420 if (!TE->isGather()) {
8421 HasLoad = false;
8422 continue;
8423 }
8424 if (HasLoad)
8425 return true;
8426 ++GatherLoads;
8427 if (GatherLoads >= GatherLoadsLimit)
8428 return true;
8429 }
8430 if (TE->getOpcode() == Instruction::GetElementPtr ||
8431 Instruction::isBinaryOp(Opcode: TE->getOpcode()))
8432 continue;
8433 if (TE->getOpcode() != Instruction::PHI &&
8434 (!TE->hasCopyableElements() ||
8435 static_cast<unsigned>(count_if(Range&: TE->Scalars, P: IsaPred<PHINode>)) <
8436 TE->Scalars.size() / 2))
8437 return true;
8438 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8439 TE->getNumOperands() > PhiOpsLimit)
8440 return false;
8441 HasPhis = true;
8442 }
8443 return !HasPhis;
8444 }
8445 return true;
8446}
8447
8448void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8449 ArrayRef<int> MaskOrder) {
8450 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8451 SmallVector<int> NewMask(getVectorFactor());
8452 SmallVector<int> NewMaskOrder(getVectorFactor());
8453 std::iota(first: NewMask.begin(), last: NewMask.end(), value: 0);
8454 std::iota(first: NewMaskOrder.begin(), last: NewMaskOrder.end(), value: 0);
8455 if (Idx == 0) {
8456 copy(Range&: Mask, Out: NewMask.begin());
8457 copy(Range&: MaskOrder, Out: NewMaskOrder.begin());
8458 } else {
8459 assert(Idx == 1 && "Expected either 0 or 1 index.");
8460 unsigned Offset = CombinedEntriesWithIndices.back().second;
8461 for (unsigned I : seq<unsigned>(Size: Mask.size())) {
8462 NewMask[I + Offset] = Mask[I] + Offset;
8463 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8464 }
8465 }
8466 reorderScalars(Scalars, Mask: NewMask);
8467 reorderOrder(Order&: ReorderIndices, Mask: NewMaskOrder, /*BottomOrder=*/true);
8468 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(Order: ReorderIndices))
8469 ReorderIndices.clear();
8470}
8471
8472void BoUpSLP::reorderTopToBottom() {
8473 // Maps VF to the graph nodes.
8474 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
8475 // ExtractElement gather nodes which can be vectorized and need to handle
8476 // their ordering.
8477 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
8478
8479 // Phi nodes can have preferred ordering based on their result users
8480 DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
8481
8482 // AltShuffles can also have a preferred ordering that leads to fewer
8483 // instructions, e.g., the addsub instruction in x86.
8484 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8485
8486 // Maps a TreeEntry to the reorder indices of external users.
8487 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
8488 ExternalUserReorderMap;
8489 // Find all reorderable nodes with the given VF.
8490 // Currently the are vectorized stores,loads,extracts + some gathering of
8491 // extracts.
8492 for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
8493 const std::unique_ptr<TreeEntry> &TE) {
8494 // Look for external users that will probably be vectorized.
8495 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8496 findExternalStoreUsersReorderIndices(TE: TE.get());
8497 if (!ExternalUserReorderIndices.empty()) {
8498 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8499 ExternalUserReorderMap.try_emplace(Key: TE.get(),
8500 Args: std::move(ExternalUserReorderIndices));
8501 }
8502
8503 // Patterns like [fadd,fsub] can be combined into a single instruction in
8504 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8505 // to take into account their order when looking for the most used order.
8506 if (TE->hasState() && TE->isAltShuffle() &&
8507 TE->State != TreeEntry::SplitVectorize) {
8508 Type *ScalarTy = TE->Scalars[0]->getType();
8509 VectorType *VecTy = getWidenedType(ScalarTy, VF: TE->Scalars.size());
8510 unsigned Opcode0 = TE->getOpcode();
8511 unsigned Opcode1 = TE->getAltOpcode();
8512 SmallBitVector OpcodeMask(
8513 getAltInstrMask(VL: TE->Scalars, ScalarTy, Opcode0, Opcode1));
8514 // If this pattern is supported by the target then we consider the order.
8515 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8516 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8517 AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType());
8518 }
8519 // TODO: Check the reverse order too.
8520 }
8521
8522 bool IgnoreReorder =
8523 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8524 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8525 VectorizableTree.front()->getOpcode() == Instruction::Store);
8526 if (std::optional<OrdersType> CurrentOrder =
8527 getReorderingData(TE: *TE, /*TopToBottom=*/true, IgnoreReorder)) {
8528 // Do not include ordering for nodes used in the alt opcode vectorization,
8529 // better to reorder them during bottom-to-top stage. If follow the order
8530 // here, it causes reordering of the whole graph though actually it is
8531 // profitable just to reorder the subgraph that starts from the alternate
8532 // opcode vectorization node. Such nodes already end-up with the shuffle
8533 // instruction and it is just enough to change this shuffle rather than
8534 // rotate the scalars for the whole graph.
8535 unsigned Cnt = 0;
8536 const TreeEntry *UserTE = TE.get();
8537 while (UserTE && Cnt < RecursionMaxDepth) {
8538 if (!UserTE->UserTreeIndex)
8539 break;
8540 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8541 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8542 UserTE->UserTreeIndex.UserTE->Idx != 0)
8543 return;
8544 UserTE = UserTE->UserTreeIndex.UserTE;
8545 ++Cnt;
8546 }
8547 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8548 if (!(TE->State == TreeEntry::Vectorize ||
8549 TE->State == TreeEntry::StridedVectorize ||
8550 TE->State == TreeEntry::SplitVectorize ||
8551 TE->State == TreeEntry::CompressVectorize) ||
8552 !TE->ReuseShuffleIndices.empty())
8553 GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8554 if (TE->State == TreeEntry::Vectorize &&
8555 TE->getOpcode() == Instruction::PHI)
8556 PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8557 }
8558 });
8559
8560 // Reorder the graph nodes according to their vectorization factor.
8561 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8562 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8563 auto It = VFToOrderedEntries.find(Val: VF);
8564 if (It == VFToOrderedEntries.end())
8565 continue;
8566 // Try to find the most profitable order. We just are looking for the most
8567 // used order and reorder scalar elements in the nodes according to this
8568 // mostly used order.
8569 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8570 // Delete VF entry upon exit.
8571 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(I: It); });
8572
8573 // All operands are reordered and used only in this node - propagate the
8574 // most used order to the user node.
8575 MapVector<OrdersType, unsigned,
8576 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
8577 OrdersUses;
8578 for (const TreeEntry *OpTE : OrderedEntries) {
8579 // No need to reorder this nodes, still need to extend and to use shuffle,
8580 // just need to merge reordering shuffle and the reuse shuffle.
8581 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE) &&
8582 OpTE->State != TreeEntry::SplitVectorize)
8583 continue;
8584 // Count number of orders uses.
8585 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8586 &PhisToOrders]() -> const OrdersType & {
8587 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8588 auto It = GathersToOrders.find(Val: OpTE);
8589 if (It != GathersToOrders.end())
8590 return It->second;
8591 }
8592 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8593 auto It = AltShufflesToOrders.find(Val: OpTE);
8594 if (It != AltShufflesToOrders.end())
8595 return It->second;
8596 }
8597 if (OpTE->State == TreeEntry::Vectorize &&
8598 OpTE->getOpcode() == Instruction::PHI) {
8599 auto It = PhisToOrders.find(Val: OpTE);
8600 if (It != PhisToOrders.end())
8601 return It->second;
8602 }
8603 return OpTE->ReorderIndices;
8604 }();
8605 // First consider the order of the external scalar users.
8606 auto It = ExternalUserReorderMap.find(Val: OpTE);
8607 if (It != ExternalUserReorderMap.end()) {
8608 const auto &ExternalUserReorderIndices = It->second;
8609 // If the OpTE vector factor != number of scalars - use natural order,
8610 // it is an attempt to reorder node with reused scalars but with
8611 // external uses.
8612 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8613 OrdersUses.try_emplace(Key: OrdersType(), Args: 0).first->second +=
8614 ExternalUserReorderIndices.size();
8615 } else {
8616 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8617 ++OrdersUses.try_emplace(Key: ExtOrder, Args: 0).first->second;
8618 }
8619 // No other useful reorder data in this entry.
8620 if (Order.empty())
8621 continue;
8622 }
8623 // Stores actually store the mask, not the order, need to invert.
8624 if (OpTE->State == TreeEntry::Vectorize &&
8625 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8626 assert(!OpTE->isAltShuffle() &&
8627 "Alternate instructions are only supported by BinaryOperator "
8628 "and CastInst.");
8629 SmallVector<int> Mask;
8630 inversePermutation(Indices: Order, Mask);
8631 unsigned E = Order.size();
8632 OrdersType CurrentOrder(E, E);
8633 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
8634 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8635 });
8636 fixupOrderingIndices(Order: CurrentOrder);
8637 ++OrdersUses.try_emplace(Key: CurrentOrder, Args: 0).first->second;
8638 } else {
8639 ++OrdersUses.try_emplace(Key: Order, Args: 0).first->second;
8640 }
8641 }
8642 if (OrdersUses.empty())
8643 continue;
8644 // Choose the most used order.
8645 unsigned IdentityCnt = 0;
8646 unsigned FilledIdentityCnt = 0;
8647 OrdersType IdentityOrder(VF, VF);
8648 for (auto &Pair : OrdersUses) {
8649 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
8650 if (!Pair.first.empty())
8651 FilledIdentityCnt += Pair.second;
8652 IdentityCnt += Pair.second;
8653 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
8654 }
8655 }
8656 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8657 unsigned Cnt = IdentityCnt;
8658 for (auto &Pair : OrdersUses) {
8659 // Prefer identity order. But, if filled identity found (non-empty order)
8660 // with same number of uses, as the new candidate order, we can choose
8661 // this candidate order.
8662 if (Cnt < Pair.second ||
8663 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8664 Cnt == Pair.second && !BestOrder.empty() &&
8665 isIdentityOrder(Order: BestOrder))) {
8666 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
8667 BestOrder = Pair.first;
8668 Cnt = Pair.second;
8669 } else {
8670 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
8671 }
8672 }
8673 // Set order of the user node.
8674 if (isIdentityOrder(Order: BestOrder))
8675 continue;
8676 fixupOrderingIndices(Order: BestOrder);
8677 SmallVector<int> Mask;
8678 inversePermutation(Indices: BestOrder, Mask);
8679 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8680 unsigned E = BestOrder.size();
8681 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8682 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8683 });
8684 // Do an actual reordering, if profitable.
8685 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8686 // Just do the reordering for the nodes with the given VF.
8687 if (TE->Scalars.size() != VF) {
8688 if (TE->ReuseShuffleIndices.size() == VF) {
8689 assert(TE->State != TreeEntry::SplitVectorize &&
8690 "Split vectorized not expected.");
8691 // Need to reorder the reuses masks of the operands with smaller VF to
8692 // be able to find the match between the graph nodes and scalar
8693 // operands of the given node during vectorization/cost estimation.
8694 assert(
8695 (!TE->UserTreeIndex ||
8696 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8697 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8698 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8699 "All users must be of VF size.");
8700 if (SLPReVec) {
8701 assert(SLPReVec && "Only supported by REVEC.");
8702 // ShuffleVectorInst does not do reorderOperands (and it should not
8703 // because ShuffleVectorInst supports only a limited set of
8704 // patterns). Only do reorderNodeWithReuses if the user is not
8705 // ShuffleVectorInst.
8706 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8707 isa<ShuffleVectorInst>(Val: TE->UserTreeIndex.UserTE->getMainOp()))
8708 continue;
8709 }
8710 // Update ordering of the operands with the smaller VF than the given
8711 // one.
8712 reorderNodeWithReuses(TE&: *TE, Mask);
8713 // Update orders in user split vectorize nodes.
8714 if (TE->UserTreeIndex &&
8715 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8716 TE->UserTreeIndex.UserTE->reorderSplitNode(
8717 Idx: TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8718 }
8719 continue;
8720 }
8721 if ((TE->State == TreeEntry::SplitVectorize &&
8722 TE->ReuseShuffleIndices.empty()) ||
8723 ((TE->State == TreeEntry::Vectorize ||
8724 TE->State == TreeEntry::StridedVectorize ||
8725 TE->State == TreeEntry::CompressVectorize) &&
8726 (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
8727 InsertElementInst>(Val: TE->getMainOp()) ||
8728 (SLPReVec && isa<ShuffleVectorInst>(Val: TE->getMainOp()))))) {
8729 assert(
8730 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8731 TE->ReuseShuffleIndices.empty())) &&
8732 "Alternate instructions are only supported by BinaryOperator "
8733 "and CastInst.");
8734 // Build correct orders for extract{element,value}, loads,
8735 // stores and alternate (split) nodes.
8736 reorderOrder(Order&: TE->ReorderIndices, Mask);
8737 if (isa<InsertElementInst, StoreInst>(Val: TE->getMainOp()))
8738 TE->reorderOperands(Mask);
8739 } else {
8740 // Reorder the node and its operands.
8741 TE->reorderOperands(Mask);
8742 assert(TE->ReorderIndices.empty() &&
8743 "Expected empty reorder sequence.");
8744 reorderScalars(Scalars&: TE->Scalars, Mask);
8745 }
8746 if (!TE->ReuseShuffleIndices.empty()) {
8747 // Apply reversed order to keep the original ordering of the reused
8748 // elements to avoid extra reorder indices shuffling.
8749 OrdersType CurrentOrder;
8750 reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
8751 SmallVector<int> NewReuses;
8752 inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
8753 addMask(Mask&: NewReuses, SubMask: TE->ReuseShuffleIndices);
8754 TE->ReuseShuffleIndices.swap(RHS&: NewReuses);
8755 } else if (TE->UserTreeIndex &&
8756 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8757 // Update orders in user split vectorize nodes.
8758 TE->UserTreeIndex.UserTE->reorderSplitNode(Idx: TE->UserTreeIndex.EdgeIdx,
8759 Mask, MaskOrder);
8760 }
8761 }
8762}
8763
8764void BoUpSLP::buildReorderableOperands(
8765 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8766 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8767 SmallVectorImpl<TreeEntry *> &GatherOps) {
8768 for (unsigned I : seq<unsigned>(Size: UserTE->getNumOperands())) {
8769 if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8770 return OpData.first == I &&
8771 (OpData.second->State == TreeEntry::Vectorize ||
8772 OpData.second->State == TreeEntry::StridedVectorize ||
8773 OpData.second->State == TreeEntry::CompressVectorize ||
8774 OpData.second->State == TreeEntry::SplitVectorize);
8775 }))
8776 continue;
8777 // Do not request operands, if they do not exist.
8778 if (UserTE->hasState()) {
8779 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8780 UserTE->getOpcode() == Instruction::ExtractValue)
8781 continue;
8782 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8783 continue;
8784 if (UserTE->getOpcode() == Instruction::Store &&
8785 UserTE->State == TreeEntry::Vectorize && I == 1)
8786 continue;
8787 if (UserTE->getOpcode() == Instruction::Load &&
8788 (UserTE->State == TreeEntry::Vectorize ||
8789 UserTE->State == TreeEntry::StridedVectorize ||
8790 UserTE->State == TreeEntry::CompressVectorize))
8791 continue;
8792 }
8793 TreeEntry *TE = getOperandEntry(E: UserTE, Idx: I);
8794 assert(TE && "Expected operand entry.");
8795 if (!TE->isGather()) {
8796 // Add the node to the list of the ordered nodes with the identity
8797 // order.
8798 Edges.emplace_back(Args&: I, Args&: TE);
8799 // Add ScatterVectorize nodes to the list of operands, where just
8800 // reordering of the scalars is required. Similar to the gathers, so
8801 // simply add to the list of gathered ops.
8802 // If there are reused scalars, process this node as a regular vectorize
8803 // node, just reorder reuses mask.
8804 if (TE->State == TreeEntry::ScatterVectorize &&
8805 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8806 GatherOps.push_back(Elt: TE);
8807 continue;
8808 }
8809 if (ReorderableGathers.contains(Ptr: TE))
8810 GatherOps.push_back(Elt: TE);
8811 }
8812}
8813
8814void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8815 struct TreeEntryCompare {
8816 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8817 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8818 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8819 return LHS->Idx < RHS->Idx;
8820 }
8821 };
8822 PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue;
8823 DenseSet<const TreeEntry *> GathersToOrders;
8824 // Find all reorderable leaf nodes with the given VF.
8825 // Currently the are vectorized loads,extracts without alternate operands +
8826 // some gathering of extracts.
8827 SmallPtrSet<const TreeEntry *, 4> NonVectorized;
8828 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8829 if (TE->State != TreeEntry::Vectorize &&
8830 TE->State != TreeEntry::StridedVectorize &&
8831 TE->State != TreeEntry::CompressVectorize &&
8832 TE->State != TreeEntry::SplitVectorize)
8833 NonVectorized.insert(Ptr: TE.get());
8834 if (std::optional<OrdersType> CurrentOrder =
8835 getReorderingData(TE: *TE, /*TopToBottom=*/false, IgnoreReorder)) {
8836 Queue.push(x: TE.get());
8837 if (!(TE->State == TreeEntry::Vectorize ||
8838 TE->State == TreeEntry::StridedVectorize ||
8839 TE->State == TreeEntry::CompressVectorize ||
8840 TE->State == TreeEntry::SplitVectorize) ||
8841 !TE->ReuseShuffleIndices.empty())
8842 GathersToOrders.insert(V: TE.get());
8843 }
8844 }
8845
8846 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8847 // I.e., if the node has operands, that are reordered, try to make at least
8848 // one operand order in the natural order and reorder others + reorder the
8849 // user node itself.
8850 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8851 while (!Queue.empty()) {
8852 // 1. Filter out only reordered nodes.
8853 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8854 TreeEntry *TE = Queue.top();
8855 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8856 Queue.pop();
8857 SmallVector<TreeEntry *> OrderedOps(1, TE);
8858 while (!Queue.empty()) {
8859 TE = Queue.top();
8860 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8861 break;
8862 Queue.pop();
8863 OrderedOps.push_back(Elt: TE);
8864 }
8865 for (TreeEntry *TE : OrderedOps) {
8866 if (!(TE->State == TreeEntry::Vectorize ||
8867 TE->State == TreeEntry::StridedVectorize ||
8868 TE->State == TreeEntry::CompressVectorize ||
8869 TE->State == TreeEntry::SplitVectorize ||
8870 (TE->isGather() && GathersToOrders.contains(V: TE))) ||
8871 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8872 !Visited.insert(Ptr: TE).second)
8873 continue;
8874 // Build a map between user nodes and their operands order to speedup
8875 // search. The graph currently does not provide this dependency directly.
8876 Users.first = TE->UserTreeIndex.UserTE;
8877 Users.second.emplace_back(Args&: TE->UserTreeIndex.EdgeIdx, Args&: TE);
8878 }
8879 if (Users.first) {
8880 auto &Data = Users;
8881 if (Data.first->State == TreeEntry::SplitVectorize) {
8882 assert(
8883 Data.second.size() <= 2 &&
8884 "Expected not greater than 2 operands for split vectorize node.");
8885 if (any_of(Range&: Data.second,
8886 P: [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8887 continue;
8888 // Update orders in user split vectorize nodes.
8889 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8890 "Expected exactly 2 entries.");
8891 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8892 TreeEntry &OpTE = *VectorizableTree[P.first];
8893 OrdersType Order = OpTE.ReorderIndices;
8894 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8895 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8896 continue;
8897 const auto BestOrder =
8898 getReorderingData(TE: OpTE, /*TopToBottom=*/false, IgnoreReorder);
8899 if (!BestOrder || BestOrder->empty() || isIdentityOrder(Order: *BestOrder))
8900 continue;
8901 Order = *BestOrder;
8902 }
8903 fixupOrderingIndices(Order);
8904 SmallVector<int> Mask;
8905 inversePermutation(Indices: Order, Mask);
8906 const unsigned E = Order.size();
8907 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8908 transform(Range&: Order, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8909 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8910 });
8911 Data.first->reorderSplitNode(Idx: P.second ? 1 : 0, Mask, MaskOrder);
8912 // Clear ordering of the operand.
8913 if (!OpTE.ReorderIndices.empty()) {
8914 OpTE.ReorderIndices.clear();
8915 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8916 reorderReuses(Reuses&: OpTE.ReuseShuffleIndices, Mask);
8917 } else {
8918 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8919 reorderScalars(Scalars&: OpTE.Scalars, Mask);
8920 }
8921 }
8922 if (Data.first->ReuseShuffleIndices.empty() &&
8923 !Data.first->ReorderIndices.empty()) {
8924 // Insert user node to the list to try to sink reordering deeper in
8925 // the graph.
8926 Queue.push(x: Data.first);
8927 }
8928 continue;
8929 }
8930 // Check that operands are used only in the User node.
8931 SmallVector<TreeEntry *> GatherOps;
8932 buildReorderableOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
8933 GatherOps);
8934 // All operands are reordered and used only in this node - propagate the
8935 // most used order to the user node.
8936 MapVector<OrdersType, unsigned,
8937 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
8938 OrdersUses;
8939 // Do the analysis for each tree entry only once, otherwise the order of
8940 // the same node my be considered several times, though might be not
8941 // profitable.
8942 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
8943 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
8944 for (const auto &Op : Data.second) {
8945 TreeEntry *OpTE = Op.second;
8946 if (!VisitedOps.insert(Ptr: OpTE).second)
8947 continue;
8948 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
8949 continue;
8950 const auto Order = [&]() -> const OrdersType {
8951 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8952 return getReorderingData(TE: *OpTE, /*TopToBottom=*/false,
8953 IgnoreReorder)
8954 .value_or(u: OrdersType(1));
8955 return OpTE->ReorderIndices;
8956 }();
8957 // The order is partially ordered, skip it in favor of fully non-ordered
8958 // orders.
8959 if (Order.size() == 1)
8960 continue;
8961
8962 // Check that the reordering does not increase number of shuffles, i.e.
8963 // same-values-nodes has same parents or their parents has same parents.
8964 if (!Order.empty() && !isIdentityOrder(Order)) {
8965 Value *Root = OpTE->hasState()
8966 ? OpTE->getMainOp()
8967 : *find_if_not(Range&: OpTE->Scalars, P: isConstant);
8968 auto GetSameNodesUsers = [&](Value *Root) {
8969 SmallSetVector<TreeEntry *, 4> Res;
8970 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Val: Root)) {
8971 if (TE != OpTE && TE->UserTreeIndex &&
8972 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8973 TE->Scalars.size() == OpTE->Scalars.size() &&
8974 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
8975 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
8976 Res.insert(X: TE->UserTreeIndex.UserTE);
8977 }
8978 for (const TreeEntry *TE : getTreeEntries(V: Root)) {
8979 if (TE != OpTE && TE->UserTreeIndex &&
8980 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8981 TE->Scalars.size() == OpTE->Scalars.size() &&
8982 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
8983 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
8984 Res.insert(X: TE->UserTreeIndex.UserTE);
8985 }
8986 return Res.takeVector();
8987 };
8988 auto GetNumOperands = [](const TreeEntry *TE) {
8989 if (TE->State == TreeEntry::SplitVectorize)
8990 return TE->getNumOperands();
8991 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
8992 return CI->arg_size();
8993 return TE->getNumOperands();
8994 };
8995 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
8996 const TreeEntry *TE) {
8997 Intrinsic::ID ID = Intrinsic::not_intrinsic;
8998 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
8999 ID = getVectorIntrinsicIDForCall(CI, TLI);
9000 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(TE))) {
9001 if (ID != Intrinsic::not_intrinsic &&
9002 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9003 continue;
9004 const TreeEntry *Op = getOperandEntry(E: TE, Idx);
9005 if (Op->isGather() && Op->hasState()) {
9006 const TreeEntry *VecOp =
9007 getSameValuesTreeEntry(V: Op->getMainOp(), VL: Op->Scalars);
9008 if (VecOp)
9009 Op = VecOp;
9010 }
9011 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9012 return false;
9013 }
9014 return true;
9015 };
9016 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
9017 if (!Users.empty() && !all_of(Range&: Users, P: [&](TreeEntry *UTE) {
9018 if (!RevisitedOps.insert(Ptr: UTE).second)
9019 return false;
9020 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
9021 !UTE->ReuseShuffleIndices.empty() ||
9022 (UTE->UserTreeIndex &&
9023 UTE->UserTreeIndex.UserTE == Data.first) ||
9024 (Data.first->UserTreeIndex &&
9025 Data.first->UserTreeIndex.UserTE == UTE) ||
9026 (IgnoreReorder && UTE->UserTreeIndex &&
9027 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9028 NodeShouldBeReorderedWithOperands(UTE);
9029 }))
9030 continue;
9031 for (TreeEntry *UTE : Users) {
9032 Intrinsic::ID ID = Intrinsic::not_intrinsic;
9033 if (auto *CI = dyn_cast<CallInst>(Val: UTE->getMainOp()); CI)
9034 ID = getVectorIntrinsicIDForCall(CI, TLI);
9035 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(UTE))) {
9036 if (ID != Intrinsic::not_intrinsic &&
9037 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9038 continue;
9039 const TreeEntry *Op = getOperandEntry(E: UTE, Idx);
9040 Visited.erase(Ptr: Op);
9041 Queue.push(x: const_cast<TreeEntry *>(Op));
9042 }
9043 }
9044 }
9045 unsigned NumOps = count_if(
9046 Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9047 return P.second == OpTE;
9048 });
9049 // Stores actually store the mask, not the order, need to invert.
9050 if (OpTE->State == TreeEntry::Vectorize &&
9051 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9052 assert(!OpTE->isAltShuffle() &&
9053 "Alternate instructions are only supported by BinaryOperator "
9054 "and CastInst.");
9055 SmallVector<int> Mask;
9056 inversePermutation(Indices: Order, Mask);
9057 unsigned E = Order.size();
9058 OrdersType CurrentOrder(E, E);
9059 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
9060 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9061 });
9062 fixupOrderingIndices(Order: CurrentOrder);
9063 OrdersUses.try_emplace(Key: CurrentOrder, Args: 0).first->second += NumOps;
9064 } else {
9065 OrdersUses.try_emplace(Key: Order, Args: 0).first->second += NumOps;
9066 }
9067 auto Res = OrdersUses.try_emplace(Key: OrdersType(), Args: 0);
9068 const auto AllowsReordering = [&](const TreeEntry *TE) {
9069 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9070 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9071 (IgnoreReorder && TE->Idx == 0))
9072 return true;
9073 if (TE->isGather()) {
9074 if (GathersToOrders.contains(V: TE))
9075 return !getReorderingData(TE: *TE, /*TopToBottom=*/false,
9076 IgnoreReorder)
9077 .value_or(u: OrdersType(1))
9078 .empty();
9079 return true;
9080 }
9081 return false;
9082 };
9083 if (OpTE->UserTreeIndex) {
9084 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9085 if (!VisitedUsers.insert(Ptr: UserTE).second)
9086 continue;
9087 // May reorder user node if it requires reordering, has reused
9088 // scalars, is an alternate op vectorize node or its op nodes require
9089 // reordering.
9090 if (AllowsReordering(UserTE))
9091 continue;
9092 // Check if users allow reordering.
9093 // Currently look up just 1 level of operands to avoid increase of
9094 // the compile time.
9095 // Profitable to reorder if definitely more operands allow
9096 // reordering rather than those with natural order.
9097 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
9098 if (static_cast<unsigned>(count_if(
9099 Range&: Ops, P: [UserTE, &AllowsReordering](
9100 const std::pair<unsigned, TreeEntry *> &Op) {
9101 return AllowsReordering(Op.second) &&
9102 Op.second->UserTreeIndex.UserTE == UserTE;
9103 })) <= Ops.size() / 2)
9104 ++Res.first->second;
9105 }
9106 }
9107 if (OrdersUses.empty()) {
9108 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9109 continue;
9110 }
9111 // Choose the most used order.
9112 unsigned IdentityCnt = 0;
9113 unsigned VF = Data.second.front().second->getVectorFactor();
9114 OrdersType IdentityOrder(VF, VF);
9115 for (auto &Pair : OrdersUses) {
9116 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
9117 IdentityCnt += Pair.second;
9118 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
9119 }
9120 }
9121 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9122 unsigned Cnt = IdentityCnt;
9123 for (auto &Pair : OrdersUses) {
9124 // Prefer identity order. But, if filled identity found (non-empty
9125 // order) with same number of uses, as the new candidate order, we can
9126 // choose this candidate order.
9127 if (Cnt < Pair.second) {
9128 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
9129 BestOrder = Pair.first;
9130 Cnt = Pair.second;
9131 } else {
9132 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
9133 }
9134 }
9135 // Set order of the user node.
9136 if (isIdentityOrder(Order: BestOrder)) {
9137 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9138 continue;
9139 }
9140 fixupOrderingIndices(Order: BestOrder);
9141 // Erase operands from OrderedEntries list and adjust their orders.
9142 VisitedOps.clear();
9143 SmallVector<int> Mask;
9144 inversePermutation(Indices: BestOrder, Mask);
9145 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9146 unsigned E = BestOrder.size();
9147 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
9148 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9149 });
9150 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9151 TreeEntry *TE = Op.second;
9152 if (!VisitedOps.insert(Ptr: TE).second)
9153 continue;
9154 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9155 reorderNodeWithReuses(TE&: *TE, Mask);
9156 continue;
9157 }
9158 // Gathers are processed separately.
9159 if (TE->State != TreeEntry::Vectorize &&
9160 TE->State != TreeEntry::StridedVectorize &&
9161 TE->State != TreeEntry::CompressVectorize &&
9162 TE->State != TreeEntry::SplitVectorize &&
9163 (TE->State != TreeEntry::ScatterVectorize ||
9164 TE->ReorderIndices.empty()))
9165 continue;
9166 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9167 TE->ReorderIndices.empty()) &&
9168 "Non-matching sizes of user/operand entries.");
9169 reorderOrder(Order&: TE->ReorderIndices, Mask);
9170 if (IgnoreReorder && TE == VectorizableTree.front().get())
9171 IgnoreReorder = false;
9172 }
9173 // For gathers just need to reorder its scalars.
9174 for (TreeEntry *Gather : GatherOps) {
9175 assert(Gather->ReorderIndices.empty() &&
9176 "Unexpected reordering of gathers.");
9177 if (!Gather->ReuseShuffleIndices.empty()) {
9178 // Just reorder reuses indices.
9179 reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
9180 continue;
9181 }
9182 reorderScalars(Scalars&: Gather->Scalars, Mask);
9183 Visited.insert(Ptr: Gather);
9184 }
9185 // Reorder operands of the user node and set the ordering for the user
9186 // node itself.
9187 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9188 return TE.isAltShuffle() &&
9189 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9190 TE.ReorderIndices.empty());
9191 };
9192 if (Data.first->State != TreeEntry::Vectorize ||
9193 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
9194 Val: Data.first->getMainOp()) ||
9195 IsNotProfitableAltCodeNode(*Data.first))
9196 Data.first->reorderOperands(Mask);
9197 if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) ||
9198 IsNotProfitableAltCodeNode(*Data.first) ||
9199 Data.first->State == TreeEntry::StridedVectorize ||
9200 Data.first->State == TreeEntry::CompressVectorize) {
9201 reorderScalars(Scalars&: Data.first->Scalars, Mask);
9202 reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
9203 /*BottomOrder=*/true);
9204 if (Data.first->ReuseShuffleIndices.empty() &&
9205 !Data.first->ReorderIndices.empty() &&
9206 !IsNotProfitableAltCodeNode(*Data.first)) {
9207 // Insert user node to the list to try to sink reordering deeper in
9208 // the graph.
9209 Queue.push(x: Data.first);
9210 }
9211 } else {
9212 reorderOrder(Order&: Data.first->ReorderIndices, Mask);
9213 }
9214 }
9215 }
9216 // If the reordering is unnecessary, just remove the reorder.
9217 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9218 VectorizableTree.front()->ReuseShuffleIndices.empty())
9219 VectorizableTree.front()->ReorderIndices.clear();
9220}
9221
9222Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9223 if (Entry.hasState() &&
9224 (Entry.getOpcode() == Instruction::Store ||
9225 Entry.getOpcode() == Instruction::Load) &&
9226 Entry.State == TreeEntry::StridedVectorize &&
9227 !Entry.ReorderIndices.empty() && isReverseOrder(Order: Entry.ReorderIndices))
9228 return dyn_cast<Instruction>(Val: Entry.Scalars[Entry.ReorderIndices.front()]);
9229 return dyn_cast<Instruction>(Val: Entry.Scalars.front());
9230}
9231
9232void BoUpSLP::buildExternalUses(
9233 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9234 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9235 DenseMap<Value *, unsigned> ScalarToExtUses;
9236 // Collect the values that we need to extract from the tree.
9237 for (auto &TEPtr : VectorizableTree) {
9238 TreeEntry *Entry = TEPtr.get();
9239
9240 // No need to handle users of gathered values.
9241 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9242 DeletedNodes.contains(Ptr: Entry) ||
9243 TransformedToGatherNodes.contains(Val: Entry))
9244 continue;
9245
9246 // For each lane:
9247 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9248 Value *Scalar = Entry->Scalars[Lane];
9249 if (!isa<Instruction>(Val: Scalar) || Entry->isCopyableElement(V: Scalar))
9250 continue;
9251
9252 // All uses must be replaced already? No need to do it again.
9253 auto It = ScalarToExtUses.find(Val: Scalar);
9254 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9255 continue;
9256
9257 if (Scalar->hasNUsesOrMore(N: NumVectScalars)) {
9258 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9259 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9260 << " from " << *Scalar << "for many users.\n");
9261 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9262 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9263 ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9264 continue;
9265 }
9266
9267 // Check if the scalar is externally used as an extra arg.
9268 const auto ExtI = ExternallyUsedValues.find(V: Scalar);
9269 if (ExtI != ExternallyUsedValues.end()) {
9270 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9271 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9272 << FoundLane << " from " << *Scalar << ".\n");
9273 ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
9274 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9275 continue;
9276 }
9277 for (User *U : Scalar->users()) {
9278 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9279
9280 Instruction *UserInst = dyn_cast<Instruction>(Val: U);
9281 if (!UserInst || isDeleted(I: UserInst))
9282 continue;
9283
9284 // Ignore users in the user ignore list.
9285 if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
9286 continue;
9287
9288 // Skip in-tree scalars that become vectors
9289 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(V: U);
9290 any_of(Range&: UseEntries, P: [this](const TreeEntry *UseEntry) {
9291 return !DeletedNodes.contains(Ptr: UseEntry) &&
9292 !TransformedToGatherNodes.contains(Val: UseEntry);
9293 })) {
9294 // Some in-tree scalars will remain as scalar in vectorized
9295 // instructions. If that is the case, the one in FoundLane will
9296 // be used.
9297 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9298 isa<LoadInst, StoreInst>(Val: UserInst)) ||
9299 isa<CallInst>(Val: UserInst)) ||
9300 all_of(Range&: UseEntries, P: [&](TreeEntry *UseEntry) {
9301 if (DeletedNodes.contains(Ptr: UseEntry) ||
9302 TransformedToGatherNodes.contains(Val: UseEntry))
9303 return true;
9304 return UseEntry->State == TreeEntry::ScatterVectorize ||
9305 !doesInTreeUserNeedToExtract(
9306 Scalar, UserInst: getRootEntryInstruction(Entry: *UseEntry), TLI,
9307 TTI);
9308 })) {
9309 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9310 << ".\n");
9311 assert(none_of(UseEntries,
9312 [](TreeEntry *UseEntry) {
9313 return UseEntry->isGather();
9314 }) &&
9315 "Bad state");
9316 continue;
9317 }
9318 U = nullptr;
9319 if (It != ScalarToExtUses.end()) {
9320 ExternalUses[It->second].User = nullptr;
9321 break;
9322 }
9323 }
9324
9325 if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
9326 U = nullptr;
9327 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9328 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9329 << " from lane " << FoundLane << " from " << *Scalar
9330 << ".\n");
9331 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9332 ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: *Entry, Args&: FoundLane);
9333 ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9334 if (!U)
9335 break;
9336 }
9337 }
9338 }
9339}
9340
9341SmallVector<SmallVector<StoreInst *>>
9342BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9343 SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
9344 SmallVector<StoreInst *>, 8>
9345 PtrToStoresMap;
9346 for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size())) {
9347 Value *V = TE->Scalars[Lane];
9348 // Don't iterate over the users of constant data.
9349 if (!isa<Instruction>(Val: V))
9350 continue;
9351 // To save compilation time we don't visit if we have too many users.
9352 if (V->hasNUsesOrMore(N: UsesLimit))
9353 break;
9354
9355 // Collect stores per pointer object.
9356 for (User *U : V->users()) {
9357 auto *SI = dyn_cast<StoreInst>(Val: U);
9358 // Test whether we can handle the store. V might be a global, which could
9359 // be used in a different function.
9360 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9361 !isValidElementType(Ty: SI->getValueOperand()->getType()))
9362 continue;
9363 // Skip entry if already
9364 if (isVectorized(V: U))
9365 continue;
9366
9367 Value *Ptr =
9368 getUnderlyingObject(V: SI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
9369 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9370 SI->getValueOperand()->getType(), Ptr}];
9371 // For now just keep one store per pointer object per lane.
9372 // TODO: Extend this to support multiple stores per pointer per lane
9373 if (StoresVec.size() > Lane)
9374 continue;
9375 if (!StoresVec.empty()) {
9376 std::optional<int64_t> Diff = getPointersDiff(
9377 ElemTyA: SI->getValueOperand()->getType(), PtrA: SI->getPointerOperand(),
9378 ElemTyB: SI->getValueOperand()->getType(),
9379 PtrB: StoresVec.front()->getPointerOperand(), DL: *DL, SE&: *SE,
9380 /*StrictCheck=*/true);
9381 // We failed to compare the pointers so just abandon this store.
9382 if (!Diff)
9383 continue;
9384 }
9385 StoresVec.push_back(Elt: SI);
9386 }
9387 }
9388 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9389 unsigned I = 0;
9390 for (auto &P : PtrToStoresMap) {
9391 Res[I].swap(RHS&: P.second);
9392 ++I;
9393 }
9394 return Res;
9395}
9396
9397bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9398 OrdersType &ReorderIndices) const {
9399 // We check whether the stores in StoreVec can form a vector by sorting them
9400 // and checking whether they are consecutive.
9401
9402 // To avoid calling getPointersDiff() while sorting we create a vector of
9403 // pairs {store, offset from first} and sort this instead.
9404 SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec;
9405 StoreInst *S0 = StoresVec[0];
9406 StoreOffsetVec.emplace_back(Args: 0, Args: 0);
9407 Type *S0Ty = S0->getValueOperand()->getType();
9408 Value *S0Ptr = S0->getPointerOperand();
9409 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoresVec.size())) {
9410 StoreInst *SI = StoresVec[Idx];
9411 std::optional<int64_t> Diff =
9412 getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
9413 PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
9414 /*StrictCheck=*/true);
9415 StoreOffsetVec.emplace_back(Args&: *Diff, Args&: Idx);
9416 }
9417
9418 // Check if the stores are consecutive by checking if their difference is 1.
9419 if (StoreOffsetVec.size() != StoresVec.size())
9420 return false;
9421 sort(C&: StoreOffsetVec, Comp: llvm::less_first());
9422 unsigned Idx = 0;
9423 int64_t PrevDist = 0;
9424 for (const auto &P : StoreOffsetVec) {
9425 if (Idx > 0 && P.first != PrevDist + 1)
9426 return false;
9427 PrevDist = P.first;
9428 ++Idx;
9429 }
9430
9431 // Calculate the shuffle indices according to their offset against the sorted
9432 // StoreOffsetVec.
9433 ReorderIndices.assign(NumElts: StoresVec.size(), Elt: 0);
9434 bool IsIdentity = true;
9435 for (auto [I, P] : enumerate(First&: StoreOffsetVec)) {
9436 ReorderIndices[P.second] = I;
9437 IsIdentity &= P.second == I;
9438 }
9439 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9440 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9441 // same convention here.
9442 if (IsIdentity)
9443 ReorderIndices.clear();
9444
9445 return true;
9446}
9447
9448#ifndef NDEBUG
9449LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
9450 for (unsigned Idx : Order)
9451 dbgs() << Idx << ", ";
9452 dbgs() << "\n";
9453}
9454#endif
9455
9456SmallVector<BoUpSLP::OrdersType, 1>
9457BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9458 unsigned NumLanes = TE->Scalars.size();
9459
9460 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9461
9462 // Holds the reorder indices for each candidate store vector that is a user of
9463 // the current TreeEntry.
9464 SmallVector<OrdersType, 1> ExternalReorderIndices;
9465
9466 // Now inspect the stores collected per pointer and look for vectorization
9467 // candidates. For each candidate calculate the reorder index vector and push
9468 // it into `ExternalReorderIndices`
9469 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9470 // If we have fewer than NumLanes stores, then we can't form a vector.
9471 if (StoresVec.size() != NumLanes)
9472 continue;
9473
9474 // If the stores are not consecutive then abandon this StoresVec.
9475 OrdersType ReorderIndices;
9476 if (!canFormVector(StoresVec, ReorderIndices))
9477 continue;
9478
9479 // We now know that the scalars in StoresVec can form a vector instruction,
9480 // so set the reorder indices.
9481 ExternalReorderIndices.push_back(Elt: ReorderIndices);
9482 }
9483 return ExternalReorderIndices;
9484}
9485
9486void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
9487 const SmallDenseSet<Value *> &UserIgnoreLst) {
9488 deleteTree();
9489 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9490 "TreeEntryToStridedPtrInfoMap is not cleared");
9491 UserIgnoreList = &UserIgnoreLst;
9492 if (!allSameType(VL: Roots))
9493 return;
9494 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
9495}
9496
9497void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
9498 deleteTree();
9499 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9500 "TreeEntryToStridedPtrInfoMap is not cleared");
9501 if (!allSameType(VL: Roots))
9502 return;
9503 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
9504}
9505
9506/// Tries to find subvector of loads and builds new vector of only loads if can
9507/// be profitable.
9508static void gatherPossiblyVectorizableLoads(
9509 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9510 ScalarEvolution &SE, const TargetTransformInfo &TTI,
9511 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9512 bool AddNew = true) {
9513 if (VL.empty())
9514 return;
9515 Type *ScalarTy = getValueType(V: VL.front());
9516 if (!isValidElementType(Ty: ScalarTy))
9517 return;
9518 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads;
9519 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9520 for (Value *V : VL) {
9521 auto *LI = dyn_cast<LoadInst>(Val: V);
9522 if (!LI)
9523 continue;
9524 if (R.isDeleted(I: LI) || R.isVectorized(V: LI) || !LI->isSimple())
9525 continue;
9526 bool IsFound = false;
9527 for (auto [Map, Data] : zip(t&: ClusteredDistToLoad, u&: ClusteredLoads)) {
9528 assert(LI->getParent() == Data.front().first->getParent() &&
9529 LI->getType() == Data.front().first->getType() &&
9530 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9531 getUnderlyingObject(Data.front().first->getPointerOperand(),
9532 RecursionMaxDepth) &&
9533 "Expected loads with the same type, same parent and same "
9534 "underlying pointer.");
9535 std::optional<int64_t> Dist = getPointersDiff(
9536 ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), ElemTyB: Data.front().first->getType(),
9537 PtrB: Data.front().first->getPointerOperand(), DL, SE,
9538 /*StrictCheck=*/true);
9539 if (!Dist)
9540 continue;
9541 auto It = Map.find(Val: *Dist);
9542 if (It != Map.end() && It->second != LI)
9543 continue;
9544 if (It == Map.end()) {
9545 Data.emplace_back(Args&: LI, Args&: *Dist);
9546 Map.try_emplace(Key: *Dist, Args&: LI);
9547 }
9548 IsFound = true;
9549 break;
9550 }
9551 if (!IsFound) {
9552 ClusteredLoads.emplace_back().emplace_back(Args&: LI, Args: 0);
9553 ClusteredDistToLoad.emplace_back().try_emplace(Key: 0, Args&: LI);
9554 }
9555 }
9556 auto FindMatchingLoads =
9557 [&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads,
9558 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>>
9559 &GatheredLoads,
9560 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9561 int64_t &Offset, unsigned &Start) {
9562 if (Loads.empty())
9563 return GatheredLoads.end();
9564 LoadInst *LI = Loads.front().first;
9565 for (auto [Idx, Data] : enumerate(First&: GatheredLoads)) {
9566 if (Idx < Start)
9567 continue;
9568 ToAdd.clear();
9569 if (LI->getParent() != Data.front().first->getParent() ||
9570 LI->getType() != Data.front().first->getType())
9571 continue;
9572 std::optional<int64_t> Dist =
9573 getPointersDiff(ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(),
9574 ElemTyB: Data.front().first->getType(),
9575 PtrB: Data.front().first->getPointerOperand(), DL, SE,
9576 /*StrictCheck=*/true);
9577 if (!Dist)
9578 continue;
9579 SmallSet<int64_t, 4> DataDists;
9580 SmallPtrSet<LoadInst *, 4> DataLoads;
9581 for (std::pair<LoadInst *, int64_t> P : Data) {
9582 DataDists.insert(V: P.second);
9583 DataLoads.insert(Ptr: P.first);
9584 }
9585 // Found matching gathered loads - check if all loads are unique or
9586 // can be effectively vectorized.
9587 unsigned NumUniques = 0;
9588 for (auto [Cnt, Pair] : enumerate(First&: Loads)) {
9589 bool Used = DataLoads.contains(Ptr: Pair.first);
9590 if (!Used && !DataDists.contains(V: *Dist + Pair.second)) {
9591 ++NumUniques;
9592 ToAdd.insert(X: Cnt);
9593 } else if (Used) {
9594 Repeated.insert(X: Cnt);
9595 }
9596 }
9597 if (NumUniques > 0 &&
9598 (Loads.size() == NumUniques ||
9599 (Loads.size() - NumUniques >= 2 &&
9600 Loads.size() - NumUniques >= Loads.size() / 2 &&
9601 (has_single_bit(Value: Data.size() + NumUniques) ||
9602 bit_ceil(Value: Data.size()) <
9603 bit_ceil(Value: Data.size() + NumUniques))))) {
9604 Offset = *Dist;
9605 Start = Idx + 1;
9606 return std::next(x: GatheredLoads.begin(), n: Idx);
9607 }
9608 }
9609 ToAdd.clear();
9610 return GatheredLoads.end();
9611 };
9612 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9613 unsigned Start = 0;
9614 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9615 int64_t Offset = 0;
9616 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9617 Offset, Start);
9618 while (It != GatheredLoads.end()) {
9619 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9620 for (unsigned Idx : LocalToAdd)
9621 It->emplace_back(Args: Data[Idx].first, Args: Data[Idx].second + Offset);
9622 ToAdd.insert_range(R&: LocalToAdd);
9623 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9624 Start);
9625 }
9626 if (any_of(Range: seq<unsigned>(Size: Data.size()), P: [&](unsigned Idx) {
9627 return !ToAdd.contains(key: Idx) && !Repeated.contains(key: Idx);
9628 })) {
9629 auto AddNewLoads =
9630 [&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) {
9631 for (unsigned Idx : seq<unsigned>(Size: Data.size())) {
9632 if (ToAdd.contains(key: Idx) || Repeated.contains(key: Idx))
9633 continue;
9634 Loads.push_back(Elt: Data[Idx]);
9635 }
9636 };
9637 if (!AddNew) {
9638 LoadInst *LI = Data.front().first;
9639 It = find_if(
9640 Range&: GatheredLoads, P: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9641 return PD.front().first->getParent() == LI->getParent() &&
9642 PD.front().first->getType() == LI->getType();
9643 });
9644 while (It != GatheredLoads.end()) {
9645 AddNewLoads(*It);
9646 It = std::find_if(
9647 first: std::next(x: It), last: GatheredLoads.end(),
9648 pred: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9649 return PD.front().first->getParent() == LI->getParent() &&
9650 PD.front().first->getType() == LI->getType();
9651 });
9652 }
9653 }
9654 GatheredLoads.emplace_back().append(in_start: Data.begin(), in_end: Data.end());
9655 AddNewLoads(GatheredLoads.emplace_back());
9656 }
9657 }
9658}
9659
9660void BoUpSLP::tryToVectorizeGatheredLoads(
9661 const SmallMapVector<
9662 std::tuple<BasicBlock *, Value *, Type *>,
9663 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9664 &GatheredLoads) {
9665 GatheredLoadsEntriesFirst = VectorizableTree.size();
9666
9667 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9668 LoadEntriesToVectorize.size());
9669 for (auto [Idx, Set] : zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize))
9670 Set.insert_range(R&: VectorizableTree[Idx]->Scalars);
9671
9672 // Sort loads by distance.
9673 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9674 const std::pair<LoadInst *, int64_t> &L2) {
9675 return L1.second > L2.second;
9676 };
9677
9678 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9679 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9680 Loads.size());
9681 Align Alignment = computeCommonAlignment<LoadInst>(VL: Values);
9682 auto *Ty = getWidenedType(ScalarTy: Loads.front()->getType(), VF: Loads.size());
9683 return TTI->isLegalMaskedGather(DataType: Ty, Alignment) &&
9684 !TTI->forceScalarizeMaskedGather(Type: Ty, Alignment);
9685 };
9686
9687 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9688 BoUpSLP::ValueSet &VectorizedLoads,
9689 SmallVectorImpl<LoadInst *> &NonVectorized,
9690 bool Final, unsigned MaxVF) {
9691 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
9692 unsigned StartIdx = 0;
9693 SmallVector<int> CandidateVFs;
9694 if (VectorizeNonPowerOf2 && has_single_bit(Value: MaxVF + 1))
9695 CandidateVFs.push_back(Elt: MaxVF);
9696 for (int NumElts = getFloorFullVectorNumberOfElements(
9697 TTI: *TTI, Ty: Loads.front()->getType(), Sz: MaxVF);
9698 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9699 TTI: *TTI, Ty: Loads.front()->getType(), Sz: NumElts - 1)) {
9700 CandidateVFs.push_back(Elt: NumElts);
9701 if (VectorizeNonPowerOf2 && NumElts > 2)
9702 CandidateVFs.push_back(Elt: NumElts - 1);
9703 }
9704
9705 if (Final && CandidateVFs.empty())
9706 return Results;
9707
9708 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9709 for (unsigned NumElts : CandidateVFs) {
9710 if (Final && NumElts > BestVF)
9711 continue;
9712 SmallVector<unsigned> MaskedGatherVectorized;
9713 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9714 ++Cnt) {
9715 ArrayRef<LoadInst *> Slice =
9716 ArrayRef(Loads).slice(N: Cnt, M: std::min(a: NumElts, b: E - Cnt));
9717 if (VectorizedLoads.count(Ptr: Slice.front()) ||
9718 VectorizedLoads.count(Ptr: Slice.back()) ||
9719 areKnownNonVectorizableLoads(VL: Slice))
9720 continue;
9721 // Check if it is profitable to try vectorizing gathered loads. It is
9722 // profitable if we have more than 3 consecutive loads or if we have
9723 // less but all users are vectorized or deleted.
9724 bool AllowToVectorize = false;
9725 // Check if it is profitable to vectorize 2-elements loads.
9726 if (NumElts == 2) {
9727 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9728 ElementTy: Slice.front()->getType(), NumElements: ElementCount::getFixed(MinVal: NumElts));
9729 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9730 for (LoadInst *LI : Slice) {
9731 // If single use/user - allow to vectorize.
9732 if (LI->hasOneUse())
9733 continue;
9734 // 1. Check if number of uses equals number of users.
9735 // 2. All users are deleted.
9736 // 3. The load broadcasts are not allowed or the load is not
9737 // broadcasted.
9738 if (static_cast<unsigned int>(std::distance(
9739 first: LI->user_begin(), last: LI->user_end())) != LI->getNumUses())
9740 return false;
9741 if (!IsLegalBroadcastLoad)
9742 continue;
9743 if (LI->hasNUsesOrMore(N: UsesLimit))
9744 return false;
9745 for (User *U : LI->users()) {
9746 if (auto *UI = dyn_cast<Instruction>(Val: U); UI && isDeleted(I: UI))
9747 continue;
9748 for (const TreeEntry *UTE : getTreeEntries(V: U)) {
9749 for (int I : seq<int>(Size: UTE->getNumOperands())) {
9750 if (all_of(Range: UTE->getOperand(OpIdx: I), P: [LI](Value *V) {
9751 return V == LI || isa<PoisonValue>(Val: V);
9752 }))
9753 // Found legal broadcast - do not vectorize.
9754 return false;
9755 }
9756 }
9757 }
9758 }
9759 return true;
9760 };
9761 AllowToVectorize = CheckIfAllowed(Slice);
9762 } else {
9763 AllowToVectorize =
9764 (NumElts >= 3 ||
9765 any_of(Range&: ValueToGatherNodes.at(Val: Slice.front()),
9766 P: [=](const TreeEntry *TE) {
9767 return TE->Scalars.size() == 2 &&
9768 ((TE->Scalars.front() == Slice.front() &&
9769 TE->Scalars.back() == Slice.back()) ||
9770 (TE->Scalars.front() == Slice.back() &&
9771 TE->Scalars.back() == Slice.front()));
9772 })) &&
9773 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(),
9774 Sz: Slice.size());
9775 }
9776 if (AllowToVectorize) {
9777 SmallVector<Value *> PointerOps;
9778 OrdersType CurrentOrder;
9779 // Try to build vector load.
9780 ArrayRef<Value *> Values(
9781 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9782 StridedPtrInfo SPtrInfo;
9783 LoadsState LS = canVectorizeLoads(VL: Values, VL0: Slice.front(), Order&: CurrentOrder,
9784 PointerOps, SPtrInfo, BestVF: &BestVF);
9785 if (LS != LoadsState::Gather ||
9786 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9787 if (LS == LoadsState::ScatterVectorize) {
9788 if (MaskedGatherVectorized.empty() ||
9789 Cnt >= MaskedGatherVectorized.back() + NumElts)
9790 MaskedGatherVectorized.push_back(Elt: Cnt);
9791 continue;
9792 }
9793 if (LS != LoadsState::Gather) {
9794 Results.emplace_back(Args&: Values, Args&: LS);
9795 VectorizedLoads.insert_range(R&: Slice);
9796 // If we vectorized initial block, no need to try to vectorize it
9797 // again.
9798 if (Cnt == StartIdx)
9799 StartIdx += NumElts;
9800 }
9801 // Check if the whole array was vectorized already - exit.
9802 if (StartIdx >= Loads.size())
9803 break;
9804 // Erase last masked gather candidate, if another candidate within
9805 // the range is found to be better.
9806 if (!MaskedGatherVectorized.empty() &&
9807 Cnt < MaskedGatherVectorized.back() + NumElts)
9808 MaskedGatherVectorized.pop_back();
9809 Cnt += NumElts - 1;
9810 continue;
9811 }
9812 }
9813 if (!AllowToVectorize || BestVF == 0)
9814 registerNonVectorizableLoads(VL: Slice);
9815 }
9816 // Mark masked gathers candidates as vectorized, if any.
9817 for (unsigned Cnt : MaskedGatherVectorized) {
9818 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9819 N: Cnt, M: std::min<unsigned>(a: NumElts, b: Loads.size() - Cnt));
9820 ArrayRef<Value *> Values(
9821 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9822 Results.emplace_back(Args&: Values, Args: LoadsState::ScatterVectorize);
9823 VectorizedLoads.insert_range(R&: Slice);
9824 // If we vectorized initial block, no need to try to vectorize it again.
9825 if (Cnt == StartIdx)
9826 StartIdx += NumElts;
9827 }
9828 }
9829 for (LoadInst *LI : Loads) {
9830 if (!VectorizedLoads.contains(Ptr: LI))
9831 NonVectorized.push_back(Elt: LI);
9832 }
9833 return Results;
9834 };
9835 auto ProcessGatheredLoads =
9836 [&, &TTI = *TTI](
9837 ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
9838 bool Final = false) {
9839 SmallVector<LoadInst *> NonVectorized;
9840 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9841 GatheredLoads) {
9842 if (LoadsDists.size() <= 1) {
9843 NonVectorized.push_back(Elt: LoadsDists.back().first);
9844 continue;
9845 }
9846 SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
9847 LoadsDists);
9848 SmallVector<LoadInst *> OriginalLoads(make_first_range(c&: LoadsDists));
9849 stable_sort(Range&: LocalLoadsDists, C: LoadSorter);
9850 SmallVector<LoadInst *> Loads;
9851 unsigned MaxConsecutiveDistance = 0;
9852 unsigned CurrentConsecutiveDist = 1;
9853 int64_t LastDist = LocalLoadsDists.front().second;
9854 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9855 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9856 if (isVectorized(V: L.first))
9857 continue;
9858 assert(LastDist >= L.second &&
9859 "Expected first distance always not less than second");
9860 if (static_cast<uint64_t>(LastDist - L.second) ==
9861 CurrentConsecutiveDist) {
9862 ++CurrentConsecutiveDist;
9863 MaxConsecutiveDistance =
9864 std::max(a: MaxConsecutiveDistance, b: CurrentConsecutiveDist);
9865 Loads.push_back(Elt: L.first);
9866 continue;
9867 }
9868 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9869 !Loads.empty())
9870 Loads.pop_back();
9871 CurrentConsecutiveDist = 1;
9872 LastDist = L.second;
9873 Loads.push_back(Elt: L.first);
9874 }
9875 if (Loads.size() <= 1)
9876 continue;
9877 if (AllowMaskedGather)
9878 MaxConsecutiveDistance = Loads.size();
9879 else if (MaxConsecutiveDistance < 2)
9880 continue;
9881 BoUpSLP::ValueSet VectorizedLoads;
9882 SmallVector<LoadInst *> SortedNonVectorized;
9883 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
9884 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9885 Final, MaxConsecutiveDistance);
9886 if (!Results.empty() && !SortedNonVectorized.empty() &&
9887 OriginalLoads.size() == Loads.size() &&
9888 MaxConsecutiveDistance == Loads.size() &&
9889 all_of(Range&: Results,
9890 P: [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9891 return P.second == LoadsState::ScatterVectorize;
9892 })) {
9893 VectorizedLoads.clear();
9894 SmallVector<LoadInst *> UnsortedNonVectorized;
9895 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
9896 UnsortedResults =
9897 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9898 UnsortedNonVectorized, Final,
9899 OriginalLoads.size());
9900 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9901 SortedNonVectorized.swap(RHS&: UnsortedNonVectorized);
9902 Results.swap(RHS&: UnsortedResults);
9903 }
9904 }
9905 for (auto [Slice, _] : Results) {
9906 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9907 << Slice.size() << ")\n");
9908 if (any_of(Range&: Slice, P: [&](Value *V) { return isVectorized(V); })) {
9909 for (Value *L : Slice)
9910 if (!isVectorized(V: L))
9911 SortedNonVectorized.push_back(Elt: cast<LoadInst>(Val: L));
9912 continue;
9913 }
9914
9915 // Select maximum VF as a maximum of user gathered nodes and
9916 // distance between scalar loads in these nodes.
9917 unsigned MaxVF = Slice.size();
9918 unsigned UserMaxVF = 0;
9919 unsigned InterleaveFactor = 0;
9920 if (MaxVF == 2) {
9921 UserMaxVF = MaxVF;
9922 } else {
9923 // Found distance between segments of the interleaved loads.
9924 std::optional<unsigned> InterleavedLoadsDistance = 0;
9925 unsigned Order = 0;
9926 std::optional<unsigned> CommonVF = 0;
9927 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9928 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9929 for (auto [Idx, V] : enumerate(First&: Slice)) {
9930 for (const TreeEntry *E : ValueToGatherNodes.at(Val: V)) {
9931 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: E->Scalars.size());
9932 unsigned Pos =
9933 EntryToPosition.try_emplace(Key: E, Args&: Idx).first->second;
9934 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: Idx - Pos + 1);
9935 if (CommonVF) {
9936 if (*CommonVF == 0) {
9937 CommonVF = E->Scalars.size();
9938 continue;
9939 }
9940 if (*CommonVF != E->Scalars.size())
9941 CommonVF.reset();
9942 }
9943 // Check if the load is the part of the interleaved load.
9944 if (Pos != Idx && InterleavedLoadsDistance) {
9945 if (!DeinterleavedNodes.contains(Ptr: E) &&
9946 any_of(Range: E->Scalars, P: [&, Slice = Slice](Value *V) {
9947 if (isa<Constant>(Val: V))
9948 return false;
9949 if (isVectorized(V))
9950 return true;
9951 const auto &Nodes = ValueToGatherNodes.at(Val: V);
9952 return (Nodes.size() != 1 || !Nodes.contains(key: E)) &&
9953 !is_contained(Range: Slice, Element: V);
9954 })) {
9955 InterleavedLoadsDistance.reset();
9956 continue;
9957 }
9958 DeinterleavedNodes.insert(Ptr: E);
9959 if (*InterleavedLoadsDistance == 0) {
9960 InterleavedLoadsDistance = Idx - Pos;
9961 continue;
9962 }
9963 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9964 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9965 InterleavedLoadsDistance.reset();
9966 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(u: 1);
9967 }
9968 }
9969 }
9970 DeinterleavedNodes.clear();
9971 // Check if the large load represents interleaved load operation.
9972 if (InterleavedLoadsDistance.value_or(u: 0) > 1 &&
9973 CommonVF.value_or(u: 0) != 0) {
9974 InterleaveFactor = bit_ceil(Value: *InterleavedLoadsDistance);
9975 unsigned VF = *CommonVF;
9976 OrdersType Order;
9977 SmallVector<Value *> PointerOps;
9978 StridedPtrInfo SPtrInfo;
9979 // Segmented load detected - vectorize at maximum vector factor.
9980 if (InterleaveFactor <= Slice.size() &&
9981 TTI.isLegalInterleavedAccessType(
9982 VTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF),
9983 Factor: InterleaveFactor,
9984 Alignment: cast<LoadInst>(Val: Slice.front())->getAlign(),
9985 AddrSpace: cast<LoadInst>(Val: Slice.front())
9986 ->getPointerAddressSpace()) &&
9987 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
9988 SPtrInfo) == LoadsState::Vectorize) {
9989 UserMaxVF = InterleaveFactor * VF;
9990 } else {
9991 InterleaveFactor = 0;
9992 }
9993 }
9994 // Cannot represent the loads as consecutive vectorizable nodes -
9995 // just exit.
9996 unsigned ConsecutiveNodesSize = 0;
9997 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
9998 any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
9999 P: [&, Slice = Slice](const auto &P) {
10000 const auto *It = find_if(Slice, [&](Value *V) {
10001 return std::get<1>(P).contains(V);
10002 });
10003 if (It == Slice.end())
10004 return false;
10005 const TreeEntry &TE =
10006 *VectorizableTree[std::get<0>(P)];
10007 ArrayRef<Value *> VL = TE.Scalars;
10008 OrdersType Order;
10009 SmallVector<Value *> PointerOps;
10010 StridedPtrInfo SPtrInfo;
10011 LoadsState State = canVectorizeLoads(
10012 VL, VL0: VL.front(), Order, PointerOps, SPtrInfo);
10013 if (State == LoadsState::ScatterVectorize ||
10014 State == LoadsState::CompressVectorize)
10015 return false;
10016 ConsecutiveNodesSize += VL.size();
10017 size_t Start = std::distance(Slice.begin(), It);
10018 size_t Sz = Slice.size() - Start;
10019 return Sz < VL.size() ||
10020 Slice.slice(N: Start, M: VL.size()) != VL;
10021 }))
10022 continue;
10023 // Try to build long masked gather loads.
10024 UserMaxVF = bit_ceil(Value: UserMaxVF);
10025 if (InterleaveFactor == 0 &&
10026 any_of(Range: seq<unsigned>(Size: Slice.size() / UserMaxVF),
10027 P: [&, Slice = Slice](unsigned Idx) {
10028 OrdersType Order;
10029 SmallVector<Value *> PointerOps;
10030 StridedPtrInfo SPtrInfo;
10031 return canVectorizeLoads(
10032 VL: Slice.slice(N: Idx * UserMaxVF, M: UserMaxVF),
10033 VL0: Slice[Idx * UserMaxVF], Order, PointerOps,
10034 SPtrInfo) == LoadsState::ScatterVectorize;
10035 }))
10036 UserMaxVF = MaxVF;
10037 if (Slice.size() != ConsecutiveNodesSize)
10038 MaxVF = std::min<unsigned>(a: MaxVF, b: UserMaxVF);
10039 }
10040 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10041 bool IsVectorized = true;
10042 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10043 ArrayRef<Value *> SubSlice =
10044 Slice.slice(N: I, M: std::min(a: VF, b: E - I));
10045 if (isVectorized(V: SubSlice.front()))
10046 continue;
10047 // Check if the subslice is to be-vectorized entry, which is not
10048 // equal to entry.
10049 if (any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
10050 P: [&](const auto &P) {
10051 return !SubSlice.equals(
10052 RHS: VectorizableTree[std::get<0>(P)]
10053 ->Scalars) &&
10054 set_is_subset(SubSlice, std::get<1>(P));
10055 }))
10056 continue;
10057 unsigned Sz = VectorizableTree.size();
10058 buildTreeRec(Roots: SubSlice, Depth: 0, EI: EdgeInfo(), InterleaveFactor);
10059 if (Sz == VectorizableTree.size()) {
10060 IsVectorized = false;
10061 // Try non-interleaved vectorization with smaller vector
10062 // factor.
10063 if (InterleaveFactor > 0) {
10064 VF = 2 * (MaxVF / InterleaveFactor);
10065 InterleaveFactor = 0;
10066 }
10067 continue;
10068 }
10069 }
10070 if (IsVectorized)
10071 break;
10072 }
10073 }
10074 NonVectorized.append(RHS: SortedNonVectorized);
10075 }
10076 return NonVectorized;
10077 };
10078 for (const auto &GLs : GatheredLoads) {
10079 const auto &Ref = GLs.second;
10080 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10081 if (!Ref.empty() && !NonVectorized.empty() &&
10082 std::accumulate(
10083 first: Ref.begin(), last: Ref.end(), init: 0u,
10084 binary_op: [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10085 -> unsigned { return S + LoadsDists.size(); }) !=
10086 NonVectorized.size() &&
10087 IsMaskedGatherSupported(NonVectorized)) {
10088 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>
10089 FinalGatheredLoads;
10090 for (LoadInst *LI : NonVectorized) {
10091 // Reinsert non-vectorized loads to other list of loads with the same
10092 // base pointers.
10093 gatherPossiblyVectorizableLoads(R: *this, VL: LI, DL: *DL, SE&: *SE, TTI: *TTI,
10094 GatheredLoads&: FinalGatheredLoads,
10095 /*AddNew=*/false);
10096 }
10097 // Final attempt to vectorize non-vectorized loads.
10098 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10099 }
10100 }
10101 // Try to vectorize postponed load entries, previously marked as gathered.
10102 for (unsigned Idx : LoadEntriesToVectorize) {
10103 const TreeEntry &E = *VectorizableTree[Idx];
10104 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10105 // Avoid reordering, if possible.
10106 if (!E.ReorderIndices.empty()) {
10107 // Build a mask out of the reorder indices and reorder scalars per this
10108 // mask.
10109 SmallVector<int> ReorderMask;
10110 inversePermutation(Indices: E.ReorderIndices, Mask&: ReorderMask);
10111 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
10112 }
10113 buildTreeRec(Roots: GatheredScalars, Depth: 0, EI: EdgeInfo());
10114 }
10115 // If no new entries created, consider it as no gathered loads entries must be
10116 // handled.
10117 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10118 VectorizableTree.size())
10119 GatheredLoadsEntriesFirst.reset();
10120}
10121
10122/// Generates key/subkey pair for the given value to provide effective sorting
10123/// of the values and better detection of the vectorizable values sequences. The
10124/// keys/subkeys can be used for better sorting of the values themselves (keys)
10125/// and in values subgroups (subkeys).
10126static std::pair<size_t, size_t> generateKeySubkey(
10127 Value *V, const TargetLibraryInfo *TLI,
10128 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10129 bool AllowAlternate) {
10130 hash_code Key = hash_value(value: V->getValueID() + 2);
10131 hash_code SubKey = hash_value(value: 0);
10132 // Sort the loads by the distance between the pointers.
10133 if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
10134 Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
10135 if (LI->isSimple())
10136 SubKey = hash_value(code: LoadsSubkeyGenerator(Key, LI));
10137 else
10138 Key = SubKey = hash_value(ptr: LI);
10139 } else if (isVectorLikeInstWithConstOps(V)) {
10140 // Sort extracts by the vector operands.
10141 if (isa<ExtractElementInst, UndefValue>(Val: V))
10142 Key = hash_value(value: Value::UndefValueVal + 1);
10143 if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
10144 if (!isUndefVector(V: EI->getVectorOperand()).all() &&
10145 !isa<UndefValue>(Val: EI->getIndexOperand()))
10146 SubKey = hash_value(ptr: EI->getVectorOperand());
10147 }
10148 } else if (auto *I = dyn_cast<Instruction>(Val: V)) {
10149 // Sort other instructions just by the opcodes except for CMPInst.
10150 // For CMP also sort by the predicate kind.
10151 if ((isa<BinaryOperator, CastInst>(Val: I)) &&
10152 isValidForAlternation(Opcode: I->getOpcode())) {
10153 if (AllowAlternate)
10154 Key = hash_value(value: isa<BinaryOperator>(Val: I) ? 1 : 0);
10155 else
10156 Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
10157 SubKey = hash_combine(
10158 args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
10159 args: hash_value(ptr: isa<BinaryOperator>(Val: I)
10160 ? I->getType()
10161 : cast<CastInst>(Val: I)->getOperand(i_nocapture: 0)->getType()));
10162 // For casts, look through the only operand to improve compile time.
10163 if (isa<CastInst>(Val: I)) {
10164 std::pair<size_t, size_t> OpVals =
10165 generateKeySubkey(V: I->getOperand(i: 0), TLI, LoadsSubkeyGenerator,
10166 /*AllowAlternate=*/true);
10167 Key = hash_combine(args: OpVals.first, args: Key);
10168 SubKey = hash_combine(args: OpVals.first, args: SubKey);
10169 }
10170 } else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
10171 CmpInst::Predicate Pred = CI->getPredicate();
10172 if (CI->isCommutative())
10173 Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
10174 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
10175 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
10176 args: hash_value(value: SwapPred),
10177 args: hash_value(ptr: CI->getOperand(i_nocapture: 0)->getType()));
10178 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
10179 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
10180 if (isTriviallyVectorizable(ID)) {
10181 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
10182 } else if (!VFDatabase(*Call).getMappings(CI: *Call).empty()) {
10183 SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
10184 args: hash_value(ptr: Call->getCalledFunction()));
10185 } else {
10186 Key = hash_combine(args: hash_value(ptr: Call), args: Key);
10187 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
10188 }
10189 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10190 SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
10191 args: hash_value(ptr: Op.Tag), args: SubKey);
10192 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
10193 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: 1)))
10194 SubKey = hash_value(ptr: Gep->getPointerOperand());
10195 else
10196 SubKey = hash_value(ptr: Gep);
10197 } else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
10198 !isa<ConstantInt>(Val: I->getOperand(i: 1))) {
10199 // Do not try to vectorize instructions with potentially high cost.
10200 SubKey = hash_value(ptr: I);
10201 } else {
10202 SubKey = hash_value(value: I->getOpcode());
10203 }
10204 Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
10205 }
10206 return std::make_pair(x&: Key, y&: SubKey);
10207}
10208
10209/// Checks if the specified instruction \p I is an main operation for the given
10210/// \p MainOp and \p AltOp instructions.
10211static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10212 Instruction *AltOp, const TargetLibraryInfo &TLI);
10213
10214bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
10215 ArrayRef<Value *> VL) const {
10216 Type *ScalarTy = S.getMainOp()->getType();
10217 unsigned Opcode0 = S.getOpcode();
10218 unsigned Opcode1 = S.getAltOpcode();
10219 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10220 // If this pattern is supported by the target then consider it profitable.
10221 if (TTI->isLegalAltInstr(VecTy: getWidenedType(ScalarTy, VF: VL.size()), Opcode0,
10222 Opcode1, OpcodeMask))
10223 return true;
10224 SmallVector<ValueList> Operands;
10225 for (unsigned I : seq<unsigned>(Size: S.getMainOp()->getNumOperands())) {
10226 Operands.emplace_back();
10227 // Prepare the operand vector.
10228 for (Value *V : VL) {
10229 if (isa<PoisonValue>(Val: V)) {
10230 Operands.back().push_back(
10231 Elt: PoisonValue::get(T: S.getMainOp()->getOperand(i: I)->getType()));
10232 continue;
10233 }
10234 Operands.back().push_back(Elt: cast<Instruction>(Val: V)->getOperand(i: I));
10235 }
10236 }
10237 if (Operands.size() == 2) {
10238 // Try find best operands candidates.
10239 for (unsigned I : seq<unsigned>(Begin: 0, End: VL.size() - 1)) {
10240 SmallVector<std::pair<Value *, Value *>> Candidates(3);
10241 Candidates[0] = std::make_pair(x&: Operands[0][I], y&: Operands[0][I + 1]);
10242 Candidates[1] = std::make_pair(x&: Operands[0][I], y&: Operands[1][I + 1]);
10243 Candidates[2] = std::make_pair(x&: Operands[1][I], y&: Operands[0][I + 1]);
10244 std::optional<int> Res = findBestRootPair(Candidates);
10245 switch (Res.value_or(u: 0)) {
10246 case 0:
10247 break;
10248 case 1:
10249 std::swap(a&: Operands[0][I + 1], b&: Operands[1][I + 1]);
10250 break;
10251 case 2:
10252 std::swap(a&: Operands[0][I], b&: Operands[1][I]);
10253 break;
10254 default:
10255 llvm_unreachable("Unexpected index.");
10256 }
10257 }
10258 }
10259 DenseSet<unsigned> UniqueOpcodes;
10260 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
10261 unsigned NonInstCnt = 0;
10262 // Estimate number of instructions, required for the vectorized node and for
10263 // the buildvector node.
10264 unsigned UndefCnt = 0;
10265 // Count the number of extra shuffles, required for vector nodes.
10266 unsigned ExtraShuffleInsts = 0;
10267 // Check that operands do not contain same values and create either perfect
10268 // diamond match or shuffled match.
10269 if (Operands.size() == 2) {
10270 // Do not count same operands twice.
10271 if (Operands.front() == Operands.back()) {
10272 Operands.erase(CI: Operands.begin());
10273 } else if (!allConstant(VL: Operands.front()) &&
10274 all_of(Range&: Operands.front(), P: [&](Value *V) {
10275 return is_contained(Range&: Operands.back(), Element: V);
10276 })) {
10277 Operands.erase(CI: Operands.begin());
10278 ++ExtraShuffleInsts;
10279 }
10280 }
10281 const Loop *L = LI->getLoopFor(BB: S.getMainOp()->getParent());
10282 // Vectorize node, if:
10283 // 1. at least single operand is constant or splat.
10284 // 2. Operands have many loop invariants (the instructions are not loop
10285 // invariants).
10286 // 3. At least single unique operands is supposed to vectorized.
10287 return none_of(Range&: Operands,
10288 P: [&](ArrayRef<Value *> Op) {
10289 if (allConstant(VL: Op) ||
10290 (!isSplat(VL: Op) && allSameBlock(VL: Op) && allSameType(VL: Op) &&
10291 getSameOpcode(VL: Op, TLI: *TLI)))
10292 return false;
10293 DenseMap<Value *, unsigned> Uniques;
10294 for (Value *V : Op) {
10295 if (isa<Constant, ExtractElementInst>(Val: V) ||
10296 isVectorized(V) || (L && L->isLoopInvariant(V))) {
10297 if (isa<UndefValue>(Val: V))
10298 ++UndefCnt;
10299 continue;
10300 }
10301 auto Res = Uniques.try_emplace(Key: V, Args: 0);
10302 // Found first duplicate - need to add shuffle.
10303 if (!Res.second && Res.first->second == 1)
10304 ++ExtraShuffleInsts;
10305 ++Res.first->getSecond();
10306 if (auto *I = dyn_cast<Instruction>(Val: V))
10307 UniqueOpcodes.insert(V: I->getOpcode());
10308 else if (Res.second)
10309 ++NonInstCnt;
10310 }
10311 return none_of(Range&: Uniques, P: [&](const auto &P) {
10312 return P.first->hasNUsesOrMore(P.second + 1) &&
10313 none_of(P.first->users(), [&](User *U) {
10314 return isVectorized(V: U) || Uniques.contains(Val: U);
10315 });
10316 });
10317 }) ||
10318 // Do not vectorize node, if estimated number of vector instructions is
10319 // more than estimated number of buildvector instructions. Number of
10320 // vector operands is number of vector instructions + number of vector
10321 // instructions for operands (buildvectors). Number of buildvector
10322 // instructions is just number_of_operands * number_of_scalars.
10323 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
10324 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
10325 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
10326}
10327
10328/// Builds the arguments types vector for the given call instruction with the
10329/// given \p ID for the specified vector factor.
10330static SmallVector<Type *>
10331buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
10332 const unsigned VF, unsigned MinBW,
10333 const TargetTransformInfo *TTI) {
10334 SmallVector<Type *> ArgTys;
10335 for (auto [Idx, Arg] : enumerate(First: CI->args())) {
10336 if (ID != Intrinsic::not_intrinsic) {
10337 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) {
10338 ArgTys.push_back(Elt: Arg->getType());
10339 continue;
10340 }
10341 if (MinBW > 0) {
10342 ArgTys.push_back(
10343 Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
10344 continue;
10345 }
10346 }
10347 ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg->getType(), VF));
10348 }
10349 return ArgTys;
10350}
10351
10352/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10353/// function (if possible) calls. Returns invalid cost for the corresponding
10354/// calls, if they cannot be vectorized/will be scalarized.
10355static std::pair<InstructionCost, InstructionCost>
10356getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
10357 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10358 ArrayRef<Type *> ArgTys) {
10359 auto Shape = VFShape::get(FTy: CI->getFunctionType(),
10360 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
10361 HasGlobalPred: false /*HasGlobalPred*/);
10362 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10363 auto LibCost = InstructionCost::getInvalid();
10364 if (!CI->isNoBuiltin() && VecFunc) {
10365 // Calculate the cost of the vector library call.
10366 // If the corresponding vector call is cheaper, return its cost.
10367 LibCost =
10368 TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
10369 }
10370 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10371
10372 // Calculate the cost of the vector intrinsic call.
10373 FastMathFlags FMF;
10374 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
10375 FMF = FPCI->getFastMathFlags();
10376 const InstructionCost ScalarLimit = 10000;
10377 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10378 LibCost.isValid() ? LibCost : ScalarLimit);
10379 auto IntrinsicCost =
10380 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
10381 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10382 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10383 IntrinsicCost = InstructionCost::getInvalid();
10384
10385 return {IntrinsicCost, LibCost};
10386}
10387
10388BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10389 const InstructionsState &S, ArrayRef<Value *> VL,
10390 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10391 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10392 assert(S.getMainOp() &&
10393 "Expected instructions with same/alternate opcodes only.");
10394
10395 unsigned ShuffleOrOp =
10396 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10397 Instruction *VL0 = S.getMainOp();
10398 switch (ShuffleOrOp) {
10399 case Instruction::PHI: {
10400 // Too many operands - gather, most probably won't be vectorized.
10401 if (VL0->getNumOperands() > MaxPHINumOperands)
10402 return TreeEntry::NeedToGather;
10403 // Check for terminator values (e.g. invoke).
10404 for (Value *V : VL) {
10405 auto *PHI = dyn_cast<PHINode>(Val: V);
10406 if (!PHI)
10407 continue;
10408 for (Value *Incoming : PHI->incoming_values()) {
10409 Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
10410 if (Term && Term->isTerminator()) {
10411 LLVM_DEBUG(dbgs()
10412 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10413 return TreeEntry::NeedToGather;
10414 }
10415 }
10416 }
10417
10418 return TreeEntry::Vectorize;
10419 }
10420 case Instruction::ExtractElement:
10421 if (any_of(Range&: VL, P: [&](Value *V) {
10422 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
10423 if (!EI)
10424 return true;
10425 return isVectorized(V: EI->getOperand(i_nocapture: 0));
10426 }))
10427 return TreeEntry::NeedToGather;
10428 [[fallthrough]];
10429 case Instruction::ExtractValue: {
10430 bool Reuse = canReuseExtract(VL, CurrentOrder);
10431 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10432 // non-full registers).
10433 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: VL0->getType(), Sz: VL.size()))
10434 return TreeEntry::NeedToGather;
10435 if (Reuse || !CurrentOrder.empty())
10436 return TreeEntry::Vectorize;
10437 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10438 return TreeEntry::NeedToGather;
10439 }
10440 case Instruction::InsertElement: {
10441 // Check that we have a buildvector and not a shuffle of 2 or more
10442 // different vectors.
10443 ValueSet SourceVectors;
10444 for (Value *V : VL) {
10445 if (isa<PoisonValue>(Val: V)) {
10446 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10447 return TreeEntry::NeedToGather;
10448 }
10449 SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: 0));
10450 assert(getElementIndex(V) != std::nullopt &&
10451 "Non-constant or undef index?");
10452 }
10453
10454 if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
10455 return !SourceVectors.contains(Ptr: V);
10456 }) >= 2) {
10457 // Found 2nd source vector - cancel.
10458 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10459 "different source vectors.\n");
10460 return TreeEntry::NeedToGather;
10461 }
10462
10463 if (any_of(Range&: VL, P: [&SourceVectors](Value *V) {
10464 // The last InsertElement can have multiple uses.
10465 return SourceVectors.contains(Ptr: V) && !V->hasOneUse();
10466 })) {
10467 assert(SLPReVec && "Only supported by REVEC.");
10468 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10469 "multiple uses.\n");
10470 return TreeEntry::NeedToGather;
10471 }
10472
10473 return TreeEntry::Vectorize;
10474 }
10475 case Instruction::Load: {
10476 // Check that a vectorized load would load the same memory as a scalar
10477 // load. For example, we don't want to vectorize loads that are smaller
10478 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10479 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10480 // from such a struct, we read/write packed bits disagreeing with the
10481 // unvectorized version.
10482 auto IsGatheredNode = [&]() {
10483 if (!GatheredLoadsEntriesFirst)
10484 return false;
10485 return all_of(Range&: VL, P: [&](Value *V) {
10486 if (isa<PoisonValue>(Val: V))
10487 return true;
10488 return any_of(Range: getTreeEntries(V), P: [&](const TreeEntry *TE) {
10489 return TE->Idx >= *GatheredLoadsEntriesFirst;
10490 });
10491 });
10492 };
10493 switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps, SPtrInfo)) {
10494 case LoadsState::Vectorize:
10495 return TreeEntry::Vectorize;
10496 case LoadsState::CompressVectorize:
10497 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10498 // Delay slow vectorized nodes for better vectorization attempts.
10499 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10500 return TreeEntry::NeedToGather;
10501 }
10502 return IsGatheredNode() ? TreeEntry::NeedToGather
10503 : TreeEntry::CompressVectorize;
10504 case LoadsState::ScatterVectorize:
10505 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10506 // Delay slow vectorized nodes for better vectorization attempts.
10507 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10508 return TreeEntry::NeedToGather;
10509 }
10510 return IsGatheredNode() ? TreeEntry::NeedToGather
10511 : TreeEntry::ScatterVectorize;
10512 case LoadsState::StridedVectorize:
10513 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10514 // Delay slow vectorized nodes for better vectorization attempts.
10515 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10516 return TreeEntry::NeedToGather;
10517 }
10518 return IsGatheredNode() ? TreeEntry::NeedToGather
10519 : TreeEntry::StridedVectorize;
10520 case LoadsState::Gather:
10521#ifndef NDEBUG
10522 Type *ScalarTy = VL0->getType();
10523 if (DL->getTypeSizeInBits(ScalarTy) !=
10524 DL->getTypeAllocSizeInBits(ScalarTy))
10525 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10526 else if (any_of(VL, [](Value *V) {
10527 auto *LI = dyn_cast<LoadInst>(V);
10528 return !LI || !LI->isSimple();
10529 }))
10530 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10531 else
10532 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10533#endif // NDEBUG
10534 registerNonVectorizableLoads(VL);
10535 return TreeEntry::NeedToGather;
10536 }
10537 llvm_unreachable("Unexpected state of loads");
10538 }
10539 case Instruction::ZExt:
10540 case Instruction::SExt:
10541 case Instruction::FPToUI:
10542 case Instruction::FPToSI:
10543 case Instruction::FPExt:
10544 case Instruction::PtrToInt:
10545 case Instruction::IntToPtr:
10546 case Instruction::SIToFP:
10547 case Instruction::UIToFP:
10548 case Instruction::Trunc:
10549 case Instruction::FPTrunc:
10550 case Instruction::BitCast: {
10551 Type *SrcTy = VL0->getOperand(i: 0)->getType();
10552 for (Value *V : VL) {
10553 if (isa<PoisonValue>(Val: V))
10554 continue;
10555 Type *Ty = cast<Instruction>(Val: V)->getOperand(i: 0)->getType();
10556 if (Ty != SrcTy || !isValidElementType(Ty)) {
10557 LLVM_DEBUG(
10558 dbgs() << "SLP: Gathering casts with different src types.\n");
10559 return TreeEntry::NeedToGather;
10560 }
10561 }
10562 return TreeEntry::Vectorize;
10563 }
10564 case Instruction::ICmp:
10565 case Instruction::FCmp: {
10566 // Check that all of the compares have the same predicate.
10567 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
10568 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
10569 Type *ComparedTy = VL0->getOperand(i: 0)->getType();
10570 for (Value *V : VL) {
10571 if (isa<PoisonValue>(Val: V))
10572 continue;
10573 auto *Cmp = cast<CmpInst>(Val: V);
10574 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10575 Cmp->getOperand(i_nocapture: 0)->getType() != ComparedTy) {
10576 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10577 return TreeEntry::NeedToGather;
10578 }
10579 }
10580 return TreeEntry::Vectorize;
10581 }
10582 case Instruction::Select:
10583 case Instruction::FNeg:
10584 case Instruction::Add:
10585 case Instruction::FAdd:
10586 case Instruction::Sub:
10587 case Instruction::FSub:
10588 case Instruction::Mul:
10589 case Instruction::FMul:
10590 case Instruction::UDiv:
10591 case Instruction::SDiv:
10592 case Instruction::FDiv:
10593 case Instruction::URem:
10594 case Instruction::SRem:
10595 case Instruction::FRem:
10596 case Instruction::Shl:
10597 case Instruction::LShr:
10598 case Instruction::AShr:
10599 case Instruction::And:
10600 case Instruction::Or:
10601 case Instruction::Xor:
10602 case Instruction::Freeze:
10603 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10604 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10605 auto *I = dyn_cast<Instruction>(Val: V);
10606 return I && I->isBinaryOp() && !I->isFast();
10607 }))
10608 return TreeEntry::NeedToGather;
10609 return TreeEntry::Vectorize;
10610 case Instruction::GetElementPtr: {
10611 // We don't combine GEPs with complicated (nested) indexing.
10612 for (Value *V : VL) {
10613 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10614 if (!I)
10615 continue;
10616 if (I->getNumOperands() != 2) {
10617 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10618 return TreeEntry::NeedToGather;
10619 }
10620 }
10621
10622 // We can't combine several GEPs into one vector if they operate on
10623 // different types.
10624 Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
10625 for (Value *V : VL) {
10626 auto *GEP = dyn_cast<GEPOperator>(Val: V);
10627 if (!GEP)
10628 continue;
10629 Type *CurTy = GEP->getSourceElementType();
10630 if (Ty0 != CurTy) {
10631 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10632 return TreeEntry::NeedToGather;
10633 }
10634 }
10635
10636 // We don't combine GEPs with non-constant indexes.
10637 Type *Ty1 = VL0->getOperand(i: 1)->getType();
10638 for (Value *V : VL) {
10639 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10640 if (!I)
10641 continue;
10642 auto *Op = I->getOperand(i_nocapture: 1);
10643 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
10644 (Op->getType() != Ty1 &&
10645 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
10646 Op->getType()->getScalarSizeInBits() >
10647 DL->getIndexSizeInBits(
10648 AS: V->getType()->getPointerAddressSpace())))) {
10649 LLVM_DEBUG(
10650 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10651 return TreeEntry::NeedToGather;
10652 }
10653 }
10654
10655 return TreeEntry::Vectorize;
10656 }
10657 case Instruction::Store: {
10658 // Check if the stores are consecutive or if we need to swizzle them.
10659 llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
10660 // Avoid types that are padded when being allocated as scalars, while
10661 // being packed together in a vector (such as i1).
10662 if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
10663 DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
10664 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10665 return TreeEntry::NeedToGather;
10666 }
10667 // Make sure all stores in the bundle are simple - we can't vectorize
10668 // atomic or volatile stores.
10669 for (Value *V : VL) {
10670 auto *SI = cast<StoreInst>(Val: V);
10671 if (!SI->isSimple()) {
10672 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10673 return TreeEntry::NeedToGather;
10674 }
10675 PointerOps.push_back(Elt: SI->getPointerOperand());
10676 }
10677
10678 // Check the order of pointer operands.
10679 if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: CurrentOrder)) {
10680 Value *Ptr0;
10681 Value *PtrN;
10682 if (CurrentOrder.empty()) {
10683 Ptr0 = PointerOps.front();
10684 PtrN = PointerOps.back();
10685 } else {
10686 Ptr0 = PointerOps[CurrentOrder.front()];
10687 PtrN = PointerOps[CurrentOrder.back()];
10688 }
10689 std::optional<int64_t> Dist =
10690 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
10691 // Check that the sorted pointer operands are consecutive.
10692 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10693 return TreeEntry::Vectorize;
10694 }
10695
10696 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10697 return TreeEntry::NeedToGather;
10698 }
10699 case Instruction::Call: {
10700 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10701 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10702 auto *I = dyn_cast<Instruction>(Val: V);
10703 return I && !I->isFast();
10704 }))
10705 return TreeEntry::NeedToGather;
10706 // Check if the calls are all to the same vectorizable intrinsic or
10707 // library function.
10708 CallInst *CI = cast<CallInst>(Val: VL0);
10709 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10710
10711 VFShape Shape = VFShape::get(
10712 FTy: CI->getFunctionType(),
10713 EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
10714 HasGlobalPred: false /*HasGlobalPred*/);
10715 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10716
10717 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10718 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10719 return TreeEntry::NeedToGather;
10720 }
10721 Function *F = CI->getCalledFunction();
10722 unsigned NumArgs = CI->arg_size();
10723 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10724 for (unsigned J = 0; J != NumArgs; ++J)
10725 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI))
10726 ScalarArgs[J] = CI->getArgOperand(i: J);
10727 for (Value *V : VL) {
10728 CallInst *CI2 = dyn_cast<CallInst>(Val: V);
10729 if (!CI2 || CI2->getCalledFunction() != F ||
10730 getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID ||
10731 (VecFunc &&
10732 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10733 !CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
10734 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10735 << "\n");
10736 return TreeEntry::NeedToGather;
10737 }
10738 // Some intrinsics have scalar arguments and should be same in order for
10739 // them to be vectorized.
10740 for (unsigned J = 0; J != NumArgs; ++J) {
10741 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) {
10742 Value *A1J = CI2->getArgOperand(i: J);
10743 if (ScalarArgs[J] != A1J) {
10744 LLVM_DEBUG(dbgs()
10745 << "SLP: mismatched arguments in call:" << *CI
10746 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10747 return TreeEntry::NeedToGather;
10748 }
10749 }
10750 }
10751 // Verify that the bundle operands are identical between the two calls.
10752 if (CI->hasOperandBundles() &&
10753 !std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
10754 last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
10755 first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10756 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10757 << "!=" << *V << '\n');
10758 return TreeEntry::NeedToGather;
10759 }
10760 }
10761 SmallVector<Type *> ArgTys =
10762 buildIntrinsicArgTypes(CI, ID, VF: VL.size(), MinBW: 0, TTI);
10763 auto *VecTy = getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
10764 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10765 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10766 return TreeEntry::NeedToGather;
10767
10768 return TreeEntry::Vectorize;
10769 }
10770 case Instruction::ShuffleVector: {
10771 if (!S.isAltShuffle()) {
10772 // REVEC can support non alternate shuffle.
10773 if (SLPReVec && getShufflevectorNumGroups(VL))
10774 return TreeEntry::Vectorize;
10775 // If this is not an alternate sequence of opcode like add-sub
10776 // then do not vectorize this instruction.
10777 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10778 return TreeEntry::NeedToGather;
10779 }
10780 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
10781 LLVM_DEBUG(
10782 dbgs()
10783 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
10784 "the whole alt sequence is not profitable.\n");
10785 return TreeEntry::NeedToGather;
10786 }
10787
10788 return TreeEntry::Vectorize;
10789 }
10790 default:
10791 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10792 return TreeEntry::NeedToGather;
10793 }
10794}
10795
10796namespace {
10797/// Allows to correctly handle operands of the phi nodes based on the \p Main
10798/// PHINode order of incoming basic blocks/values.
10799class PHIHandler {
10800 DominatorTree &DT;
10801 PHINode *Main = nullptr;
10802 SmallVector<Value *> Phis;
10803 SmallVector<SmallVector<Value *>> Operands;
10804
10805public:
10806 PHIHandler() = delete;
10807 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10808 : DT(DT), Main(Main), Phis(Phis),
10809 Operands(Main->getNumIncomingValues(),
10810 SmallVector<Value *>(Phis.size(), nullptr)) {}
10811 void buildOperands() {
10812 constexpr unsigned FastLimit = 4;
10813 if (Main->getNumIncomingValues() <= FastLimit) {
10814 for (unsigned I : seq<unsigned>(Begin: 0, End: Main->getNumIncomingValues())) {
10815 BasicBlock *InBB = Main->getIncomingBlock(i: I);
10816 if (!DT.isReachableFromEntry(A: InBB)) {
10817 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10818 continue;
10819 }
10820 // Prepare the operand vector.
10821 for (auto [Idx, V] : enumerate(First&: Phis)) {
10822 auto *P = dyn_cast<PHINode>(Val: V);
10823 if (!P) {
10824 assert(isa<PoisonValue>(V) &&
10825 "Expected isa instruction or poison value.");
10826 Operands[I][Idx] = V;
10827 continue;
10828 }
10829 if (P->getIncomingBlock(i: I) == InBB)
10830 Operands[I][Idx] = P->getIncomingValue(i: I);
10831 else
10832 Operands[I][Idx] = P->getIncomingValueForBlock(BB: InBB);
10833 }
10834 }
10835 return;
10836 }
10837 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10838 Blocks;
10839 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) {
10840 BasicBlock *InBB = Main->getIncomingBlock(i: I);
10841 if (!DT.isReachableFromEntry(A: InBB)) {
10842 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10843 continue;
10844 }
10845 Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I);
10846 }
10847 for (auto [Idx, V] : enumerate(First&: Phis)) {
10848 if (isa<PoisonValue>(Val: V)) {
10849 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues()))
10850 Operands[I][Idx] = V;
10851 continue;
10852 }
10853 auto *P = cast<PHINode>(Val: V);
10854 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
10855 BasicBlock *InBB = P->getIncomingBlock(i: I);
10856 if (InBB == Main->getIncomingBlock(i: I)) {
10857 if (isa_and_nonnull<PoisonValue>(Val: Operands[I][Idx]))
10858 continue;
10859 Operands[I][Idx] = P->getIncomingValue(i: I);
10860 continue;
10861 }
10862 auto *It = Blocks.find(Key: InBB);
10863 if (It == Blocks.end())
10864 continue;
10865 Operands[It->second.front()][Idx] = P->getIncomingValue(i: I);
10866 }
10867 }
10868 for (const auto &P : Blocks) {
10869 ArrayRef<unsigned> IncomingValues = P.second;
10870 if (IncomingValues.size() <= 1)
10871 continue;
10872 unsigned BasicI = IncomingValues.consume_front();
10873 for (unsigned I : IncomingValues) {
10874 assert(all_of(enumerate(Operands[I]),
10875 [&](const auto &Data) {
10876 return !Data.value() ||
10877 Data.value() == Operands[BasicI][Data.index()];
10878 }) &&
10879 "Expected empty operands list.");
10880 Operands[I] = Operands[BasicI];
10881 }
10882 }
10883 }
10884 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10885};
10886} // namespace
10887
10888/// Returns main/alternate instructions for the given \p VL. Unlike
10889/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10890/// node support.
10891/// \returns first main/alt instructions, if only poisons and instruction with
10892/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10893static std::pair<Instruction *, Instruction *>
10894getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
10895 Instruction *MainOp = nullptr;
10896 Instruction *AltOp = nullptr;
10897 for (Value *V : VL) {
10898 if (isa<PoisonValue>(Val: V))
10899 continue;
10900 auto *I = dyn_cast<Instruction>(Val: V);
10901 if (!I)
10902 return {};
10903 if (!MainOp) {
10904 MainOp = I;
10905 continue;
10906 }
10907 if (MainOp->getOpcode() == I->getOpcode()) {
10908 if (I->getParent() != MainOp->getParent())
10909 return {};
10910 continue;
10911 }
10912 if (!AltOp) {
10913 AltOp = I;
10914 continue;
10915 }
10916 if (AltOp->getOpcode() == I->getOpcode()) {
10917 if (I->getParent() != AltOp->getParent())
10918 return {};
10919 continue;
10920 }
10921 return {};
10922 }
10923 if (!AltOp)
10924 return {};
10925 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10926 "Expected different main and alt instructions.");
10927 return std::make_pair(x&: MainOp, y&: AltOp);
10928}
10929
10930/// Checks that every instruction appears once in the list and if not, packs
10931/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10932/// unique scalars is extended by poison values to the whole register size.
10933///
10934/// \returns false if \p VL could not be uniquified, in which case \p VL is
10935/// unchanged and \p ReuseShuffleIndices is empty.
10936static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
10937 SmallVectorImpl<int> &ReuseShuffleIndices,
10938 const TargetTransformInfo &TTI,
10939 const TargetLibraryInfo &TLI,
10940 const InstructionsState &S,
10941 const BoUpSLP::EdgeInfo &UserTreeIdx,
10942 bool TryPad = false) {
10943 // Check that every instruction appears once in this bundle.
10944 SmallVector<Value *> UniqueValues;
10945 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10946 for (Value *V : VL) {
10947 if (isConstant(V)) {
10948 // Constants are always considered distinct, even if the same constant
10949 // appears multiple times in VL.
10950 ReuseShuffleIndices.emplace_back(
10951 Args: isa<PoisonValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
10952 UniqueValues.emplace_back(Args&: V);
10953 continue;
10954 }
10955 auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
10956 ReuseShuffleIndices.emplace_back(Args&: Res.first->second);
10957 if (Res.second)
10958 UniqueValues.emplace_back(Args&: V);
10959 }
10960
10961 // Easy case: VL has unique values and a "natural" size
10962 size_t NumUniqueScalarValues = UniqueValues.size();
10963 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10964 TTI, Ty: getValueType(V: UniqueValues.front()), Sz: NumUniqueScalarValues);
10965 if (NumUniqueScalarValues == VL.size() &&
10966 (VectorizeNonPowerOf2 || IsFullVectors)) {
10967 ReuseShuffleIndices.clear();
10968 return true;
10969 }
10970
10971 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10972 if ((UserTreeIdx.UserTE &&
10973 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10974 !hasFullVectorsOrPowerOf2(TTI, Ty: getValueType(V: VL.front()), Sz: VL.size())) {
10975 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10976 "for nodes with padding.\n");
10977 ReuseShuffleIndices.clear();
10978 return false;
10979 }
10980
10981 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10982 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10983 (UniquePositions.size() == 1 && all_of(Range&: UniqueValues, P: [](Value *V) {
10984 return isa<UndefValue>(Val: V) || !isConstant(V);
10985 }))) {
10986 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10987 S.getMainOp()->isSafeToRemove() &&
10988 (S.areInstructionsWithCopyableElements() ||
10989 all_of(Range&: UniqueValues, P: IsaPred<Instruction, PoisonValue>))) {
10990 // Find the number of elements, which forms full vectors.
10991 unsigned PWSz = getFullVectorNumberOfElements(
10992 TTI, Ty: UniqueValues.front()->getType(), Sz: UniqueValues.size());
10993 PWSz = std::min<unsigned>(a: PWSz, b: VL.size());
10994 if (PWSz == VL.size()) {
10995 // We ended up with the same size after removing duplicates and
10996 // upgrading the resulting vector size to a "nice size". Just keep
10997 // the initial VL then.
10998 ReuseShuffleIndices.clear();
10999 } else {
11000 // Pad unique values with poison to grow the vector to a "nice" size
11001 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
11002 UniqueValues.end());
11003 PaddedUniqueValues.append(
11004 NumInputs: PWSz - UniqueValues.size(),
11005 Elt: PoisonValue::get(T: UniqueValues.front()->getType()));
11006 // Check that extended with poisons/copyable operations are still valid
11007 // for vectorization (div/rem are not allowed).
11008 if ((!S.areInstructionsWithCopyableElements() &&
11009 !getSameOpcode(VL: PaddedUniqueValues, TLI).valid()) ||
11010 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
11011 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
11012 isa<CallInst>(Val: S.getMainOp())))) {
11013 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11014 ReuseShuffleIndices.clear();
11015 return false;
11016 }
11017 VL = std::move(PaddedUniqueValues);
11018 }
11019 return true;
11020 }
11021 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
11022 ReuseShuffleIndices.clear();
11023 return false;
11024 }
11025 VL = std::move(UniqueValues);
11026 return true;
11027}
11028
11029bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
11030 const InstructionsState &LocalState,
11031 SmallVectorImpl<Value *> &Op1,
11032 SmallVectorImpl<Value *> &Op2,
11033 OrdersType &ReorderIndices) const {
11034 constexpr unsigned SmallNodeSize = 4;
11035 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
11036 !SplitAlternateInstructions)
11037 return false;
11038
11039 // Check if this is a duplicate of another split entry.
11040 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
11041 << ".\n");
11042 for (TreeEntry *E : getSplitTreeEntries(V: LocalState.getMainOp())) {
11043 if (E->isSame(VL)) {
11044 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
11045 << *LocalState.getMainOp() << ".\n");
11046 return false;
11047 }
11048 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11049 if (all_of(Range&: VL, P: [&](Value *V) {
11050 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V);
11051 })) {
11052 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11053 return false;
11054 }
11055 }
11056
11057 ReorderIndices.assign(NumElts: VL.size(), Elt: VL.size());
11058 SmallBitVector Op1Indices(VL.size());
11059 for (auto [Idx, V] : enumerate(First&: VL)) {
11060 auto *I = dyn_cast<Instruction>(Val: V);
11061 if (!I) {
11062 Op1.push_back(Elt: V);
11063 Op1Indices.set(Idx);
11064 continue;
11065 }
11066 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
11067 isMainInstruction(I, MainOp: LocalState.getMainOp(), AltOp: LocalState.getAltOp(),
11068 TLI: *TLI)) ||
11069 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
11070 !isAlternateInstruction(I, MainOp: LocalState.getMainOp(),
11071 AltOp: LocalState.getAltOp(), TLI: *TLI))) {
11072 Op1.push_back(Elt: V);
11073 Op1Indices.set(Idx);
11074 continue;
11075 }
11076 Op2.push_back(Elt: V);
11077 }
11078 Type *ScalarTy = getValueType(V: VL.front());
11079 VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11080 unsigned Opcode0 = LocalState.getOpcode();
11081 unsigned Opcode1 = LocalState.getAltOpcode();
11082 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
11083 // Enable split node, only if all nodes do not form legal alternate
11084 // instruction (like X86 addsub).
11085 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
11086 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
11087 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11088 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11089 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op1.front()->getType(), Sz: Op1.size()) ||
11090 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op2.front()->getType(), Sz: Op2.size()))
11091 return false;
11092 // Enable split node, only if all nodes are power-of-2/full registers.
11093 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11094 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11095 if (Op1Indices.test(Idx)) {
11096 ReorderIndices[Op1Cnt] = Idx;
11097 ++Op1Cnt;
11098 } else {
11099 ReorderIndices[Op2Cnt] = Idx;
11100 ++Op2Cnt;
11101 }
11102 }
11103 if (isIdentityOrder(Order: ReorderIndices))
11104 ReorderIndices.clear();
11105 SmallVector<int> Mask;
11106 if (!ReorderIndices.empty())
11107 inversePermutation(Indices: ReorderIndices, Mask);
11108 unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
11109 VectorType *Op1VecTy = getWidenedType(ScalarTy, VF: Op1.size());
11110 VectorType *Op2VecTy = getWidenedType(ScalarTy, VF: Op2.size());
11111 // Check non-profitable single register ops, which better to be represented
11112 // as alternate ops.
11113 if (NumParts >= VL.size())
11114 return false;
11115 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11116 InstructionCost InsertCost = ::getShuffleCost(
11117 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind: Kind, Index: Op1.size(), SubTp: Op2VecTy);
11118 FixedVectorType *SubVecTy =
11119 getWidenedType(ScalarTy, VF: std::max(a: Op1.size(), b: Op2.size()));
11120 InstructionCost NewShuffleCost =
11121 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: SubVecTy, Mask, CostKind: Kind);
11122 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11123 (Mask.empty() || InsertCost >= NewShuffleCost))
11124 return false;
11125 if ((LocalState.getMainOp()->isBinaryOp() &&
11126 LocalState.getAltOp()->isBinaryOp() &&
11127 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11128 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11129 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11130 (LocalState.getMainOp()->isUnaryOp() &&
11131 LocalState.getAltOp()->isUnaryOp())) {
11132 InstructionCost OriginalVecOpsCost =
11133 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: VecTy, CostKind: Kind) +
11134 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: VecTy, CostKind: Kind);
11135 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11136 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11137 if (isa<PoisonValue>(Val: VL[Idx]))
11138 continue;
11139 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11140 }
11141 InstructionCost OriginalCost =
11142 OriginalVecOpsCost + ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
11143 Tp: VecTy, Mask: OriginalMask, CostKind: Kind);
11144 InstructionCost NewVecOpsCost =
11145 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: Op1VecTy, CostKind: Kind) +
11146 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: Op2VecTy, CostKind: Kind);
11147 InstructionCost NewCost =
11148 NewVecOpsCost + InsertCost +
11149 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11150 VectorizableTree.front()->getOpcode() == Instruction::Store
11151 ? NewShuffleCost
11152 : 0);
11153 // If not profitable to split - exit.
11154 if (NewCost >= OriginalCost)
11155 return false;
11156 }
11157 return true;
11158}
11159
11160namespace {
11161/// Class accepts incoming list of values, checks if it is able to model
11162/// "copyable" values as compatible operations, and generates the list of values
11163/// for scheduling and list of operands doe the new nodes.
11164class InstructionsCompatibilityAnalysis {
11165 DominatorTree &DT;
11166 const DataLayout &DL;
11167 const TargetTransformInfo &TTI;
11168 const TargetLibraryInfo &TLI;
11169 unsigned MainOpcode = 0;
11170 Instruction *MainOp = nullptr;
11171
11172 /// Checks if the opcode is supported as the main opcode for copyable
11173 /// elements.
11174 static bool isSupportedOpcode(const unsigned Opcode) {
11175 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11176 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11177 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11178 Opcode == Instruction::And || Opcode == Instruction::Or ||
11179 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11180 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11181 Opcode == Instruction::FDiv;
11182 }
11183
11184 /// Identifies the best candidate value, which represents main opcode
11185 /// operation.
11186 /// Currently the best candidate is the Add instruction with the parent
11187 /// block with the highest DFS incoming number (block, that dominates other).
11188 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11189 BasicBlock *Parent = nullptr;
11190 // Checks if the instruction has supported opcode.
11191 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11192 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I)))
11193 return false;
11194 return I && isSupportedOpcode(Opcode: I->getOpcode()) &&
11195 (!doesNotNeedToBeScheduled(V: I) || !R.isVectorized(V: I));
11196 };
11197 // Exclude operands instructions immediately to improve compile time, it
11198 // will be unable to schedule anyway.
11199 SmallDenseSet<Value *, 8> Operands;
11200 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11201 bool AnyUndef = false;
11202 for (Value *V : VL) {
11203 auto *I = dyn_cast<Instruction>(Val: V);
11204 if (!I) {
11205 AnyUndef |= isa<UndefValue>(Val: V);
11206 continue;
11207 }
11208 if (!DT.isReachableFromEntry(A: I->getParent()))
11209 continue;
11210 if (Candidates.empty()) {
11211 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11212 Parent = I->getParent();
11213 Operands.insert(I: I->op_begin(), E: I->op_end());
11214 continue;
11215 }
11216 if (Parent == I->getParent()) {
11217 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11218 Operands.insert(I: I->op_begin(), E: I->op_end());
11219 continue;
11220 }
11221 auto *NodeA = DT.getNode(BB: Parent);
11222 auto *NodeB = DT.getNode(BB: I->getParent());
11223 assert(NodeA && "Should only process reachable instructions");
11224 assert(NodeB && "Should only process reachable instructions");
11225 assert((NodeA == NodeB) ==
11226 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11227 "Different nodes should have different DFS numbers");
11228 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11229 Candidates.clear();
11230 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11231 Parent = I->getParent();
11232 Operands.clear();
11233 Operands.insert(I: I->op_begin(), E: I->op_end());
11234 }
11235 }
11236 unsigned BestOpcodeNum = 0;
11237 MainOp = nullptr;
11238 bool UsedOutside = false;
11239 for (const auto &P : Candidates) {
11240 bool PUsedOutside = all_of(Range: P.second, P: isUsedOutsideBlock);
11241 if (UsedOutside && !PUsedOutside)
11242 continue;
11243 if (!UsedOutside && PUsedOutside)
11244 BestOpcodeNum = 0;
11245 if (P.second.size() < BestOpcodeNum)
11246 continue;
11247 // If have inner dependencies - skip.
11248 if (!PUsedOutside && any_of(Range: P.second, P: [&](Instruction *I) {
11249 return Operands.contains(V: I);
11250 }))
11251 continue;
11252 UsedOutside = PUsedOutside;
11253 for (Instruction *I : P.second) {
11254 if (IsSupportedInstruction(I, AnyUndef)) {
11255 MainOp = I;
11256 BestOpcodeNum = P.second.size();
11257 break;
11258 }
11259 }
11260 }
11261 if (MainOp) {
11262 // Do not match, if any copyable is a terminator from the same block as
11263 // the main operation.
11264 if (any_of(Range&: VL, P: [&](Value *V) {
11265 auto *I = dyn_cast<Instruction>(Val: V);
11266 return I && I->getParent() == MainOp->getParent() &&
11267 I->isTerminator();
11268 })) {
11269 MainOp = nullptr;
11270 return;
11271 }
11272 MainOpcode = MainOp->getOpcode();
11273 }
11274 }
11275
11276 /// Returns the idempotent value for the \p MainOp with the detected \p
11277 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11278 /// the operand itself, since V or V == V.
11279 Value *selectBestIdempotentValue() const {
11280 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11281 return ConstantExpr::getBinOpIdentity(Opcode: MainOpcode, Ty: MainOp->getType(),
11282 AllowRHSConstant: !MainOp->isCommutative());
11283 }
11284
11285 /// Returns the value and operands for the \p V, considering if it is original
11286 /// instruction and its actual operands should be returned, or it is a
11287 /// copyable element and its should be represented as idempotent instruction.
11288 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11289 if (isa<PoisonValue>(Val: V))
11290 return {V, V};
11291 if (!S.isCopyableElement(V))
11292 return convertTo(I: cast<Instruction>(Val: V), S).second;
11293 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11294 return {V, selectBestIdempotentValue()};
11295 }
11296
11297 /// Builds operands for the original instructions.
11298 void
11299 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11300 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11301
11302 unsigned ShuffleOrOp =
11303 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11304 Instruction *VL0 = S.getMainOp();
11305
11306 switch (ShuffleOrOp) {
11307 case Instruction::PHI: {
11308 auto *PH = cast<PHINode>(Val: VL0);
11309
11310 // Keeps the reordered operands to avoid code duplication.
11311 PHIHandler Handler(DT, PH, VL);
11312 Handler.buildOperands();
11313 Operands.assign(NumElts: PH->getNumOperands(), Elt: {});
11314 for (unsigned I : seq<unsigned>(Size: PH->getNumOperands()))
11315 Operands[I].assign(in_start: Handler.getOperands(I).begin(),
11316 in_end: Handler.getOperands(I).end());
11317 return;
11318 }
11319 case Instruction::ExtractValue:
11320 case Instruction::ExtractElement:
11321 // This is a special case, as it does not gather, but at the same time
11322 // we are not extending buildTree_rec() towards the operands.
11323 Operands.assign(NumElts: 1, Elt: {VL.size(), VL0->getOperand(i: 0)});
11324 return;
11325 case Instruction::InsertElement:
11326 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
11327 for (auto [Idx, V] : enumerate(First&: VL)) {
11328 auto *IE = cast<InsertElementInst>(Val: V);
11329 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11330 Ops[Idx] = IE->getOperand(i_nocapture: OpIdx);
11331 }
11332 return;
11333 case Instruction::Load:
11334 Operands.assign(
11335 NumElts: 1, Elt: {VL.size(),
11336 PoisonValue::get(T: cast<LoadInst>(Val: VL0)->getPointerOperandType())});
11337 for (auto [V, Op] : zip(t&: VL, u&: Operands.back())) {
11338 auto *LI = dyn_cast<LoadInst>(Val: V);
11339 if (!LI)
11340 continue;
11341 Op = LI->getPointerOperand();
11342 }
11343 return;
11344 case Instruction::ZExt:
11345 case Instruction::SExt:
11346 case Instruction::FPToUI:
11347 case Instruction::FPToSI:
11348 case Instruction::FPExt:
11349 case Instruction::PtrToInt:
11350 case Instruction::IntToPtr:
11351 case Instruction::SIToFP:
11352 case Instruction::UIToFP:
11353 case Instruction::Trunc:
11354 case Instruction::FPTrunc:
11355 case Instruction::BitCast:
11356 case Instruction::ICmp:
11357 case Instruction::FCmp:
11358 case Instruction::Select:
11359 case Instruction::FNeg:
11360 case Instruction::Add:
11361 case Instruction::FAdd:
11362 case Instruction::Sub:
11363 case Instruction::FSub:
11364 case Instruction::Mul:
11365 case Instruction::FMul:
11366 case Instruction::UDiv:
11367 case Instruction::SDiv:
11368 case Instruction::FDiv:
11369 case Instruction::URem:
11370 case Instruction::SRem:
11371 case Instruction::FRem:
11372 case Instruction::Shl:
11373 case Instruction::LShr:
11374 case Instruction::AShr:
11375 case Instruction::And:
11376 case Instruction::Or:
11377 case Instruction::Xor:
11378 case Instruction::Freeze:
11379 case Instruction::Store:
11380 case Instruction::ShuffleVector:
11381 Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
11382 for (auto [Idx, V] : enumerate(First&: VL)) {
11383 auto *I = dyn_cast<Instruction>(Val: V);
11384 if (!I) {
11385 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11386 Ops[Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
11387 continue;
11388 }
11389 auto [Op, ConvertedOps] = convertTo(I, S);
11390 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11391 Ops[Idx] = ConvertedOps[OpIdx];
11392 }
11393 return;
11394 case Instruction::GetElementPtr: {
11395 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
11396 // Need to cast all indices to the same type before vectorization to
11397 // avoid crash.
11398 // Required to be able to find correct matches between different gather
11399 // nodes and reuse the vectorized values rather than trying to gather them
11400 // again.
11401 const unsigned IndexIdx = 1;
11402 Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
11403 Type *Ty =
11404 all_of(Range&: VL,
11405 P: [&](Value *V) {
11406 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11407 return !GEP || VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
11408 })
11409 ? VL0Ty
11410 : DL.getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
11411 ->getPointerOperandType()
11412 ->getScalarType());
11413 for (auto [Idx, V] : enumerate(First&: VL)) {
11414 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11415 if (!GEP) {
11416 Operands[0][Idx] = V;
11417 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11418 continue;
11419 }
11420 Operands[0][Idx] = GEP->getPointerOperand();
11421 auto *Op = GEP->getOperand(i_nocapture: IndexIdx);
11422 auto *CI = dyn_cast<ConstantInt>(Val: Op);
11423 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11424 C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL)
11425 : Op;
11426 }
11427 return;
11428 }
11429 case Instruction::Call: {
11430 auto *CI = cast<CallInst>(Val: VL0);
11431 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
11432 for (unsigned Idx : seq<unsigned>(Size: CI->arg_size())) {
11433 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI: &TTI))
11434 continue;
11435 auto &Ops = Operands.emplace_back();
11436 for (Value *V : VL) {
11437 auto *I = dyn_cast<Instruction>(Val: V);
11438 Ops.push_back(Elt: I ? I->getOperand(i: Idx)
11439 : PoisonValue::get(T: VL0->getOperand(i: Idx)->getType()));
11440 }
11441 }
11442 return;
11443 }
11444 default:
11445 break;
11446 }
11447 llvm_unreachable("Unexpected vectorization of the instructions.");
11448 }
11449
11450public:
11451 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11452 const TargetTransformInfo &TTI,
11453 const TargetLibraryInfo &TLI)
11454 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11455
11456 InstructionsState
11457 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11458 bool TryCopyableElementsVectorization,
11459 bool WithProfitabilityCheck = false,
11460 bool SkipSameCodeCheck = false) {
11461 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11462 ? InstructionsState::invalid()
11463 : getSameOpcode(VL, TLI);
11464 if (S)
11465 return S;
11466 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11467 return S;
11468 findAndSetMainInstruction(VL, R);
11469 if (!MainOp)
11470 return InstructionsState::invalid();
11471 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11472 if (!WithProfitabilityCheck)
11473 return S;
11474 // Check if it is profitable to vectorize the instruction.
11475 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11476 auto BuildCandidates =
11477 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11478 Value *V2) {
11479 if (V1 != V2 && isa<PHINode>(Val: V1))
11480 return;
11481 auto *I1 = dyn_cast<Instruction>(Val: V1);
11482 auto *I2 = dyn_cast<Instruction>(Val: V2);
11483 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11484 I1->getParent() != I2->getParent())
11485 return;
11486 Candidates.emplace_back(Args&: V1, Args&: (I1 || I2) ? V2 : V1);
11487 };
11488 if (VL.size() == 2) {
11489 // Check if the operands allow better vectorization.
11490 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11491 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11492 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11493 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11494 R.findBestRootPair(Candidates: Candidates1) &&
11495 R.findBestRootPair(Candidates: Candidates2);
11496 if (!Res && isCommutative(I: MainOp)) {
11497 Candidates1.clear();
11498 Candidates2.clear();
11499 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11500 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11501 Res = !Candidates1.empty() && !Candidates2.empty() &&
11502 R.findBestRootPair(Candidates: Candidates1) &&
11503 R.findBestRootPair(Candidates: Candidates2);
11504 }
11505 if (!Res)
11506 return InstructionsState::invalid();
11507 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11508 InstructionCost ScalarCost = TTI.getInstructionCost(U: S.getMainOp(), CostKind: Kind);
11509 InstructionCost VectorCost;
11510 FixedVectorType *VecTy =
11511 getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
11512 switch (MainOpcode) {
11513 case Instruction::Add:
11514 case Instruction::Sub:
11515 case Instruction::LShr:
11516 case Instruction::Shl:
11517 case Instruction::SDiv:
11518 case Instruction::UDiv:
11519 case Instruction::And:
11520 case Instruction::Or:
11521 case Instruction::Xor:
11522 case Instruction::FAdd:
11523 case Instruction::FMul:
11524 case Instruction::FSub:
11525 case Instruction::FDiv:
11526 VectorCost = TTI.getArithmeticInstrCost(Opcode: MainOpcode, Ty: VecTy, CostKind: Kind);
11527 break;
11528 default:
11529 llvm_unreachable("Unexpected instruction.");
11530 }
11531 if (VectorCost > ScalarCost)
11532 return InstructionsState::invalid();
11533 return S;
11534 }
11535 assert(Operands.size() == 2 && "Unexpected number of operands!");
11536 unsigned CopyableNum =
11537 count_if(Range&: VL, P: [&](Value *V) { return S.isCopyableElement(V); });
11538 if (CopyableNum < VL.size() / 2)
11539 return S;
11540 // Too many phi copyables - exit.
11541 const unsigned Limit = VL.size() / 24;
11542 if ((CopyableNum >= VL.size() - Limit ||
11543 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11544 CopyableNum >= MaxPHINumOperands) &&
11545 all_of(Range&: VL, P: [&](Value *V) {
11546 return isa<PHINode>(Val: V) || !S.isCopyableElement(V);
11547 }))
11548 return InstructionsState::invalid();
11549 // Check profitability if number of copyables > VL.size() / 2.
11550 // 1. Reorder operands for better matching.
11551 if (isCommutative(I: MainOp)) {
11552 for (auto &Ops : Operands) {
11553 // Make instructions the first operands.
11554 if (!isa<Instruction>(Val: Ops.front()) && isa<Instruction>(Val: Ops.back())) {
11555 std::swap(a&: Ops.front(), b&: Ops.back());
11556 continue;
11557 }
11558 // Make constants the second operands.
11559 if (isa<Constant>(Val: Ops.front())) {
11560 std::swap(a&: Ops.front(), b&: Ops.back());
11561 continue;
11562 }
11563 }
11564 }
11565 // 2. Check, if operands can be vectorized.
11566 if (count_if(Range&: Operands.back(), P: IsaPred<Instruction>) > 1)
11567 return InstructionsState::invalid();
11568 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11569 if (allConstant(VL: Ops) || isSplat(VL: Ops))
11570 return true;
11571 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11572 // one is different.
11573 constexpr unsigned Limit = 4;
11574 if (Operands.front().size() >= Limit) {
11575 SmallDenseMap<const Value *, unsigned> Counters;
11576 for (Value *V : Ops) {
11577 if (isa<UndefValue>(Val: V))
11578 continue;
11579 ++Counters[V];
11580 }
11581 if (Counters.size() == 2 &&
11582 any_of(Range&: Counters, P: [&](const std::pair<const Value *, unsigned> &C) {
11583 return C.second == 1;
11584 }))
11585 return true;
11586 }
11587 // First operand not a constant or splat? Last attempt - check for
11588 // potential vectorization.
11589 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11590 InstructionsState OpS = Analysis.buildInstructionsState(
11591 VL: Ops, R, /*TryCopyableElementsVectorization=*/true);
11592 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(VL: Ops)))
11593 return false;
11594 unsigned CopyableNum =
11595 count_if(Range&: Ops, P: [&](Value *V) { return OpS.isCopyableElement(V); });
11596 return CopyableNum <= VL.size() / 2;
11597 };
11598 if (!CheckOperand(Operands.front()))
11599 return InstructionsState::invalid();
11600
11601 return S;
11602 }
11603
11604 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11605 ArrayRef<Value *> VL) {
11606 assert(S && "Invalid state!");
11607 SmallVector<BoUpSLP::ValueList> Operands;
11608 if (S.areInstructionsWithCopyableElements()) {
11609 MainOp = S.getMainOp();
11610 MainOpcode = S.getOpcode();
11611 Operands.assign(NumElts: MainOp->getNumOperands(),
11612 Elt: BoUpSLP::ValueList(VL.size(), nullptr));
11613 for (auto [Idx, V] : enumerate(First&: VL)) {
11614 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11615 for (auto [OperandIdx, Operand] : enumerate(First&: OperandsForValue))
11616 Operands[OperandIdx][Idx] = Operand;
11617 }
11618 } else {
11619 buildOriginalOperands(S, VL, Operands);
11620 }
11621 return Operands;
11622 }
11623};
11624} // namespace
11625
11626BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11627 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11628 bool TryCopyableElementsVectorization) const {
11629 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11630
11631 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11632 InstructionsState S = Analysis.buildInstructionsState(
11633 VL, R: *this, TryCopyableElementsVectorization,
11634 /*WithProfitabilityCheck=*/true, SkipSameCodeCheck: TryCopyableElementsVectorization);
11635
11636 bool AreScatterAllGEPSameBlock = false;
11637 if (!S) {
11638 SmallVector<unsigned> SortedIndices;
11639 BasicBlock *BB = nullptr;
11640 bool IsScatterVectorizeUserTE =
11641 UserTreeIdx.UserTE &&
11642 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11643 AreScatterAllGEPSameBlock =
11644 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11645 VL.size() > 2 &&
11646 all_of(Range&: VL,
11647 P: [&BB](Value *V) {
11648 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
11649 if (!I)
11650 return doesNotNeedToBeScheduled(V);
11651 if (!BB)
11652 BB = I->getParent();
11653 return BB == I->getParent() && I->getNumOperands() == 2;
11654 }) &&
11655 BB &&
11656 sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL,
11657 SE&: *SE, SortedIndices));
11658 if (!AreScatterAllGEPSameBlock) {
11659 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11660 "C,S,B,O, small shuffle. \n";
11661 dbgs() << "[";
11662 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11663 dbgs() << "]\n");
11664 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11665 /*TryToFindDuplicates=*/true,
11666 /*TrySplitVectorize=*/true);
11667 }
11668 // Reset S to make it GetElementPtr kind of node.
11669 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
11670 assert(It != VL.end() && "Expected at least one GEP.");
11671 S = getSameOpcode(VL: *It, TLI: *TLI);
11672 }
11673 assert(S && "Must be valid.");
11674
11675 // Don't handle vectors.
11676 if (!SLPReVec && getValueType(V: VL.front())->isVectorTy()) {
11677 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11678 // Do not try to pack to avoid extra instructions here.
11679 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11680 /*TryToFindDuplicates=*/false);
11681 }
11682
11683 // Check that all of the users of the scalars that we want to vectorize are
11684 // schedulable.
11685 BasicBlock *BB = S.getMainOp()->getParent();
11686
11687 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()) ||
11688 !DT->isReachableFromEntry(A: BB)) {
11689 // Don't go into unreachable blocks. They may contain instructions with
11690 // dependency cycles which confuse the final scheduling.
11691 // Do not vectorize EH and non-returning blocks, not profitable in most
11692 // cases.
11693 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11694 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11695 }
11696
11697 // Don't go into catchswitch blocks, which can happen with PHIs.
11698 // Such blocks can only have PHIs and the catchswitch. There is no
11699 // place to insert a shuffle if we need to, so just avoid that issue.
11700 if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
11701 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11702 // Do not try to pack to avoid extra instructions here.
11703 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11704 /*TryToFindDuplicates=*/false);
11705 }
11706
11707 // Don't handle scalable vectors
11708 if (S.getOpcode() == Instruction::ExtractElement &&
11709 isa<ScalableVectorType>(
11710 Val: cast<ExtractElementInst>(Val: S.getMainOp())->getVectorOperandType())) {
11711 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11712 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11713 }
11714
11715 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11716 // a load), in which case peek through to include it in the tree, without
11717 // ballooning over-budget.
11718 if (Depth >= RecursionMaxDepth &&
11719 (S.isAltShuffle() || VL.size() < 4 ||
11720 !(match(V: S.getMainOp(), P: m_Load(Op: m_Value())) ||
11721 all_of(Range&: VL, P: [&S](const Value *I) {
11722 return match(V: I,
11723 P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
11724 cast<Instruction>(Val: I)->getOpcode() == S.getOpcode();
11725 })))) {
11726 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11727 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11728 }
11729
11730 // Check if this is a duplicate of another entry.
11731 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11732 for (TreeEntry *E : getTreeEntries(V: S.getMainOp())) {
11733 if (E->isSame(VL)) {
11734 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11735 << ".\n");
11736 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11737 }
11738 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11739 if (all_of(Range&: VL, P: [&](Value *V) {
11740 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V) ||
11741 (S.getOpcode() == Instruction::PHI && isa<PHINode>(Val: V) &&
11742 LI->getLoopFor(BB: S.getMainOp()->getParent()) &&
11743 isVectorized(V));
11744 })) {
11745 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11746 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11747 }
11748 }
11749
11750 // If all of the operands are identical or constant we have a simple solution.
11751 // If we deal with insert/extract instructions, they all must have constant
11752 // indices, otherwise we should gather them, not try to vectorize.
11753 // If alternate op node with 2 elements with gathered operands - do not
11754 // vectorize.
11755 auto NotProfitableForVectorization = [&S, this, Depth](ArrayRef<Value *> VL) {
11756 if (!S || !S.isAltShuffle() || VL.size() > 2)
11757 return false;
11758 if (VectorizableTree.size() < MinTreeSize)
11759 return false;
11760 if (Depth >= RecursionMaxDepth - 1)
11761 return true;
11762 // Check if all operands are extracts, part of vector node or can build a
11763 // regular vectorize node.
11764 SmallVector<unsigned, 8> InstsCount;
11765 for (Value *V : VL) {
11766 auto *I = cast<Instruction>(Val: V);
11767 InstsCount.push_back(Elt: count_if(Range: I->operand_values(), P: [](Value *Op) {
11768 return isa<Instruction>(Val: Op) || isVectorLikeInstWithConstOps(V: Op);
11769 }));
11770 }
11771 bool IsCommutative =
11772 isCommutative(I: S.getMainOp()) || isCommutative(I: S.getAltOp());
11773 if ((IsCommutative &&
11774 std::accumulate(first: InstsCount.begin(), last: InstsCount.end(), init: 0) < 2) ||
11775 (!IsCommutative &&
11776 all_of(Range&: InstsCount, P: [](unsigned ICnt) { return ICnt < 2; })))
11777 return true;
11778 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
11779 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
11780 auto *I1 = cast<Instruction>(Val: VL.front());
11781 auto *I2 = cast<Instruction>(Val: VL.back());
11782 for (int Op : seq<int>(Size: S.getMainOp()->getNumOperands()))
11783 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
11784 Args: I2->getOperand(i: Op));
11785 if (static_cast<unsigned>(count_if(
11786 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11787 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
11788 })) >= S.getMainOp()->getNumOperands() / 2)
11789 return false;
11790 if (S.getMainOp()->getNumOperands() > 2)
11791 return true;
11792 if (IsCommutative) {
11793 // Check permuted operands.
11794 Candidates.clear();
11795 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
11796 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
11797 Args: I2->getOperand(i: (Op + 1) % E));
11798 if (any_of(
11799 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
11800 return findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplat);
11801 }))
11802 return false;
11803 }
11804 return true;
11805 };
11806 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11807 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11808 if (!AreAllSameInsts || isSplat(VL) ||
11809 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
11810 Val: S.getMainOp()) &&
11811 !all_of(Range&: VL, P: isVectorLikeInstWithConstOps)) ||
11812 NotProfitableForVectorization(VL)) {
11813 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n";
11814 dbgs() << "[";
11815 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11816 dbgs() << "]\n");
11817 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11818 }
11819
11820 // Don't vectorize ephemeral values.
11821 if (!EphValues.empty()) {
11822 for (Value *V : VL) {
11823 if (EphValues.count(Ptr: V)) {
11824 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11825 << ") is ephemeral.\n");
11826 // Do not try to pack to avoid extra instructions here.
11827 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11828 /*TryToFindDuplicates=*/false);
11829 }
11830 }
11831 }
11832
11833 // We now know that this is a vector of instructions of the same type from
11834 // the same block.
11835
11836 // Check that none of the instructions in the bundle are already in the tree
11837 // and the node may be not profitable for the vectorization as the small
11838 // alternate node.
11839 if (S.isAltShuffle()) {
11840 auto GetNumVectorizedExtracted = [&]() {
11841 APInt Extracted = APInt::getZero(numBits: VL.size());
11842 APInt Vectorized = APInt::getAllOnes(numBits: VL.size());
11843 for (auto [Idx, V] : enumerate(First&: VL)) {
11844 auto *I = dyn_cast<Instruction>(Val: V);
11845 if (!I || doesNotNeedToBeScheduled(V: I) ||
11846 all_of(Range: I->operands(), P: [&](const Use &U) {
11847 return isa<ExtractElementInst>(Val: U.get());
11848 }))
11849 continue;
11850 if (isVectorized(V: I))
11851 Vectorized.clearBit(BitPosition: Idx);
11852 else if (!I->hasOneUser() && !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList))
11853 Extracted.setBit(Idx);
11854 }
11855 return std::make_pair(x&: Vectorized, y&: Extracted);
11856 };
11857 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11858 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11859 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11860 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11861 // Rough cost estimation, if the vector code (+ potential extracts) is
11862 // more profitable than the scalar + buildvector.
11863 Type *ScalarTy = VL.front()->getType();
11864 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11865 InstructionCost VectorizeCostEstimate =
11866 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: {}, CostKind: Kind) +
11867 ::getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Extracted,
11868 /*Insert=*/false, /*Extract=*/true, CostKind: Kind);
11869 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11870 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Vectorized,
11871 /*Insert=*/true, /*Extract=*/false, CostKind: Kind, /*ForPoisonSrc=*/false);
11872 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11873 }
11874 if (PreferScalarize) {
11875 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11876 "node is not profitable.\n");
11877 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11878 }
11879 }
11880
11881 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11882 if (UserIgnoreList && !UserIgnoreList->empty()) {
11883 for (Value *V : VL) {
11884 if (UserIgnoreList->contains(V)) {
11885 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11886 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11887 }
11888 }
11889 }
11890
11891 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11892}
11893
11894void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11895 const EdgeInfo &UserTreeIdx,
11896 unsigned InterleaveFactor) {
11897 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11898
11899 SmallVector<int> ReuseShuffleIndices;
11900 SmallVector<Value *> VL(VLRef);
11901
11902 // Tries to build split node.
11903 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11904 SmallVector<Value *> Op1, Op2;
11905 OrdersType ReorderIndices;
11906 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11907 return false;
11908
11909 auto Invalid = ScheduleBundle::invalid();
11910 auto *TE = newTreeEntry(VL, EntryState: TreeEntry::SplitVectorize, Bundle&: Invalid, S: LocalState,
11911 UserTreeIdx, ReuseShuffleIndices: {}, ReorderIndices);
11912 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11913 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11914 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
11915 if (S && (isa<LoadInst>(Val: S.getMainOp()) ||
11916 getSameValuesTreeEntry(V: S.getMainOp(), VL: Op, /*SameVF=*/true))) {
11917 // Build gather node for loads, they will be gathered later.
11918 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
11919 Args: Idx == 0 ? 0 : Op1.size());
11920 (void)newTreeEntry(VL: Op, EntryState: TreeEntry::NeedToGather, Bundle&: Invalid, S, UserTreeIdx: {TE, Idx});
11921 } else {
11922 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
11923 Args: Idx == 0 ? 0 : Op1.size());
11924 buildTreeRec(VLRef: Op, Depth, UserTreeIdx: {TE, Idx});
11925 }
11926 };
11927 AddNode(Op1, 0);
11928 AddNode(Op2, 1);
11929 return true;
11930 };
11931
11932 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11933 bool AreConsts = false;
11934 for (Value *V : VL) {
11935 if (isa<PoisonValue>(Val: V))
11936 continue;
11937 if (isa<Constant>(Val: V)) {
11938 AreConsts = true;
11939 continue;
11940 }
11941 if (!isa<PHINode>(Val: V))
11942 return false;
11943 }
11944 return AreConsts;
11945 };
11946 if (AreOnlyConstsWithPHIs(VL)) {
11947 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11948 newGatherTreeEntry(VL, S: InstructionsState::invalid(), UserTreeIdx);
11949 return;
11950 }
11951
11952 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11953 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11954 InstructionsState S = Legality.getInstructionsState();
11955 if (!Legality.isLegal()) {
11956 if (Legality.trySplitVectorize()) {
11957 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11958 // Last chance to try to vectorize alternate node.
11959 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11960 return;
11961 }
11962 if (!S)
11963 Legality = getScalarsVectorizationLegality(
11964 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11965 if (!Legality.isLegal()) {
11966 if (Legality.tryToFindDuplicates())
11967 tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S,
11968 UserTreeIdx);
11969
11970 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11971 return;
11972 }
11973 S = Legality.getInstructionsState();
11974 }
11975
11976 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11977 if (S.isAltShuffle() && TrySplitNode(S))
11978 return;
11979
11980 // Check that every instruction appears once in this bundle.
11981 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx,
11982 /*TryPad=*/true)) {
11983 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11984 return;
11985 }
11986
11987 // Perform specific checks for each particular instruction kind.
11988 bool IsScatterVectorizeUserTE =
11989 UserTreeIdx.UserTE &&
11990 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11991 OrdersType CurrentOrder;
11992 SmallVector<Value *> PointerOps;
11993 StridedPtrInfo SPtrInfo;
11994 TreeEntry::EntryState State = getScalarsVectorizationState(
11995 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11996 if (State == TreeEntry::NeedToGather) {
11997 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11998 return;
11999 }
12000
12001 Instruction *VL0 = S.getMainOp();
12002 BasicBlock *BB = VL0->getParent();
12003 auto &BSRef = BlocksSchedules[BB];
12004 if (!BSRef)
12005 BSRef = std::make_unique<BlockScheduling>(args&: BB);
12006
12007 BlockScheduling &BS = *BSRef;
12008
12009 SetVector<Value *> UniqueValues(llvm::from_range, VL);
12010 std::optional<ScheduleBundle *> BundlePtr =
12011 BS.tryScheduleBundle(VL: UniqueValues.getArrayRef(), SLP: this, S, EI: UserTreeIdx);
12012#ifdef EXPENSIVE_CHECKS
12013 // Make sure we didn't break any internal invariants
12014 BS.verify();
12015#endif
12016 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
12017 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
12018 // Last chance to try to vectorize alternate node.
12019 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
12020 return;
12021 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
12022 NonScheduledFirst.insert(Ptr: VL.front());
12023 if (S.getOpcode() == Instruction::Load &&
12024 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
12025 registerNonVectorizableLoads(VL: ArrayRef(VL));
12026 return;
12027 }
12028 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
12029 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
12030 ScheduleBundle Empty;
12031 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
12032 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
12033
12034 unsigned ShuffleOrOp =
12035 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
12036 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
12037 // Postpone PHI nodes creation
12038 SmallVector<unsigned> PHIOps;
12039 for (unsigned I : seq<unsigned>(Operands.size())) {
12040 ArrayRef<Value *> Op = Operands[I];
12041 if (Op.empty())
12042 continue;
12043 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
12044 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
12045 buildTreeRec(VLRef: Op, Depth: Depth + 1, UserTreeIdx: {TE, I});
12046 else
12047 PHIOps.push_back(Elt: I);
12048 }
12049 for (unsigned I : PHIOps)
12050 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
12051 };
12052 switch (ShuffleOrOp) {
12053 case Instruction::PHI: {
12054 TreeEntry *TE =
12055 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
12056 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
12057 TE->dump());
12058
12059 TE->setOperands(Operands);
12060 CreateOperandNodes(TE, Operands);
12061 return;
12062 }
12063 case Instruction::ExtractValue:
12064 case Instruction::ExtractElement: {
12065 if (CurrentOrder.empty()) {
12066 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
12067 } else {
12068 LLVM_DEBUG({
12069 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
12070 "with order";
12071 for (unsigned Idx : CurrentOrder)
12072 dbgs() << " " << Idx;
12073 dbgs() << "\n";
12074 });
12075 fixupOrderingIndices(Order: CurrentOrder);
12076 }
12077 // Insert new order with initial value 0, if it does not exist,
12078 // otherwise return the iterator to the existing one.
12079 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12080 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12081 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
12082 "(ExtractValueInst/ExtractElementInst).\n";
12083 TE->dump());
12084 // This is a special case, as it does not gather, but at the same time
12085 // we are not extending buildTreeRec() towards the operands.
12086 TE->setOperands(Operands);
12087 return;
12088 }
12089 case Instruction::InsertElement: {
12090 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
12091
12092 auto OrdCompare = [](const std::pair<int, int> &P1,
12093 const std::pair<int, int> &P2) {
12094 return P1.first > P2.first;
12095 };
12096 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
12097 decltype(OrdCompare)>
12098 Indices(OrdCompare);
12099 for (int I = 0, E = VL.size(); I < E; ++I) {
12100 unsigned Idx = *getElementIndex(Inst: VL[I]);
12101 Indices.emplace(args&: Idx, args&: I);
12102 }
12103 OrdersType CurrentOrder(VL.size(), VL.size());
12104 bool IsIdentity = true;
12105 for (int I = 0, E = VL.size(); I < E; ++I) {
12106 CurrentOrder[Indices.top().second] = I;
12107 IsIdentity &= Indices.top().second == I;
12108 Indices.pop();
12109 }
12110 if (IsIdentity)
12111 CurrentOrder.clear();
12112 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12113 ReuseShuffleIndices: {}, ReorderIndices: CurrentOrder);
12114 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12115 TE->dump());
12116
12117 TE->setOperands(Operands);
12118 buildTreeRec(VLRef: TE->getOperand(OpIdx: 1), Depth: Depth + 1, UserTreeIdx: {TE, 1});
12119 return;
12120 }
12121 case Instruction::Load: {
12122 // Check that a vectorized load would load the same memory as a scalar
12123 // load. For example, we don't want to vectorize loads that are smaller
12124 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12125 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12126 // from such a struct, we read/write packed bits disagreeing with the
12127 // unvectorized version.
12128 TreeEntry *TE = nullptr;
12129 fixupOrderingIndices(Order: CurrentOrder);
12130 switch (State) {
12131 case TreeEntry::Vectorize:
12132 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12133 ReuseShuffleIndices, ReorderIndices: CurrentOrder, InterleaveFactor);
12134 if (CurrentOrder.empty())
12135 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12136 TE->dump());
12137 else
12138 LLVM_DEBUG(dbgs()
12139 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12140 TE->dump());
12141 break;
12142 case TreeEntry::CompressVectorize:
12143 // Vectorizing non-consecutive loads with (masked)load + compress.
12144 TE = newTreeEntry(VL, EntryState: TreeEntry::CompressVectorize, Bundle, S,
12145 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12146 LLVM_DEBUG(
12147 dbgs()
12148 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12149 TE->dump());
12150 break;
12151 case TreeEntry::StridedVectorize:
12152 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12153 TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
12154 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12155 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12156 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12157 TE->dump());
12158 break;
12159 case TreeEntry::ScatterVectorize:
12160 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12161 TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
12162 UserTreeIdx, ReuseShuffleIndices);
12163 LLVM_DEBUG(
12164 dbgs()
12165 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12166 TE->dump());
12167 break;
12168 case TreeEntry::CombinedVectorize:
12169 case TreeEntry::SplitVectorize:
12170 case TreeEntry::NeedToGather:
12171 llvm_unreachable("Unexpected loads state.");
12172 }
12173 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12174 assert(Operands.size() == 1 && "Expected a single operand only");
12175 SmallVector<int> Mask;
12176 inversePermutation(Indices: CurrentOrder, Mask);
12177 reorderScalars(Scalars&: Operands.front(), Mask);
12178 }
12179 TE->setOperands(Operands);
12180 if (State == TreeEntry::ScatterVectorize)
12181 buildTreeRec(VLRef: PointerOps, Depth: Depth + 1, UserTreeIdx: {TE, 0});
12182 return;
12183 }
12184 case Instruction::ZExt:
12185 case Instruction::SExt:
12186 case Instruction::FPToUI:
12187 case Instruction::FPToSI:
12188 case Instruction::FPExt:
12189 case Instruction::PtrToInt:
12190 case Instruction::IntToPtr:
12191 case Instruction::SIToFP:
12192 case Instruction::UIToFP:
12193 case Instruction::Trunc:
12194 case Instruction::FPTrunc:
12195 case Instruction::BitCast: {
12196 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12197 u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
12198 y: std::numeric_limits<unsigned>::max()));
12199 if (ShuffleOrOp == Instruction::ZExt ||
12200 ShuffleOrOp == Instruction::SExt) {
12201 CastMaxMinBWSizes = std::make_pair(
12202 x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12203 b: PrevMaxBW),
12204 y: std::min<unsigned>(
12205 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
12206 b: PrevMinBW));
12207 } else if (ShuffleOrOp == Instruction::Trunc) {
12208 CastMaxMinBWSizes = std::make_pair(
12209 x: std::max<unsigned>(
12210 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
12211 b: PrevMaxBW),
12212 y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12213 b: PrevMinBW));
12214 }
12215 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12216 ReuseShuffleIndices);
12217 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12218 TE->dump());
12219
12220 TE->setOperands(Operands);
12221 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12222 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12223 if (ShuffleOrOp == Instruction::Trunc) {
12224 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12225 } else if (ShuffleOrOp == Instruction::SIToFP ||
12226 ShuffleOrOp == Instruction::UIToFP) {
12227 unsigned NumSignBits =
12228 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
12229 if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: 0))) {
12230 APInt Mask = DB->getDemandedBits(I: OpI);
12231 NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
12232 }
12233 if (NumSignBits * 2 >=
12234 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
12235 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12236 }
12237 return;
12238 }
12239 case Instruction::ICmp:
12240 case Instruction::FCmp: {
12241 // Check that all of the compares have the same predicate.
12242 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
12243 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12244 ReuseShuffleIndices);
12245 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12246 TE->dump());
12247
12248 VLOperands Ops(VL, Operands, S, *this);
12249 if (cast<CmpInst>(Val: VL0)->isCommutative()) {
12250 // Commutative predicate - collect + sort operands of the instructions
12251 // so that each side is more likely to have the same opcode.
12252 assert(P0 == CmpInst::getSwappedPredicate(P0) &&
12253 "Commutative Predicate mismatch");
12254 Ops.reorder();
12255 Operands.front() = Ops.getVL(OpIdx: 0);
12256 Operands.back() = Ops.getVL(OpIdx: 1);
12257 } else {
12258 // Collect operands - commute if it uses the swapped predicate.
12259 for (auto [Idx, V] : enumerate(First&: VL)) {
12260 if (isa<PoisonValue>(Val: V))
12261 continue;
12262 auto *Cmp = cast<CmpInst>(Val: V);
12263 if (Cmp->getPredicate() != P0)
12264 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12265 }
12266 }
12267 TE->setOperands(Operands);
12268 buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0});
12269 buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1});
12270 if (ShuffleOrOp == Instruction::ICmp) {
12271 unsigned NumSignBits0 =
12272 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
12273 if (NumSignBits0 * 2 >=
12274 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
12275 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12276 unsigned NumSignBits1 =
12277 ComputeNumSignBits(Op: VL0->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
12278 if (NumSignBits1 * 2 >=
12279 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 1)->getType()))
12280 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 1)->Idx);
12281 }
12282 return;
12283 }
12284 case Instruction::Select:
12285 case Instruction::FNeg:
12286 case Instruction::Add:
12287 case Instruction::FAdd:
12288 case Instruction::Sub:
12289 case Instruction::FSub:
12290 case Instruction::Mul:
12291 case Instruction::FMul:
12292 case Instruction::UDiv:
12293 case Instruction::SDiv:
12294 case Instruction::FDiv:
12295 case Instruction::URem:
12296 case Instruction::SRem:
12297 case Instruction::FRem:
12298 case Instruction::Shl:
12299 case Instruction::LShr:
12300 case Instruction::AShr:
12301 case Instruction::And:
12302 case Instruction::Or:
12303 case Instruction::Xor:
12304 case Instruction::Freeze: {
12305 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12306 ReuseShuffleIndices);
12307 LLVM_DEBUG(
12308 dbgs() << "SLP: added a new TreeEntry "
12309 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12310 TE->dump());
12311
12312 if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
12313 VLOperands Ops(VL, Operands, S, *this);
12314 Ops.reorder();
12315 Operands[0] = Ops.getVL(OpIdx: 0);
12316 Operands[1] = Ops.getVL(OpIdx: 1);
12317 }
12318 TE->setOperands(Operands);
12319 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12320 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12321 return;
12322 }
12323 case Instruction::GetElementPtr: {
12324 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12325 ReuseShuffleIndices);
12326 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12327 TE->dump());
12328 TE->setOperands(Operands);
12329
12330 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12331 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
12332 return;
12333 }
12334 case Instruction::Store: {
12335 bool Consecutive = CurrentOrder.empty();
12336 if (!Consecutive)
12337 fixupOrderingIndices(Order: CurrentOrder);
12338 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12339 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12340 if (Consecutive)
12341 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12342 TE->dump());
12343 else
12344 LLVM_DEBUG(
12345 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12346 TE->dump());
12347 TE->setOperands(Operands);
12348 buildTreeRec(VLRef: TE->getOperand(OpIdx: 0), Depth: Depth + 1, UserTreeIdx: {TE, 0});
12349 return;
12350 }
12351 case Instruction::Call: {
12352 // Check if the calls are all to the same vectorizable intrinsic or
12353 // library function.
12354 CallInst *CI = cast<CallInst>(Val: VL0);
12355 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
12356
12357 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12358 ReuseShuffleIndices);
12359 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12360 TE->dump());
12361 if (isCommutative(I: VL0)) {
12362 VLOperands Ops(VL, Operands, S, *this);
12363 Ops.reorder();
12364 Operands[0] = Ops.getVL(OpIdx: 0);
12365 Operands[1] = Ops.getVL(OpIdx: 1);
12366 }
12367 TE->setOperands(Operands);
12368 for (unsigned I : seq<unsigned>(Size: CI->arg_size())) {
12369 // For scalar operands no need to create an entry since no need to
12370 // vectorize it.
12371 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI))
12372 continue;
12373 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12374 }
12375 return;
12376 }
12377 case Instruction::ShuffleVector: {
12378 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12379 ReuseShuffleIndices);
12380 if (S.isAltShuffle()) {
12381 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12382 TE->dump());
12383 } else {
12384 assert(SLPReVec && "Only supported by REVEC.");
12385 LLVM_DEBUG(
12386 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12387 TE->dump());
12388 }
12389
12390 // Reorder operands if reordering would enable vectorization.
12391 auto *CI = dyn_cast<CmpInst>(Val: VL0);
12392 if (CI && any_of(Range&: VL, P: [](Value *V) {
12393 return !isa<PoisonValue>(Val: V) && !cast<CmpInst>(Val: V)->isCommutative();
12394 })) {
12395 auto *MainCI = cast<CmpInst>(Val: S.getMainOp());
12396 auto *AltCI = cast<CmpInst>(Val: S.getAltOp());
12397 CmpInst::Predicate MainP = MainCI->getPredicate();
12398 CmpInst::Predicate AltP = AltCI->getPredicate();
12399 assert(MainP != AltP &&
12400 "Expected different main/alternate predicates.");
12401 // Collect operands - commute if it uses the swapped predicate or
12402 // alternate operation.
12403 for (auto [Idx, V] : enumerate(First&: VL)) {
12404 if (isa<PoisonValue>(Val: V))
12405 continue;
12406 auto *Cmp = cast<CmpInst>(Val: V);
12407
12408 if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
12409 if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12410 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12411 } else {
12412 if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12413 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12414 }
12415 }
12416 TE->setOperands(Operands);
12417 buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0});
12418 buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1});
12419 return;
12420 }
12421
12422 if (isa<BinaryOperator>(Val: VL0) || CI) {
12423 VLOperands Ops(VL, Operands, S, *this);
12424 Ops.reorder();
12425 Operands[0] = Ops.getVL(OpIdx: 0);
12426 Operands[1] = Ops.getVL(OpIdx: 1);
12427 }
12428 TE->setOperands(Operands);
12429 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12430 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12431 return;
12432 }
12433 default:
12434 break;
12435 }
12436 llvm_unreachable("Unexpected vectorization of the instructions.");
12437}
12438
12439unsigned BoUpSLP::canMapToVector(Type *T) const {
12440 unsigned N = 1;
12441 Type *EltTy = T;
12442
12443 while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
12444 if (EltTy->isEmptyTy())
12445 return 0;
12446 if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
12447 // Check that struct is homogeneous.
12448 for (const auto *Ty : ST->elements())
12449 if (Ty != *ST->element_begin())
12450 return 0;
12451 N *= ST->getNumElements();
12452 EltTy = *ST->element_begin();
12453 } else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
12454 N *= AT->getNumElements();
12455 EltTy = AT->getElementType();
12456 } else {
12457 auto *VT = cast<FixedVectorType>(Val: EltTy);
12458 N *= VT->getNumElements();
12459 EltTy = VT->getElementType();
12460 }
12461 }
12462
12463 if (!isValidElementType(Ty: EltTy))
12464 return 0;
12465 size_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
12466 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12467 VTSize != DL->getTypeStoreSizeInBits(Ty: T))
12468 return 0;
12469 return N;
12470}
12471
12472bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12473 SmallVectorImpl<unsigned> &CurrentOrder,
12474 bool ResizeAllowed) const {
12475 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
12476 assert(It != VL.end() && "Expected at least one extract instruction.");
12477 auto *E0 = cast<Instruction>(Val: *It);
12478 assert(
12479 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
12480 "Invalid opcode");
12481 // Check if all of the extracts come from the same vector and from the
12482 // correct offset.
12483 Value *Vec = E0->getOperand(i: 0);
12484
12485 CurrentOrder.clear();
12486
12487 // We have to extract from a vector/aggregate with the same number of elements.
12488 unsigned NElts;
12489 if (E0->getOpcode() == Instruction::ExtractValue) {
12490 NElts = canMapToVector(T: Vec->getType());
12491 if (!NElts)
12492 return false;
12493 // Check if load can be rewritten as load of vector.
12494 LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
12495 if (!LI || !LI->isSimple() || !LI->hasNUses(N: VL.size()))
12496 return false;
12497 } else {
12498 NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
12499 }
12500
12501 unsigned E = VL.size();
12502 if (!ResizeAllowed && NElts != E)
12503 return false;
12504 SmallVector<int> Indices(E, PoisonMaskElem);
12505 unsigned MinIdx = NElts, MaxIdx = 0;
12506 for (auto [I, V] : enumerate(First&: VL)) {
12507 auto *Inst = dyn_cast<Instruction>(Val: V);
12508 if (!Inst)
12509 continue;
12510 if (Inst->getOperand(i: 0) != Vec)
12511 return false;
12512 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
12513 if (isa<UndefValue>(Val: EE->getIndexOperand()))
12514 continue;
12515 std::optional<unsigned> Idx = getExtractIndex(E: Inst);
12516 if (!Idx)
12517 return false;
12518 const unsigned ExtIdx = *Idx;
12519 if (ExtIdx >= NElts)
12520 continue;
12521 Indices[I] = ExtIdx;
12522 if (MinIdx > ExtIdx)
12523 MinIdx = ExtIdx;
12524 if (MaxIdx < ExtIdx)
12525 MaxIdx = ExtIdx;
12526 }
12527 if (MaxIdx - MinIdx + 1 > E)
12528 return false;
12529 if (MaxIdx + 1 <= E)
12530 MinIdx = 0;
12531
12532 // Check that all of the indices extract from the correct offset.
12533 bool ShouldKeepOrder = true;
12534 // Assign to all items the initial value E + 1 so we can check if the extract
12535 // instruction index was used already.
12536 // Also, later we can check that all the indices are used and we have a
12537 // consecutive access in the extract instructions, by checking that no
12538 // element of CurrentOrder still has value E + 1.
12539 CurrentOrder.assign(NumElts: E, Elt: E);
12540 for (unsigned I = 0; I < E; ++I) {
12541 if (Indices[I] == PoisonMaskElem)
12542 continue;
12543 const unsigned ExtIdx = Indices[I] - MinIdx;
12544 if (CurrentOrder[ExtIdx] != E) {
12545 CurrentOrder.clear();
12546 return false;
12547 }
12548 ShouldKeepOrder &= ExtIdx == I;
12549 CurrentOrder[ExtIdx] = I;
12550 }
12551 if (ShouldKeepOrder)
12552 CurrentOrder.clear();
12553
12554 return ShouldKeepOrder;
12555}
12556
12557bool BoUpSLP::areAllUsersVectorized(
12558 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12559 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(V: I))) ||
12560 all_of(Range: I->users(), P: [this](User *U) {
12561 return isVectorized(V: U) || isVectorLikeInstWithConstOps(V: U) ||
12562 (isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
12563 });
12564}
12565
12566void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12567 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12568 SmallVectorImpl<Value *> *OpScalars,
12569 SmallVectorImpl<Value *> *AltScalars) const {
12570 unsigned Sz = Scalars.size();
12571 Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
12572 SmallVector<int> OrderMask;
12573 if (!ReorderIndices.empty())
12574 inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
12575 for (unsigned I = 0; I < Sz; ++I) {
12576 unsigned Idx = I;
12577 if (!ReorderIndices.empty())
12578 Idx = OrderMask[I];
12579 if (isa<PoisonValue>(Val: Scalars[Idx]))
12580 continue;
12581 auto *OpInst = cast<Instruction>(Val: Scalars[Idx]);
12582 if (IsAltOp(OpInst)) {
12583 Mask[I] = Sz + Idx;
12584 if (AltScalars)
12585 AltScalars->push_back(Elt: OpInst);
12586 } else {
12587 Mask[I] = Idx;
12588 if (OpScalars)
12589 OpScalars->push_back(Elt: OpInst);
12590 }
12591 }
12592 if (!ReuseShuffleIndices.empty()) {
12593 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12594 transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
12595 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12596 });
12597 Mask.swap(RHS&: NewMask);
12598 }
12599}
12600
12601static bool isMainInstruction(Instruction *I, Instruction *MainOp,
12602 Instruction *AltOp,
12603 const TargetLibraryInfo &TLI) {
12604 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12605}
12606
12607static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
12608 Instruction *AltOp,
12609 const TargetLibraryInfo &TLI) {
12610 if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
12611 auto *AltCI = cast<CmpInst>(Val: AltOp);
12612 CmpInst::Predicate MainP = MainCI->getPredicate();
12613 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12614 assert(MainP != AltP && "Expected different main/alternate predicates.");
12615 auto *CI = cast<CmpInst>(Val: I);
12616 if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
12617 return false;
12618 if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
12619 return true;
12620 CmpInst::Predicate P = CI->getPredicate();
12621 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
12622
12623 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12624 "CmpInst expected to match either main or alternate predicate or "
12625 "their swap.");
12626 return MainP != P && MainP != SwappedP;
12627 }
12628 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12629}
12630
12631TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) const {
12632 assert(!Ops.empty());
12633 const auto *Op0 = Ops.front();
12634
12635 const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
12636 // TODO: We should allow undef elements here
12637 return isConstant(V) && !isa<UndefValue>(Val: V);
12638 });
12639 const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
12640 // TODO: We should allow undef elements here
12641 return V == Op0;
12642 });
12643 const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12644 // TODO: We should allow undef elements here
12645 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12646 return CI->getValue().isPowerOf2();
12647 return false;
12648 });
12649 const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12650 // TODO: We should allow undef elements here
12651 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12652 return CI->getValue().isNegatedPowerOf2();
12653 return false;
12654 });
12655
12656 TTI::OperandValueKind VK = TTI::OK_AnyValue;
12657 if (IsConstant && IsUniform)
12658 VK = TTI::OK_UniformConstantValue;
12659 else if (IsConstant)
12660 VK = TTI::OK_NonUniformConstantValue;
12661 else if (IsUniform)
12662 VK = TTI::OK_UniformValue;
12663
12664 TTI::OperandValueProperties VP = TTI::OP_None;
12665 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12666 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12667
12668 return {.Kind: VK, .Properties: VP};
12669}
12670
12671namespace {
12672/// The base class for shuffle instruction emission and shuffle cost estimation.
12673class BaseShuffleAnalysis {
12674protected:
12675 Type *ScalarTy = nullptr;
12676
12677 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12678
12679 /// V is expected to be a vectorized value.
12680 /// When REVEC is disabled, there is no difference between VF and
12681 /// VNumElements.
12682 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12683 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12684 /// of 8.
12685 unsigned getVF(Value *V) const {
12686 assert(V && "V cannot be nullptr");
12687 assert(isa<FixedVectorType>(V->getType()) &&
12688 "V does not have FixedVectorType");
12689 assert(ScalarTy && "ScalarTy cannot be nullptr");
12690 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
12691 unsigned VNumElements =
12692 cast<FixedVectorType>(Val: V->getType())->getNumElements();
12693 assert(VNumElements > ScalarTyNumElements &&
12694 "the number of elements of V is not large enough");
12695 assert(VNumElements % ScalarTyNumElements == 0 &&
12696 "the number of elements of V is not a vectorized value");
12697 return VNumElements / ScalarTyNumElements;
12698 }
12699
12700 /// Checks if the mask is an identity mask.
12701 /// \param IsStrict if is true the function returns false if mask size does
12702 /// not match vector size.
12703 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12704 bool IsStrict) {
12705 int Limit = Mask.size();
12706 int VF = VecTy->getNumElements();
12707 int Index = -1;
12708 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
12709 return true;
12710 if (!IsStrict) {
12711 // Consider extract subvector starting from index 0.
12712 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
12713 Index == 0)
12714 return true;
12715 // All VF-size submasks are identity (e.g.
12716 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12717 if (Limit % VF == 0 && all_of(Range: seq<int>(Begin: 0, End: Limit / VF), P: [=](int Idx) {
12718 ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
12719 return all_of(Range&: Slice, P: equal_to(Arg: PoisonMaskElem)) ||
12720 ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
12721 }))
12722 return true;
12723 }
12724 return false;
12725 }
12726
12727 /// Tries to combine 2 different masks into single one.
12728 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12729 /// change the size of the vector, \p LocalVF is the original size of the
12730 /// shuffled vector.
12731 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12732 ArrayRef<int> ExtMask) {
12733 unsigned VF = Mask.size();
12734 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12735 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12736 if (ExtMask[I] == PoisonMaskElem)
12737 continue;
12738 int MaskedIdx = Mask[ExtMask[I] % VF];
12739 NewMask[I] =
12740 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12741 }
12742 Mask.swap(RHS&: NewMask);
12743 }
12744
12745 /// Looks through shuffles trying to reduce final number of shuffles in the
12746 /// code. The function looks through the previously emitted shuffle
12747 /// instructions and properly mark indices in mask as undef.
12748 /// For example, given the code
12749 /// \code
12750 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12751 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12752 /// \endcode
12753 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12754 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12755 /// <0, 1, 2, 3> for the shuffle.
12756 /// If 2 operands are of different size, the smallest one will be resized and
12757 /// the mask recalculated properly.
12758 /// For example, given the code
12759 /// \code
12760 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12761 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12762 /// \endcode
12763 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12764 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12765 /// <0, 1, 2, 3> for the shuffle.
12766 /// So, it tries to transform permutations to simple vector merge, if
12767 /// possible.
12768 /// \param V The input vector which must be shuffled using the given \p Mask.
12769 /// If the better candidate is found, \p V is set to this best candidate
12770 /// vector.
12771 /// \param Mask The input mask for the shuffle. If the best candidate is found
12772 /// during looking-through-shuffles attempt, it is updated accordingly.
12773 /// \param SinglePermute true if the shuffle operation is originally a
12774 /// single-value-permutation. In this case the look-through-shuffles procedure
12775 /// may look for resizing shuffles as the best candidates.
12776 /// \return true if the shuffle results in the non-resizing identity shuffle
12777 /// (and thus can be ignored), false - otherwise.
12778 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12779 bool SinglePermute) {
12780 Value *Op = V;
12781 ShuffleVectorInst *IdentityOp = nullptr;
12782 SmallVector<int> IdentityMask;
12783 while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
12784 // Exit if not a fixed vector type or changing size shuffle.
12785 auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
12786 if (!SVTy)
12787 break;
12788 // Remember the identity or broadcast mask, if it is not a resizing
12789 // shuffle. If no better candidates are found, this Op and Mask will be
12790 // used in the final shuffle.
12791 if (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/false)) {
12792 if (!IdentityOp || !SinglePermute ||
12793 (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/true) &&
12794 !ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
12795 NumSrcElts: IdentityMask.size()))) {
12796 IdentityOp = SV;
12797 // Store current mask in the IdentityMask so later we did not lost
12798 // this info if IdentityOp is selected as the best candidate for the
12799 // permutation.
12800 IdentityMask.assign(RHS: Mask);
12801 }
12802 }
12803 // Remember the broadcast mask. If no better candidates are found, this Op
12804 // and Mask will be used in the final shuffle.
12805 // Zero splat can be used as identity too, since it might be used with
12806 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12807 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12808 // expensive, the analysis founds out, that the source vector is just a
12809 // broadcast, this original mask can be transformed to identity mask <0,
12810 // 1, 2, 3>.
12811 // \code
12812 // %0 = shuffle %v, poison, zeroinitalizer
12813 // %res = shuffle %0, poison, <3, 1, 2, 0>
12814 // \endcode
12815 // may be transformed to
12816 // \code
12817 // %0 = shuffle %v, poison, zeroinitalizer
12818 // %res = shuffle %0, poison, <0, 1, 2, 3>
12819 // \endcode
12820 if (SV->isZeroEltSplat()) {
12821 IdentityOp = SV;
12822 IdentityMask.assign(RHS: Mask);
12823 }
12824 int LocalVF = Mask.size();
12825 if (auto *SVOpTy =
12826 dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType()))
12827 LocalVF = SVOpTy->getNumElements();
12828 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12829 for (auto [Idx, I] : enumerate(First&: Mask)) {
12830 if (I == PoisonMaskElem ||
12831 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12832 continue;
12833 ExtMask[Idx] = SV->getMaskValue(Elt: I);
12834 }
12835 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12836 V: SV->getOperand(i_nocapture: 0),
12837 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
12838 .all();
12839 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12840 V: SV->getOperand(i_nocapture: 1),
12841 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
12842 .all();
12843 if (!IsOp1Undef && !IsOp2Undef) {
12844 // Update mask and mark undef elems.
12845 for (int &I : Mask) {
12846 if (I == PoisonMaskElem)
12847 continue;
12848 if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
12849 PoisonMaskElem)
12850 I = PoisonMaskElem;
12851 }
12852 break;
12853 }
12854 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12855 combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
12856 Mask.swap(RHS&: ShuffleMask);
12857 if (IsOp2Undef)
12858 Op = SV->getOperand(i_nocapture: 0);
12859 else
12860 Op = SV->getOperand(i_nocapture: 1);
12861 }
12862 if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
12863 !OpTy || !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) ||
12864 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
12865 if (IdentityOp) {
12866 V = IdentityOp;
12867 assert(Mask.size() == IdentityMask.size() &&
12868 "Expected masks of same sizes.");
12869 // Clear known poison elements.
12870 for (auto [I, Idx] : enumerate(First&: Mask))
12871 if (Idx == PoisonMaskElem)
12872 IdentityMask[I] = PoisonMaskElem;
12873 Mask.swap(RHS&: IdentityMask);
12874 auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
12875 return SinglePermute &&
12876 (isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
12877 /*IsStrict=*/true) ||
12878 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12879 Shuffle->isZeroEltSplat() &&
12880 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size()) &&
12881 all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) {
12882 return P.value() == PoisonMaskElem ||
12883 Shuffle->getShuffleMask()[P.index()] == 0;
12884 })));
12885 }
12886 V = Op;
12887 return false;
12888 }
12889 V = Op;
12890 return true;
12891 }
12892
12893 /// Smart shuffle instruction emission, walks through shuffles trees and
12894 /// tries to find the best matching vector for the actual shuffle
12895 /// instruction.
12896 template <typename T, typename ShuffleBuilderTy>
12897 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12898 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12899 assert(V1 && "Expected at least one vector value.");
12900 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
12901 SmallVector<int> NewMask(Mask);
12902 if (ScalarTyNumElements != 1) {
12903 assert(SLPReVec && "FixedVectorType is not expected.");
12904 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask&: NewMask);
12905 Mask = NewMask;
12906 }
12907 if (V2)
12908 Builder.resizeToMatch(V1, V2);
12909 int VF = Mask.size();
12910 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
12911 VF = FTy->getNumElements();
12912 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
12913 V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg))
12914 .all()) {
12915 // Peek through shuffles.
12916 Value *Op1 = V1;
12917 Value *Op2 = V2;
12918 int VF =
12919 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
12920 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12921 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12922 for (int I = 0, E = Mask.size(); I < E; ++I) {
12923 if (Mask[I] < VF)
12924 CombinedMask1[I] = Mask[I];
12925 else
12926 CombinedMask2[I] = Mask[I] - VF;
12927 }
12928 Value *PrevOp1;
12929 Value *PrevOp2;
12930 do {
12931 PrevOp1 = Op1;
12932 PrevOp2 = Op2;
12933 (void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /*SinglePermute=*/false);
12934 (void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /*SinglePermute=*/false);
12935 // Check if we have 2 resizing shuffles - need to peek through operands
12936 // again.
12937 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
12938 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
12939 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12940 for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
12941 if (I == PoisonMaskElem)
12942 continue;
12943 ExtMask1[Idx] = SV1->getMaskValue(Elt: I);
12944 }
12945 SmallBitVector UseMask1 = buildUseMask(
12946 VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: 1)->getType())
12947 ->getNumElements(),
12948 Mask: ExtMask1, MaskArg: UseMask::SecondArg);
12949 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12950 for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
12951 if (I == PoisonMaskElem)
12952 continue;
12953 ExtMask2[Idx] = SV2->getMaskValue(Elt: I);
12954 }
12955 SmallBitVector UseMask2 = buildUseMask(
12956 VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: 1)->getType())
12957 ->getNumElements(),
12958 Mask: ExtMask2, MaskArg: UseMask::SecondArg);
12959 if (SV1->getOperand(i_nocapture: 0)->getType() ==
12960 SV2->getOperand(i_nocapture: 0)->getType() &&
12961 SV1->getOperand(i_nocapture: 0)->getType() != SV1->getType() &&
12962 isUndefVector(V: SV1->getOperand(i_nocapture: 1), UseMask: UseMask1).all() &&
12963 isUndefVector(V: SV2->getOperand(i_nocapture: 1), UseMask: UseMask2).all()) {
12964 Op1 = SV1->getOperand(i_nocapture: 0);
12965 Op2 = SV2->getOperand(i_nocapture: 0);
12966 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12967 int LocalVF = ShuffleMask1.size();
12968 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
12969 LocalVF = FTy->getNumElements();
12970 combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
12971 CombinedMask1.swap(RHS&: ShuffleMask1);
12972 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12973 LocalVF = ShuffleMask2.size();
12974 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
12975 LocalVF = FTy->getNumElements();
12976 combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
12977 CombinedMask2.swap(RHS&: ShuffleMask2);
12978 }
12979 }
12980 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12981 Builder.resizeToMatch(Op1, Op2);
12982 VF = std::max(a: cast<VectorType>(Val: Op1->getType())
12983 ->getElementCount()
12984 .getKnownMinValue(),
12985 b: cast<VectorType>(Val: Op2->getType())
12986 ->getElementCount()
12987 .getKnownMinValue());
12988 for (int I = 0, E = Mask.size(); I < E; ++I) {
12989 if (CombinedMask2[I] != PoisonMaskElem) {
12990 assert(CombinedMask1[I] == PoisonMaskElem &&
12991 "Expected undefined mask element");
12992 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12993 }
12994 }
12995 if (Op1 == Op2 &&
12996 (ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) ||
12997 (ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
12998 isa<ShuffleVectorInst>(Val: Op1) &&
12999 cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
13000 ArrayRef(CombinedMask1))))
13001 return Builder.createIdentity(Op1);
13002 return Builder.createShuffleVector(
13003 Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
13004 CombinedMask1);
13005 }
13006 if (isa<PoisonValue>(Val: V1))
13007 return Builder.createPoison(
13008 cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
13009 bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /*SinglePermute=*/true);
13010 assert(V1 && "Expected non-null value after looking through shuffles.");
13011
13012 if (!IsIdentity)
13013 return Builder.createShuffleVector(V1, NewMask);
13014 return Builder.createIdentity(V1);
13015 }
13016
13017 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
13018 /// shuffle emission.
13019 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
13020 ArrayRef<int> Mask) {
13021 for (unsigned I : seq<unsigned>(Size: CommonMask.size()))
13022 if (Mask[I] != PoisonMaskElem)
13023 CommonMask[I] = I;
13024 }
13025};
13026} // namespace
13027
13028/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
13029static std::pair<InstructionCost, InstructionCost>
13030getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
13031 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
13032 Type *ScalarTy, VectorType *VecTy) {
13033 InstructionCost ScalarCost = 0;
13034 InstructionCost VecCost = 0;
13035 // Here we differentiate two cases: (1) when Ptrs represent a regular
13036 // vectorization tree node (as they are pointer arguments of scattered
13037 // loads) or (2) when Ptrs are the arguments of loads or stores being
13038 // vectorized as plane wide unit-stride load/store since all the
13039 // loads/stores are known to be from/to adjacent locations.
13040 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
13041 // Case 2: estimate costs for pointer related costs when vectorizing to
13042 // a wide load/store.
13043 // Scalar cost is estimated as a set of pointers with known relationship
13044 // between them.
13045 // For vector code we will use BasePtr as argument for the wide load/store
13046 // but we also need to account all the instructions which are going to
13047 // stay in vectorized code due to uses outside of these scalar
13048 // loads/stores.
13049 ScalarCost = TTI.getPointersChainCost(
13050 Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
13051 CostKind);
13052
13053 SmallVector<const Value *> PtrsRetainedInVecCode;
13054 for (Value *V : Ptrs) {
13055 if (V == BasePtr) {
13056 PtrsRetainedInVecCode.push_back(Elt: V);
13057 continue;
13058 }
13059 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
13060 // For simplicity assume Ptr to stay in vectorized code if it's not a
13061 // GEP instruction. We don't care since it's cost considered free.
13062 // TODO: We should check for any uses outside of vectorizable tree
13063 // rather than just single use.
13064 if (!Ptr || !Ptr->hasOneUse())
13065 PtrsRetainedInVecCode.push_back(Elt: V);
13066 }
13067
13068 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
13069 // If all pointers stay in vectorized code then we don't have
13070 // any savings on that.
13071 return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
13072 }
13073 VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
13074 Info: TTI::PointersChainInfo::getKnownStride(),
13075 AccessTy: VecTy, CostKind);
13076 } else {
13077 // Case 1: Ptrs are the arguments of loads that we are going to transform
13078 // into masked gather load intrinsic.
13079 // All the scalar GEPs will be removed as a result of vectorization.
13080 // For any external uses of some lanes extract element instructions will
13081 // be generated (which cost is estimated separately).
13082 TTI::PointersChainInfo PtrsInfo =
13083 all_of(Range&: Ptrs,
13084 P: [](const Value *V) {
13085 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
13086 return Ptr && !Ptr->hasAllConstantIndices();
13087 })
13088 ? TTI::PointersChainInfo::getUnknownStride()
13089 : TTI::PointersChainInfo::getKnownStride();
13090
13091 ScalarCost =
13092 TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
13093 auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
13094 if (!BaseGEP) {
13095 auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
13096 if (It != Ptrs.end())
13097 BaseGEP = cast<GEPOperator>(Val: *It);
13098 }
13099 if (BaseGEP) {
13100 SmallVector<const Value *> Indices(BaseGEP->indices());
13101 VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
13102 Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
13103 CostKind);
13104 }
13105 }
13106
13107 return std::make_pair(x&: ScalarCost, y&: VecCost);
13108}
13109
13110void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13111 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13112 "Expected gather node without reordering.");
13113 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13114 SmallSet<size_t, 2> LoadKeyUsed;
13115
13116 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13117 // instructions have same opcode already.
13118 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13119 all_of(Range&: TE.Scalars, P: isConstant))
13120 return;
13121
13122 if (any_of(Range: seq<unsigned>(Size: TE.Idx), P: [&](unsigned Idx) {
13123 return VectorizableTree[Idx]->isSame(VL: TE.Scalars);
13124 }))
13125 return;
13126
13127 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13128 Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
13129 Value *Ptr =
13130 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
13131 if (LoadKeyUsed.contains(V: Key)) {
13132 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
13133 if (LIt != LoadsMap.end()) {
13134 for (LoadInst *RLI : LIt->second) {
13135 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
13136 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL: *DL, SE&: *SE,
13137 /*StrictCheck=*/true))
13138 return hash_value(ptr: RLI->getPointerOperand());
13139 }
13140 for (LoadInst *RLI : LIt->second) {
13141 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
13142 Ptr2: LI->getPointerOperand(), TLI: *TLI)) {
13143 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
13144 return SubKey;
13145 }
13146 }
13147 if (LIt->second.size() > 2) {
13148 hash_code SubKey =
13149 hash_value(ptr: LIt->second.back()->getPointerOperand());
13150 return SubKey;
13151 }
13152 }
13153 }
13154 LoadKeyUsed.insert(V: Key);
13155 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)).first->second.push_back(Elt: LI);
13156 return hash_value(ptr: LI->getPointerOperand());
13157 };
13158 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13159 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13160 bool IsOrdered = true;
13161 unsigned NumInstructions = 0;
13162 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13163 // nodes.
13164 for (auto [I, V] : enumerate(First&: TE.Scalars)) {
13165 size_t Key = 1, Idx = 1;
13166 if (auto *Inst = dyn_cast<Instruction>(Val: V);
13167 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(Val: V) &&
13168 !isDeleted(I: Inst) && !isVectorized(V)) {
13169 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
13170 /*AllowAlternate=*/false);
13171 ++NumInstructions;
13172 }
13173 auto &Container = SortedValues[Key];
13174 if (IsOrdered && !KeyToIndex.contains(Val: V) &&
13175 !(isa<Constant, ExtractElementInst>(Val: V) ||
13176 isVectorLikeInstWithConstOps(V)) &&
13177 ((Container.contains(Key: Idx) &&
13178 KeyToIndex.at(Val: Container[Idx].back()).back() != I - 1) ||
13179 (!Container.empty() && !Container.contains(Key: Idx) &&
13180 KeyToIndex.at(Val: Container.back().second.back()).back() != I - 1)))
13181 IsOrdered = false;
13182 auto &KTI = KeyToIndex[V];
13183 if (KTI.empty())
13184 Container[Idx].push_back(Elt: V);
13185 KTI.push_back(Elt: I);
13186 }
13187 SmallVector<std::pair<unsigned, unsigned>> SubVectors;
13188 APInt DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13189 if (!IsOrdered && NumInstructions > 1) {
13190 unsigned Cnt = 0;
13191 TE.ReorderIndices.resize(N: TE.Scalars.size(), NV: TE.Scalars.size());
13192 for (const auto &D : SortedValues) {
13193 for (const auto &P : D.second) {
13194 unsigned Sz = 0;
13195 for (Value *V : P.second) {
13196 ArrayRef<unsigned> Indices = KeyToIndex.at(Val: V);
13197 for (auto [K, Idx] : enumerate(First&: Indices)) {
13198 TE.ReorderIndices[Cnt + K] = Idx;
13199 TE.Scalars[Cnt + K] = V;
13200 }
13201 Sz += Indices.size();
13202 Cnt += Indices.size();
13203 }
13204 if (Sz > 1 && isa<Instruction>(Val: P.second.front())) {
13205 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13206 TTI: *TTI, Ty: TE.Scalars.front()->getType(), Sz);
13207 SubVectors.emplace_back(Args: Cnt - Sz, Args: SubVF);
13208 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt - Sz + SubVF))
13209 DemandedElts.clearBit(BitPosition: I);
13210 } else if (!P.second.empty() && isConstant(V: P.second.front())) {
13211 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt))
13212 DemandedElts.clearBit(BitPosition: I);
13213 }
13214 }
13215 }
13216 }
13217 // Reuses always require shuffles, so consider it as profitable.
13218 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13219 return;
13220 // Do simple cost estimation.
13221 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13222 InstructionCost Cost = 0;
13223 auto *ScalarTy = TE.Scalars.front()->getType();
13224 auto *VecTy = getWidenedType(ScalarTy, VF: TE.Scalars.size());
13225 for (auto [Idx, Sz] : SubVectors) {
13226 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind,
13227 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: Sz));
13228 }
13229 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13230 /*Insert=*/true,
13231 /*Extract=*/false, CostKind);
13232 int Sz = TE.Scalars.size();
13233 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13234 TE.ReorderIndices.end());
13235 for (unsigned I : seq<unsigned>(Size: Sz)) {
13236 Value *V = TE.getOrdered(Idx: I);
13237 if (isa<PoisonValue>(Val: V)) {
13238 ReorderMask[I] = PoisonMaskElem;
13239 } else if (isConstant(V) || DemandedElts[I]) {
13240 ReorderMask[I] = I + TE.ReorderIndices.size();
13241 }
13242 }
13243 Cost += ::getShuffleCost(TTI: *TTI,
13244 Kind: any_of(Range&: ReorderMask, P: [&](int I) { return I >= Sz; })
13245 ? TTI::SK_PermuteTwoSrc
13246 : TTI::SK_PermuteSingleSrc,
13247 Tp: VecTy, Mask: ReorderMask);
13248 DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13249 ReorderMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
13250 for (unsigned I : seq<unsigned>(Size: Sz)) {
13251 Value *V = TE.getOrdered(Idx: I);
13252 if (isConstant(V)) {
13253 DemandedElts.clearBit(BitPosition: I);
13254 if (!isa<PoisonValue>(Val: V))
13255 ReorderMask[I] = I;
13256 } else {
13257 ReorderMask[I] = I + Sz;
13258 }
13259 }
13260 InstructionCost BVCost =
13261 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13262 /*Insert=*/true, /*Extract=*/false, CostKind);
13263 if (!DemandedElts.isAllOnes())
13264 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: ReorderMask);
13265 if (Cost >= BVCost) {
13266 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13267 reorderScalars(Scalars&: TE.Scalars, Mask);
13268 TE.ReorderIndices.clear();
13269 }
13270}
13271
13272/// Check if we can convert fadd/fsub sequence to FMAD.
13273/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13274static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
13275 const InstructionsState &S,
13276 DominatorTree &DT, const DataLayout &DL,
13277 TargetTransformInfo &TTI,
13278 const TargetLibraryInfo &TLI) {
13279 assert(all_of(VL,
13280 [](Value *V) {
13281 return V->getType()->getScalarType()->isFloatingPointTy();
13282 }) &&
13283 "Can only convert to FMA for floating point types");
13284 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13285
13286 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13287 FastMathFlags FMF;
13288 FMF.set();
13289 for (Value *V : VL) {
13290 auto *I = dyn_cast<Instruction>(Val: V);
13291 if (!I)
13292 continue;
13293 if (S.isCopyableElement(V: I))
13294 continue;
13295 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13296 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13297 continue;
13298 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13299 FMF &= FPCI->getFastMathFlags();
13300 }
13301 return FMF.allowContract();
13302 };
13303 if (!CheckForContractable(VL))
13304 return InstructionCost::getInvalid();
13305 // fmul also should be contractable
13306 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13307 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13308
13309 InstructionsState OpS = getSameOpcode(VL: Operands.front(), TLI);
13310 if (!OpS.valid())
13311 return InstructionCost::getInvalid();
13312
13313 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13314 return InstructionCost::getInvalid();
13315 if (!CheckForContractable(Operands.front()))
13316 return InstructionCost::getInvalid();
13317 // Compare the costs.
13318 InstructionCost FMulPlusFAddCost = 0;
13319 InstructionCost FMACost = 0;
13320 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13321 FastMathFlags FMF;
13322 FMF.set();
13323 for (Value *V : VL) {
13324 auto *I = dyn_cast<Instruction>(Val: V);
13325 if (!I)
13326 continue;
13327 if (!S.isCopyableElement(V: I))
13328 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13329 FMF &= FPCI->getFastMathFlags();
13330 FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13331 }
13332 unsigned NumOps = 0;
13333 for (auto [V, Op] : zip(t&: VL, u&: Operands.front())) {
13334 if (S.isCopyableElement(V))
13335 continue;
13336 auto *I = dyn_cast<Instruction>(Val: Op);
13337 if (!I || !I->hasOneUse() || OpS.isCopyableElement(V: I)) {
13338 if (auto *OpI = dyn_cast<Instruction>(Val: V))
13339 FMACost += TTI.getInstructionCost(U: OpI, CostKind);
13340 if (I)
13341 FMACost += TTI.getInstructionCost(U: I, CostKind);
13342 continue;
13343 }
13344 ++NumOps;
13345 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13346 FMF &= FPCI->getFastMathFlags();
13347 FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13348 }
13349 Type *Ty = VL.front()->getType();
13350 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13351 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13352 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13353}
13354
13355bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
13356 bool &IsBSwap) const {
13357 assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
13358 "Expected Shl node.");
13359 IsBSwap = false;
13360 if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
13361 !TE.ReuseShuffleIndices.empty() || MinBWs.contains(Val: &TE) ||
13362 any_of(Range: TE.Scalars, P: [](Value *V) { return !V->hasOneUse(); }))
13363 return false;
13364 Type *ScalarTy = TE.getMainOp()->getType();
13365 // TODO: Check if same can be done for the vector types.
13366 if (!ScalarTy->isIntegerTy())
13367 return false;
13368 if (ScalarTy->isVectorTy())
13369 return false;
13370 const unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
13371 if (!isPowerOf2_64(Value: Sz))
13372 return false;
13373 const TreeEntry *LhsTE = getOperandEntry(E: &TE, /*Idx=*/0);
13374 const TreeEntry *RhsTE = getOperandEntry(E: &TE, /*Idx=*/1);
13375 // Lhs should be zext i<stride> to I<sz>.
13376 if (!(LhsTE->State == TreeEntry::Vectorize &&
13377 LhsTE->getOpcode() == Instruction::ZExt &&
13378 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13379 !MinBWs.contains(Val: LhsTE) &&
13380 all_of(Range: LhsTE->Scalars, P: [](Value *V) { return V->hasOneUse(); })))
13381 return false;
13382 Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
13383 unsigned Stride = DL->getTypeSizeInBits(Ty: SrcScalarTy);
13384 if (!isPowerOf2_64(Value: Stride) || Stride >= Sz)
13385 return false;
13386 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13387 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(Val: RhsTE)))
13388 return false;
13389 Order.clear();
13390 unsigned CurrentValue = 0;
13391 // Rhs should be (0, Stride, 2 * Stride, ..., Sz-Stride).
13392 if (all_of(Range: RhsTE->Scalars,
13393 P: [&](Value *V) {
13394 CurrentValue += Stride;
13395 if (isa<UndefValue>(Val: V))
13396 return true;
13397 auto *C = dyn_cast<Constant>(Val: V);
13398 if (!C)
13399 return false;
13400 return C->getUniqueInteger() == CurrentValue - Stride;
13401 }) &&
13402 CurrentValue == Sz) {
13403 Order.clear();
13404 } else {
13405 const unsigned VF = RhsTE->getVectorFactor();
13406 Order.assign(NumElts: VF, Elt: VF);
13407 // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
13408 // ..., Sz-Stride).
13409 if (VF * Stride != Sz)
13410 return false;
13411 for (const auto [Idx, V] : enumerate(First: RhsTE->Scalars)) {
13412 if (isa<UndefValue>(Val: V))
13413 continue;
13414 auto *C = dyn_cast<Constant>(Val: V);
13415 if (!C)
13416 return false;
13417 const APInt &Val = C->getUniqueInteger();
13418 if (Val.isNegative() || Val.uge(RHS: Sz) || Val.getZExtValue() % Stride != 0)
13419 return false;
13420 unsigned Pos = Val.getZExtValue() / Stride;
13421 // TODO: Support Pos >= VF, in this case need to shift the final value.
13422 if (Order[Idx] != VF || Pos >= VF)
13423 return false;
13424 Order[Idx] = Pos;
13425 }
13426 // One of the indices not set - exit.
13427 if (is_contained(Range&: Order, Element: VF))
13428 return false;
13429 }
13430 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13431 FastMathFlags FMF;
13432 SmallPtrSet<Value *, 4> CheckedExtracts;
13433 auto *VecTy = getWidenedType(ScalarTy, VF: TE.getVectorFactor());
13434 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
13435 TTI::CastContextHint CastCtx =
13436 getCastContextHint(TE: *getOperandEntry(E: LhsTE, /*Idx=*/0));
13437 InstructionCost VecCost =
13438 TTI->getArithmeticReductionCost(Opcode: Instruction::Or, Ty: VecTy, FMF, CostKind) +
13439 TTI->getArithmeticInstrCost(Opcode: Instruction::Shl, Ty: VecTy, CostKind,
13440 Opd1Info: getOperandInfo(Ops: LhsTE->Scalars)) +
13441 TTI->getCastInstrCost(
13442 Opcode: Instruction::ZExt, Dst: VecTy,
13443 Src: getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor()), CCH: CastCtx,
13444 CostKind);
13445 InstructionCost BitcastCost = TTI->getCastInstrCost(
13446 Opcode: Instruction::BitCast, Dst: ScalarTy, Src: SrcVecTy, CCH: CastCtx, CostKind);
13447 if (!Order.empty()) {
13448 fixupOrderingIndices(Order);
13449 SmallVector<int> Mask;
13450 inversePermutation(Indices: Order, Mask);
13451 BitcastCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SrcVecTy,
13452 Mask, CostKind);
13453 }
13454 // Check if the combination can be modeled as a bitcast+byteswap operation.
13455 constexpr unsigned ByteSize = 8;
13456 if (!Order.empty() && isReverseOrder(Order) &&
13457 DL->getTypeSizeInBits(Ty: SrcScalarTy) == ByteSize) {
13458 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, ScalarTy, {ScalarTy});
13459 InstructionCost BSwapCost =
13460 TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: ScalarTy, Src: SrcVecTy, CCH: CastCtx,
13461 CostKind) +
13462 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13463 if (BSwapCost <= BitcastCost) {
13464 BitcastCost = BSwapCost;
13465 IsBSwap = true;
13466 }
13467 }
13468 return BitcastCost < VecCost;
13469}
13470
13471void BoUpSLP::transformNodes() {
13472 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13473 BaseGraphSize = VectorizableTree.size();
13474 // Turn graph transforming mode on and off, when done.
13475 class GraphTransformModeRAAI {
13476 bool &SavedIsGraphTransformMode;
13477
13478 public:
13479 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13480 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13481 IsGraphTransformMode = true;
13482 }
13483 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13484 } TransformContext(IsGraphTransformMode);
13485 // Operands are profitable if they are:
13486 // 1. At least one constant
13487 // or
13488 // 2. Splats
13489 // or
13490 // 3. Results in good vectorization opportunity, i.e. may generate vector
13491 // nodes and reduce cost of the graph.
13492 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13493 const InstructionsState &S) {
13494 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
13495 for (unsigned Op : seq<unsigned>(Size: S.getMainOp()->getNumOperands()))
13496 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
13497 Args: I2->getOperand(i: Op));
13498 return all_of(
13499 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13500 return all_of(Range&: Cand,
13501 P: [](const std::pair<Value *, Value *> &P) {
13502 return isa<Constant>(Val: P.first) ||
13503 isa<Constant>(Val: P.second) || P.first == P.second;
13504 }) ||
13505 findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplatLoads);
13506 });
13507 };
13508
13509 // Try to reorder gather nodes for better vectorization opportunities.
13510 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
13511 TreeEntry &E = *VectorizableTree[Idx];
13512 if (E.isGather())
13513 reorderGatherNode(TE&: E);
13514 }
13515
13516 // Better to use full gathered loads analysis, if there are only 2 loads
13517 // gathered nodes each having less than 16 elements.
13518 constexpr unsigned VFLimit = 16;
13519 bool ForceLoadGather =
13520 count_if(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
13521 return TE->isGather() && TE->hasState() &&
13522 TE->getOpcode() == Instruction::Load &&
13523 TE->getVectorFactor() < VFLimit;
13524 }) == 2;
13525
13526 // Checks if the scalars are used in other node.
13527 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13528 function_ref<bool(Value *)> CheckContainer) {
13529 return TE->isSame(VL) || all_of(Range&: VL, P: [&](Value *V) {
13530 if (isa<PoisonValue>(Val: V))
13531 return true;
13532 auto *I = dyn_cast<Instruction>(Val: V);
13533 if (!I)
13534 return false;
13535 return is_contained(Range: TE->Scalars, Element: I) || CheckContainer(I);
13536 });
13537 };
13538 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13539 if (E.hasState()) {
13540 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: E.getMainOp());
13541 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13542 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13543 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13544 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13545 return is_contained(Range&: TEs, Element: TE);
13546 });
13547 });
13548 }))
13549 return true;
13550 ;
13551 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: E.getMainOp());
13552 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13553 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13554 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13555 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13556 return is_contained(Range&: TEs, Element: TE);
13557 });
13558 });
13559 }))
13560 return true;
13561 } else {
13562 // Check if the gather node full copy of split node.
13563 auto *It = find_if(Range: E.Scalars, P: IsaPred<Instruction>);
13564 if (It != E.Scalars.end()) {
13565 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: *It);
13566 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13567 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13568 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13569 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13570 return is_contained(Range&: TEs, Element: TE);
13571 });
13572 });
13573 }))
13574 return true;
13575 }
13576 }
13577 return false;
13578 };
13579 // The tree may grow here, so iterate over nodes, built before.
13580 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
13581 TreeEntry &E = *VectorizableTree[Idx];
13582 if (E.isGather()) {
13583 ArrayRef<Value *> VL = E.Scalars;
13584 const unsigned Sz = getVectorElementSize(V: VL.front());
13585 unsigned MinVF = getMinVF(Sz: 2 * Sz);
13586 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13587 // same opcode and same parent block or all constants.
13588 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(key: Idx) ||
13589 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13590 // We use allSameOpcode instead of isAltShuffle because we don't
13591 // want to use interchangeable instruction here.
13592 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13593 allConstant(VL) || isSplat(VL))
13594 continue;
13595 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13596 continue;
13597 // Check if the node is a copy of other vector nodes.
13598 if (CheckForSameVectorNodes(E))
13599 continue;
13600 // Try to find vectorizable sequences and transform them into a series of
13601 // insertvector instructions.
13602 unsigned StartIdx = 0;
13603 unsigned End = VL.size();
13604 SmallBitVector Processed(End);
13605 for (unsigned VF = getFloorFullVectorNumberOfElements(
13606 TTI: *TTI, Ty: VL.front()->getType(), Sz: VL.size() - 1);
13607 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13608 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
13609 if (StartIdx + VF > End)
13610 continue;
13611 SmallVector<std::pair<unsigned, unsigned>> Slices;
13612 bool AllStrided = true;
13613 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13614 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
13615 // If any instruction is vectorized already - do not try again.
13616 // Reuse the existing node, if it fully matches the slice.
13617 if ((Processed.test(Idx: Cnt) || isVectorized(V: Slice.front())) &&
13618 !getSameValuesTreeEntry(V: Slice.front(), VL: Slice, /*SameVF=*/true))
13619 continue;
13620 // Constant already handled effectively - skip.
13621 if (allConstant(VL: Slice))
13622 continue;
13623 // Do not try to vectorize small splats (less than vector register and
13624 // only with the single non-undef element).
13625 bool IsSplat = isSplat(VL: Slice);
13626 bool IsTwoRegisterSplat = true;
13627 if (IsSplat && VF == 2) {
13628 unsigned NumRegs2VF = ::getNumberOfParts(
13629 TTI: *TTI, VecTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF: 2 * VF));
13630 IsTwoRegisterSplat = NumRegs2VF == 2;
13631 }
13632 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13633 count(Range&: Slice, Element: Slice.front()) ==
13634 static_cast<long>(isa<UndefValue>(Val: Slice.front()) ? VF - 1
13635 : 1)) {
13636 if (IsSplat)
13637 continue;
13638 InstructionsState S = getSameOpcode(VL: Slice, TLI: *TLI);
13639 if (!S || !allSameOpcode(VL: Slice) || !allSameBlock(VL: Slice) ||
13640 (S.getOpcode() == Instruction::Load &&
13641 areKnownNonVectorizableLoads(VL: Slice)) ||
13642 (S.getOpcode() != Instruction::Load &&
13643 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), Sz: VF)))
13644 continue;
13645 if (VF == 2) {
13646 // Try to vectorize reduced values or if all users are vectorized.
13647 // For expensive instructions extra extracts might be profitable.
13648 if ((!UserIgnoreList || E.Idx != 0) &&
13649 TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
13650 TTI::TCC_Expensive &&
13651 !all_of(Range&: Slice, P: [&](Value *V) {
13652 if (isa<PoisonValue>(Val: V))
13653 return true;
13654 return areAllUsersVectorized(I: cast<Instruction>(Val: V),
13655 VectorizedVals: UserIgnoreList);
13656 }))
13657 continue;
13658 if (S.getOpcode() == Instruction::Load) {
13659 OrdersType Order;
13660 SmallVector<Value *> PointerOps;
13661 StridedPtrInfo SPtrInfo;
13662 LoadsState Res = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
13663 PointerOps, SPtrInfo);
13664 AllStrided &= Res == LoadsState::StridedVectorize ||
13665 Res == LoadsState::ScatterVectorize ||
13666 Res == LoadsState::Gather;
13667 // Do not vectorize gathers.
13668 if (Res == LoadsState::ScatterVectorize ||
13669 Res == LoadsState::Gather) {
13670 if (Res == LoadsState::Gather) {
13671 registerNonVectorizableLoads(VL: Slice);
13672 // If reductions and the scalars from the root node are
13673 // analyzed - mark as non-vectorizable reduction.
13674 if (UserIgnoreList && E.Idx == 0)
13675 analyzedReductionVals(VL: Slice);
13676 }
13677 continue;
13678 }
13679 } else if (S.getOpcode() == Instruction::ExtractElement ||
13680 (TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
13681 TTI::TCC_Expensive &&
13682 !CheckOperandsProfitability(
13683 S.getMainOp(),
13684 cast<Instruction>(Val: *find_if(Range: reverse(C&: Slice),
13685 P: IsaPred<Instruction>)),
13686 S))) {
13687 // Do not vectorize extractelements (handled effectively
13688 // alread). Do not vectorize non-profitable instructions (with
13689 // low cost and non-vectorizable operands.)
13690 continue;
13691 }
13692 }
13693 }
13694 Slices.emplace_back(Args&: Cnt, Args: Slice.size());
13695 }
13696 // Do not try to vectorize if all slides are strided or gathered with
13697 // vector factor 2 and there are more than 2 slices. Better to handle
13698 // them in gathered loads analysis, may result in better vectorization.
13699 if (VF == 2 && AllStrided && Slices.size() > 2)
13700 continue;
13701 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13702 E.CombinedEntriesWithIndices.emplace_back(Args&: Idx, Args&: Cnt);
13703 Processed.set(I: Cnt, E: Cnt + Sz);
13704 if (StartIdx == Cnt)
13705 StartIdx = Cnt + Sz;
13706 if (End == Cnt + Sz)
13707 End = Cnt;
13708 };
13709 for (auto [Cnt, Sz] : Slices) {
13710 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: Sz);
13711 const TreeEntry *SameTE = nullptr;
13712 if (const auto *It = find_if(Range&: Slice, P: IsaPred<Instruction>);
13713 It != Slice.end()) {
13714 // If any instruction is vectorized already - do not try again.
13715 SameTE = getSameValuesTreeEntry(V: *It, VL: Slice);
13716 }
13717 unsigned PrevSize = VectorizableTree.size();
13718 [[maybe_unused]] unsigned PrevEntriesSize =
13719 LoadEntriesToVectorize.size();
13720 buildTreeRec(VLRef: Slice, Depth: 0, UserTreeIdx: EdgeInfo(&E, UINT_MAX));
13721 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13722 VectorizableTree[PrevSize]->isGather() &&
13723 VectorizableTree[PrevSize]->hasState() &&
13724 VectorizableTree[PrevSize]->getOpcode() !=
13725 Instruction::ExtractElement &&
13726 !isSplat(VL: Slice)) {
13727 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13728 analyzedReductionVals(VL: Slice);
13729 VectorizableTree.pop_back();
13730 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13731 "LoadEntriesToVectorize expected to remain the same");
13732 continue;
13733 }
13734 AddCombinedNode(PrevSize, Cnt, Sz);
13735 }
13736 }
13737 // Restore ordering, if no extra vectorization happened.
13738 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13739 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13740 reorderScalars(Scalars&: E.Scalars, Mask);
13741 E.ReorderIndices.clear();
13742 }
13743 }
13744 if (!E.hasState())
13745 continue;
13746 switch (E.getOpcode()) {
13747 case Instruction::Load: {
13748 // No need to reorder masked gather loads, just reorder the scalar
13749 // operands.
13750 if (E.State != TreeEntry::Vectorize)
13751 break;
13752 Type *ScalarTy = E.getMainOp()->getType();
13753 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
13754 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
13755 // Check if profitable to represent consecutive load + reverse as strided
13756 // load with stride -1.
13757 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
13758 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
13759 SmallVector<int> Mask;
13760 inversePermutation(Indices: E.ReorderIndices, Mask);
13761 auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
13762 InstructionCost OriginalVecCost =
13763 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
13764 AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
13765 OpdInfo: TTI::OperandValueInfo()) +
13766 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
13767 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13768 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13769 VecTy, BaseLI->getPointerOperand(),
13770 /*VariableMask=*/false, CommonAlignment,
13771 BaseLI),
13772 CostKind);
13773 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13774 // Strided load is more profitable than consecutive load + reverse -
13775 // transform the node to strided load.
13776 Type *StrideTy = DL->getIndexType(PtrTy: cast<LoadInst>(Val: E.Scalars.front())
13777 ->getPointerOperand()
13778 ->getType());
13779 StridedPtrInfo SPtrInfo;
13780 SPtrInfo.StrideVal = ConstantInt::get(Ty: StrideTy, V: 1);
13781 SPtrInfo.Ty = VecTy;
13782 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13783 E.State = TreeEntry::StridedVectorize;
13784 }
13785 }
13786 break;
13787 }
13788 case Instruction::Store: {
13789 Type *ScalarTy =
13790 cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
13791 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
13792 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
13793 // Check if profitable to represent consecutive load + reverse as strided
13794 // load with stride -1.
13795 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
13796 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
13797 SmallVector<int> Mask;
13798 inversePermutation(Indices: E.ReorderIndices, Mask);
13799 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
13800 InstructionCost OriginalVecCost =
13801 TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
13802 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
13803 OpdInfo: TTI::OperandValueInfo()) +
13804 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
13805 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13806 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13807 VecTy, BaseSI->getPointerOperand(),
13808 /*VariableMask=*/false, CommonAlignment,
13809 BaseSI),
13810 CostKind);
13811 if (StridedCost < OriginalVecCost)
13812 // Strided store is more profitable than reverse + consecutive store -
13813 // transform the node to strided store.
13814 E.State = TreeEntry::StridedVectorize;
13815 } else if (!E.ReorderIndices.empty()) {
13816 // Check for interleaved stores.
13817 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13818 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.front());
13819 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13820 if (Mask.size() < 4)
13821 return 0u;
13822 for (unsigned Factor : seq<unsigned>(Begin: 2, End: Mask.size() / 2 + 1)) {
13823 if (ShuffleVectorInst::isInterleaveMask(
13824 Mask, Factor, NumInputElts: VecTy->getElementCount().getFixedValue()) &&
13825 TTI.isLegalInterleavedAccessType(
13826 VTy: VecTy, Factor, Alignment: BaseSI->getAlign(),
13827 AddrSpace: BaseSI->getPointerAddressSpace()))
13828 return Factor;
13829 }
13830
13831 return 0u;
13832 };
13833 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13834 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13835 if (InterleaveFactor != 0)
13836 E.setInterleave(InterleaveFactor);
13837 }
13838 break;
13839 }
13840 case Instruction::Select: {
13841 if (E.State != TreeEntry::Vectorize)
13842 break;
13843 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: E.Scalars);
13844 if (MinMaxID == Intrinsic::not_intrinsic)
13845 break;
13846 // This node is a minmax node.
13847 E.CombinedOp = TreeEntry::MinMax;
13848 TreeEntry *CondEntry = getOperandEntry(E: &E, Idx: 0);
13849 if (SelectOnly && CondEntry->UserTreeIndex &&
13850 CondEntry->State == TreeEntry::Vectorize) {
13851 // The condition node is part of the combined minmax node.
13852 CondEntry->State = TreeEntry::CombinedVectorize;
13853 }
13854 break;
13855 }
13856 case Instruction::FSub:
13857 case Instruction::FAdd: {
13858 // Check if possible to convert (a*b)+c to fma.
13859 if (E.State != TreeEntry::Vectorize ||
13860 !E.getOperations().isAddSubLikeOp())
13861 break;
13862 if (!canConvertToFMA(VL: E.Scalars, S: E.getOperations(), DT&: *DT, DL: *DL, TTI&: *TTI, TLI: *TLI)
13863 .isValid())
13864 break;
13865 // This node is a fmuladd node.
13866 E.CombinedOp = TreeEntry::FMulAdd;
13867 TreeEntry *FMulEntry = getOperandEntry(E: &E, Idx: 0);
13868 if (FMulEntry->UserTreeIndex &&
13869 FMulEntry->State == TreeEntry::Vectorize) {
13870 // The FMul node is part of the combined fmuladd node.
13871 FMulEntry->State = TreeEntry::CombinedVectorize;
13872 }
13873 break;
13874 }
13875 case Instruction::Shl: {
13876 if (E.Idx != 0 || DL->isBigEndian())
13877 break;
13878 if (!UserIgnoreList)
13879 break;
13880 // Check that all reduction operands are disjoint or instructions.
13881 if (any_of(Range: *UserIgnoreList, P: [](Value *V) {
13882 return !match(V, P: m_DisjointOr(L: m_Value(), R: m_Value()));
13883 }))
13884 break;
13885 OrdersType Order;
13886 bool IsBSwap;
13887 if (!matchesShlZExt(TE: E, Order, IsBSwap))
13888 break;
13889 // This node is a (reduced disjoint or) bitcast node.
13890 TreeEntry::CombinedOpcode Code =
13891 IsBSwap ? TreeEntry::ReducedBitcastBSwap : TreeEntry::ReducedBitcast;
13892 E.CombinedOp = Code;
13893 if (!IsBSwap)
13894 E.ReorderIndices = std::move(Order);
13895 TreeEntry *ZExtEntry = getOperandEntry(E: &E, Idx: 0);
13896 assert(ZExtEntry->UserTreeIndex &&
13897 ZExtEntry->State == TreeEntry::Vectorize &&
13898 ZExtEntry->getOpcode() == Instruction::ZExt &&
13899 "Expected ZExt node.");
13900 // The ZExt node is part of the combined node.
13901 ZExtEntry->State = TreeEntry::CombinedVectorize;
13902 ZExtEntry->CombinedOp = Code;
13903 TreeEntry *ConstEntry = getOperandEntry(E: &E, Idx: 1);
13904 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
13905 "Expected ZExt node.");
13906 // The ConstNode node is part of the combined node.
13907 ConstEntry->State = TreeEntry::CombinedVectorize;
13908 ConstEntry->CombinedOp = Code;
13909 break;
13910 }
13911 default:
13912 break;
13913 }
13914 }
13915
13916 if (LoadEntriesToVectorize.empty()) {
13917 // Single load node - exit.
13918 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
13919 VectorizableTree.front()->getOpcode() == Instruction::Load)
13920 return;
13921 // Small graph with small VF - exit.
13922 constexpr unsigned SmallTree = 3;
13923 constexpr unsigned SmallVF = 2;
13924 if ((VectorizableTree.size() <= SmallTree &&
13925 VectorizableTree.front()->Scalars.size() == SmallVF) ||
13926 (VectorizableTree.size() <= 2 && UserIgnoreList))
13927 return;
13928
13929 if (VectorizableTree.front()->isNonPowOf2Vec() &&
13930 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
13931 getCanonicalGraphSize() <= SmallTree &&
13932 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
13933 P: [](const std::unique_ptr<TreeEntry> &TE) {
13934 return TE->isGather() && TE->hasState() &&
13935 TE->getOpcode() == Instruction::Load &&
13936 !allSameBlock(VL: TE->Scalars);
13937 }) == 1)
13938 return;
13939 }
13940
13941 // A list of loads to be gathered during the vectorization process. We can
13942 // try to vectorize them at the end, if profitable.
13943 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
13944 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
13945 GatheredLoads;
13946
13947 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
13948 TreeEntry &E = *TE;
13949 if (E.isGather() &&
13950 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
13951 (!E.hasState() && any_of(Range&: E.Scalars,
13952 P: [&](Value *V) {
13953 return isa<LoadInst>(Val: V) &&
13954 !isVectorized(V) &&
13955 !isDeleted(I: cast<Instruction>(Val: V));
13956 }))) &&
13957 !isSplat(VL: E.Scalars)) {
13958 for (Value *V : E.Scalars) {
13959 auto *LI = dyn_cast<LoadInst>(Val: V);
13960 if (!LI)
13961 continue;
13962 if (isDeleted(I: LI) || isVectorized(V: LI) || !LI->isSimple())
13963 continue;
13964 gatherPossiblyVectorizableLoads(
13965 R: *this, VL: V, DL: *DL, SE&: *SE, TTI: *TTI,
13966 GatheredLoads&: GatheredLoads[std::make_tuple(
13967 args: LI->getParent(),
13968 args: getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth),
13969 args: LI->getType())]);
13970 }
13971 }
13972 }
13973 // Try to vectorize gathered loads if this is not just a gather of loads.
13974 if (!GatheredLoads.empty())
13975 tryToVectorizeGatheredLoads(GatheredLoads);
13976}
13977
13978/// Merges shuffle masks and emits final shuffle instruction, if required. It
13979/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
13980/// when the actual shuffle instruction is generated only if this is actually
13981/// required. Otherwise, the shuffle instruction emission is delayed till the
13982/// end of the process, to reduce the number of emitted instructions and further
13983/// analysis/transformations.
13984class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
13985 bool IsFinalized = false;
13986 SmallVector<int> CommonMask;
13987 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
13988 const TargetTransformInfo &TTI;
13989 InstructionCost Cost = 0;
13990 SmallDenseSet<Value *> VectorizedVals;
13991 BoUpSLP &R;
13992 SmallPtrSetImpl<Value *> &CheckedExtracts;
13993 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13994 /// While set, still trying to estimate the cost for the same nodes and we
13995 /// can delay actual cost estimation (virtual shuffle instruction emission).
13996 /// May help better estimate the cost if same nodes must be permuted + allows
13997 /// to move most of the long shuffles cost estimation to TTI.
13998 bool SameNodesEstimated = true;
13999
14000 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
14001 if (Ty->getScalarType()->isPointerTy()) {
14002 Constant *Res = ConstantExpr::getIntToPtr(
14003 C: ConstantInt::getAllOnesValue(
14004 Ty: IntegerType::get(C&: Ty->getContext(),
14005 NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
14006 Ty: Ty->getScalarType());
14007 if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
14008 Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
14009 return Res;
14010 }
14011 return Constant::getAllOnesValue(Ty);
14012 }
14013
14014 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
14015 if ((!Root && allConstant(VL)) || all_of(Range&: VL, P: IsaPred<UndefValue>))
14016 return TTI::TCC_Free;
14017 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
14018 InstructionCost GatherCost = 0;
14019 SmallVector<Value *> Gathers(VL);
14020 if (!Root && isSplat(VL)) {
14021 // Found the broadcasting of the single scalar, calculate the cost as
14022 // the broadcast.
14023 const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
14024 assert(It != VL.end() && "Expected at least one non-undef value.");
14025 // Add broadcast for non-identity shuffle only.
14026 bool NeedShuffle =
14027 count(Range&: VL, Element: *It) > 1 &&
14028 (VL.front() != *It || !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
14029 if (!NeedShuffle) {
14030 if (isa<FixedVectorType>(Val: ScalarTy)) {
14031 assert(SLPReVec && "FixedVectorType is not expected.");
14032 return TTI.getShuffleCost(
14033 Kind: TTI::SK_InsertSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind,
14034 Index: std::distance(first: VL.begin(), last: It) * getNumElements(Ty: ScalarTy),
14035 SubTp: cast<FixedVectorType>(Val: ScalarTy));
14036 }
14037 return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
14038 CostKind, Index: std::distance(first: VL.begin(), last: It),
14039 Op0: PoisonValue::get(T: VecTy), Op1: *It);
14040 }
14041
14042 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
14043 transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
14044 return isa<PoisonValue>(Val: V) ? PoisonMaskElem : 0;
14045 });
14046 InstructionCost InsertCost =
14047 TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0,
14048 Op0: PoisonValue::get(T: VecTy), Op1: *It);
14049 return InsertCost + ::getShuffleCost(TTI,
14050 Kind: TargetTransformInfo::SK_Broadcast,
14051 Tp: VecTy, Mask: ShuffleMask, CostKind,
14052 /*Index=*/0, /*SubTp=*/nullptr,
14053 /*Args=*/*It);
14054 }
14055 return GatherCost +
14056 (all_of(Range&: Gathers, P: IsaPred<UndefValue>)
14057 ? TTI::TCC_Free
14058 : R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
14059 ScalarTy));
14060 };
14061
14062 /// Compute the cost of creating a vector containing the extracted values from
14063 /// \p VL.
14064 InstructionCost
14065 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
14066 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14067 unsigned NumParts) {
14068 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
14069 unsigned NumElts =
14070 std::accumulate(first: VL.begin(), last: VL.end(), init: 0, binary_op: [](unsigned Sz, Value *V) {
14071 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
14072 if (!EE)
14073 return Sz;
14074 auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
14075 if (!VecTy)
14076 return Sz;
14077 return std::max(a: Sz, b: VecTy->getNumElements());
14078 });
14079 // FIXME: this must be moved to TTI for better estimation.
14080 unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
14081 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
14082 SmallVectorImpl<unsigned> &Indices,
14083 SmallVectorImpl<unsigned> &SubVecSizes)
14084 -> std::optional<TTI::ShuffleKind> {
14085 if (NumElts <= EltsPerVector)
14086 return std::nullopt;
14087 int OffsetReg0 =
14088 alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
14089 binary_op: [](int S, int I) {
14090 if (I == PoisonMaskElem)
14091 return S;
14092 return std::min(a: S, b: I);
14093 }),
14094 Align: EltsPerVector);
14095 int OffsetReg1 = OffsetReg0;
14096 DenseSet<int> RegIndices;
14097 // Check that if trying to permute same single/2 input vectors.
14098 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
14099 int FirstRegId = -1;
14100 Indices.assign(NumElts: 1, Elt: OffsetReg0);
14101 for (auto [Pos, I] : enumerate(First&: Mask)) {
14102 if (I == PoisonMaskElem)
14103 continue;
14104 int Idx = I - OffsetReg0;
14105 int RegId =
14106 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14107 if (FirstRegId < 0)
14108 FirstRegId = RegId;
14109 RegIndices.insert(V: RegId);
14110 if (RegIndices.size() > 2)
14111 return std::nullopt;
14112 if (RegIndices.size() == 2) {
14113 ShuffleKind = TTI::SK_PermuteTwoSrc;
14114 if (Indices.size() == 1) {
14115 OffsetReg1 = alignDown(
14116 Value: std::accumulate(
14117 first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
14118 binary_op: [&](int S, int I) {
14119 if (I == PoisonMaskElem)
14120 return S;
14121 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14122 ((I - OffsetReg0) % NumElts) / EltsPerVector;
14123 if (RegId == FirstRegId)
14124 return S;
14125 return std::min(a: S, b: I);
14126 }),
14127 Align: EltsPerVector);
14128 unsigned Index = OffsetReg1 % NumElts;
14129 Indices.push_back(Elt: Index);
14130 SubVecSizes.push_back(Elt: std::min(a: NumElts - Index, b: EltsPerVector));
14131 }
14132 Idx = I - OffsetReg1;
14133 }
14134 I = (Idx % NumElts) % EltsPerVector +
14135 (RegId == FirstRegId ? 0 : EltsPerVector);
14136 }
14137 return ShuffleKind;
14138 };
14139 InstructionCost Cost = 0;
14140
14141 // Process extracts in blocks of EltsPerVector to check if the source vector
14142 // operand can be re-used directly. If not, add the cost of creating a
14143 // shuffle to extract the values into a vector register.
14144 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
14145 if (!ShuffleKinds[Part])
14146 continue;
14147 ArrayRef<int> MaskSlice = Mask.slice(
14148 N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
14149 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
14150 copy(Range&: MaskSlice, Out: SubMask.begin());
14151 SmallVector<unsigned, 2> Indices;
14152 SmallVector<unsigned, 2> SubVecSizes;
14153 std::optional<TTI::ShuffleKind> RegShuffleKind =
14154 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14155 if (!RegShuffleKind) {
14156 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
14157 !ShuffleVectorInst::isIdentityMask(
14158 Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
14159 Cost +=
14160 ::getShuffleCost(TTI, Kind: *ShuffleKinds[Part],
14161 Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
14162 continue;
14163 }
14164 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
14165 !ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
14166 Cost +=
14167 ::getShuffleCost(TTI, Kind: *RegShuffleKind,
14168 Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
14169 }
14170 const unsigned BaseVF = getFullVectorNumberOfElements(
14171 TTI: *R.TTI, Ty: VL.front()->getType(), Sz: alignTo(Value: NumElts, Align: EltsPerVector));
14172 for (const auto [Idx, SubVecSize] : zip(t&: Indices, u&: SubVecSizes)) {
14173 assert((Idx + SubVecSize) <= BaseVF &&
14174 "SK_ExtractSubvector index out of range");
14175 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
14176 Tp: getWidenedType(ScalarTy, VF: BaseVF), Mask: {}, CostKind,
14177 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: SubVecSize));
14178 }
14179 // Second attempt to check, if just a permute is better estimated than
14180 // subvector extract.
14181 SubMask.assign(NumElts, Elt: PoisonMaskElem);
14182 copy(Range&: MaskSlice, Out: SubMask.begin());
14183 InstructionCost OriginalCost = ::getShuffleCost(
14184 TTI, Kind: *ShuffleKinds[Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
14185 if (OriginalCost < Cost)
14186 Cost = OriginalCost;
14187 }
14188 return Cost;
14189 }
14190 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14191 /// mask \p Mask, register number \p Part, that includes \p SliceSize
14192 /// elements.
14193 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14194 ArrayRef<int> Mask, unsigned Part,
14195 unsigned SliceSize) {
14196 if (SameNodesEstimated) {
14197 // Delay the cost estimation if the same nodes are reshuffling.
14198 // If we already requested the cost of reshuffling of E1 and E2 before, no
14199 // need to estimate another cost with the sub-Mask, instead include this
14200 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14201 // estimation.
14202 if ((InVectors.size() == 2 &&
14203 cast<const TreeEntry *>(Val&: InVectors.front()) == &E1 &&
14204 cast<const TreeEntry *>(Val&: InVectors.back()) == E2) ||
14205 (!E2 && cast<const TreeEntry *>(Val&: InVectors.front()) == &E1)) {
14206 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
14207 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14208 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14209 "Expected all poisoned elements.");
14210 ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
14211 copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
14212 return;
14213 }
14214 // Found non-matching nodes - need to estimate the cost for the matched
14215 // and transform mask.
14216 Cost += createShuffle(P1: InVectors.front(),
14217 P2: InVectors.size() == 1 ? nullptr : InVectors.back(),
14218 Mask: CommonMask);
14219 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14220 } else if (InVectors.size() == 2) {
14221 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
14222 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14223 }
14224 SameNodesEstimated = false;
14225 if (!E2 && InVectors.size() == 1) {
14226 unsigned VF = E1.getVectorFactor();
14227 if (Value *V1 = dyn_cast<Value *>(Val&: InVectors.front())) {
14228 VF = std::max(a: VF, b: getVF(V: V1));
14229 } else {
14230 const auto *E = cast<const TreeEntry *>(Val&: InVectors.front());
14231 VF = std::max(a: VF, b: E->getVectorFactor());
14232 }
14233 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14234 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14235 CommonMask[Idx] = Mask[Idx] + VF;
14236 Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
14237 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14238 } else {
14239 auto P = InVectors.front();
14240 Cost += createShuffle(P1: &E1, P2: E2, Mask);
14241 unsigned VF = Mask.size();
14242 if (Value *V1 = dyn_cast<Value *>(Val&: P)) {
14243 VF = std::max(a: VF,
14244 b: getNumElements(Ty: V1->getType()));
14245 } else {
14246 const auto *E = cast<const TreeEntry *>(Val&: P);
14247 VF = std::max(a: VF, b: E->getVectorFactor());
14248 }
14249 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14250 if (Mask[Idx] != PoisonMaskElem)
14251 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14252 Cost += createShuffle(P1: P, P2: InVectors.front(), Mask: CommonMask);
14253 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14254 }
14255 }
14256
14257 class ShuffleCostBuilder {
14258 const TargetTransformInfo &TTI;
14259
14260 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14261 int Index = -1;
14262 return Mask.empty() ||
14263 (VF == Mask.size() &&
14264 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) ||
14265 (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
14266 Index == 0);
14267 }
14268
14269 public:
14270 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14271 ~ShuffleCostBuilder() = default;
14272 InstructionCost createShuffleVector(Value *V1, Value *,
14273 ArrayRef<int> Mask) const {
14274 // Empty mask or identity mask are free.
14275 unsigned VF =
14276 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14277 if (isEmptyOrIdentity(Mask, VF))
14278 return TTI::TCC_Free;
14279 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
14280 Tp: cast<VectorType>(Val: V1->getType()), Mask);
14281 }
14282 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14283 // Empty mask or identity mask are free.
14284 unsigned VF =
14285 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14286 if (isEmptyOrIdentity(Mask, VF))
14287 return TTI::TCC_Free;
14288 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
14289 Tp: cast<VectorType>(Val: V1->getType()), Mask);
14290 }
14291 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14292 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14293 return TTI::TCC_Free;
14294 }
14295 void resizeToMatch(Value *&, Value *&) const {}
14296 };
14297
14298 /// Smart shuffle instruction emission, walks through shuffles trees and
14299 /// tries to find the best matching vector for the actual shuffle
14300 /// instruction.
14301 InstructionCost
14302 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14303 const PointerUnion<Value *, const TreeEntry *> &P2,
14304 ArrayRef<int> Mask) {
14305 ShuffleCostBuilder Builder(TTI);
14306 SmallVector<int> CommonMask(Mask);
14307 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14308 unsigned CommonVF = Mask.size();
14309 InstructionCost ExtraCost = 0;
14310 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14311 unsigned VF) -> InstructionCost {
14312 if (E.isGather() && allConstant(VL: E.Scalars))
14313 return TTI::TCC_Free;
14314 Type *EScalarTy = E.Scalars.front()->getType();
14315 bool IsSigned = true;
14316 if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
14317 EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
14318 IsSigned = It->second.second;
14319 }
14320 if (EScalarTy != ScalarTy) {
14321 unsigned CastOpcode = Instruction::Trunc;
14322 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14323 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14324 if (DstSz > SrcSz)
14325 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14326 return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
14327 Src: getWidenedType(ScalarTy: EScalarTy, VF),
14328 CCH: TTI::CastContextHint::None, CostKind);
14329 }
14330 return TTI::TCC_Free;
14331 };
14332 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14333 if (isa<Constant>(Val: V))
14334 return TTI::TCC_Free;
14335 auto *VecTy = cast<VectorType>(Val: V->getType());
14336 Type *EScalarTy = VecTy->getElementType();
14337 if (EScalarTy != ScalarTy) {
14338 bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL));
14339 unsigned CastOpcode = Instruction::Trunc;
14340 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14341 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14342 if (DstSz > SrcSz)
14343 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14344 return TTI.getCastInstrCost(
14345 Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
14346 Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
14347 }
14348 return TTI::TCC_Free;
14349 };
14350 if (!V1 && !V2 && !P2.isNull()) {
14351 // Shuffle 2 entry nodes.
14352 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
14353 unsigned VF = E->getVectorFactor();
14354 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
14355 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14356 assert(all_of(Mask,
14357 [=](int Idx) {
14358 return Idx < 2 * static_cast<int>(CommonVF);
14359 }) &&
14360 "All elements in mask must be less than 2 * CommonVF.");
14361 if (E->Scalars.size() == E2->Scalars.size()) {
14362 SmallVector<int> EMask = E->getCommonMask();
14363 SmallVector<int> E2Mask = E2->getCommonMask();
14364 if (!EMask.empty() || !E2Mask.empty()) {
14365 for (int &Idx : CommonMask) {
14366 if (Idx == PoisonMaskElem)
14367 continue;
14368 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14369 Idx = EMask[Idx];
14370 else if (Idx >= static_cast<int>(CommonVF))
14371 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14372 E->Scalars.size();
14373 }
14374 }
14375 CommonVF = E->Scalars.size();
14376 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14377 GetNodeMinBWAffectedCost(*E2, CommonVF);
14378 } else {
14379 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14380 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14381 }
14382 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14383 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14384 } else if (!V1 && P2.isNull()) {
14385 // Shuffle single entry node.
14386 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
14387 unsigned VF = E->getVectorFactor();
14388 CommonVF = VF;
14389 assert(
14390 all_of(Mask,
14391 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14392 "All elements in mask must be less than CommonVF.");
14393 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14394 SmallVector<int> EMask = E->getCommonMask();
14395 assert(!EMask.empty() && "Expected non-empty common mask.");
14396 for (int &Idx : CommonMask) {
14397 if (Idx != PoisonMaskElem)
14398 Idx = EMask[Idx];
14399 }
14400 CommonVF = E->Scalars.size();
14401 } else if (unsigned Factor = E->getInterleaveFactor();
14402 Factor > 0 && E->Scalars.size() != Mask.size() &&
14403 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask: CommonMask,
14404 Factor)) {
14405 // Deinterleaved nodes are free.
14406 std::iota(first: CommonMask.begin(), last: CommonMask.end(), value: 0);
14407 }
14408 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14409 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14410 // Not identity/broadcast? Try to see if the original vector is better.
14411 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14412 CommonVF == CommonMask.size() &&
14413 any_of(Range: enumerate(First&: CommonMask),
14414 P: [](const auto &&P) {
14415 return P.value() != PoisonMaskElem &&
14416 static_cast<unsigned>(P.value()) != P.index();
14417 }) &&
14418 any_of(Range&: CommonMask,
14419 P: [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14420 SmallVector<int> ReorderMask;
14421 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
14422 ::addMask(Mask&: CommonMask, SubMask: ReorderMask);
14423 }
14424 } else if (V1 && P2.isNull()) {
14425 // Shuffle single vector.
14426 ExtraCost += GetValueMinBWAffectedCost(V1);
14427 CommonVF = getVF(V: V1);
14428 assert(
14429 all_of(Mask,
14430 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14431 "All elements in mask must be less than CommonVF.");
14432 } else if (V1 && !V2) {
14433 // Shuffle vector and tree node.
14434 unsigned VF = getVF(V: V1);
14435 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
14436 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14437 assert(all_of(Mask,
14438 [=](int Idx) {
14439 return Idx < 2 * static_cast<int>(CommonVF);
14440 }) &&
14441 "All elements in mask must be less than 2 * CommonVF.");
14442 if (E2->Scalars.size() == VF && VF != CommonVF) {
14443 SmallVector<int> E2Mask = E2->getCommonMask();
14444 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14445 for (int &Idx : CommonMask) {
14446 if (Idx == PoisonMaskElem)
14447 continue;
14448 if (Idx >= static_cast<int>(CommonVF))
14449 Idx = E2Mask[Idx - CommonVF] + VF;
14450 }
14451 CommonVF = VF;
14452 }
14453 ExtraCost += GetValueMinBWAffectedCost(V1);
14454 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14455 ExtraCost += GetNodeMinBWAffectedCost(
14456 *E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
14457 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14458 } else if (!V1 && V2) {
14459 // Shuffle vector and tree node.
14460 unsigned VF = getVF(V: V2);
14461 const TreeEntry *E1 = cast<const TreeEntry *>(Val: P1);
14462 CommonVF = std::max(a: VF, b: E1->getVectorFactor());
14463 assert(all_of(Mask,
14464 [=](int Idx) {
14465 return Idx < 2 * static_cast<int>(CommonVF);
14466 }) &&
14467 "All elements in mask must be less than 2 * CommonVF.");
14468 if (E1->Scalars.size() == VF && VF != CommonVF) {
14469 SmallVector<int> E1Mask = E1->getCommonMask();
14470 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14471 for (int &Idx : CommonMask) {
14472 if (Idx == PoisonMaskElem)
14473 continue;
14474 if (Idx >= static_cast<int>(CommonVF))
14475 Idx = E1Mask[Idx - CommonVF] + VF;
14476 else
14477 Idx = E1Mask[Idx];
14478 }
14479 CommonVF = VF;
14480 }
14481 ExtraCost += GetNodeMinBWAffectedCost(
14482 *E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
14483 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14484 ExtraCost += GetValueMinBWAffectedCost(V2);
14485 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14486 } else {
14487 assert(V1 && V2 && "Expected both vectors.");
14488 unsigned VF = getVF(V: V1);
14489 CommonVF = std::max(a: VF, b: getVF(V: V2));
14490 assert(all_of(Mask,
14491 [=](int Idx) {
14492 return Idx < 2 * static_cast<int>(CommonVF);
14493 }) &&
14494 "All elements in mask must be less than 2 * CommonVF.");
14495 ExtraCost +=
14496 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14497 if (V1->getType() != V2->getType()) {
14498 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14499 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14500 } else {
14501 if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
14502 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14503 if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
14504 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14505 }
14506 }
14507 InVectors.front() =
14508 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
14509 if (InVectors.size() == 2)
14510 InVectors.pop_back();
14511 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14512 V1, V2, Mask: CommonMask, Builder, ScalarTy);
14513 }
14514
14515public:
14516 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
14517 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14518 SmallPtrSetImpl<Value *> &CheckedExtracts)
14519 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14520 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14521 CheckedExtracts(CheckedExtracts) {}
14522 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14523 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14524 unsigned NumParts, bool &UseVecBaseAsInput) {
14525 UseVecBaseAsInput = false;
14526 if (Mask.empty())
14527 return nullptr;
14528 Value *VecBase = nullptr;
14529 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14530 if (!E->ReorderIndices.empty()) {
14531 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14532 E->ReorderIndices.end());
14533 reorderScalars(Scalars&: VL, Mask: ReorderMask);
14534 }
14535 // Check if it can be considered reused if same extractelements were
14536 // vectorized already.
14537 bool PrevNodeFound = any_of(
14538 Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
14539 P: [&](const std::unique_ptr<TreeEntry> &TE) {
14540 return ((TE->hasState() && !TE->isAltShuffle() &&
14541 TE->getOpcode() == Instruction::ExtractElement) ||
14542 TE->isGather()) &&
14543 all_of(Range: enumerate(First&: TE->Scalars), P: [&](auto &&Data) {
14544 return VL.size() > Data.index() &&
14545 (Mask[Data.index()] == PoisonMaskElem ||
14546 isa<UndefValue>(VL[Data.index()]) ||
14547 Data.value() == VL[Data.index()]);
14548 });
14549 });
14550 SmallPtrSet<Value *, 4> UniqueBases;
14551 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
14552 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14553 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
14554 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
14555 ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
14556 for (auto [I, V] :
14557 enumerate(First: ArrayRef(VL).slice(N: Part * SliceSize, M: Limit))) {
14558 // Ignore non-extractelement scalars.
14559 if (isa<UndefValue>(Val: V) ||
14560 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14561 continue;
14562 // If all users of instruction are going to be vectorized and this
14563 // instruction itself is not going to be vectorized, consider this
14564 // instruction as dead and remove its cost from the final cost of the
14565 // vectorized tree.
14566 // Also, avoid adjusting the cost for extractelements with multiple uses
14567 // in different graph entries.
14568 auto *EE = cast<ExtractElementInst>(Val: V);
14569 VecBase = EE->getVectorOperand();
14570 UniqueBases.insert(Ptr: VecBase);
14571 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14572 if (!CheckedExtracts.insert(Ptr: V).second ||
14573 !R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) ||
14574 any_of(Range&: VEs,
14575 P: [&](const TreeEntry *TE) {
14576 return R.DeletedNodes.contains(Ptr: TE) ||
14577 R.TransformedToGatherNodes.contains(Val: TE);
14578 }) ||
14579 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14580 !R.isVectorized(V: EE) &&
14581 count_if(Range: E->Scalars, P: [&](Value *V) { return V == EE; }) !=
14582 count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
14583 P: [&](Value *V) { return V == EE; })) ||
14584 any_of(Range: EE->users(),
14585 P: [&](User *U) {
14586 return isa<GetElementPtrInst>(Val: U) &&
14587 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
14588 VectorizedVals: &VectorizedVals);
14589 }) ||
14590 (!VEs.empty() && !is_contained(Range&: VEs, Element: E)))
14591 continue;
14592 std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
14593 if (!EEIdx)
14594 continue;
14595 unsigned Idx = *EEIdx;
14596 // Take credit for instruction that will become dead.
14597 if (EE->hasOneUse() || !PrevNodeFound) {
14598 Instruction *Ext = EE->user_back();
14599 if (isa<SExtInst, ZExtInst>(Val: Ext) &&
14600 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
14601 // Use getExtractWithExtendCost() to calculate the cost of
14602 // extractelement/ext pair.
14603 Cost -= TTI.getExtractWithExtendCost(
14604 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: EE->getVectorOperandType(),
14605 Index: Idx, CostKind);
14606 // Add back the cost of s|zext which is subtracted separately.
14607 Cost += TTI.getCastInstrCost(
14608 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
14609 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
14610 continue;
14611 }
14612 }
14613 APInt &DemandedElts =
14614 VectorOpsToExtracts
14615 .try_emplace(Key: VecBase,
14616 Args: APInt::getZero(numBits: getNumElements(Ty: VecBase->getType())))
14617 .first->getSecond();
14618 DemandedElts.setBit(Idx);
14619 }
14620 }
14621 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14622 Cost -= TTI.getScalarizationOverhead(Ty: cast<VectorType>(Val: Vec->getType()),
14623 DemandedElts, /*Insert=*/false,
14624 /*Extract=*/true, CostKind);
14625 // Check that gather of extractelements can be represented as just a
14626 // shuffle of a single/two vectors the scalars are extracted from.
14627 // Found the bunch of extractelement instructions that must be gathered
14628 // into a vector and can be represented as a permutation elements in a
14629 // single input vector or of 2 input vectors.
14630 // Done for reused if same extractelements were vectorized already.
14631 if (!PrevNodeFound)
14632 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14633 InVectors.assign(NumElts: 1, Elt: E);
14634 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14635 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14636 SameNodesEstimated = false;
14637 if (NumParts != 1 && UniqueBases.size() != 1) {
14638 UseVecBaseAsInput = true;
14639 VecBase =
14640 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
14641 }
14642 return VecBase;
14643 }
14644 /// Checks if the specified entry \p E needs to be delayed because of its
14645 /// dependency nodes.
14646 std::optional<InstructionCost>
14647 needToDelay(const TreeEntry *,
14648 ArrayRef<SmallVector<const TreeEntry *>>) const {
14649 // No need to delay the cost estimation during analysis.
14650 return std::nullopt;
14651 }
14652 /// Reset the builder to handle perfect diamond match.
14653 void resetForSameNode() {
14654 IsFinalized = false;
14655 CommonMask.clear();
14656 InVectors.clear();
14657 Cost = 0;
14658 VectorizedVals.clear();
14659 SameNodesEstimated = true;
14660 }
14661 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14662 if (&E1 == &E2) {
14663 assert(all_of(Mask,
14664 [&](int Idx) {
14665 return Idx < static_cast<int>(E1.getVectorFactor());
14666 }) &&
14667 "Expected single vector shuffle mask.");
14668 add(E1, Mask);
14669 return;
14670 }
14671 if (InVectors.empty()) {
14672 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14673 InVectors.assign(IL: {&E1, &E2});
14674 return;
14675 }
14676 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14677 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
14678 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
14679 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
14680 const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
14681 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
14682 estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
14683 }
14684 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14685 if (InVectors.empty()) {
14686 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14687 InVectors.assign(NumElts: 1, Elt: &E1);
14688 return;
14689 }
14690 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14691 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
14692 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
14693 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
14694 const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
14695 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
14696 estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
14697 if (!SameNodesEstimated && InVectors.size() == 1)
14698 InVectors.emplace_back(Args: &E1);
14699 }
14700 /// Adds 2 input vectors and the mask for their shuffling.
14701 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14702 // May come only for shuffling of 2 vectors with extractelements, already
14703 // handled in adjustExtracts.
14704 assert(InVectors.size() == 1 &&
14705 all_of(enumerate(CommonMask),
14706 [&](auto P) {
14707 if (P.value() == PoisonMaskElem)
14708 return Mask[P.index()] == PoisonMaskElem;
14709 auto *EI = cast<ExtractElementInst>(
14710 cast<const TreeEntry *>(InVectors.front())
14711 ->getOrdered(P.index()));
14712 return EI->getVectorOperand() == V1 ||
14713 EI->getVectorOperand() == V2;
14714 }) &&
14715 "Expected extractelement vectors.");
14716 }
14717 /// Adds another one input vector and the mask for the shuffling.
14718 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14719 if (InVectors.empty()) {
14720 assert(CommonMask.empty() && !ForExtracts &&
14721 "Expected empty input mask/vectors.");
14722 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14723 InVectors.assign(NumElts: 1, Elt: V1);
14724 return;
14725 }
14726 if (ForExtracts) {
14727 // No need to add vectors here, already handled them in adjustExtracts.
14728 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14729 !CommonMask.empty() &&
14730 all_of(enumerate(CommonMask),
14731 [&](auto P) {
14732 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14733 ->getOrdered(P.index());
14734 if (P.value() == PoisonMaskElem)
14735 return P.value() == Mask[P.index()] ||
14736 isa<UndefValue>(Scalar);
14737 if (isa<Constant>(V1))
14738 return true;
14739 auto *EI = cast<ExtractElementInst>(Scalar);
14740 return EI->getVectorOperand() == V1;
14741 }) &&
14742 "Expected only tree entry for extractelement vectors.");
14743 return;
14744 }
14745 assert(!InVectors.empty() && !CommonMask.empty() &&
14746 "Expected only tree entries from extracts/reused buildvectors.");
14747 unsigned VF = getVF(V: V1);
14748 if (InVectors.size() == 2) {
14749 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
14750 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14751 VF = std::max<unsigned>(a: VF, b: CommonMask.size());
14752 } else if (const auto *InTE =
14753 InVectors.front().dyn_cast<const TreeEntry *>()) {
14754 VF = std::max(a: VF, b: InTE->getVectorFactor());
14755 } else {
14756 VF = std::max(
14757 a: VF, b: cast<FixedVectorType>(Val: cast<Value *>(Val&: InVectors.front())->getType())
14758 ->getNumElements());
14759 }
14760 InVectors.push_back(Elt: V1);
14761 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14762 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14763 CommonMask[Idx] = Mask[Idx] + VF;
14764 }
14765 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14766 Value *Root = nullptr) {
14767 Cost += getBuildVectorCost(VL, Root);
14768 if (!Root) {
14769 // FIXME: Need to find a way to avoid use of getNullValue here.
14770 SmallVector<Constant *> Vals;
14771 unsigned VF = VL.size();
14772 if (MaskVF != 0)
14773 VF = std::min(a: VF, b: MaskVF);
14774 Type *VLScalarTy = VL.front()->getType();
14775 for (Value *V : VL.take_front(N: VF)) {
14776 Type *ScalarTy = VLScalarTy->getScalarType();
14777 if (isa<PoisonValue>(Val: V)) {
14778 Vals.push_back(Elt: PoisonValue::get(T: ScalarTy));
14779 continue;
14780 }
14781 if (isa<UndefValue>(Val: V)) {
14782 Vals.push_back(Elt: UndefValue::get(T: ScalarTy));
14783 continue;
14784 }
14785 Vals.push_back(Elt: Constant::getNullValue(Ty: ScalarTy));
14786 }
14787 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: VLScalarTy)) {
14788 assert(SLPReVec && "FixedVectorType is not expected.");
14789 // When REVEC is enabled, we need to expand vector types into scalar
14790 // types.
14791 Vals = replicateMask(Val: Vals, VF: VecTy->getNumElements());
14792 }
14793 return ConstantVector::get(V: Vals);
14794 }
14795 return ConstantVector::getSplat(
14796 EC: ElementCount::getFixed(
14797 MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
14798 Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy->getScalarType()));
14799 }
14800 InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
14801 /// Finalize emission of the shuffles.
14802 InstructionCost finalize(
14803 ArrayRef<int> ExtMask,
14804 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14805 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14806 function_ref<void(Value *&, SmallVectorImpl<int> &,
14807 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
14808 Action = {}) {
14809 IsFinalized = true;
14810 if (Action) {
14811 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14812 if (InVectors.size() == 2)
14813 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
14814 else
14815 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
14816 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14817 assert(VF > 0 &&
14818 "Expected vector length for the final value before action.");
14819 Value *V = cast<Value *>(Val: Vec);
14820 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14821 Cost += createShuffle(P1: V1, P2: V2, Mask);
14822 return V1;
14823 });
14824 InVectors.front() = V;
14825 }
14826 if (!SubVectors.empty()) {
14827 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14828 if (InVectors.size() == 2)
14829 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
14830 else
14831 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
14832 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14833 // Add subvectors permutation cost.
14834 if (!SubVectorsMask.empty()) {
14835 assert(SubVectorsMask.size() <= CommonMask.size() &&
14836 "Expected same size of masks for subvectors and common mask.");
14837 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14838 copy(Range&: SubVectorsMask, Out: SVMask.begin());
14839 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
14840 if (I2 != PoisonMaskElem) {
14841 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14842 I1 = I2 + CommonMask.size();
14843 }
14844 }
14845 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
14846 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()),
14847 Mask: SVMask, CostKind);
14848 }
14849 for (auto [E, Idx] : SubVectors) {
14850 Type *EScalarTy = E->Scalars.front()->getType();
14851 bool IsSigned = true;
14852 if (auto It = R.MinBWs.find(Val: E); It != R.MinBWs.end()) {
14853 EScalarTy =
14854 IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
14855 IsSigned = It->second.second;
14856 }
14857 if (ScalarTy != EScalarTy) {
14858 unsigned CastOpcode = Instruction::Trunc;
14859 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14860 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14861 if (DstSz > SrcSz)
14862 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14863 Cost += TTI.getCastInstrCost(
14864 Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF: E->getVectorFactor()),
14865 Src: getWidenedType(ScalarTy: EScalarTy, VF: E->getVectorFactor()),
14866 CCH: TTI::CastContextHint::Normal, CostKind);
14867 }
14868 Cost += ::getShuffleCost(
14869 TTI, Kind: TTI::SK_InsertSubvector,
14870 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), Mask: {}, CostKind, Index: Idx,
14871 SubTp: getWidenedType(ScalarTy, VF: E->getVectorFactor()));
14872 if (!CommonMask.empty()) {
14873 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
14874 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
14875 value: Idx);
14876 }
14877 }
14878 }
14879
14880 if (!ExtMask.empty()) {
14881 if (CommonMask.empty()) {
14882 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
14883 } else {
14884 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14885 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14886 if (ExtMask[I] == PoisonMaskElem)
14887 continue;
14888 NewMask[I] = CommonMask[ExtMask[I]];
14889 }
14890 CommonMask.swap(RHS&: NewMask);
14891 }
14892 }
14893 if (CommonMask.empty()) {
14894 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14895 return Cost;
14896 }
14897 return Cost +
14898 createShuffle(P1: InVectors.front(),
14899 P2: InVectors.size() == 2 ? InVectors.back() : nullptr,
14900 Mask: CommonMask);
14901 }
14902
14903 ~ShuffleCostEstimator() {
14904 assert((IsFinalized || CommonMask.empty()) &&
14905 "Shuffle construction must be finalized.");
14906 }
14907};
14908
14909const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
14910 unsigned Idx) const {
14911 TreeEntry *Op = OperandsToTreeEntry.at(Val: {E, Idx});
14912 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
14913 return Op;
14914}
14915
14916TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
14917 if (TE.State == TreeEntry::ScatterVectorize ||
14918 TE.State == TreeEntry::StridedVectorize)
14919 return TTI::CastContextHint::GatherScatter;
14920 if (TE.State == TreeEntry::CompressVectorize)
14921 return TTI::CastContextHint::Masked;
14922 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
14923 !TE.isAltShuffle()) {
14924 if (TE.ReorderIndices.empty())
14925 return TTI::CastContextHint::Normal;
14926 SmallVector<int> Mask;
14927 inversePermutation(Indices: TE.ReorderIndices, Mask);
14928 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
14929 return TTI::CastContextHint::Reversed;
14930 }
14931 return TTI::CastContextHint::None;
14932}
14933
14934InstructionCost
14935BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
14936 SmallPtrSetImpl<Value *> &CheckedExtracts) {
14937 ArrayRef<Value *> VL = E->Scalars;
14938
14939 Type *ScalarTy = getValueType(V: VL[0]);
14940 if (!isValidElementType(Ty: ScalarTy))
14941 return InstructionCost::getInvalid();
14942 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14943
14944 // If we have computed a smaller type for the expression, update VecTy so
14945 // that the costs will be accurate.
14946 auto It = MinBWs.find(Val: E);
14947 Type *OrigScalarTy = ScalarTy;
14948 if (It != MinBWs.end()) {
14949 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
14950 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
14951 if (VecTy)
14952 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
14953 } else if (E->Idx == 0 && isReducedBitcastRoot()) {
14954 const TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
14955 ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
14956 }
14957 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
14958 unsigned EntryVF = E->getVectorFactor();
14959 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
14960
14961 if (E->isGather() || TransformedToGatherNodes.contains(Val: E)) {
14962 if (allConstant(VL))
14963 return 0;
14964 if (isa<InsertElementInst>(Val: VL[0]))
14965 return InstructionCost::getInvalid();
14966 if (isa<CmpInst>(Val: VL.front()))
14967 ScalarTy = VL.front()->getType();
14968 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
14969 E, ScalarTy, Params&: *TTI, Params&: VectorizedVals, Params&: *this, Params&: CheckedExtracts);
14970 }
14971 if (E->State == TreeEntry::SplitVectorize) {
14972 assert(E->CombinedEntriesWithIndices.size() == 2 &&
14973 "Expected exactly 2 combined entries.");
14974 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
14975 InstructionCost VectorCost = 0;
14976 if (E->ReorderIndices.empty()) {
14977 VectorCost = ::getShuffleCost(
14978 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: FinalVecTy, Mask: {}, CostKind,
14979 Index: E->CombinedEntriesWithIndices.back().second,
14980 SubTp: getWidenedType(
14981 ScalarTy,
14982 VF: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14983 ->getVectorFactor()));
14984 } else {
14985 unsigned CommonVF =
14986 std::max(a: VectorizableTree[E->CombinedEntriesWithIndices.front().first]
14987 ->getVectorFactor(),
14988 b: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
14989 ->getVectorFactor());
14990 VectorCost = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
14991 Tp: getWidenedType(ScalarTy, VF: CommonVF),
14992 Mask: E->getSplitMask(), CostKind);
14993 }
14994 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
14995 return VectorCost;
14996 }
14997 InstructionCost CommonCost = 0;
14998 SmallVector<int> Mask;
14999 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
15000 (E->State != TreeEntry::StridedVectorize ||
15001 !isReverseOrder(Order: E->ReorderIndices))) {
15002 SmallVector<int> NewMask;
15003 if (E->getOpcode() == Instruction::Store) {
15004 // For stores the order is actually a mask.
15005 NewMask.resize(N: E->ReorderIndices.size());
15006 copy(Range: E->ReorderIndices, Out: NewMask.begin());
15007 } else {
15008 inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
15009 }
15010 ::addMask(Mask, SubMask: NewMask);
15011 }
15012 if (!E->ReuseShuffleIndices.empty())
15013 ::addMask(Mask, SubMask: E->ReuseShuffleIndices);
15014 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
15015 CommonCost =
15016 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
15017 assert((E->State == TreeEntry::Vectorize ||
15018 E->State == TreeEntry::ScatterVectorize ||
15019 E->State == TreeEntry::StridedVectorize ||
15020 E->State == TreeEntry::CompressVectorize) &&
15021 "Unhandled state");
15022 assert(E->getOpcode() &&
15023 ((allSameType(VL) && allSameBlock(VL)) ||
15024 (E->getOpcode() == Instruction::GetElementPtr &&
15025 E->getMainOp()->getType()->isPointerTy()) ||
15026 E->hasCopyableElements()) &&
15027 "Invalid VL");
15028 Instruction *VL0 = E->getMainOp();
15029 unsigned ShuffleOrOp =
15030 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15031 if (E->CombinedOp != TreeEntry::NotCombinedOp)
15032 ShuffleOrOp = E->CombinedOp;
15033 SmallSetVector<Value *, 16> UniqueValues;
15034 SmallVector<unsigned, 16> UniqueIndexes;
15035 for (auto [Idx, V] : enumerate(First&: VL))
15036 if (UniqueValues.insert(X: V))
15037 UniqueIndexes.push_back(Elt: Idx);
15038 const unsigned Sz = UniqueValues.size();
15039 SmallBitVector UsedScalars(Sz, false);
15040 for (unsigned I = 0; I < Sz; ++I) {
15041 if (isa<Instruction>(Val: UniqueValues[I]) &&
15042 !E->isCopyableElement(V: UniqueValues[I]) &&
15043 getTreeEntries(V: UniqueValues[I]).front() == E)
15044 continue;
15045 UsedScalars.set(I);
15046 }
15047 auto GetCastContextHint = [&](Value *V) {
15048 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
15049 return getCastContextHint(TE: *OpTEs.front());
15050 InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: 0), TLI: *TLI);
15051 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15052 !SrcState.isAltShuffle())
15053 return TTI::CastContextHint::GatherScatter;
15054 return TTI::CastContextHint::None;
15055 };
15056 auto GetCostDiff =
15057 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
15058 function_ref<InstructionCost(InstructionCost)> VectorCost) {
15059 // Calculate the cost of this instruction.
15060 InstructionCost ScalarCost = 0;
15061 if (isa<CastInst, CallInst>(Val: VL0)) {
15062 // For some of the instructions no need to calculate cost for each
15063 // particular instruction, we can use the cost of the single
15064 // instruction x total number of scalar instructions.
15065 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
15066 } else {
15067 for (unsigned I = 0; I < Sz; ++I) {
15068 if (UsedScalars.test(Idx: I))
15069 continue;
15070 ScalarCost += ScalarEltCost(I);
15071 }
15072 }
15073
15074 InstructionCost VecCost = VectorCost(CommonCost);
15075 // Check if the current node must be resized, if the parent node is not
15076 // resized.
15077 if (It != MinBWs.end() && !UnaryInstruction::isCast(Opcode: E->getOpcode()) &&
15078 E->Idx != 0 &&
15079 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
15080 const EdgeInfo &EI = E->UserTreeIndex;
15081 if (!EI.UserTE->hasState() ||
15082 EI.UserTE->getOpcode() != Instruction::Select ||
15083 EI.EdgeIdx != 0) {
15084 auto UserBWIt = MinBWs.find(Val: EI.UserTE);
15085 Type *UserScalarTy =
15086 (EI.UserTE->isGather() ||
15087 EI.UserTE->State == TreeEntry::SplitVectorize)
15088 ? EI.UserTE->Scalars.front()->getType()
15089 : EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
15090 if (UserBWIt != MinBWs.end())
15091 UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
15092 NumBits: UserBWIt->second.first);
15093 if (ScalarTy != UserScalarTy) {
15094 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
15095 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
15096 unsigned VecOpcode;
15097 auto *UserVecTy = getWidenedType(ScalarTy: UserScalarTy, VF: E->Scalars.size());
15098 if (BWSz > SrcBWSz)
15099 VecOpcode = Instruction::Trunc;
15100 else
15101 VecOpcode =
15102 It->second.second ? Instruction::SExt : Instruction::ZExt;
15103 TTI::CastContextHint CCH = GetCastContextHint(VL0);
15104 VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
15105 CostKind);
15106 }
15107 }
15108 }
15109 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
15110 ScalarCost, "Calculated costs for Tree"));
15111 return VecCost - ScalarCost;
15112 };
15113 // Calculate cost difference from vectorizing set of GEPs.
15114 // Negative value means vectorizing is profitable.
15115 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
15116 assert((E->State == TreeEntry::Vectorize ||
15117 E->State == TreeEntry::StridedVectorize ||
15118 E->State == TreeEntry::CompressVectorize) &&
15119 "Entry state expected to be Vectorize, StridedVectorize or "
15120 "MaskedLoadCompressVectorize here.");
15121 InstructionCost ScalarCost = 0;
15122 InstructionCost VecCost = 0;
15123 std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
15124 TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
15125 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
15126 "Calculated GEPs cost for Tree"));
15127
15128 return VecCost - ScalarCost;
15129 };
15130
15131 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
15132 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI ? VI : VL);
15133 if (MinMaxID == Intrinsic::not_intrinsic)
15134 return InstructionCost::getInvalid();
15135 Type *CanonicalType = Ty;
15136 if (CanonicalType->isPtrOrPtrVectorTy())
15137 CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
15138 C&: CanonicalType->getContext(),
15139 NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
15140
15141 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15142 {CanonicalType, CanonicalType});
15143 InstructionCost IntrinsicCost =
15144 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15145 // If the selects are the only uses of the compares, they will be
15146 // dead and we can adjust the cost by removing their cost.
15147 if (VI && SelectOnly) {
15148 assert((!Ty->isVectorTy() || SLPReVec) &&
15149 "Expected only for scalar type.");
15150 auto *CI = cast<CmpInst>(Val: VI->getOperand(i: 0));
15151 IntrinsicCost -= TTI->getCmpSelInstrCost(
15152 Opcode: CI->getOpcode(), ValTy: Ty, CondTy: Builder.getInt1Ty(), VecPred: CI->getPredicate(),
15153 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
15154 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CI);
15155 }
15156 return IntrinsicCost;
15157 };
15158 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
15159 Instruction *VI) {
15160 InstructionCost Cost = canConvertToFMA(VL: VI, S, DT&: *DT, DL: *DL, TTI, TLI: *TLI);
15161 return Cost;
15162 };
15163 switch (ShuffleOrOp) {
15164 case Instruction::PHI: {
15165 // Count reused scalars.
15166 InstructionCost ScalarCost = 0;
15167 SmallPtrSet<const TreeEntry *, 4> CountedOps;
15168 for (Value *V : UniqueValues) {
15169 auto *PHI = dyn_cast<PHINode>(Val: V);
15170 if (!PHI)
15171 continue;
15172
15173 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
15174 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
15175 Value *Op = PHI->getIncomingValue(i: I);
15176 Operands[I] = Op;
15177 }
15178 if (const TreeEntry *OpTE =
15179 getSameValuesTreeEntry(V: Operands.front(), VL: Operands))
15180 if (CountedOps.insert(Ptr: OpTE).second &&
15181 !OpTE->ReuseShuffleIndices.empty())
15182 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15183 OpTE->Scalars.size());
15184 }
15185
15186 return CommonCost - ScalarCost;
15187 }
15188 case Instruction::ExtractValue:
15189 case Instruction::ExtractElement: {
15190 APInt DemandedElts;
15191 VectorType *SrcVecTy = nullptr;
15192 auto GetScalarCost = [&](unsigned Idx) {
15193 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15194 return InstructionCost(TTI::TCC_Free);
15195
15196 auto *I = cast<Instruction>(Val: UniqueValues[Idx]);
15197 if (!SrcVecTy) {
15198 if (ShuffleOrOp == Instruction::ExtractElement) {
15199 auto *EE = cast<ExtractElementInst>(Val: I);
15200 SrcVecTy = EE->getVectorOperandType();
15201 } else {
15202 auto *EV = cast<ExtractValueInst>(Val: I);
15203 Type *AggregateTy = EV->getAggregateOperand()->getType();
15204 unsigned NumElts;
15205 if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
15206 NumElts = ATy->getNumElements();
15207 else
15208 NumElts = AggregateTy->getStructNumElements();
15209 SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
15210 }
15211 }
15212 if (I->hasOneUse()) {
15213 Instruction *Ext = I->user_back();
15214 if ((isa<SExtInst>(Val: Ext) || isa<ZExtInst>(Val: Ext)) &&
15215 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
15216 // Use getExtractWithExtendCost() to calculate the cost of
15217 // extractelement/ext pair.
15218 InstructionCost Cost = TTI->getExtractWithExtendCost(
15219 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I),
15220 CostKind);
15221 // Subtract the cost of s|zext which is subtracted separately.
15222 Cost -= TTI->getCastInstrCost(
15223 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
15224 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
15225 return Cost;
15226 }
15227 }
15228 if (DemandedElts.isZero())
15229 DemandedElts = APInt::getZero(numBits: getNumElements(Ty: SrcVecTy));
15230 DemandedElts.setBit(*getExtractIndex(E: I));
15231 return InstructionCost(TTI::TCC_Free);
15232 };
15233 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15234 return CommonCost - (DemandedElts.isZero()
15235 ? TTI::TCC_Free
15236 : TTI.getScalarizationOverhead(
15237 Ty: SrcVecTy, DemandedElts, /*Insert=*/false,
15238 /*Extract=*/true, CostKind));
15239 };
15240 return GetCostDiff(GetScalarCost, GetVectorCost);
15241 }
15242 case Instruction::InsertElement: {
15243 assert(E->ReuseShuffleIndices.empty() &&
15244 "Unique insertelements only are expected.");
15245 auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
15246 unsigned const NumElts = SrcVecTy->getNumElements();
15247 unsigned const NumScalars = VL.size();
15248
15249 unsigned NumOfParts = ::getNumberOfParts(TTI: *TTI, VecTy: SrcVecTy);
15250
15251 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15252 unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
15253 unsigned OffsetEnd = OffsetBeg;
15254 InsertMask[OffsetBeg] = 0;
15255 for (auto [I, V] : enumerate(First: VL.drop_front())) {
15256 unsigned Idx = *getElementIndex(Inst: V);
15257 if (OffsetBeg > Idx)
15258 OffsetBeg = Idx;
15259 else if (OffsetEnd < Idx)
15260 OffsetEnd = Idx;
15261 InsertMask[Idx] = I + 1;
15262 }
15263 unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
15264 if (NumOfParts > 0 && NumOfParts < NumElts)
15265 VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - 1) / NumOfParts);
15266 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15267 VecScalarsSz;
15268 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15269 unsigned InsertVecSz = std::min<unsigned>(
15270 a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + 1),
15271 b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15272 bool IsWholeSubvector =
15273 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15274 // Check if we can safely insert a subvector. If it is not possible, just
15275 // generate a whole-sized vector and shuffle the source vector and the new
15276 // subvector.
15277 if (OffsetBeg + InsertVecSz > VecSz) {
15278 // Align OffsetBeg to generate correct mask.
15279 OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
15280 InsertVecSz = VecSz;
15281 }
15282
15283 APInt DemandedElts = APInt::getZero(numBits: NumElts);
15284 // TODO: Add support for Instruction::InsertValue.
15285 SmallVector<int> Mask;
15286 if (!E->ReorderIndices.empty()) {
15287 inversePermutation(Indices: E->ReorderIndices, Mask);
15288 Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
15289 } else {
15290 Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
15291 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: 0);
15292 }
15293 bool IsIdentity = true;
15294 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15295 Mask.swap(RHS&: PrevMask);
15296 for (unsigned I = 0; I < NumScalars; ++I) {
15297 unsigned InsertIdx = *getElementIndex(Inst: VL[PrevMask[I]]);
15298 DemandedElts.setBit(InsertIdx);
15299 IsIdentity &= InsertIdx - OffsetBeg == I;
15300 Mask[InsertIdx - OffsetBeg] = I;
15301 }
15302 assert(Offset < NumElts && "Failed to find vector index offset");
15303
15304 InstructionCost Cost = 0;
15305 Cost -=
15306 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: SrcVecTy, DemandedElts,
15307 /*Insert*/ true, /*Extract*/ false, CostKind);
15308
15309 // First cost - resize to actual vector size if not identity shuffle or
15310 // need to shift the vector.
15311 // Do not calculate the cost if the actual size is the register size and
15312 // we can merge this shuffle with the following SK_Select.
15313 auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
15314 if (!IsIdentity)
15315 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc,
15316 Tp: InsertVecTy, Mask);
15317 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: [E](Value *V) {
15318 return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
15319 }));
15320 // Second cost - permutation with subvector, if some elements are from the
15321 // initial vector or inserting a subvector.
15322 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
15323 // subvector of ActualVecTy.
15324 SmallBitVector InMask =
15325 isUndefVector(V: FirstInsert->getOperand(i: 0),
15326 UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
15327 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15328 if (InsertVecSz != VecSz) {
15329 auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
15330 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy, Mask: {},
15331 CostKind, Index: OffsetBeg - Offset, SubTp: InsertVecTy);
15332 } else {
15333 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
15334 Mask[I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
15335 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15336 I <= End; ++I)
15337 if (Mask[I] != PoisonMaskElem)
15338 Mask[I] = I + VecSz;
15339 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
15340 Mask[I] =
15341 ((I >= InMask.size()) || InMask.test(Idx: I)) ? PoisonMaskElem : I;
15342 Cost +=
15343 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
15344 }
15345 }
15346 return Cost;
15347 }
15348 case Instruction::ZExt:
15349 case Instruction::SExt:
15350 case Instruction::FPToUI:
15351 case Instruction::FPToSI:
15352 case Instruction::FPExt:
15353 case Instruction::PtrToInt:
15354 case Instruction::IntToPtr:
15355 case Instruction::SIToFP:
15356 case Instruction::UIToFP:
15357 case Instruction::Trunc:
15358 case Instruction::FPTrunc:
15359 case Instruction::BitCast: {
15360 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
15361 Type *SrcScalarTy = VL0->getOperand(i: 0)->getType();
15362 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
15363 unsigned Opcode = ShuffleOrOp;
15364 unsigned VecOpcode = Opcode;
15365 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15366 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15367 // Check if the values are candidates to demote.
15368 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy->getScalarType());
15369 if (SrcIt != MinBWs.end()) {
15370 SrcBWSz = SrcIt->second.first;
15371 unsigned SrcScalarTyNumElements = getNumElements(Ty: SrcScalarTy);
15372 SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
15373 SrcVecTy =
15374 getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size() * SrcScalarTyNumElements);
15375 }
15376 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
15377 if (BWSz == SrcBWSz) {
15378 VecOpcode = Instruction::BitCast;
15379 } else if (BWSz < SrcBWSz) {
15380 VecOpcode = Instruction::Trunc;
15381 } else if (It != MinBWs.end()) {
15382 assert(BWSz > SrcBWSz && "Invalid cast!");
15383 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15384 } else if (SrcIt != MinBWs.end()) {
15385 assert(BWSz > SrcBWSz && "Invalid cast!");
15386 VecOpcode =
15387 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15388 }
15389 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15390 !SrcIt->second.second) {
15391 VecOpcode = Instruction::UIToFP;
15392 }
15393 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15394 assert(Idx == 0 && "Expected 0 index only");
15395 return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
15396 Src: VL0->getOperand(i: 0)->getType(),
15397 CCH: TTI::getCastContextHint(I: VL0), CostKind, I: VL0);
15398 };
15399 auto GetVectorCost = [=](InstructionCost CommonCost) {
15400 // Do not count cost here if minimum bitwidth is in effect and it is just
15401 // a bitcast (here it is just a noop).
15402 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15403 return CommonCost;
15404 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
15405 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(i: 0));
15406
15407 bool IsArithmeticExtendedReduction =
15408 E->Idx == 0 && UserIgnoreList &&
15409 all_of(Range: *UserIgnoreList, P: [](Value *V) {
15410 auto *I = cast<Instruction>(Val: V);
15411 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
15412 Instruction::Mul, Instruction::FMul,
15413 Instruction::And, Instruction::Or,
15414 Instruction::Xor},
15415 Element: I->getOpcode());
15416 });
15417 if (IsArithmeticExtendedReduction &&
15418 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15419 return CommonCost;
15420 return CommonCost +
15421 TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
15422 I: VecOpcode == Opcode ? VI : nullptr);
15423 };
15424 return GetCostDiff(GetScalarCost, GetVectorCost);
15425 }
15426 case Instruction::FCmp:
15427 case Instruction::ICmp:
15428 case Instruction::Select: {
15429 CmpPredicate VecPred, SwappedVecPred;
15430 auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
15431 if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) ||
15432 match(V: VL0, P: MatchCmp))
15433 SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
15434 else
15435 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15436 ? CmpInst::BAD_FCMP_PREDICATE
15437 : CmpInst::BAD_ICMP_PREDICATE;
15438 auto GetScalarCost = [&](unsigned Idx) {
15439 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15440 return InstructionCost(TTI::TCC_Free);
15441
15442 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
15443 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
15444 ? CmpInst::BAD_FCMP_PREDICATE
15445 : CmpInst::BAD_ICMP_PREDICATE;
15446 auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
15447 if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
15448 !match(V: VI, P: MatchCmp)) ||
15449 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
15450 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
15451 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
15452 ? CmpInst::BAD_FCMP_PREDICATE
15453 : CmpInst::BAD_ICMP_PREDICATE;
15454
15455 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
15456 Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
15457 CostKind, Op1Info: getOperandInfo(Ops: VI->getOperand(i: 0)),
15458 Op2Info: getOperandInfo(Ops: VI->getOperand(i: 1)), I: VI);
15459 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
15460 if (IntrinsicCost.isValid())
15461 ScalarCost = IntrinsicCost;
15462
15463 return ScalarCost;
15464 };
15465 auto GetVectorCost = [&](InstructionCost CommonCost) {
15466 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
15467
15468 InstructionCost VecCost =
15469 TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred,
15470 CostKind, Op1Info: getOperandInfo(Ops: E->getOperand(OpIdx: 0)),
15471 Op2Info: getOperandInfo(Ops: E->getOperand(OpIdx: 1)), I: VL0);
15472 if (auto *SI = dyn_cast<SelectInst>(Val: VL0)) {
15473 auto *CondType =
15474 getWidenedType(ScalarTy: SI->getCondition()->getType(), VF: VL.size());
15475 unsigned CondNumElements = CondType->getNumElements();
15476 unsigned VecTyNumElements = getNumElements(Ty: VecTy);
15477 assert(VecTyNumElements >= CondNumElements &&
15478 VecTyNumElements % CondNumElements == 0 &&
15479 "Cannot vectorize Instruction::Select");
15480 if (CondNumElements != VecTyNumElements) {
15481 // When the return type is i1 but the source is fixed vector type, we
15482 // need to duplicate the condition value.
15483 VecCost += ::getShuffleCost(
15484 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: CondType,
15485 Mask: createReplicatedMask(ReplicationFactor: VecTyNumElements / CondNumElements,
15486 VF: CondNumElements));
15487 }
15488 }
15489 return VecCost + CommonCost;
15490 };
15491 return GetCostDiff(GetScalarCost, GetVectorCost);
15492 }
15493 case TreeEntry::MinMax: {
15494 auto GetScalarCost = [&](unsigned Idx) {
15495 return GetMinMaxCost(OrigScalarTy);
15496 };
15497 auto GetVectorCost = [&](InstructionCost CommonCost) {
15498 InstructionCost VecCost = GetMinMaxCost(VecTy);
15499 return VecCost + CommonCost;
15500 };
15501 return GetCostDiff(GetScalarCost, GetVectorCost);
15502 }
15503 case TreeEntry::FMulAdd: {
15504 auto GetScalarCost = [&](unsigned Idx) {
15505 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15506 return InstructionCost(TTI::TCC_Free);
15507 return GetFMulAddCost(E->getOperations(),
15508 cast<Instruction>(Val: UniqueValues[Idx]));
15509 };
15510 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15511 FastMathFlags FMF;
15512 FMF.set();
15513 for (Value *V : E->Scalars) {
15514 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: V)) {
15515 FMF &= FPCI->getFastMathFlags();
15516 if (auto *FPCIOp = dyn_cast<FPMathOperator>(Val: FPCI->getOperand(i: 0)))
15517 FMF &= FPCIOp->getFastMathFlags();
15518 }
15519 }
15520 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15521 {VecTy, VecTy, VecTy}, FMF);
15522 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15523 return VecCost + CommonCost;
15524 };
15525 return GetCostDiff(GetScalarCost, GetVectorCost);
15526 }
15527 case TreeEntry::ReducedBitcast:
15528 case TreeEntry::ReducedBitcastBSwap: {
15529 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
15530 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15531 return InstructionCost(TTI::TCC_Free);
15532 auto *Shl = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
15533 if (!Shl)
15534 return InstructionCost(TTI::TCC_Free);
15535 InstructionCost ScalarCost = TTI.getInstructionCost(U: Shl, CostKind);
15536 auto *ZExt = dyn_cast<Instruction>(Val: Shl->getOperand(i: 0));
15537 if (!ZExt)
15538 return ScalarCost;
15539 ScalarCost += TTI.getInstructionCost(U: ZExt, CostKind);
15540 return ScalarCost;
15541 };
15542 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15543 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
15544 TTI::CastContextHint CastCtx =
15545 getCastContextHint(TE: *getOperandEntry(E: LhsTE, /*Idx=*/0));
15546 Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
15547 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
15548 InstructionCost BitcastCost = TTI.getCastInstrCost(
15549 Opcode: Instruction::BitCast, Dst: ScalarTy, Src: SrcVecTy, CCH: CastCtx, CostKind);
15550 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
15551 auto *OrigScalarTy = E->getMainOp()->getType();
15552 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, OrigScalarTy,
15553 {OrigScalarTy});
15554 InstructionCost IntrinsicCost =
15555 TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15556 BitcastCost += IntrinsicCost;
15557 }
15558 return BitcastCost + CommonCost;
15559 };
15560 return GetCostDiff(GetScalarCost, GetVectorCost);
15561 }
15562 case Instruction::FNeg:
15563 case Instruction::Add:
15564 case Instruction::FAdd:
15565 case Instruction::Sub:
15566 case Instruction::FSub:
15567 case Instruction::Mul:
15568 case Instruction::FMul:
15569 case Instruction::UDiv:
15570 case Instruction::SDiv:
15571 case Instruction::FDiv:
15572 case Instruction::URem:
15573 case Instruction::SRem:
15574 case Instruction::FRem:
15575 case Instruction::Shl:
15576 case Instruction::LShr:
15577 case Instruction::AShr:
15578 case Instruction::And:
15579 case Instruction::Or:
15580 case Instruction::Xor: {
15581 auto GetScalarCost = [&](unsigned Idx) {
15582 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15583 return InstructionCost(TTI::TCC_Free);
15584
15585 // We cannot retrieve the operand from UniqueValues[Idx] because an
15586 // interchangeable instruction may be used. The order and the actual
15587 // operand might differ from what is retrieved from UniqueValues[Idx].
15588 unsigned Lane = UniqueIndexes[Idx];
15589 Value *Op1 = E->getOperand(OpIdx: 0)[Lane];
15590 Value *Op2;
15591 SmallVector<const Value *, 2> Operands(1, Op1);
15592 if (isa<UnaryOperator>(Val: UniqueValues[Idx])) {
15593 Op2 = Op1;
15594 } else {
15595 Op2 = E->getOperand(OpIdx: 1)[Lane];
15596 Operands.push_back(Elt: Op2);
15597 }
15598 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: Op1);
15599 TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(V: Op2);
15600 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15601 Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands);
15602 if (auto *I = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
15603 I && (ShuffleOrOp == Instruction::FAdd ||
15604 ShuffleOrOp == Instruction::FSub)) {
15605 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15606 if (IntrinsicCost.isValid())
15607 ScalarCost = IntrinsicCost;
15608 }
15609 return ScalarCost;
15610 };
15611 auto GetVectorCost = [=](InstructionCost CommonCost) {
15612 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15613 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
15614 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
15615 if (all_of(Range&: Ops, P: [&](Value *Op) {
15616 auto *CI = dyn_cast<ConstantInt>(Val: Op);
15617 return CI && CI->getValue().countr_one() >= It->second.first;
15618 }))
15619 return CommonCost;
15620 }
15621 }
15622 unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? 0 : 1;
15623 TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
15624 TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
15625 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
15626 Opd2Info: Op2Info, Args: {}, CxtI: nullptr, TLibInfo: TLI) +
15627 CommonCost;
15628 };
15629 return GetCostDiff(GetScalarCost, GetVectorCost);
15630 }
15631 case Instruction::GetElementPtr: {
15632 return CommonCost + GetGEPCostDiff(VL, VL0);
15633 }
15634 case Instruction::Load: {
15635 auto GetScalarCost = [&](unsigned Idx) {
15636 auto *VI = cast<LoadInst>(Val: UniqueValues[Idx]);
15637 return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
15638 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
15639 CostKind, OpdInfo: TTI::OperandValueInfo(), I: VI);
15640 };
15641 auto *LI0 = cast<LoadInst>(Val: VL0);
15642 auto GetVectorCost = [&](InstructionCost CommonCost) {
15643 InstructionCost VecLdCost;
15644 switch (E->State) {
15645 case TreeEntry::Vectorize:
15646 if (unsigned Factor = E->getInterleaveFactor()) {
15647 VecLdCost = TTI->getInterleavedMemoryOpCost(
15648 Opcode: Instruction::Load, VecTy, Factor, Indices: {}, Alignment: LI0->getAlign(),
15649 AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15650
15651 } else {
15652 VecLdCost = TTI->getMemoryOpCost(
15653 Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
15654 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
15655 }
15656 break;
15657 case TreeEntry::StridedVectorize: {
15658 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
15659 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15660 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
15661 Align CommonAlignment =
15662 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
15663 VecLdCost = TTI->getMemIntrinsicInstrCost(
15664 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15665 StridedLoadTy, LI0->getPointerOperand(),
15666 /*VariableMask=*/false, CommonAlignment),
15667 CostKind);
15668 if (StridedLoadTy != VecTy)
15669 VecLdCost +=
15670 TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: VecTy, Src: StridedLoadTy,
15671 CCH: getCastContextHint(TE: *E), CostKind);
15672
15673 break;
15674 }
15675 case TreeEntry::CompressVectorize: {
15676 bool IsMasked;
15677 unsigned InterleaveFactor;
15678 SmallVector<int> CompressMask;
15679 VectorType *LoadVecTy;
15680 SmallVector<Value *> Scalars(VL);
15681 if (!E->ReorderIndices.empty()) {
15682 SmallVector<int> Mask(E->ReorderIndices.begin(),
15683 E->ReorderIndices.end());
15684 reorderScalars(Scalars, Mask);
15685 }
15686 SmallVector<Value *> PointerOps(Scalars.size());
15687 for (auto [I, V] : enumerate(First&: Scalars))
15688 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
15689 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15690 VL: Scalars, PointerOps, Order: E->ReorderIndices, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
15691 TLI: *TLI, AreAllUsersVectorized: [](Value *) { return true; }, IsMasked, InterleaveFactor,
15692 CompressMask, LoadVecTy);
15693 assert(IsVectorized && "Failed to vectorize load");
15694 CompressEntryToData.try_emplace(Key: E, Args&: CompressMask, Args&: LoadVecTy,
15695 Args&: InterleaveFactor, Args&: IsMasked);
15696 Align CommonAlignment = LI0->getAlign();
15697 if (InterleaveFactor) {
15698 VecLdCost = TTI->getInterleavedMemoryOpCost(
15699 Opcode: Instruction::Load, VecTy: LoadVecTy, Factor: InterleaveFactor, Indices: {},
15700 Alignment: CommonAlignment, AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15701 } else if (IsMasked) {
15702 VecLdCost = TTI->getMemIntrinsicInstrCost(
15703 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15704 CommonAlignment,
15705 LI0->getPointerAddressSpace()),
15706 CostKind);
15707 // TODO: include this cost into CommonCost.
15708 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
15709 Tp: LoadVecTy, Mask: CompressMask, CostKind);
15710 } else {
15711 VecLdCost = TTI->getMemoryOpCost(
15712 Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
15713 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
15714 // TODO: include this cost into CommonCost.
15715 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
15716 Tp: LoadVecTy, Mask: CompressMask, CostKind);
15717 }
15718 break;
15719 }
15720 case TreeEntry::ScatterVectorize: {
15721 Align CommonAlignment =
15722 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
15723 VecLdCost = TTI->getMemIntrinsicInstrCost(
15724 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15725 LI0->getPointerOperand(),
15726 /*VariableMask=*/false, CommonAlignment),
15727 CostKind);
15728 break;
15729 }
15730 case TreeEntry::CombinedVectorize:
15731 case TreeEntry::SplitVectorize:
15732 case TreeEntry::NeedToGather:
15733 llvm_unreachable("Unexpected vectorization state.");
15734 }
15735 return VecLdCost + CommonCost;
15736 };
15737
15738 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15739 // If this node generates masked gather load then it is not a terminal node.
15740 // Hence address operand cost is estimated separately.
15741 if (E->State == TreeEntry::ScatterVectorize)
15742 return Cost;
15743
15744 // Estimate cost of GEPs since this tree node is a terminator.
15745 SmallVector<Value *> PointerOps(VL.size());
15746 for (auto [I, V] : enumerate(First&: VL))
15747 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
15748 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15749 }
15750 case Instruction::Store: {
15751 bool IsReorder = !E->ReorderIndices.empty();
15752 auto GetScalarCost = [=](unsigned Idx) {
15753 auto *VI = cast<StoreInst>(Val: VL[Idx]);
15754 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
15755 return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
15756 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
15757 CostKind, OpdInfo: OpInfo, I: VI);
15758 };
15759 auto *BaseSI =
15760 cast<StoreInst>(Val: IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15761 auto GetVectorCost = [=](InstructionCost CommonCost) {
15762 // We know that we can merge the stores. Calculate the cost.
15763 InstructionCost VecStCost;
15764 if (E->State == TreeEntry::StridedVectorize) {
15765 Align CommonAlignment =
15766 computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
15767 VecStCost = TTI->getMemIntrinsicInstrCost(
15768 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15769 VecTy, BaseSI->getPointerOperand(),
15770 /*VariableMask=*/false, CommonAlignment),
15771 CostKind);
15772 } else {
15773 assert(E->State == TreeEntry::Vectorize &&
15774 "Expected either strided or consecutive stores.");
15775 if (unsigned Factor = E->getInterleaveFactor()) {
15776 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
15777 "No reused shuffles expected");
15778 CommonCost = 0;
15779 VecStCost = TTI->getInterleavedMemoryOpCost(
15780 Opcode: Instruction::Store, VecTy, Factor, Indices: {}, Alignment: BaseSI->getAlign(),
15781 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind);
15782 } else {
15783 TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
15784 VecStCost = TTI->getMemoryOpCost(
15785 Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
15786 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
15787 }
15788 }
15789 return VecStCost + CommonCost;
15790 };
15791 SmallVector<Value *> PointerOps(VL.size());
15792 for (auto [I, V] : enumerate(First&: VL)) {
15793 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
15794 PointerOps[Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
15795 }
15796
15797 return GetCostDiff(GetScalarCost, GetVectorCost) +
15798 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
15799 }
15800 case Instruction::Call: {
15801 auto GetScalarCost = [&](unsigned Idx) {
15802 auto *CI = cast<CallInst>(Val: UniqueValues[Idx]);
15803 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
15804 if (ID != Intrinsic::not_intrinsic) {
15805 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
15806 return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15807 }
15808 return TTI->getCallInstrCost(F: CI->getCalledFunction(),
15809 RetTy: CI->getFunctionType()->getReturnType(),
15810 Tys: CI->getFunctionType()->params(), CostKind);
15811 };
15812 auto GetVectorCost = [=](InstructionCost CommonCost) {
15813 auto *CI = cast<CallInst>(Val: VL0);
15814 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
15815 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
15816 CI, ID, VF: VecTy->getNumElements(),
15817 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
15818 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
15819 return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
15820 };
15821 return GetCostDiff(GetScalarCost, GetVectorCost);
15822 }
15823 case Instruction::ShuffleVector: {
15824 if (!SLPReVec || E->isAltShuffle())
15825 assert(E->isAltShuffle() &&
15826 ((Instruction::isBinaryOp(E->getOpcode()) &&
15827 Instruction::isBinaryOp(E->getAltOpcode())) ||
15828 (Instruction::isCast(E->getOpcode()) &&
15829 Instruction::isCast(E->getAltOpcode())) ||
15830 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
15831 "Invalid Shuffle Vector Operand");
15832 // Try to find the previous shuffle node with the same operands and same
15833 // main/alternate ops.
15834 auto TryFindNodeWithEqualOperands = [=]() {
15835 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
15836 if (TE.get() == E)
15837 break;
15838 if (TE->hasState() && TE->isAltShuffle() &&
15839 ((TE->getOpcode() == E->getOpcode() &&
15840 TE->getAltOpcode() == E->getAltOpcode()) ||
15841 (TE->getOpcode() == E->getAltOpcode() &&
15842 TE->getAltOpcode() == E->getOpcode())) &&
15843 TE->hasEqualOperands(TE: *E))
15844 return true;
15845 }
15846 return false;
15847 };
15848 auto GetScalarCost = [&](unsigned Idx) {
15849 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15850 return InstructionCost(TTI::TCC_Free);
15851
15852 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
15853 assert(E->getMatchingMainOpOrAltOp(VI) &&
15854 "Unexpected main/alternate opcode");
15855 (void)E;
15856 return TTI->getInstructionCost(U: VI, CostKind);
15857 };
15858 // Need to clear CommonCost since the final shuffle cost is included into
15859 // vector cost.
15860 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
15861 // VecCost is equal to sum of the cost of creating 2 vectors
15862 // and the cost of creating shuffle.
15863 InstructionCost VecCost = 0;
15864 if (TryFindNodeWithEqualOperands()) {
15865 LLVM_DEBUG({
15866 dbgs() << "SLP: diamond match for alternate node found.\n";
15867 E->dump();
15868 });
15869 // No need to add new vector costs here since we're going to reuse
15870 // same main/alternate vector ops, just do different shuffling.
15871 } else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
15872 VecCost =
15873 TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
15874 VecCost +=
15875 TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
15876 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
15877 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
15878 VecCost = TTIRef.getCmpSelInstrCost(
15879 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred: CI0->getPredicate(), CostKind,
15880 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
15881 I: VL0);
15882 VecCost += TTIRef.getCmpSelInstrCost(
15883 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
15884 VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
15885 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
15886 I: E->getAltOp());
15887 } else {
15888 Type *SrcSclTy = E->getMainOp()->getOperand(i: 0)->getType();
15889 auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
15890 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
15891 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
15892 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
15893 unsigned SrcBWSz =
15894 DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: 0)->getType());
15895 if (SrcIt != MinBWs.end()) {
15896 SrcBWSz = SrcIt->second.first;
15897 SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
15898 SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
15899 }
15900 if (BWSz <= SrcBWSz) {
15901 if (BWSz < SrcBWSz)
15902 VecCost =
15903 TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
15904 CCH: TTI::CastContextHint::None, CostKind);
15905 LLVM_DEBUG({
15906 dbgs()
15907 << "SLP: alternate extension, which should be truncated.\n";
15908 E->dump();
15909 });
15910 return VecCost;
15911 }
15912 }
15913 VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
15914 CCH: TTI::CastContextHint::None, CostKind);
15915 VecCost +=
15916 TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
15917 CCH: TTI::CastContextHint::None, CostKind);
15918 }
15919 SmallVector<int> Mask;
15920 E->buildAltOpShuffleMask(
15921 IsAltOp: [&](Instruction *I) {
15922 assert(E->getMatchingMainOpOrAltOp(I) &&
15923 "Unexpected main/alternate opcode");
15924 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
15925 TLI: *TLI);
15926 },
15927 Mask);
15928 VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
15929 Tp: FinalVecTy, Mask, CostKind);
15930 // Patterns like [fadd,fsub] can be combined into a single instruction
15931 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
15932 // need to take into account their order when looking for the most used
15933 // order.
15934 unsigned Opcode0 = E->getOpcode();
15935 unsigned Opcode1 = E->getAltOpcode();
15936 SmallBitVector OpcodeMask(
15937 getAltInstrMask(VL: E->Scalars, ScalarTy, Opcode0, Opcode1));
15938 // If this pattern is supported by the target then we consider the
15939 // order.
15940 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
15941 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
15942 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
15943 return AltVecCost < VecCost ? AltVecCost : VecCost;
15944 }
15945 // TODO: Check the reverse order too.
15946 return VecCost;
15947 };
15948 if (SLPReVec && !E->isAltShuffle())
15949 return GetCostDiff(
15950 GetScalarCost, [&](InstructionCost) -> InstructionCost {
15951 // If a group uses mask in order, the shufflevector can be
15952 // eliminated by instcombine. Then the cost is 0.
15953 assert(isa<ShuffleVectorInst>(VL.front()) &&
15954 "Not supported shufflevector usage.");
15955 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
15956 unsigned SVNumElements =
15957 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())
15958 ->getNumElements();
15959 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
15960 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
15961 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
15962 int NextIndex = 0;
15963 if (!all_of(Range&: Group, P: [&](Value *V) {
15964 assert(isa<ShuffleVectorInst>(V) &&
15965 "Not supported shufflevector usage.");
15966 auto *SV = cast<ShuffleVectorInst>(Val: V);
15967 int Index;
15968 [[maybe_unused]] bool IsExtractSubvectorMask =
15969 SV->isExtractSubvectorMask(Index);
15970 assert(IsExtractSubvectorMask &&
15971 "Not supported shufflevector usage.");
15972 if (NextIndex != Index)
15973 return false;
15974 NextIndex += SV->getShuffleMask().size();
15975 return true;
15976 }))
15977 return ::getShuffleCost(
15978 TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy,
15979 Mask: calculateShufflevectorMask(VL: E->Scalars));
15980 }
15981 return TTI::TCC_Free;
15982 });
15983 return GetCostDiff(GetScalarCost, GetVectorCost);
15984 }
15985 case Instruction::Freeze:
15986 return CommonCost;
15987 default:
15988 llvm_unreachable("Unknown instruction");
15989 }
15990}
15991
15992bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
15993 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
15994 << VectorizableTree.size() << " is fully vectorizable .\n");
15995
15996 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
15997 SmallVector<int> Mask;
15998 return TE->isGather() &&
15999 !any_of(Range: TE->Scalars,
16000 P: [this](Value *V) { return EphValues.contains(Ptr: V); }) &&
16001 (allConstant(VL: TE->Scalars) || isSplat(VL: TE->Scalars) ||
16002 TE->Scalars.size() < Limit ||
16003 (((TE->hasState() &&
16004 TE->getOpcode() == Instruction::ExtractElement) ||
16005 all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
16006 isFixedVectorShuffle(VL: TE->Scalars, Mask, AC)) ||
16007 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
16008 !TE->isAltShuffle()) ||
16009 any_of(Range: TE->Scalars, P: IsaPred<LoadInst>));
16010 };
16011
16012 // We only handle trees of heights 1 and 2.
16013 if (VectorizableTree.size() == 1 &&
16014 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
16015 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
16016 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
16017 (ForReduction &&
16018 AreVectorizableGathers(VectorizableTree[0].get(),
16019 VectorizableTree[0]->Scalars.size()) &&
16020 VectorizableTree[0]->getVectorFactor() > 2)))
16021 return true;
16022
16023 if (VectorizableTree.size() != 2)
16024 return false;
16025
16026 // Handle splat and all-constants stores. Also try to vectorize tiny trees
16027 // with the second gather nodes if they have less scalar operands rather than
16028 // the initial tree element (may be profitable to shuffle the second gather)
16029 // or they are extractelements, which form shuffle.
16030 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
16031 AreVectorizableGathers(VectorizableTree[1].get(),
16032 VectorizableTree[0]->Scalars.size()))
16033 return true;
16034
16035 // Gathering cost would be too much for tiny trees.
16036 if (VectorizableTree[0]->isGather() ||
16037 (VectorizableTree[1]->isGather() &&
16038 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
16039 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
16040 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
16041 return false;
16042
16043 return true;
16044}
16045
16046static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
16047 TargetTransformInfo *TTI,
16048 bool MustMatchOrInst) {
16049 // Look past the root to find a source value. Arbitrarily follow the
16050 // path through operand 0 of any 'or'. Also, peek through optional
16051 // shift-left-by-multiple-of-8-bits.
16052 Value *ZextLoad = Root;
16053 const APInt *ShAmtC;
16054 bool FoundOr = false;
16055 while (!isa<ConstantExpr>(Val: ZextLoad) &&
16056 (match(V: ZextLoad, P: m_Or(L: m_Value(), R: m_Value())) ||
16057 (match(V: ZextLoad, P: m_Shl(L: m_Value(), R: m_APInt(Res&: ShAmtC))) &&
16058 ShAmtC->urem(RHS: 8) == 0))) {
16059 auto *BinOp = cast<BinaryOperator>(Val: ZextLoad);
16060 ZextLoad = BinOp->getOperand(i_nocapture: 0);
16061 if (BinOp->getOpcode() == Instruction::Or)
16062 FoundOr = true;
16063 }
16064 // Check if the input is an extended load of the required or/shift expression.
16065 Value *Load;
16066 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
16067 !match(V: ZextLoad, P: m_ZExt(Op: m_Value(V&: Load))) || !isa<LoadInst>(Val: Load))
16068 return false;
16069
16070 // Require that the total load bit width is a legal integer type.
16071 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
16072 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
16073 Type *SrcTy = Load->getType();
16074 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
16075 if (!TTI->isTypeLegal(Ty: IntegerType::get(C&: Root->getContext(), NumBits: LoadBitWidth)))
16076 return false;
16077
16078 // Everything matched - assume that we can fold the whole sequence using
16079 // load combining.
16080 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
16081 << *(cast<Instruction>(Root)) << "\n");
16082
16083 return true;
16084}
16085
16086bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
16087 if (RdxKind != RecurKind::Or)
16088 return false;
16089
16090 unsigned NumElts = VectorizableTree[0]->Scalars.size();
16091 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
16092 return isLoadCombineCandidateImpl(Root: FirstReduced, NumElts, TTI,
16093 /* MatchOr */ MustMatchOrInst: false);
16094}
16095
16096bool BoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const {
16097 // Peek through a final sequence of stores and check if all operations are
16098 // likely to be load-combined.
16099 unsigned NumElts = Stores.size();
16100 for (Value *Scalar : Stores) {
16101 Value *X;
16102 if (!match(V: Scalar, P: m_Store(ValueOp: m_Value(V&: X), PointerOp: m_Value())) ||
16103 !isLoadCombineCandidateImpl(Root: X, NumElts, TTI, /* MatchOr */ MustMatchOrInst: true))
16104 return false;
16105 }
16106 return true;
16107}
16108
16109bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
16110 if (!DebugCounter::shouldExecute(Counter&: VectorizedGraphs))
16111 return true;
16112
16113 // Graph is empty - do nothing.
16114 if (VectorizableTree.empty()) {
16115 assert(ExternalUses.empty() && "We shouldn't have any external users");
16116
16117 return true;
16118 }
16119
16120 // No need to vectorize inserts of gathered values.
16121 if (VectorizableTree.size() == 2 &&
16122 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
16123 VectorizableTree[1]->isGather() &&
16124 (VectorizableTree[1]->getVectorFactor() <= 2 ||
16125 !(isSplat(VL: VectorizableTree[1]->Scalars) ||
16126 allConstant(VL: VectorizableTree[1]->Scalars))))
16127 return true;
16128
16129 // If the graph includes only PHI nodes and gathers, it is defnitely not
16130 // profitable for the vectorization, we can skip it, if the cost threshold is
16131 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
16132 // gathers/buildvectors.
16133 constexpr int Limit = 4;
16134 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16135 !VectorizableTree.empty() &&
16136 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16137 return (TE->isGather() &&
16138 (!TE->hasState() ||
16139 TE->getOpcode() != Instruction::ExtractElement) &&
16140 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) ||
16141 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
16142 }))
16143 return true;
16144
16145 // Do not vectorize small tree of phis only, if all vector phis are also
16146 // gathered.
16147 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16148 VectorizableTree.size() <= Limit &&
16149 all_of(Range: VectorizableTree,
16150 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16151 return (TE->isGather() &&
16152 (!TE->hasState() ||
16153 TE->getOpcode() != Instruction::ExtractElement) &&
16154 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <=
16155 Limit) ||
16156 (TE->hasState() &&
16157 (TE->getOpcode() == Instruction::InsertElement ||
16158 (TE->getOpcode() == Instruction::PHI &&
16159 all_of(Range&: TE->Scalars, P: [&](Value *V) {
16160 return isa<PoisonValue>(Val: V) || MustGather.contains(Ptr: V);
16161 }))));
16162 }) &&
16163 any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16164 return TE->State == TreeEntry::Vectorize &&
16165 TE->getOpcode() == Instruction::PHI;
16166 }))
16167 return true;
16168
16169 // If the tree contains only phis, buildvectors, split nodes and
16170 // small nodes with reuses, we can skip it.
16171 SmallVector<const TreeEntry *> StoreLoadNodes;
16172 unsigned NumGathers = 0;
16173 constexpr int LimitTreeSize = 36;
16174 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16175 all_of(Range: VectorizableTree,
16176 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16177 if (!TE->isGather() && TE->hasState() &&
16178 (TE->getOpcode() == Instruction::Load ||
16179 TE->getOpcode() == Instruction::Store)) {
16180 StoreLoadNodes.push_back(Elt: TE.get());
16181 return true;
16182 }
16183 if (TE->isGather())
16184 ++NumGathers;
16185 return TE->State == TreeEntry::SplitVectorize ||
16186 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
16187 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
16188 VectorizableTree.size() > LimitTreeSize) ||
16189 (TE->isGather() &&
16190 none_of(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>)) ||
16191 (TE->hasState() &&
16192 (TE->getOpcode() == Instruction::PHI ||
16193 (TE->hasCopyableElements() &&
16194 static_cast<unsigned>(count_if(
16195 Range&: TE->Scalars, P: IsaPred<PHINode, Constant>)) >=
16196 TE->Scalars.size() / 2) ||
16197 ((!TE->ReuseShuffleIndices.empty() ||
16198 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
16199 TE->Scalars.size() == 2)));
16200 }) &&
16201 (StoreLoadNodes.empty() ||
16202 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
16203 (NumGathers > 0 || none_of(Range&: StoreLoadNodes, P: [&](const TreeEntry *TE) {
16204 return TE->getOpcode() == Instruction::Store ||
16205 all_of(Range: TE->Scalars, P: [&](Value *V) {
16206 return !isa<LoadInst>(Val: V) ||
16207 areAllUsersVectorized(I: cast<Instruction>(Val: V));
16208 });
16209 })))))
16210 return true;
16211
16212 // If the tree contains only buildvector, 2 non-buildvectors (with root user
16213 // tree node) and other buildvectors, we can skip it.
16214 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16215 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
16216 VectorizableTree.size() >= Limit &&
16217 count_if(Range: ArrayRef(VectorizableTree).drop_front(),
16218 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16219 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16220 TE->UserTreeIndex.UserTE->Idx == 0;
16221 }) == 2)
16222 return true;
16223
16224 // If the tree contains only vectorization of the phi node from the
16225 // buildvector - skip it.
16226 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16227 VectorizableTree.size() > 2 &&
16228 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16229 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16230 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16231 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16232 all_of(
16233 Range: ArrayRef(VectorizableTree).drop_front(N: 2),
16234 P: [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
16235 return true;
16236
16237 // We can vectorize the tree if its size is greater than or equal to the
16238 // minimum size specified by the MinTreeSize command line option.
16239 if (VectorizableTree.size() >= MinTreeSize)
16240 return false;
16241
16242 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16243 // can vectorize it if we can prove it fully vectorizable.
16244 if (isFullyVectorizableTinyTree(ForReduction))
16245 return false;
16246
16247 // Check if any of the gather node forms an insertelement buildvector
16248 // somewhere.
16249 bool IsAllowedSingleBVNode =
16250 VectorizableTree.size() > 1 ||
16251 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16252 !VectorizableTree.front()->isAltShuffle() &&
16253 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16254 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16255 allSameBlock(VL: VectorizableTree.front()->Scalars));
16256 if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16257 return TE->isGather() && all_of(Range&: TE->Scalars, P: [&](Value *V) {
16258 return isa<ExtractElementInst, Constant>(Val: V) ||
16259 (IsAllowedSingleBVNode &&
16260 !V->hasNUsesOrMore(N: UsesLimit) &&
16261 any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
16262 });
16263 }))
16264 return false;
16265
16266 if (VectorizableTree.back()->isGather() &&
16267 VectorizableTree.back()->hasState() &&
16268 VectorizableTree.back()->isAltShuffle() &&
16269 VectorizableTree.back()->getVectorFactor() > 2 &&
16270 allSameBlock(VL: VectorizableTree.back()->Scalars) &&
16271 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16272 TTI->getScalarizationOverhead(
16273 Ty: getWidenedType(ScalarTy: VectorizableTree.back()->Scalars.front()->getType(),
16274 VF: VectorizableTree.back()->getVectorFactor()),
16275 DemandedElts: APInt::getAllOnes(numBits: VectorizableTree.back()->getVectorFactor()),
16276 /*Insert=*/true, /*Extract=*/false,
16277 CostKind: TTI::TCK_RecipThroughput) > -SLPCostThreshold)
16278 return false;
16279
16280 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
16281 // vectorizable.
16282 return true;
16283}
16284
16285bool BoUpSLP::isTreeNotExtendable() const {
16286 if (getCanonicalGraphSize() != getTreeSize()) {
16287 constexpr unsigned SmallTree = 3;
16288 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16289 getCanonicalGraphSize() <= SmallTree &&
16290 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
16291 P: [](const std::unique_ptr<TreeEntry> &TE) {
16292 return TE->isGather() && TE->hasState() &&
16293 TE->getOpcode() == Instruction::Load &&
16294 !allSameBlock(VL: TE->Scalars);
16295 }) == 1)
16296 return true;
16297 return false;
16298 }
16299 bool Res = false;
16300 for (unsigned Idx : seq<unsigned>(Size: getTreeSize())) {
16301 TreeEntry &E = *VectorizableTree[Idx];
16302 if (E.State == TreeEntry::SplitVectorize)
16303 return false;
16304 if (!E.isGather())
16305 continue;
16306 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16307 (!E.hasState() &&
16308 all_of(Range&: E.Scalars, P: IsaPred<ExtractElementInst, LoadInst>)) ||
16309 (isa<ExtractElementInst>(Val: E.Scalars.front()) &&
16310 getSameOpcode(VL: ArrayRef(E.Scalars).drop_front(), TLI: *TLI).valid()))
16311 return false;
16312 if (isSplat(VL: E.Scalars) || allConstant(VL: E.Scalars))
16313 continue;
16314 Res = true;
16315 }
16316 return Res;
16317}
16318
16319InstructionCost BoUpSLP::getSpillCost() {
16320 // Walk from the bottom of the tree to the top, tracking which values are
16321 // live. When we see a call instruction that is not part of our tree,
16322 // query TTI to see if there is a cost to keeping values live over it
16323 // (for example, if spills and fills are required).
16324
16325 const TreeEntry *Root = VectorizableTree.front().get();
16326 if (Root->isGather())
16327 return 0;
16328
16329 InstructionCost Cost = 0;
16330 SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
16331 EntriesToOperands;
16332 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
16333 SmallPtrSet<const Instruction *, 8> LastInstructions;
16334 SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
16335 for (const auto &TEPtr : VectorizableTree) {
16336 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
16337 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
16338 ScalarOrPseudoEntries.insert(Ptr: TEPtr.get());
16339 continue;
16340 }
16341 if (!TEPtr->isGather()) {
16342 Instruction *LastInst = &getLastInstructionInBundle(E: TEPtr.get());
16343 EntriesToLastInstruction.try_emplace(Key: TEPtr.get(), Args&: LastInst);
16344 LastInstructions.insert(Ptr: LastInst);
16345 }
16346 if (TEPtr->UserTreeIndex)
16347 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(Elt: TEPtr.get());
16348 }
16349
16350 auto NoCallIntrinsic = [this](const Instruction *I) {
16351 const auto *II = dyn_cast<IntrinsicInst>(Val: I);
16352 if (!II)
16353 return false;
16354 if (II->isAssumeLikeIntrinsic())
16355 return true;
16356 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16357 InstructionCost IntrCost =
16358 TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
16359 InstructionCost CallCost = TTI->getCallInstrCost(
16360 F: nullptr, RetTy: II->getType(), Tys: ICA.getArgTypes(), CostKind: TTI::TCK_RecipThroughput);
16361 return IntrCost < CallCost;
16362 };
16363
16364 // Maps last instruction in the entry to the last instruction for the one of
16365 // operand entries and the flag. If the flag is true, there are no calls in
16366 // between these instructions.
16367 SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
16368 CheckedInstructions;
16369 unsigned Budget = 0;
16370 const unsigned BudgetLimit =
16371 ScheduleRegionSizeBudget / VectorizableTree.size();
16372 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
16373 const Instruction *Last) {
16374 assert(First->getParent() == Last->getParent() &&
16375 "Expected instructions in same block.");
16376 if (auto It = CheckedInstructions.find(Val: Last);
16377 It != CheckedInstructions.end()) {
16378 const Instruction *Checked = It->second.getPointer();
16379 if (Checked == First || Checked->comesBefore(Other: First))
16380 return It->second.getInt() != 0;
16381 Last = Checked;
16382 } else if (Last == First || Last->comesBefore(Other: First)) {
16383 return true;
16384 }
16385 BasicBlock::const_reverse_iterator InstIt =
16386 ++First->getIterator().getReverse(),
16387 PrevInstIt =
16388 Last->getIterator().getReverse();
16389 SmallVector<const Instruction *> LastInstsInRange;
16390 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16391 // Debug information does not impact spill cost.
16392 // Vectorized calls, represented as vector intrinsics, do not impact spill
16393 // cost.
16394 if (const auto *CB = dyn_cast<CallBase>(Val: &*PrevInstIt);
16395 CB && !NoCallIntrinsic(CB) && !isVectorized(V: CB)) {
16396 for (const Instruction *LastInst : LastInstsInRange)
16397 CheckedInstructions.try_emplace(Key: LastInst, Args: &*PrevInstIt, Args: 0);
16398 return false;
16399 }
16400 if (LastInstructions.contains(Ptr: &*PrevInstIt))
16401 LastInstsInRange.push_back(Elt: &*PrevInstIt);
16402
16403 ++PrevInstIt;
16404 ++Budget;
16405 }
16406 for (const Instruction *LastInst : LastInstsInRange)
16407 CheckedInstructions.try_emplace(
16408 Key: LastInst, Args: PrevInstIt == InstIt ? First : &*PrevInstIt,
16409 Args: Budget <= BudgetLimit ? 1 : 0);
16410 return Budget <= BudgetLimit;
16411 };
16412 auto AddCosts = [&](const TreeEntry *Op) {
16413 if (ScalarOrPseudoEntries.contains(Ptr: Op))
16414 return;
16415 Type *ScalarTy = Op->Scalars.front()->getType();
16416 auto It = MinBWs.find(Val: Op);
16417 if (It != MinBWs.end())
16418 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
16419 auto *VecTy = getWidenedType(ScalarTy, VF: Op->getVectorFactor());
16420 Cost += TTI->getCostOfKeepingLiveOverCall(Tys: VecTy);
16421 if (ScalarTy->isVectorTy()) {
16422 // Handle revec dead vector instructions.
16423 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(Tys: ScalarTy);
16424 }
16425 };
16426 // Memoize the relationship between blocks, i.e. if there is (at least one)
16427 // non-vectorized call between the blocks. This allows to skip the analysis of
16428 // the same block paths multiple times.
16429 SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
16430 ParentOpParentToPreds;
16431 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
16432 BasicBlock *OpParent) {
16433 auto Key = std::make_pair(x&: Root, y&: OpParent);
16434 if (auto It = ParentOpParentToPreds.find(Val: Key);
16435 It != ParentOpParentToPreds.end())
16436 return It->second;
16437 SmallVector<BasicBlock *> Worklist;
16438 if (Pred)
16439 Worklist.push_back(Elt: Pred);
16440 else
16441 Worklist.append(in_start: pred_begin(BB: Root), in_end: pred_end(BB: Root));
16442 SmallPtrSet<const BasicBlock *, 16> Visited;
16443 SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
16444 ParentsPairsToAdd;
16445 bool Res = false;
16446 llvm::scope_exit Cleanup([&]() {
16447 for (const auto &KeyPair : ParentsPairsToAdd) {
16448 assert(!ParentOpParentToPreds.contains(KeyPair) &&
16449 "Should not have been added before.");
16450 ParentOpParentToPreds.try_emplace(Key: KeyPair, Args&: Res);
16451 }
16452 });
16453 while (!Worklist.empty()) {
16454 BasicBlock *BB = Worklist.pop_back_val();
16455 if (BB == OpParent || !Visited.insert(Ptr: BB).second)
16456 continue;
16457 auto Pair = std::make_pair(x&: BB, y&: OpParent);
16458 if (auto It = ParentOpParentToPreds.find(Val: Pair);
16459 It != ParentOpParentToPreds.end()) {
16460 Res = It->second;
16461 return Res;
16462 }
16463 ParentsPairsToAdd.insert(V: Pair);
16464 unsigned BlockSize = BB->size();
16465 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
16466 return Res;
16467 Budget += BlockSize;
16468 if (Budget > BudgetLimit)
16469 return Res;
16470 if (!isa<CatchSwitchInst>(Val: BB->getTerminator()) &&
16471 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
16472 BB->getTerminator()))
16473 return Res;
16474 Worklist.append(in_start: pred_begin(BB), in_end: pred_end(BB));
16475 }
16476 Res = true;
16477 return Res;
16478 };
16479 SmallVector<const TreeEntry *> LiveEntries(1, Root);
16480 auto FindNonScalarParentEntry = [&](const TreeEntry *E) -> const TreeEntry * {
16481 assert(ScalarOrPseudoEntries.contains(E) &&
16482 "Expected scalar or pseudo entry.");
16483 const TreeEntry *Entry = E;
16484 while (Entry->UserTreeIndex) {
16485 Entry = Entry->UserTreeIndex.UserTE;
16486 if (!ScalarOrPseudoEntries.contains(Ptr: Entry))
16487 return Entry;
16488 }
16489 return nullptr;
16490 };
16491 while (!LiveEntries.empty()) {
16492 const TreeEntry *Entry = LiveEntries.pop_back_val();
16493 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Val: Entry);
16494 if (Operands.empty())
16495 continue;
16496 if (ScalarOrPseudoEntries.contains(Ptr: Entry)) {
16497 Entry = FindNonScalarParentEntry(Entry);
16498 if (!Entry) {
16499 for (const TreeEntry *Op : Operands) {
16500 if (!Op->isGather())
16501 LiveEntries.push_back(Elt: Op);
16502 }
16503 continue;
16504 }
16505 }
16506 Instruction *LastInst = EntriesToLastInstruction.at(Val: Entry);
16507 BasicBlock *Parent = LastInst->getParent();
16508 for (const TreeEntry *Op : Operands) {
16509 if (!Op->isGather())
16510 LiveEntries.push_back(Elt: Op);
16511 if (ScalarOrPseudoEntries.contains(Ptr: Op))
16512 continue;
16513 if (Entry->State == TreeEntry::SplitVectorize ||
16514 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
16515 (Op->isGather() && allConstant(VL: Op->Scalars)))
16516 continue;
16517 Budget = 0;
16518 BasicBlock *Pred = nullptr;
16519 if (auto *Phi = dyn_cast<PHINode>(Val: Entry->getMainOp()))
16520 Pred = Phi->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
16521 BasicBlock *OpParent;
16522 Instruction *OpLastInst;
16523 if (Op->isGather()) {
16524 assert(Entry->getOpcode() == Instruction::PHI &&
16525 "Expected phi node only.");
16526 OpParent = cast<PHINode>(Val: Entry->getMainOp())
16527 ->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
16528 OpLastInst = OpParent->getTerminator();
16529 for (Value *V : Op->Scalars) {
16530 auto *Inst = dyn_cast<Instruction>(Val: V);
16531 if (!Inst)
16532 continue;
16533 if (isVectorized(V)) {
16534 OpParent = Inst->getParent();
16535 OpLastInst = Inst;
16536 break;
16537 }
16538 }
16539 } else {
16540 OpLastInst = EntriesToLastInstruction.at(Val: Op);
16541 OpParent = OpLastInst->getParent();
16542 }
16543 // Check the call instructions within the same basic blocks.
16544 if (OpParent == Parent) {
16545 if (Entry->getOpcode() == Instruction::PHI) {
16546 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16547 AddCosts(Op);
16548 continue;
16549 }
16550 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16551 AddCosts(Op);
16552 continue;
16553 }
16554 // Check for call instruction in between blocks.
16555 // 1. Check entry's block to the head.
16556 if (Entry->getOpcode() != Instruction::PHI &&
16557 !CheckForNonVecCallsInSameBlock(
16558 &*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
16559 AddCosts(Op);
16560 continue;
16561 }
16562 // 2. Check op's block from the end.
16563 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16564 OpParent->getTerminator())) {
16565 AddCosts(Op);
16566 continue;
16567 }
16568 // 3. Check the predecessors of entry's block till op's block.
16569 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16570 AddCosts(Op);
16571 continue;
16572 }
16573 }
16574 }
16575
16576 return Cost;
16577}
16578
16579/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16580/// buildvector sequence.
16581static bool isFirstInsertElement(const InsertElementInst *IE1,
16582 const InsertElementInst *IE2) {
16583 if (IE1 == IE2)
16584 return false;
16585 const auto *I1 = IE1;
16586 const auto *I2 = IE2;
16587 const InsertElementInst *PrevI1;
16588 const InsertElementInst *PrevI2;
16589 unsigned Idx1 = *getElementIndex(Inst: IE1);
16590 unsigned Idx2 = *getElementIndex(Inst: IE2);
16591 do {
16592 if (I2 == IE1)
16593 return true;
16594 if (I1 == IE2)
16595 return false;
16596 PrevI1 = I1;
16597 PrevI2 = I2;
16598 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16599 getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
16600 I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: 0));
16601 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16602 getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
16603 I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: 0));
16604 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16605 llvm_unreachable("Two different buildvectors not expected.");
16606}
16607
16608namespace {
16609/// Returns incoming Value *, if the requested type is Value * too, or a default
16610/// value, otherwise.
16611struct ValueSelect {
16612 template <typename U>
16613 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16614 return V;
16615 }
16616 template <typename U>
16617 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16618 return U();
16619 }
16620};
16621} // namespace
16622
16623/// Does the analysis of the provided shuffle masks and performs the requested
16624/// actions on the vectors with the given shuffle masks. It tries to do it in
16625/// several steps.
16626/// 1. If the Base vector is not undef vector, resizing the very first mask to
16627/// have common VF and perform action for 2 input vectors (including non-undef
16628/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16629/// and processed as a shuffle of 2 elements.
16630/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16631/// action only for 1 vector with the given mask, if it is not the identity
16632/// mask.
16633/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16634/// vectors, combing the masks properly between the steps.
16635template <typename T>
16636static T *performExtractsShuffleAction(
16637 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16638 function_ref<unsigned(T *)> GetVF,
16639 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16640 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
16641 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16642 SmallVector<int> Mask(ShuffleMask.begin()->second);
16643 auto VMIt = std::next(ShuffleMask.begin());
16644 T *Prev = nullptr;
16645 SmallBitVector UseMask =
16646 buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
16647 SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
16648 if (!IsBaseUndef.all()) {
16649 // Base is not undef, need to combine it with the next subvectors.
16650 std::pair<T *, bool> Res =
16651 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16652 SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
16653 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16654 if (Mask[Idx] == PoisonMaskElem)
16655 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16656 else
16657 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16658 }
16659 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16660 assert((!V || GetVF(V) == Mask.size()) &&
16661 "Expected base vector of VF number of elements.");
16662 Prev = Action(Mask, {nullptr, Res.first});
16663 } else if (ShuffleMask.size() == 1) {
16664 // Base is undef and only 1 vector is shuffled - perform the action only for
16665 // single vector, if the mask is not the identity mask.
16666 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16667 /*ForSingleMask=*/true);
16668 if (Res.second)
16669 // Identity mask is found.
16670 Prev = Res.first;
16671 else
16672 Prev = Action(Mask, {ShuffleMask.begin()->first});
16673 } else {
16674 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16675 // shuffles step by step, combining shuffle between the steps.
16676 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16677 unsigned Vec2VF = GetVF(VMIt->first);
16678 if (Vec1VF == Vec2VF) {
16679 // No need to resize the input vectors since they are of the same size, we
16680 // can shuffle them directly.
16681 ArrayRef<int> SecMask = VMIt->second;
16682 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16683 if (SecMask[I] != PoisonMaskElem) {
16684 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16685 Mask[I] = SecMask[I] + Vec1VF;
16686 }
16687 }
16688 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16689 } else {
16690 // Vectors of different sizes - resize and reshuffle.
16691 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16692 /*ForSingleMask=*/false);
16693 std::pair<T *, bool> Res2 =
16694 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16695 ArrayRef<int> SecMask = VMIt->second;
16696 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16697 if (Mask[I] != PoisonMaskElem) {
16698 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16699 if (Res1.second)
16700 Mask[I] = I;
16701 } else if (SecMask[I] != PoisonMaskElem) {
16702 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16703 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16704 }
16705 }
16706 Prev = Action(Mask, {Res1.first, Res2.first});
16707 }
16708 VMIt = std::next(VMIt);
16709 }
16710 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16711 // Perform requested actions for the remaining masks/vectors.
16712 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16713 // Shuffle other input vectors, if any.
16714 std::pair<T *, bool> Res =
16715 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16716 ArrayRef<int> SecMask = VMIt->second;
16717 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16718 if (SecMask[I] != PoisonMaskElem) {
16719 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16720 "Multiple uses of scalars.");
16721 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16722 } else if (Mask[I] != PoisonMaskElem) {
16723 Mask[I] = I;
16724 }
16725 }
16726 Prev = Action(Mask, {Prev, Res.first});
16727 }
16728 return Prev;
16729}
16730
16731InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
16732 ArrayRef<Value *> VectorizedVals) {
16733 SmallDenseMap<const TreeEntry *, InstructionCost> NodesCosts;
16734 SmallPtrSet<Value *, 4> CheckedExtracts;
16735 SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
16736 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16737 << VectorizableTree.size() << ".\n");
16738 InstructionCost Cost = 0;
16739 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16740 TreeEntry &TE = *Ptr;
16741 // No need to count the cost for combined entries, they are combined and
16742 // just skip their cost.
16743 if (TE.State == TreeEntry::CombinedVectorize) {
16744 LLVM_DEBUG(
16745 dbgs() << "SLP: Skipping cost for combined node that starts with "
16746 << *TE.Scalars[0] << ".\n";
16747 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16748 NodesCosts.try_emplace(Key: &TE);
16749 continue;
16750 }
16751 if (TE.hasState() &&
16752 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16753 if (const TreeEntry *E =
16754 getSameValuesTreeEntry(V: TE.getMainOp(), VL: TE.Scalars);
16755 E && E->getVectorFactor() == TE.getVectorFactor()) {
16756 // Some gather nodes might be absolutely the same as some vectorizable
16757 // nodes after reordering, need to handle it.
16758 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16759 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16760 << "SLP: Current total cost = " << Cost << "\n");
16761 NodesCosts.try_emplace(Key: &TE);
16762 continue;
16763 }
16764 }
16765
16766 // Exclude cost of gather loads nodes which are not used. These nodes were
16767 // built as part of the final attempt to vectorize gathered loads.
16768 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16769 "Expected gather nodes with users only.");
16770
16771 InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
16772 Cost += C;
16773 NodesCosts.try_emplace(Key: &TE, Args&: C);
16774 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16775 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16776 << "SLP: Current total cost = " << Cost << "\n");
16777 // Add gathered loads nodes to the set for later processing.
16778 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16779 TE.getOpcode() == Instruction::Load)
16780 GatheredLoadsNodes.insert(X: &TE);
16781 }
16782 // Bail out if the cost threshold is negative and cost already below it.
16783 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
16784 Cost < -SLPCostThreshold)
16785 return Cost;
16786 // The narrow non-profitable tree in loop? Skip, may cause regressions.
16787 constexpr unsigned PartLimit = 2;
16788 const unsigned Sz =
16789 getVectorElementSize(V: VectorizableTree.front()->Scalars.front());
16790 const unsigned MinVF = getMinVF(Sz);
16791 if (Cost >= -SLPCostThreshold &&
16792 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
16793 (!VectorizableTree.front()->hasState() ||
16794 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
16795 LI->getLoopFor(BB: VectorizableTree.front()->getMainOp()->getParent()))))
16796 return Cost;
16797 SmallVector<std::pair<InstructionCost, SmallVector<unsigned>>> SubtreeCosts(
16798 VectorizableTree.size());
16799 auto UpdateParentNodes =
16800 [&](const TreeEntry *UserTE, const TreeEntry *TE, InstructionCost C,
16801 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
16802 &VisitedUser,
16803 bool AddToList = true) {
16804 while (UserTE &&
16805 VisitedUser.insert(V: std::make_pair(x&: TE, y&: UserTE)).second) {
16806 SubtreeCosts[UserTE->Idx].first += C;
16807 if (AddToList)
16808 SubtreeCosts[UserTE->Idx].second.push_back(Elt: TE->Idx);
16809 UserTE = UserTE->UserTreeIndex.UserTE;
16810 }
16811 };
16812 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16813 TreeEntry &TE = *Ptr;
16814 InstructionCost C = NodesCosts.at(Val: &TE);
16815 SubtreeCosts[TE.Idx].first += C;
16816 if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
16817 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
16818 VisitedUser;
16819 UpdateParentNodes(UserTE, &TE, C, VisitedUser);
16820 }
16821 }
16822 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
16823 for (TreeEntry *TE : GatheredLoadsNodes) {
16824 InstructionCost C = SubtreeCosts[TE->Idx].first;
16825 for (Value *V : TE->Scalars) {
16826 for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(Val: V))
16827 UpdateParentNodes(BVTE, TE, C, Visited, /*AddToList=*/false);
16828 }
16829 }
16830 Visited.clear();
16831 using CostIndicesTy =
16832 std::pair<TreeEntry *, std::pair<InstructionCost, SmallVector<unsigned>>>;
16833 struct FirstGreater {
16834 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
16835 return LHS.second.first < RHS.second.first ||
16836 (LHS.second.first == RHS.second.first &&
16837 LHS.first->Idx < RHS.first->Idx);
16838 }
16839 };
16840 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
16841 Worklist;
16842 for (const auto [Idx, P] : enumerate(First&: SubtreeCosts))
16843 Worklist.emplace(args: VectorizableTree[Idx].get(), args&: P);
16844
16845 // Narrow store trees with non-profitable immediate values - exit.
16846 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
16847 VectorizableTree.front()->hasState() &&
16848 VectorizableTree.front()->getOpcode() == Instruction::Store &&
16849 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
16850 return Cost;
16851
16852 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
16853 bool Changed = false;
16854 while (!Worklist.empty() && Worklist.top().second.first > 0) {
16855 TreeEntry *TE = Worklist.top().first;
16856 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(Ptr: TE) ||
16857 // Exit early if the parent node is split node and any of scalars is
16858 // used in other split nodes.
16859 (TE->UserTreeIndex &&
16860 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
16861 any_of(Range&: TE->Scalars, P: [&](Value *V) {
16862 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
16863 return Entries.size() > 1;
16864 }))) {
16865 Worklist.pop();
16866 continue;
16867 }
16868
16869 // Calculate the gather cost of the root node.
16870 InstructionCost SubtreeCost = Worklist.top().second.first;
16871 if (SubtreeCost < TE->Scalars.size()) {
16872 Worklist.pop();
16873 continue;
16874 }
16875 if (!TransformedToGatherNodes.empty()) {
16876 for (unsigned Idx : Worklist.top().second.second) {
16877 auto It = TransformedToGatherNodes.find(Val: VectorizableTree[Idx].get());
16878 if (It != TransformedToGatherNodes.end()) {
16879 SubtreeCost -= SubtreeCosts[Idx].first;
16880 SubtreeCost += It->second;
16881 }
16882 }
16883 }
16884 if (SubtreeCost < 0 || SubtreeCost < TE->Scalars.size()) {
16885 Worklist.pop();
16886 continue;
16887 }
16888 const unsigned Sz = TE->Scalars.size();
16889 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
16890 for (auto [Idx, V] : enumerate(First&: TE->Scalars)) {
16891 if (isConstant(V))
16892 DemandedElts.clearBit(BitPosition: Idx);
16893 }
16894
16895 Type *ScalarTy = getValueType(V: TE->Scalars.front());
16896 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
16897 const unsigned EntryVF = TE->getVectorFactor();
16898 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
16899 InstructionCost GatherCost = ::getScalarizationOverhead(
16900 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
16901 /*Insert=*/true, /*Extract=*/false, CostKind);
16902 SmallVector<int> Mask;
16903 if (!TE->ReorderIndices.empty() &&
16904 TE->State != TreeEntry::CompressVectorize &&
16905 (TE->State != TreeEntry::StridedVectorize ||
16906 !isReverseOrder(Order: TE->ReorderIndices))) {
16907 SmallVector<int> NewMask;
16908 if (TE->getOpcode() == Instruction::Store) {
16909 // For stores the order is actually a mask.
16910 NewMask.resize(N: TE->ReorderIndices.size());
16911 copy(Range&: TE->ReorderIndices, Out: NewMask.begin());
16912 } else {
16913 inversePermutation(Indices: TE->ReorderIndices, Mask&: NewMask);
16914 }
16915 ::addMask(Mask, SubMask: NewMask);
16916 }
16917 if (!TE->ReuseShuffleIndices.empty())
16918 ::addMask(Mask, SubMask: TE->ReuseShuffleIndices);
16919 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: EntryVF))
16920 GatherCost +=
16921 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
16922 // If all scalars are reused in gather node(s) or other vector nodes, there
16923 // might be extra cost for inserting them.
16924 if (all_of(Range&: TE->Scalars, P: [&](Value *V) {
16925 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
16926 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
16927 }))
16928 GatherCost *= 2;
16929 // Erase subtree if it is non-profitable.
16930 if (SubtreeCost > GatherCost) {
16931 // If the remaining tree is just a buildvector - exit, it will cause
16932 // endless attempts to vectorize.
16933 if (VectorizableTree.front()->hasState() &&
16934 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16935 TE->Idx == 1)
16936 return InstructionCost::getInvalid();
16937
16938 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
16939 << TE->Idx << " with cost "
16940 << Worklist.top().second.first << " and gather cost "
16941 << GatherCost << ".\n");
16942 if (TE->UserTreeIndex) {
16943 TransformedToGatherNodes.try_emplace(Key: TE, Args&: GatherCost);
16944 NodesCosts.erase(Val: TE);
16945 } else {
16946 DeletedNodes.insert(Ptr: TE);
16947 TransformedToGatherNodes.erase(Val: TE);
16948 NodesCosts.erase(Val: TE);
16949 }
16950 for (unsigned Idx : Worklist.top().second.second) {
16951 TreeEntry &ChildTE = *VectorizableTree[Idx];
16952 DeletedNodes.insert(Ptr: &ChildTE);
16953 TransformedToGatherNodes.erase(Val: &ChildTE);
16954 NodesCosts.erase(Val: &ChildTE);
16955 }
16956 Changed = true;
16957 }
16958 Worklist.pop();
16959 }
16960 if (!Changed)
16961 return SubtreeCosts.front().first;
16962
16963 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
16964 InstructionCost LoadsExtractsCost = 0;
16965 // Check if all loads of gathered loads nodes are marked for deletion. In this
16966 // case the whole gathered loads subtree must be deleted.
16967 // Also, try to account for extracts, which might be required, if only part of
16968 // gathered load must be vectorized. Keep partially vectorized nodes, if
16969 // extracts are cheaper than gathers.
16970 for (TreeEntry *TE : GatheredLoadsNodes) {
16971 if (DeletedNodes.contains(Ptr: TE) || TransformedToGatherNodes.contains(Val: TE))
16972 continue;
16973 GatheredLoadsToDelete.insert(Ptr: TE);
16974 APInt DemandedElts = APInt::getZero(numBits: TE->getVectorFactor());
16975 // All loads are removed from gathered? Need to delete the subtree.
16976 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
16977 for (Value *V : TE->Scalars) {
16978 unsigned Pos = TE->findLaneForValue(V);
16979 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
16980 if (DeletedNodes.contains(Ptr: BVE))
16981 continue;
16982 DemandedElts.setBit(Pos);
16983 ValuesToInsert.try_emplace(Key: BVE).first->second.push_back(Elt: V);
16984 }
16985 }
16986 if (!DemandedElts.isZero()) {
16987 Type *ScalarTy = TE->Scalars.front()->getType();
16988 auto *VecTy = getWidenedType(ScalarTy, VF: TE->getVectorFactor());
16989 InstructionCost ExtractsCost = ::getScalarizationOverhead(
16990 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
16991 /*Insert=*/false, /*Extract=*/true, CostKind);
16992 InstructionCost BVCost = 0;
16993 for (const auto &[BVE, Values] : ValuesToInsert) {
16994 APInt BVDemandedElts = APInt::getZero(numBits: BVE->getVectorFactor());
16995 SmallVector<Value *> BVValues(BVE->getVectorFactor(),
16996 PoisonValue::get(T: ScalarTy));
16997 for (Value *V : Values) {
16998 unsigned Pos = BVE->findLaneForValue(V);
16999 BVValues[Pos] = V;
17000 BVDemandedElts.setBit(Pos);
17001 }
17002 auto *BVVecTy = getWidenedType(ScalarTy, VF: BVE->getVectorFactor());
17003 BVCost += ::getScalarizationOverhead(
17004 TTI: *TTI, ScalarTy, Ty: BVVecTy, DemandedElts: BVDemandedElts,
17005 /*Insert=*/true, /*Extract=*/false, CostKind,
17006 ForPoisonSrc: BVDemandedElts.isAllOnes(), VL: BVValues);
17007 }
17008 if (ExtractsCost < BVCost) {
17009 LoadsExtractsCost += ExtractsCost;
17010 GatheredLoadsToDelete.erase(Ptr: TE);
17011 continue;
17012 }
17013 LoadsExtractsCost += BVCost;
17014 }
17015 NodesCosts.erase(Val: TE);
17016 }
17017
17018 // Deleted all subtrees rooted at gathered loads nodes.
17019 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17020 if (TE->UserTreeIndex &&
17021 GatheredLoadsToDelete.contains(Ptr: TE->UserTreeIndex.UserTE)) {
17022 DeletedNodes.insert(Ptr: TE.get());
17023 NodesCosts.erase(Val: TE.get());
17024 GatheredLoadsToDelete.insert(Ptr: TE.get());
17025 }
17026 }
17027
17028 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17029 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(Val: TE.get())) {
17030 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
17031 continue;
17032 }
17033 if (DeletedNodes.contains(Ptr: TE.get()))
17034 continue;
17035 if (!NodesCosts.contains(Val: TE.get())) {
17036 InstructionCost C =
17037 getEntryCost(E: TE.get(), VectorizedVals, CheckedExtracts);
17038 NodesCosts.try_emplace(Key: TE.get(), Args&: C);
17039 }
17040 }
17041
17042 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
17043 InstructionCost NewCost = 0;
17044 for (const auto &P : NodesCosts) {
17045 NewCost += P.second;
17046 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
17047 << shortBundleName(P.first->Scalars, P.first->Idx)
17048 << ".\n"
17049 << "SLP: Current total cost = " << Cost << "\n");
17050 }
17051 if (NewCost + LoadsExtractsCost >= Cost) {
17052 DeletedNodes.clear();
17053 TransformedToGatherNodes.clear();
17054 NewCost = Cost;
17055 }
17056 return NewCost;
17057}
17058
17059namespace {
17060/// Data type for handling buildvector sequences with the reused scalars from
17061/// other tree entries.
17062template <typename T> struct ShuffledInsertData {
17063 /// List of insertelements to be replaced by shuffles.
17064 SmallVector<InsertElementInst *> InsertElements;
17065 /// The parent vectors and shuffle mask for the given list of inserts.
17066 MapVector<T, SmallVector<int>> ValueMasks;
17067};
17068} // namespace
17069
17070InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
17071 ArrayRef<Value *> VectorizedVals,
17072 InstructionCost ReductionCost) {
17073 InstructionCost Cost = TreeCost + ReductionCost;
17074
17075 if (Cost >= -SLPCostThreshold &&
17076 none_of(Range&: ExternalUses, P: [](const ExternalUser &EU) {
17077 return isa_and_nonnull<InsertElementInst>(Val: EU.User);
17078 }))
17079 return Cost;
17080
17081 SmallPtrSet<Value *, 16> ExtractCostCalculated;
17082 InstructionCost ExtractCost = 0;
17083 SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
17084 SmallVector<APInt> DemandedElts;
17085 SmallDenseSet<Value *, 4> UsedInserts;
17086 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
17087 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
17088 DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
17089 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
17090 // Keep track {Scalar, Index, User} tuple.
17091 // On AArch64, this helps in fusing a mov instruction, associated with
17092 // extractelement, with fmul in the backend so that extractelement is free.
17093 SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
17094 for (ExternalUser &EU : ExternalUses) {
17095 ScalarUserAndIdx.emplace_back(Args&: EU.Scalar, Args&: EU.User, Args&: EU.Lane);
17096 }
17097 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
17098 for (ExternalUser &EU : ExternalUses) {
17099 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
17100 << EU.E.Idx << " in lane " << EU.Lane << "\n");
17101 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
17102 else dbgs() << " User: nullptr\n");
17103 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
17104
17105 // Uses by ephemeral values are free (because the ephemeral value will be
17106 // removed prior to code generation, and so the extraction will be
17107 // removed as well).
17108 if (EphValues.count(Ptr: EU.User))
17109 continue;
17110
17111 // Check if the scalar for the given user or all users is accounted already.
17112 if (!CheckedScalarUser.insert(V: std::make_pair(x&: EU.Scalar, y&: EU.User)).second ||
17113 (EU.User &&
17114 CheckedScalarUser.contains(V: std::make_pair(x&: EU.Scalar, y: nullptr))))
17115 continue;
17116
17117 // Used in unreachable blocks or in EH pads (rarely executed) or is
17118 // terminated with unreachable instruction.
17119 if (BasicBlock *UserParent =
17120 EU.User ? cast<Instruction>(Val: EU.User)->getParent() : nullptr;
17121 UserParent &&
17122 (!DT->isReachableFromEntry(A: UserParent) || UserParent->isEHPad() ||
17123 isa_and_present<UnreachableInst>(Val: UserParent->getTerminator())))
17124 continue;
17125
17126 // We only add extract cost once for the same scalar.
17127 if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
17128 !ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
17129 continue;
17130
17131 // No extract cost for vector "scalar" if REVEC is disabled
17132 if (!SLPReVec && isa<FixedVectorType>(Val: EU.Scalar->getType()))
17133 continue;
17134
17135 // If found user is an insertelement, do not calculate extract cost but try
17136 // to detect it as a final shuffled/identity match.
17137 // TODO: what if a user is insertvalue when REVEC is enabled?
17138 if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
17139 VU && VU->getOperand(i_nocapture: 1) == EU.Scalar) {
17140 if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
17141 if (!UsedInserts.insert(V: VU).second)
17142 continue;
17143 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
17144 if (InsertIdx) {
17145 const TreeEntry *ScalarTE = &EU.E;
17146 auto *It = find_if(
17147 Range&: ShuffledInserts,
17148 P: [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
17149 // Checks if 2 insertelements are from the same buildvector.
17150 InsertElementInst *VecInsert = Data.InsertElements.front();
17151 return areTwoInsertFromSameBuildVector(
17152 VU, V: VecInsert, GetBaseOperand: [this](InsertElementInst *II) -> Value * {
17153 Value *Op0 = II->getOperand(i_nocapture: 0);
17154 if (isVectorized(V: II) && !isVectorized(V: Op0))
17155 return nullptr;
17156 return Op0;
17157 });
17158 });
17159 int VecId = -1;
17160 if (It == ShuffledInserts.end()) {
17161 auto &Data = ShuffledInserts.emplace_back();
17162 Data.InsertElements.emplace_back(Args&: VU);
17163 DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
17164 VecId = ShuffledInserts.size() - 1;
17165 auto It = MinBWs.find(Val: ScalarTE);
17166 if (It != MinBWs.end() &&
17167 VectorCasts
17168 .insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
17169 .second) {
17170 unsigned BWSz = It->second.first;
17171 unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
17172 unsigned VecOpcode;
17173 if (DstBWSz < BWSz)
17174 VecOpcode = Instruction::Trunc;
17175 else
17176 VecOpcode =
17177 It->second.second ? Instruction::SExt : Instruction::ZExt;
17178 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17179 InstructionCost C = TTI->getCastInstrCost(
17180 Opcode: VecOpcode, Dst: FTy,
17181 Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
17182 VF: FTy->getNumElements()),
17183 CCH: TTI::CastContextHint::None, CostKind);
17184 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17185 << " for extending externally used vector with "
17186 "non-equal minimum bitwidth.\n");
17187 Cost += C;
17188 }
17189 } else {
17190 if (isFirstInsertElement(IE1: VU, IE2: It->InsertElements.front()))
17191 It->InsertElements.front() = VU;
17192 VecId = std::distance(first: ShuffledInserts.begin(), last: It);
17193 }
17194 int InIdx = *InsertIdx;
17195 SmallVectorImpl<int> &Mask =
17196 ShuffledInserts[VecId].ValueMasks[ScalarTE];
17197 if (Mask.empty())
17198 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
17199 Mask[InIdx] = EU.Lane;
17200 DemandedElts[VecId].setBit(InIdx);
17201 continue;
17202 }
17203 }
17204 }
17205
17206 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17207 // If we plan to rewrite the tree in a smaller type, we will need to sign
17208 // extend the extracted value back to the original type. Here, we account
17209 // for the extract and the added cost of the sign extend if needed.
17210 InstructionCost ExtraCost = TTI::TCC_Free;
17211 auto *ScalarTy = EU.Scalar->getType();
17212 const unsigned BundleWidth = EU.E.getVectorFactor();
17213 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
17214 auto *VecTy = getWidenedType(ScalarTy, VF: BundleWidth);
17215 const TreeEntry *Entry = &EU.E;
17216 auto It = MinBWs.find(Val: Entry);
17217 if (It != MinBWs.end()) {
17218 Type *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
17219 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
17220 MinTy = getWidenedType(ScalarTy: MinTy, VF: VecTy->getNumElements());
17221 unsigned Extend = isKnownNonNegative(V: EU.Scalar, SQ: SimplifyQuery(*DL))
17222 ? Instruction::ZExt
17223 : Instruction::SExt;
17224 VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
17225 ExtraCost =
17226 getExtractWithExtendCost(TTI: *TTI, Opcode: Extend, Dst: ScalarTy, VecTy, Index: EU.Lane);
17227 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
17228 << ExtraCost << "\n");
17229 } else {
17230 ExtraCost =
17231 getVectorInstrCost(TTI: *TTI, ScalarTy, Opcode: Instruction::ExtractElement, Val: VecTy,
17232 CostKind, Index: EU.Lane, Scalar: EU.Scalar, ScalarUserAndIdx);
17233 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
17234 << *VecTy << ": " << ExtraCost << "\n");
17235 }
17236 // Leave the scalar instructions as is if they are cheaper than extracts.
17237 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
17238 Entry->getOpcode() == Instruction::Load) {
17239 // Checks if the user of the external scalar is phi in loop body.
17240 auto IsPhiInLoop = [&](const ExternalUser &U) {
17241 if (auto *Phi = dyn_cast_if_present<PHINode>(Val: U.User)) {
17242 auto *I = cast<Instruction>(Val: U.Scalar);
17243 const Loop *L = LI->getLoopFor(BB: Phi->getParent());
17244 return L && (Phi->getParent() == I->getParent() ||
17245 L == LI->getLoopFor(BB: I->getParent()));
17246 }
17247 return false;
17248 };
17249 if (!ValueToExtUses) {
17250 ValueToExtUses.emplace();
17251 for (const auto &P : enumerate(First&: ExternalUses)) {
17252 // Ignore phis in loops.
17253 if (IsPhiInLoop(P.value()))
17254 continue;
17255
17256 ValueToExtUses->try_emplace(Key: P.value().Scalar, Args: P.index());
17257 }
17258 }
17259 // Can use original instruction, if no operands vectorized or they are
17260 // marked as externally used already.
17261 auto *Inst = cast<Instruction>(Val: EU.Scalar);
17262 InstructionCost ScalarCost = TTI->getInstructionCost(U: Inst, CostKind);
17263 auto OperandIsScalar = [&](Value *V) {
17264 if (!isVectorized(V)) {
17265 // Some extractelements might be not vectorized, but
17266 // transformed into shuffle and removed from the function,
17267 // consider it here.
17268 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V))
17269 return !EE->hasOneUse() || !MustGather.contains(Ptr: EE);
17270 return true;
17271 }
17272 return ValueToExtUses->contains(Val: V);
17273 };
17274 bool CanBeUsedAsScalar = all_of(Range: Inst->operands(), P: OperandIsScalar);
17275 bool CanBeUsedAsScalarCast = false;
17276 if (auto *CI = dyn_cast<CastInst>(Val: Inst); CI && !CanBeUsedAsScalar) {
17277 if (auto *Op = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: 0));
17278 Op && all_of(Range: Op->operands(), P: OperandIsScalar)) {
17279 InstructionCost OpCost =
17280 (isVectorized(V: Op) && !ValueToExtUses->contains(Val: Op))
17281 ? TTI->getInstructionCost(U: Op, CostKind)
17282 : 0;
17283 if (ScalarCost + OpCost <= ExtraCost) {
17284 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
17285 ScalarCost += OpCost;
17286 }
17287 }
17288 }
17289 if (CanBeUsedAsScalar) {
17290 bool KeepScalar = ScalarCost <= ExtraCost;
17291 // Try to keep original scalar if the user is the phi node from the same
17292 // block as the root phis, currently vectorized. It allows to keep
17293 // better ordering info of PHIs, being vectorized currently.
17294 bool IsProfitablePHIUser =
17295 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
17296 VectorizableTree.front()->Scalars.size() > 2)) &&
17297 VectorizableTree.front()->hasState() &&
17298 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
17299 !Inst->hasNUsesOrMore(N: UsesLimit) &&
17300 none_of(Range: Inst->users(),
17301 P: [&](User *U) {
17302 auto *PHIUser = dyn_cast<PHINode>(Val: U);
17303 return (!PHIUser ||
17304 PHIUser->getParent() !=
17305 cast<Instruction>(
17306 Val: VectorizableTree.front()->getMainOp())
17307 ->getParent()) &&
17308 !isVectorized(V: U);
17309 }) &&
17310 count_if(Range: Entry->Scalars, P: [&](Value *V) {
17311 return ValueToExtUses->contains(Val: V);
17312 }) <= 2;
17313 if (IsProfitablePHIUser) {
17314 KeepScalar = true;
17315 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
17316 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
17317 (!GatheredLoadsEntriesFirst.has_value() ||
17318 Entry->Idx < *GatheredLoadsEntriesFirst)) {
17319 unsigned ScalarUsesCount = count_if(Range: Entry->Scalars, P: [&](Value *V) {
17320 return ValueToExtUses->contains(Val: V);
17321 });
17322 auto It = ExtractsCount.find(Val: Entry);
17323 if (It != ExtractsCount.end()) {
17324 assert(ScalarUsesCount >= It->getSecond().size() &&
17325 "Expected total number of external uses not less than "
17326 "number of scalar uses.");
17327 ScalarUsesCount -= It->getSecond().size();
17328 }
17329 // Keep original scalar if number of externally used instructions in
17330 // the same entry is not power of 2. It may help to do some extra
17331 // vectorization for now.
17332 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(Value: ScalarUsesCount);
17333 }
17334 if (KeepScalar) {
17335 ExternalUsesAsOriginalScalar.insert(Ptr: EU.Scalar);
17336 for (Value *V : Inst->operands()) {
17337 auto It = ValueToExtUses->find(Val: V);
17338 if (It != ValueToExtUses->end()) {
17339 // Replace all uses to avoid compiler crash.
17340 ExternalUses[It->second].User = nullptr;
17341 }
17342 }
17343 ExtraCost = ScalarCost;
17344 if (!IsPhiInLoop(EU))
17345 ExtractsCount[Entry].insert(V: Inst);
17346 if (CanBeUsedAsScalarCast) {
17347 ScalarOpsFromCasts.insert(Ptr: Inst->getOperand(i: 0));
17348 // Update the users of the operands of the cast operand to avoid
17349 // compiler crash.
17350 if (auto *IOp = dyn_cast<Instruction>(Val: Inst->getOperand(i: 0))) {
17351 for (Value *V : IOp->operands()) {
17352 auto It = ValueToExtUses->find(Val: V);
17353 if (It != ValueToExtUses->end()) {
17354 // Replace all uses to avoid compiler crash.
17355 ExternalUses[It->second].User = nullptr;
17356 }
17357 }
17358 }
17359 }
17360 }
17361 }
17362 }
17363
17364 ExtractCost += ExtraCost;
17365 }
17366 // Insert externals for extract of operands of casts to be emitted as scalars
17367 // instead of extractelement.
17368 for (Value *V : ScalarOpsFromCasts) {
17369 ExternalUsesAsOriginalScalar.insert(Ptr: V);
17370 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
17371 const auto *It = find_if_not(Range&: TEs, P: [&](TreeEntry *TE) {
17372 return TransformedToGatherNodes.contains(Val: TE) ||
17373 DeletedNodes.contains(Ptr: TE);
17374 });
17375 if (It != TEs.end()) {
17376 const TreeEntry *UserTE = *It;
17377 ExternalUses.emplace_back(Args&: V, Args: nullptr, Args: *UserTE,
17378 Args: UserTE->findLaneForValue(V));
17379 }
17380 }
17381 }
17382 // Add reduced value cost, if resized.
17383 if (!VectorizedVals.empty()) {
17384 const TreeEntry &Root = *VectorizableTree.front();
17385 auto BWIt = MinBWs.find(Val: &Root);
17386 if (BWIt != MinBWs.end()) {
17387 Type *DstTy = Root.Scalars.front()->getType();
17388 unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy->getScalarType());
17389 unsigned SrcSz =
17390 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17391 if (OriginalSz != SrcSz) {
17392 unsigned Opcode = Instruction::Trunc;
17393 if (OriginalSz > SrcSz)
17394 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17395 Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
17396 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DstTy)) {
17397 assert(SLPReVec && "Only supported by REVEC.");
17398 SrcTy = getWidenedType(ScalarTy: SrcTy, VF: VecTy->getNumElements());
17399 }
17400 Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
17401 CCH: TTI::CastContextHint::None,
17402 CostKind: TTI::TCK_RecipThroughput);
17403 }
17404 }
17405 }
17406
17407 // Buildvector with externally used scalars, which should remain as scalars,
17408 // should not be vectorized, the compiler may hang.
17409 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
17410 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
17411 VectorizableTree[1]->hasState() &&
17412 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17413 all_of(Range&: VectorizableTree[1]->Scalars, P: [&](Value *V) {
17414 return ExternalUsesAsOriginalScalar.contains(Ptr: V);
17415 }))
17416 return InstructionCost::getInvalid();
17417
17418 Cost += ExtractCost;
17419 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
17420 bool ForSingleMask) {
17421 InstructionCost C = 0;
17422 unsigned VF = Mask.size();
17423 unsigned VecVF = TE->getVectorFactor();
17424 bool HasLargeIndex =
17425 any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); });
17426 if ((VF != VecVF && HasLargeIndex) ||
17427 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) {
17428
17429 if (HasLargeIndex) {
17430 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
17431 std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
17432 result: OrigMask.begin());
17433 C = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17434 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
17435 Mask: OrigMask);
17436 LLVM_DEBUG(
17437 dbgs() << "SLP: Adding cost " << C
17438 << " for final shuffle of insertelement external users.\n";
17439 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17440 Cost += C;
17441 return std::make_pair(x&: TE, y: true);
17442 }
17443
17444 if (!ForSingleMask) {
17445 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
17446 for (unsigned I = 0; I < VF; ++I) {
17447 if (Mask[I] != PoisonMaskElem)
17448 ResizeMask[Mask[I]] = Mask[I];
17449 }
17450 if (!ShuffleVectorInst::isIdentityMask(Mask: ResizeMask, NumSrcElts: VF))
17451 C = ::getShuffleCost(
17452 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17453 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), Mask: ResizeMask);
17454 LLVM_DEBUG(
17455 dbgs() << "SLP: Adding cost " << C
17456 << " for final shuffle of insertelement external users.\n";
17457 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17458
17459 Cost += C;
17460 }
17461 }
17462 return std::make_pair(x&: TE, y: false);
17463 };
17464 // Calculate the cost of the reshuffled vectors, if any.
17465 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
17466 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(i_nocapture: 0);
17467 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
17468 unsigned VF = 0;
17469 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
17470 ArrayRef<const TreeEntry *> TEs) {
17471 assert((TEs.size() == 1 || TEs.size() == 2) &&
17472 "Expected exactly 1 or 2 tree entries.");
17473 if (TEs.size() == 1) {
17474 if (VF == 0)
17475 VF = TEs.front()->getVectorFactor();
17476 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
17477 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
17478 !all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
17479 return Data.value() == PoisonMaskElem ||
17480 (Data.index() < VF &&
17481 static_cast<int>(Data.index()) == Data.value());
17482 })) {
17483 InstructionCost C =
17484 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
17485 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17486 << " for final shuffle of insertelement "
17487 "external users.\n";
17488 TEs.front()->dump();
17489 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17490 Cost += C;
17491 }
17492 } else {
17493 if (VF == 0) {
17494 if (TEs.front() &&
17495 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17496 VF = TEs.front()->getVectorFactor();
17497 else
17498 VF = Mask.size();
17499 }
17500 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
17501 InstructionCost C =
17502 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
17503 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17504 << " for final shuffle of vector node and external "
17505 "insertelement users.\n";
17506 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17507 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17508 Cost += C;
17509 }
17510 VF = Mask.size();
17511 return TEs.back();
17512 };
17513 (void)performExtractsShuffleAction<const TreeEntry>(
17514 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
17515 GetVF: [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeAction: ResizeToVF,
17516 Action: EstimateShufflesCost);
17517 InstructionCost InsertCost = TTI->getScalarizationOverhead(
17518 Ty: cast<FixedVectorType>(
17519 Val: ShuffledInserts[I].InsertElements.front()->getType()),
17520 DemandedElts: DemandedElts[I],
17521 /*Insert*/ true, /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
17522 Cost -= InsertCost;
17523 }
17524
17525 // Add the cost for reduced value resize (if required).
17526 if (ReductionBitWidth != 0) {
17527 assert(UserIgnoreList && "Expected reduction tree.");
17528 const TreeEntry &E = *VectorizableTree.front();
17529 auto It = MinBWs.find(Val: &E);
17530 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17531 unsigned SrcSize = It->second.first;
17532 unsigned DstSize = ReductionBitWidth;
17533 unsigned Opcode = Instruction::Trunc;
17534 if (SrcSize < DstSize) {
17535 bool IsArithmeticExtendedReduction =
17536 all_of(Range: *UserIgnoreList, P: [](Value *V) {
17537 auto *I = cast<Instruction>(Val: V);
17538 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
17539 Instruction::Mul, Instruction::FMul,
17540 Instruction::And, Instruction::Or,
17541 Instruction::Xor},
17542 Element: I->getOpcode());
17543 });
17544 if (IsArithmeticExtendedReduction)
17545 Opcode =
17546 Instruction::BitCast; // Handle it by getExtendedReductionCost
17547 else
17548 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17549 }
17550 if (Opcode != Instruction::BitCast) {
17551 auto *SrcVecTy =
17552 getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
17553 auto *DstVecTy =
17554 getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
17555 TTI::CastContextHint CCH = getCastContextHint(TE: E);
17556 InstructionCost CastCost;
17557 switch (E.getOpcode()) {
17558 case Instruction::SExt:
17559 case Instruction::ZExt:
17560 case Instruction::Trunc: {
17561 const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: 0);
17562 CCH = getCastContextHint(TE: *OpTE);
17563 break;
17564 }
17565 default:
17566 break;
17567 }
17568 CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
17569 CostKind: TTI::TCK_RecipThroughput);
17570 Cost += CastCost;
17571 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
17572 << " for final resize for reduction from " << SrcVecTy
17573 << " to " << DstVecTy << "\n";
17574 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17575 }
17576 }
17577 }
17578
17579 std::optional<InstructionCost> SpillCost;
17580 if (Cost < -SLPCostThreshold) {
17581 SpillCost = getSpillCost();
17582 Cost += *SpillCost;
17583 }
17584#ifndef NDEBUG
17585 SmallString<256> Str;
17586 {
17587 raw_svector_ostream OS(Str);
17588 OS << "SLP: Spill Cost = ";
17589 if (SpillCost)
17590 OS << *SpillCost;
17591 else
17592 OS << "<skipped>";
17593 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
17594 << "SLP: Total Cost = " << Cost << ".\n";
17595 }
17596 LLVM_DEBUG(dbgs() << Str);
17597 if (ViewSLPTree)
17598 ViewGraph(this, "SLP" + F->getName(), false, Str);
17599#endif
17600
17601 return Cost;
17602}
17603
17604/// Tries to find extractelement instructions with constant indices from fixed
17605/// vector type and gather such instructions into a bunch, which highly likely
17606/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17607/// successful, the matched scalars are replaced by poison values in \p VL for
17608/// future analysis.
17609std::optional<TTI::ShuffleKind>
17610BoUpSLP::tryToGatherSingleRegisterExtractElements(
17611 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
17612 // Scan list of gathered scalars for extractelements that can be represented
17613 // as shuffles.
17614 MapVector<Value *, SmallVector<int>> VectorOpToIdx;
17615 SmallVector<int> UndefVectorExtracts;
17616 for (int I = 0, E = VL.size(); I < E; ++I) {
17617 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
17618 if (!EI) {
17619 if (isa<UndefValue>(Val: VL[I]))
17620 UndefVectorExtracts.push_back(Elt: I);
17621 continue;
17622 }
17623 auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
17624 if (!VecTy || !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
17625 continue;
17626 std::optional<unsigned> Idx = getExtractIndex(E: EI);
17627 // Undefined index.
17628 if (!Idx) {
17629 UndefVectorExtracts.push_back(Elt: I);
17630 continue;
17631 }
17632 if (Idx >= VecTy->getNumElements()) {
17633 UndefVectorExtracts.push_back(Elt: I);
17634 continue;
17635 }
17636 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
17637 ExtractMask.reset(Idx: *Idx);
17638 if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
17639 UndefVectorExtracts.push_back(Elt: I);
17640 continue;
17641 }
17642 VectorOpToIdx[EI->getVectorOperand()].push_back(Elt: I);
17643 }
17644 // Sort the vector operands by the maximum number of uses in extractelements.
17645 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
17646 VectorOpToIdx.takeVector();
17647 stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
17648 return P1.second.size() > P2.second.size();
17649 });
17650 // Find the best pair of the vectors or a single vector.
17651 const int UndefSz = UndefVectorExtracts.size();
17652 unsigned SingleMax = 0;
17653 unsigned PairMax = 0;
17654 if (!Vectors.empty()) {
17655 SingleMax = Vectors.front().second.size() + UndefSz;
17656 if (Vectors.size() > 1) {
17657 auto *ItNext = std::next(x: Vectors.begin());
17658 PairMax = SingleMax + ItNext->second.size();
17659 }
17660 }
17661 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17662 return std::nullopt;
17663 // Check if better to perform a shuffle of 2 vectors or just of a single
17664 // vector.
17665 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
17666 SmallVector<Value *> GatheredExtracts(
17667 VL.size(), PoisonValue::get(T: VL.front()->getType()));
17668 if (SingleMax >= PairMax && SingleMax) {
17669 for (int Idx : Vectors.front().second)
17670 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
17671 } else if (!Vectors.empty()) {
17672 for (unsigned Idx : {0, 1})
17673 for (int Idx : Vectors[Idx].second)
17674 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
17675 }
17676 // Add extracts from undefs too.
17677 for (int Idx : UndefVectorExtracts)
17678 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
17679 // Check that gather of extractelements can be represented as just a
17680 // shuffle of a single/two vectors the scalars are extracted from.
17681 std::optional<TTI::ShuffleKind> Res =
17682 isFixedVectorShuffle(VL: GatheredExtracts, Mask, AC);
17683 if (!Res || all_of(Range&: Mask, P: equal_to(Arg: PoisonMaskElem))) {
17684 // TODO: try to check other subsets if possible.
17685 // Restore the original VL if attempt was not successful.
17686 copy(Range&: SavedVL, Out: VL.begin());
17687 return std::nullopt;
17688 }
17689 // Restore unused scalars from mask, if some of the extractelements were not
17690 // selected for shuffle.
17691 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
17692 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts[I]) &&
17693 isa<UndefValue>(Val: GatheredExtracts[I])) {
17694 std::swap(a&: VL[I], b&: GatheredExtracts[I]);
17695 continue;
17696 }
17697 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
17698 if (!EI || !isa<FixedVectorType>(Val: EI->getVectorOperandType()) ||
17699 !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) ||
17700 is_contained(Range&: UndefVectorExtracts, Element: I))
17701 continue;
17702 }
17703 return Res;
17704}
17705
17706/// Tries to find extractelement instructions with constant indices from fixed
17707/// vector type and gather such instructions into a bunch, which highly likely
17708/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17709/// successful, the matched scalars are replaced by poison values in \p VL for
17710/// future analysis.
17711SmallVector<std::optional<TTI::ShuffleKind>>
17712BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17713 SmallVectorImpl<int> &Mask,
17714 unsigned NumParts) const {
17715 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
17716 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
17717 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
17718 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
17719 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
17720 // Scan list of gathered scalars for extractelements that can be represented
17721 // as shuffles.
17722 MutableArrayRef<Value *> SubVL = MutableArrayRef(VL).slice(
17723 N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
17724 SmallVector<int> SubMask;
17725 std::optional<TTI::ShuffleKind> Res =
17726 tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
17727 ShufflesRes[Part] = Res;
17728 copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
17729 }
17730 if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
17731 return Res.has_value();
17732 }))
17733 ShufflesRes.clear();
17734 return ShufflesRes;
17735}
17736
17737std::optional<TargetTransformInfo::ShuffleKind>
17738BoUpSLP::isGatherShuffledSingleRegisterEntry(
17739 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
17740 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
17741 Entries.clear();
17742 if (TE->Idx == 0)
17743 return std::nullopt;
17744 // TODO: currently checking only for Scalars in the tree entry, need to count
17745 // reused elements too for better cost estimation.
17746 auto GetUserEntry = [&](const TreeEntry *TE) {
17747 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17748 TE = TE->UserTreeIndex.UserTE;
17749 if (TE == VectorizableTree.front().get())
17750 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
17751 return TE->UserTreeIndex;
17752 };
17753 auto HasGatherUser = [&](const TreeEntry *TE) {
17754 while (TE->Idx != 0 && TE->UserTreeIndex) {
17755 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
17756 return true;
17757 TE = TE->UserTreeIndex.UserTE;
17758 }
17759 return false;
17760 };
17761 const EdgeInfo TEUseEI = GetUserEntry(TE);
17762 if (!TEUseEI)
17763 return std::nullopt;
17764 const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
17765 const BasicBlock *TEInsertBlock = nullptr;
17766 // Main node of PHI entries keeps the correct order of operands/incoming
17767 // blocks.
17768 if (auto *PHI = dyn_cast_or_null<PHINode>(
17769 Val: TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
17770 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
17771 TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
17772 TEInsertPt = TEInsertBlock->getTerminator();
17773 } else {
17774 TEInsertBlock = TEInsertPt->getParent();
17775 }
17776 if (!DT->isReachableFromEntry(A: TEInsertBlock))
17777 return std::nullopt;
17778 auto *NodeUI = DT->getNode(BB: TEInsertBlock);
17779 assert(NodeUI && "Should only process reachable instructions");
17780 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
17781 auto CheckOrdering = [&](const Instruction *InsertPt) {
17782 // Argument InsertPt is an instruction where vector code for some other
17783 // tree entry (one that shares one or more scalars with TE) is going to be
17784 // generated. This lambda returns true if insertion point of vector code
17785 // for the TE dominates that point (otherwise dependency is the other way
17786 // around). The other node is not limited to be of a gather kind. Gather
17787 // nodes are not scheduled and their vector code is inserted before their
17788 // first user. If user is PHI, that is supposed to be at the end of a
17789 // predecessor block. Otherwise it is the last instruction among scalars of
17790 // the user node. So, instead of checking dependency between instructions
17791 // themselves, we check dependency between their insertion points for vector
17792 // code (since each scalar instruction ends up as a lane of a vector
17793 // instruction).
17794 const BasicBlock *InsertBlock = InsertPt->getParent();
17795 auto *NodeEUI = DT->getNode(BB: InsertBlock);
17796 if (!NodeEUI)
17797 return false;
17798 assert((NodeUI == NodeEUI) ==
17799 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
17800 "Different nodes should have different DFS numbers");
17801 // Check the order of the gather nodes users.
17802 if (TEInsertPt->getParent() != InsertBlock &&
17803 (DT->dominates(A: NodeUI, B: NodeEUI) || !DT->dominates(A: NodeEUI, B: NodeUI)))
17804 return false;
17805 if (TEInsertPt->getParent() == InsertBlock &&
17806 TEInsertPt->comesBefore(Other: InsertPt))
17807 return false;
17808 return true;
17809 };
17810 // Find all tree entries used by the gathered values. If no common entries
17811 // found - not a shuffle.
17812 // Here we build a set of tree nodes for each gathered value and trying to
17813 // find the intersection between these sets. If we have at least one common
17814 // tree node for each gathered value - we have just a permutation of the
17815 // single vector. If we have 2 different sets, we're in situation where we
17816 // have a permutation of 2 input vectors.
17817 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
17818 SmallDenseMap<Value *, int> UsedValuesEntry;
17819 SmallPtrSet<const Value *, 16> VisitedValue;
17820 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
17821 // The node is reused - exit.
17822 if ((TEPtr->getVectorFactor() != VL.size() &&
17823 TEPtr->Scalars.size() != VL.size()) ||
17824 (!TEPtr->isSame(VL) && !TEPtr->isSame(VL: TE->Scalars)))
17825 return false;
17826 UsedTEs.clear();
17827 UsedTEs.emplace_back().insert(Ptr: TEPtr);
17828 for (Value *V : VL) {
17829 if (isConstant(V))
17830 continue;
17831 UsedValuesEntry.try_emplace(Key: V, Args: 0);
17832 }
17833 return true;
17834 };
17835 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
17836 unsigned EdgeIdx) {
17837 const TreeEntry *Ptr1 = User1;
17838 const TreeEntry *Ptr2 = User2;
17839 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
17840 while (Ptr2) {
17841 PtrToIdx.try_emplace(Key: Ptr2, Args&: EdgeIdx);
17842 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
17843 Ptr2 = Ptr2->UserTreeIndex.UserTE;
17844 }
17845 while (Ptr1) {
17846 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
17847 Ptr1 = Ptr1->UserTreeIndex.UserTE;
17848 if (auto It = PtrToIdx.find(Val: Ptr1); It != PtrToIdx.end())
17849 return Idx < It->second;
17850 }
17851 return false;
17852 };
17853 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
17854 Instruction *InsertPt) {
17855 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
17856 !TEUseEI.UserTE->isCopyableElement(
17857 V: const_cast<Instruction *>(TEInsertPt)) &&
17858 isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
17859 InsertPt->getNextNode() == TEInsertPt &&
17860 (!E->hasCopyableElements() || !E->isCopyableElement(V: InsertPt) ||
17861 !isUsedOutsideBlock(V: InsertPt));
17862 };
17863 for (Value *V : VL) {
17864 if (isConstant(V) || !VisitedValue.insert(Ptr: V).second)
17865 continue;
17866 // Build a list of tree entries where V is used.
17867 SmallPtrSet<const TreeEntry *, 4> VToTEs;
17868 SmallVector<const TreeEntry *> GatherNodes(
17869 ValueToGatherNodes.lookup(Val: V).takeVector());
17870 if (TransformedToGatherNodes.contains(Val: TE)) {
17871 for (TreeEntry *E : getSplitTreeEntries(V)) {
17872 if (TE == E || !TransformedToGatherNodes.contains(Val: E) ||
17873 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17874 continue;
17875 GatherNodes.push_back(Elt: E);
17876 }
17877 for (TreeEntry *E : getTreeEntries(V)) {
17878 if (TE == E || !TransformedToGatherNodes.contains(Val: E) ||
17879 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
17880 continue;
17881 GatherNodes.push_back(Elt: E);
17882 }
17883 }
17884 for (const TreeEntry *TEPtr : GatherNodes) {
17885 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(Ptr: TEPtr))
17886 continue;
17887 assert(any_of(TEPtr->Scalars,
17888 [&](Value *V) { return GatheredScalars.contains(V); }) &&
17889 "Must contain at least single gathered value.");
17890 assert(TEPtr->UserTreeIndex &&
17891 "Expected only single user of a gather node.");
17892 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
17893
17894 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
17895 UseEI.UserTE->hasState())
17896 ? dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp())
17897 : nullptr;
17898 Instruction *InsertPt =
17899 UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
17900 : &getLastInstructionInBundle(E: UseEI.UserTE);
17901 if (TEInsertPt == InsertPt) {
17902 // Check nodes, which might be emitted first.
17903 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17904 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
17905 TEUseEI.UserTE->isAltShuffle()) &&
17906 all_of(Range&: TEUseEI.UserTE->Scalars, P: isUsedOutsideBlock)) {
17907 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
17908 (UseEI.UserTE->hasState() &&
17909 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17910 !UseEI.UserTE->isAltShuffle()) ||
17911 !all_of(Range&: UseEI.UserTE->Scalars, P: isUsedOutsideBlock))
17912 continue;
17913 }
17914
17915 // If the schedulable insertion point is used in multiple entries - just
17916 // exit, no known ordering at this point, available only after real
17917 // scheduling.
17918 if (!doesNotNeedToBeScheduled(V: InsertPt) &&
17919 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
17920 continue;
17921 // If the users are the PHI nodes with the same incoming blocks - skip.
17922 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
17923 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
17924 UseEI.UserTE->State == TreeEntry::Vectorize &&
17925 UseEI.UserTE->getOpcode() == Instruction::PHI &&
17926 TEUseEI.UserTE != UseEI.UserTE)
17927 continue;
17928 // If 2 gathers are operands of the same entry (regardless of whether
17929 // user is PHI or else), compare operands indices, use the earlier one
17930 // as the base.
17931 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
17932 continue;
17933 // If the user instruction is used for some reason in different
17934 // vectorized nodes - make it depend on index.
17935 if (TEUseEI.UserTE != UseEI.UserTE &&
17936 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
17937 HasGatherUser(TEUseEI.UserTE)))
17938 continue;
17939 // If the user node is the operand of the other user node - skip.
17940 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
17941 continue;
17942 }
17943
17944 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
17945 TEUseEI.UserTE->doesNotNeedToSchedule() !=
17946 UseEI.UserTE->doesNotNeedToSchedule() &&
17947 is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))
17948 continue;
17949 // Check if the user node of the TE comes after user node of TEPtr,
17950 // otherwise TEPtr depends on TE.
17951 if ((TEInsertBlock != InsertPt->getParent() ||
17952 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
17953 (!CheckOrdering(InsertPt) ||
17954 (UseEI.UserTE->hasCopyableElements() &&
17955 isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
17956 is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))))
17957 continue;
17958 // The node is reused - exit.
17959 if (CheckAndUseSameNode(TEPtr))
17960 break;
17961 // The parent node is copyable with last inst used outside? And the last
17962 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
17963 // preserve def-use chain.
17964 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
17965 continue;
17966 VToTEs.insert(Ptr: TEPtr);
17967 }
17968 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
17969 const auto *It = find_if(Range&: VTEs, P: [&](const TreeEntry *MTE) {
17970 return MTE != TE && MTE != TEUseEI.UserTE &&
17971 !DeletedNodes.contains(Ptr: MTE) &&
17972 !TransformedToGatherNodes.contains(Val: MTE);
17973 });
17974 if (It != VTEs.end()) {
17975 const TreeEntry *VTE = *It;
17976 if (none_of(Range: TE->CombinedEntriesWithIndices,
17977 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
17978 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
17979 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
17980 continue;
17981 }
17982 // The node is reused - exit.
17983 if (CheckAndUseSameNode(VTE))
17984 break;
17985 VToTEs.insert(Ptr: VTE);
17986 }
17987 }
17988 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
17989 const auto *It = find_if(Range&: VTEs, P: [&, MainTE = TE](const TreeEntry *TE) {
17990 return TE != MainTE && !DeletedNodes.contains(Ptr: TE) &&
17991 !TransformedToGatherNodes.contains(Val: TE);
17992 });
17993 if (It != VTEs.end()) {
17994 const TreeEntry *VTE = *It;
17995 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(u: 0) &&
17996 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
17997 VTEs = VTEs.drop_front();
17998 // Iterate through all vectorized nodes.
17999 const auto *MIt = find_if(Range&: VTEs, P: [](const TreeEntry *MTE) {
18000 return MTE->State == TreeEntry::Vectorize;
18001 });
18002 if (MIt == VTEs.end())
18003 continue;
18004 VTE = *MIt;
18005 }
18006 if (none_of(Range: TE->CombinedEntriesWithIndices,
18007 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
18008 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
18009 if (&LastBundleInst == TEInsertPt ||
18010 !CheckOrdering(&LastBundleInst) ||
18011 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
18012 continue;
18013 }
18014 // The node is reused - exit.
18015 if (CheckAndUseSameNode(VTE))
18016 break;
18017 VToTEs.insert(Ptr: VTE);
18018 }
18019 }
18020 if (VToTEs.empty())
18021 continue;
18022 if (UsedTEs.empty()) {
18023 // The first iteration, just insert the list of nodes to vector.
18024 UsedTEs.push_back(Elt: VToTEs);
18025 UsedValuesEntry.try_emplace(Key: V, Args: 0);
18026 } else {
18027 // Need to check if there are any previously used tree nodes which use V.
18028 // If there are no such nodes, consider that we have another one input
18029 // vector.
18030 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
18031 unsigned Idx = 0;
18032 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
18033 // Do we have a non-empty intersection of previously listed tree entries
18034 // and tree entries using current V?
18035 set_intersect(S1&: VToTEs, S2: Set);
18036 if (!VToTEs.empty()) {
18037 // Yes, write the new subset and continue analysis for the next
18038 // scalar.
18039 Set.swap(RHS&: VToTEs);
18040 break;
18041 }
18042 VToTEs = SavedVToTEs;
18043 ++Idx;
18044 }
18045 // No non-empty intersection found - need to add a second set of possible
18046 // source vectors.
18047 if (Idx == UsedTEs.size()) {
18048 // If the number of input vectors is greater than 2 - not a permutation,
18049 // fallback to the regular gather.
18050 // TODO: support multiple reshuffled nodes.
18051 if (UsedTEs.size() == 2)
18052 continue;
18053 UsedTEs.push_back(Elt: SavedVToTEs);
18054 Idx = UsedTEs.size() - 1;
18055 }
18056 UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
18057 }
18058 }
18059
18060 if (UsedTEs.empty()) {
18061 Entries.clear();
18062 return std::nullopt;
18063 }
18064
18065 unsigned VF = 0;
18066 if (UsedTEs.size() == 1) {
18067 // Keep the order to avoid non-determinism.
18068 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
18069 UsedTEs.front().end());
18070 sort(C&: FirstEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
18071 return TE1->Idx < TE2->Idx;
18072 });
18073 // Try to find the perfect match in another gather node at first.
18074 auto *It = find_if(Range&: FirstEntries, P: [=](const TreeEntry *EntryPtr) {
18075 return EntryPtr->isSame(VL) || EntryPtr->isSame(VL: TE->Scalars);
18076 });
18077 if (It != FirstEntries.end() &&
18078 ((*It)->getVectorFactor() == VL.size() ||
18079 ((*It)->getVectorFactor() == TE->Scalars.size() &&
18080 TE->ReuseShuffleIndices.size() == VL.size() &&
18081 (*It)->isSame(VL: TE->Scalars)))) {
18082 Entries.push_back(Elt: *It);
18083 if ((*It)->getVectorFactor() == VL.size()) {
18084 std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18085 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: 0);
18086 } else {
18087 SmallVector<int> CommonMask = TE->getCommonMask();
18088 copy(Range&: CommonMask, Out: Mask.begin());
18089 }
18090 // Clear undef scalars.
18091 for (unsigned I : seq<unsigned>(Size: VL.size()))
18092 if (isa<PoisonValue>(Val: VL[I]))
18093 Mask[Part * VL.size() + I] = PoisonMaskElem;
18094 return TargetTransformInfo::SK_PermuteSingleSrc;
18095 }
18096 // No perfect match, just shuffle, so choose the first tree node from the
18097 // tree.
18098 Entries.push_back(Elt: FirstEntries.front());
18099 // Update mapping between values and corresponding tree entries.
18100 for (auto &P : UsedValuesEntry)
18101 P.second = 0;
18102 VF = FirstEntries.front()->getVectorFactor();
18103 } else {
18104 // Try to find nodes with the same vector factor.
18105 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
18106 // Keep the order of tree nodes to avoid non-determinism.
18107 DenseMap<int, const TreeEntry *> VFToTE;
18108 for (const TreeEntry *TE : UsedTEs.front()) {
18109 unsigned VF = TE->getVectorFactor();
18110 auto It = VFToTE.find(Val: VF);
18111 if (It != VFToTE.end()) {
18112 if (It->second->Idx > TE->Idx)
18113 It->getSecond() = TE;
18114 continue;
18115 }
18116 VFToTE.try_emplace(Key: VF, Args&: TE);
18117 }
18118 // Same, keep the order to avoid non-determinism.
18119 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
18120 UsedTEs.back().end());
18121 sort(C&: SecondEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
18122 return TE1->Idx < TE2->Idx;
18123 });
18124 for (const TreeEntry *TE : SecondEntries) {
18125 auto It = VFToTE.find(Val: TE->getVectorFactor());
18126 if (It != VFToTE.end()) {
18127 VF = It->first;
18128 Entries.push_back(Elt: It->second);
18129 Entries.push_back(Elt: TE);
18130 break;
18131 }
18132 }
18133 // No 2 source vectors with the same vector factor - just choose 2 with max
18134 // index.
18135 if (Entries.empty()) {
18136 Entries.push_back(Elt: *llvm::max_element(
18137 Range&: UsedTEs.front(), C: [](const TreeEntry *TE1, const TreeEntry *TE2) {
18138 return TE1->Idx < TE2->Idx;
18139 }));
18140 Entries.push_back(Elt: SecondEntries.front());
18141 VF = std::max(a: Entries.front()->getVectorFactor(),
18142 b: Entries.back()->getVectorFactor());
18143 } else {
18144 VF = Entries.front()->getVectorFactor();
18145 }
18146 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
18147 for (const TreeEntry *E : Entries)
18148 ValuesToEntries.emplace_back().insert(I: E->Scalars.begin(),
18149 E: E->Scalars.end());
18150 // Update mapping between values and corresponding tree entries.
18151 for (auto &P : UsedValuesEntry) {
18152 for (unsigned Idx : seq<unsigned>(Size: ValuesToEntries.size()))
18153 if (ValuesToEntries[Idx].contains(Ptr: P.first)) {
18154 P.second = Idx;
18155 break;
18156 }
18157 }
18158 }
18159
18160 bool IsSplatOrUndefs = isSplat(VL) || all_of(Range&: VL, P: IsaPred<UndefValue>);
18161 // Checks if the 2 PHIs are compatible in terms of high possibility to be
18162 // vectorized.
18163 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
18164 auto *PHI = cast<PHINode>(Val: V);
18165 auto *PHI1 = cast<PHINode>(Val: V1);
18166 // Check that all incoming values are compatible/from same parent (if they
18167 // are instructions).
18168 // The incoming values are compatible if they all are constants, or
18169 // instruction with the same/alternate opcodes from the same basic block.
18170 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
18171 Value *In = PHI->getIncomingValue(i: I);
18172 Value *In1 = PHI1->getIncomingValue(i: I);
18173 if (isConstant(V: In) && isConstant(V: In1))
18174 continue;
18175 if (!getSameOpcode(VL: {In, In1}, TLI: *TLI))
18176 return false;
18177 if (cast<Instruction>(Val: In)->getParent() !=
18178 cast<Instruction>(Val: In1)->getParent())
18179 return false;
18180 }
18181 return true;
18182 };
18183 // Check if the value can be ignored during analysis for shuffled gathers.
18184 // We suppose it is better to ignore instruction, which do not form splats,
18185 // are not vectorized/not extractelements (these instructions will be handled
18186 // by extractelements processing) or may form vector node in future.
18187 auto MightBeIgnored = [=](Value *V) {
18188 auto *I = dyn_cast<Instruction>(Val: V);
18189 return I && !IsSplatOrUndefs && !isVectorized(V: I) &&
18190 !isVectorLikeInstWithConstOps(V: I) &&
18191 !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
18192 };
18193 // Check that the neighbor instruction may form a full vector node with the
18194 // current instruction V. It is possible, if they have same/alternate opcode
18195 // and same parent basic block.
18196 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
18197 Value *V1 = VL[Idx];
18198 bool UsedInSameVTE = false;
18199 auto It = UsedValuesEntry.find(Val: V1);
18200 if (It != UsedValuesEntry.end())
18201 UsedInSameVTE = It->second == UsedValuesEntry.find(Val: V)->second;
18202 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
18203 getSameOpcode(VL: {V, V1}, TLI: *TLI) &&
18204 cast<Instruction>(Val: V)->getParent() ==
18205 cast<Instruction>(Val: V1)->getParent() &&
18206 (!isa<PHINode>(Val: V1) || AreCompatiblePHIs(V, V1));
18207 };
18208 // Build a shuffle mask for better cost estimation and vector emission.
18209 SmallBitVector UsedIdxs(Entries.size());
18210 SmallVector<std::pair<unsigned, int>> EntryLanes;
18211 for (int I = 0, E = VL.size(); I < E; ++I) {
18212 Value *V = VL[I];
18213 auto It = UsedValuesEntry.find(Val: V);
18214 if (It == UsedValuesEntry.end())
18215 continue;
18216 // Do not try to shuffle scalars, if they are constants, or instructions
18217 // that can be vectorized as a result of the following vector build
18218 // vectorization.
18219 if (isConstant(V) || (MightBeIgnored(V) &&
18220 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
18221 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
18222 continue;
18223 unsigned Idx = It->second;
18224 EntryLanes.emplace_back(Args&: Idx, Args&: I);
18225 UsedIdxs.set(Idx);
18226 }
18227 // Iterate through all shuffled scalars and select entries, which can be used
18228 // for final shuffle.
18229 SmallVector<const TreeEntry *> TempEntries;
18230 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
18231 if (!UsedIdxs.test(Idx: I))
18232 continue;
18233 // Fix the entry number for the given scalar. If it is the first entry, set
18234 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
18235 // These indices are used when calculating final shuffle mask as the vector
18236 // offset.
18237 for (std::pair<unsigned, int> &Pair : EntryLanes)
18238 if (Pair.first == I)
18239 Pair.first = TempEntries.size();
18240 TempEntries.push_back(Elt: Entries[I]);
18241 }
18242 Entries.swap(RHS&: TempEntries);
18243 if (EntryLanes.size() == Entries.size() &&
18244 !VL.equals(RHS: ArrayRef(TE->Scalars)
18245 .slice(N: Part * VL.size(),
18246 M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
18247 // We may have here 1 or 2 entries only. If the number of scalars is equal
18248 // to the number of entries, no need to do the analysis, it is not very
18249 // profitable. Since VL is not the same as TE->Scalars, it means we already
18250 // have some shuffles before. Cut off not profitable case.
18251 Entries.clear();
18252 return std::nullopt;
18253 }
18254 // Build the final mask, check for the identity shuffle, if possible.
18255 bool IsIdentity = Entries.size() == 1;
18256 // Pair.first is the offset to the vector, while Pair.second is the index of
18257 // scalar in the list.
18258 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
18259 unsigned Idx = Part * VL.size() + Pair.second;
18260 Mask[Idx] =
18261 Pair.first * VF +
18262 (ForOrder ? std::distance(
18263 first: Entries[Pair.first]->Scalars.begin(),
18264 last: find(Range: Entries[Pair.first]->Scalars, Val: VL[Pair.second]))
18265 : Entries[Pair.first]->findLaneForValue(V: VL[Pair.second]));
18266 IsIdentity &= Mask[Idx] == Pair.second;
18267 }
18268 if (ForOrder || IsIdentity || Entries.empty()) {
18269 switch (Entries.size()) {
18270 case 1:
18271 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
18272 return TargetTransformInfo::SK_PermuteSingleSrc;
18273 break;
18274 case 2:
18275 if (EntryLanes.size() > 2 || VL.size() <= 2)
18276 return TargetTransformInfo::SK_PermuteTwoSrc;
18277 break;
18278 default:
18279 break;
18280 }
18281 } else if (!isa<VectorType>(Val: VL.front()->getType()) &&
18282 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
18283 // Do the cost estimation if shuffle beneficial than buildvector.
18284 SmallVector<int> SubMask(std::next(x: Mask.begin(), n: Part * VL.size()),
18285 std::next(x: Mask.begin(), n: (Part + 1) * VL.size()));
18286 int MinElement = SubMask.front(), MaxElement = SubMask.front();
18287 for (int Idx : SubMask) {
18288 if (Idx == PoisonMaskElem)
18289 continue;
18290 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
18291 MinElement = Idx;
18292 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
18293 MaxElement = Idx;
18294 }
18295 assert(MaxElement >= 0 && MinElement >= 0 &&
18296 MaxElement % VF >= MinElement % VF &&
18297 "Expected at least single element.");
18298 unsigned NewVF = std::max<unsigned>(
18299 a: VL.size(), b: getFullVectorNumberOfElements(TTI: *TTI, Ty: VL.front()->getType(),
18300 Sz: (MaxElement % VF) -
18301 (MinElement % VF) + 1));
18302 if (NewVF < VF) {
18303 for (int &Idx : SubMask) {
18304 if (Idx == PoisonMaskElem)
18305 continue;
18306 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
18307 (Idx >= static_cast<int>(VF) ? NewVF : 0);
18308 }
18309 } else {
18310 NewVF = VF;
18311 }
18312
18313 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18314 auto *VecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: NewVF);
18315 auto *MaskVecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: SubMask.size());
18316 auto GetShuffleCost = [&,
18317 &TTI = *TTI](ArrayRef<int> Mask,
18318 ArrayRef<const TreeEntry *> Entries,
18319 VectorType *VecTy) -> InstructionCost {
18320 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
18321 ShuffleVectorInst::isDeInterleaveMaskOfFactor(
18322 Mask, Factor: Entries.front()->getInterleaveFactor()))
18323 return TTI::TCC_Free;
18324 return ::getShuffleCost(TTI,
18325 Kind: Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
18326 : TTI::SK_PermuteSingleSrc,
18327 Tp: VecTy, Mask, CostKind);
18328 };
18329 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
18330 InstructionCost FirstShuffleCost = 0;
18331 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18332 if (Entries.size() == 1 || !Entries[0]->isGather()) {
18333 FirstShuffleCost = ShuffleCost;
18334 } else {
18335 // Transform mask to include only first entry.
18336 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18337 bool IsIdentity = true;
18338 for (auto [I, Idx] : enumerate(First&: FirstMask)) {
18339 if (Idx >= static_cast<int>(NewVF)) {
18340 Idx = PoisonMaskElem;
18341 } else {
18342 DemandedElts.clearBit(BitPosition: I);
18343 if (Idx != PoisonMaskElem)
18344 IsIdentity &= static_cast<int>(I) == Idx;
18345 }
18346 }
18347 if (!IsIdentity)
18348 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
18349 FirstShuffleCost += getScalarizationOverhead(
18350 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
18351 /*Extract=*/false, CostKind);
18352 }
18353 InstructionCost SecondShuffleCost = 0;
18354 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18355 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18356 SecondShuffleCost = ShuffleCost;
18357 } else {
18358 // Transform mask to include only first entry.
18359 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18360 bool IsIdentity = true;
18361 for (auto [I, Idx] : enumerate(First&: SecondMask)) {
18362 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
18363 Idx = PoisonMaskElem;
18364 } else {
18365 DemandedElts.clearBit(BitPosition: I);
18366 if (Idx != PoisonMaskElem) {
18367 Idx -= NewVF;
18368 IsIdentity &= static_cast<int>(I) == Idx;
18369 }
18370 }
18371 }
18372 if (!IsIdentity)
18373 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18374 SecondShuffleCost += getScalarizationOverhead(
18375 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
18376 /*Extract=*/false, CostKind);
18377 }
18378 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18379 for (auto [I, Idx] : enumerate(First&: SubMask))
18380 if (Idx == PoisonMaskElem)
18381 DemandedElts.clearBit(BitPosition: I);
18382 InstructionCost BuildVectorCost = getScalarizationOverhead(
18383 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
18384 /*Extract=*/false, CostKind);
18385 const TreeEntry *BestEntry = nullptr;
18386 if (FirstShuffleCost < ShuffleCost) {
18387 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18388 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
18389 f: [&](int &Idx) {
18390 if (Idx >= static_cast<int>(VF))
18391 Idx = PoisonMaskElem;
18392 });
18393 BestEntry = Entries.front();
18394 ShuffleCost = FirstShuffleCost;
18395 }
18396 if (SecondShuffleCost < ShuffleCost) {
18397 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18398 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
18399 f: [&](int &Idx) {
18400 if (Idx < static_cast<int>(VF))
18401 Idx = PoisonMaskElem;
18402 else
18403 Idx -= VF;
18404 });
18405 BestEntry = Entries[1];
18406 ShuffleCost = SecondShuffleCost;
18407 }
18408 if (BuildVectorCost >= ShuffleCost) {
18409 if (BestEntry) {
18410 Entries.clear();
18411 Entries.push_back(Elt: BestEntry);
18412 }
18413 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
18414 : TargetTransformInfo::SK_PermuteSingleSrc;
18415 }
18416 }
18417 Entries.clear();
18418 // Clear the corresponding mask elements.
18419 std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18420 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: PoisonMaskElem);
18421 return std::nullopt;
18422}
18423
18424SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
18425BoUpSLP::isGatherShuffledEntry(
18426 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
18427 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
18428 bool ForOrder) {
18429 assert(NumParts > 0 && NumParts < VL.size() &&
18430 "Expected positive number of registers.");
18431 Entries.clear();
18432 // No need to check for the topmost gather node.
18433 if (TE == VectorizableTree.front().get() &&
18434 (!GatheredLoadsEntriesFirst.has_value() ||
18435 none_of(Range: ArrayRef(VectorizableTree).drop_front(),
18436 P: [](const std::unique_ptr<TreeEntry> &TE) {
18437 return !TE->isGather();
18438 })))
18439 return {};
18440 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
18441 // implemented yet.
18442 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
18443 return {};
18444 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
18445 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18446 "Expected only single user of the gather node.");
18447 assert(VL.size() % NumParts == 0 &&
18448 "Number of scalars must be divisible by NumParts.");
18449 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
18450 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18451 (TE->Idx == 0 ||
18452 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
18453 isSplat(VL: TE->Scalars) ||
18454 (TE->hasState() &&
18455 getSameValuesTreeEntry(V: TE->getMainOp(), VL: TE->Scalars))))
18456 return {};
18457 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
18458 SmallVector<std::optional<TTI::ShuffleKind>> Res;
18459 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
18460 ArrayRef<Value *> SubVL =
18461 VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
18462 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18463 std::optional<TTI::ShuffleKind> SubRes =
18464 isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
18465 ForOrder);
18466 if (!SubRes)
18467 SubEntries.clear();
18468 Res.push_back(Elt: SubRes);
18469 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
18470 SubEntries.front()->getVectorFactor() == VL.size() &&
18471 (SubEntries.front()->isSame(VL: TE->Scalars) ||
18472 SubEntries.front()->isSame(VL))) {
18473 SmallVector<const TreeEntry *> LocalSubEntries;
18474 LocalSubEntries.swap(RHS&: SubEntries);
18475 Entries.clear();
18476 Res.clear();
18477 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
18478 // Clear undef scalars.
18479 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
18480 if (isa<PoisonValue>(Val: VL[I]))
18481 Mask[I] = PoisonMaskElem;
18482 Entries.emplace_back(Args: 1, Args&: LocalSubEntries.front());
18483 Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
18484 return Res;
18485 }
18486 }
18487 if (all_of(Range&: Res,
18488 P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
18489 Entries.clear();
18490 return {};
18491 }
18492 return Res;
18493}
18494
18495InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
18496 Type *ScalarTy) const {
18497 const unsigned VF = VL.size();
18498 auto *VecTy = getWidenedType(ScalarTy, VF);
18499 // Find the cost of inserting/extracting values from the vector.
18500 // Check if the same elements are inserted several times and count them as
18501 // shuffle candidates.
18502 APInt DemandedElements = APInt::getZero(numBits: VF);
18503 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18504 InstructionCost Cost;
18505 auto EstimateInsertCost = [&](unsigned I, Value *V) {
18506 DemandedElements.setBit(I);
18507 if (V->getType() != ScalarTy)
18508 Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
18509 CCH: TTI::CastContextHint::None, CostKind);
18510 };
18511 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
18512 std::iota(first: ConstantShuffleMask.begin(), last: ConstantShuffleMask.end(), value: 0);
18513 for (auto [I, V] : enumerate(First&: VL)) {
18514 // No need to shuffle duplicates for constants.
18515 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(Val: V))
18516 continue;
18517
18518 if (isConstant(V)) {
18519 ConstantShuffleMask[I] = I + VF;
18520 continue;
18521 }
18522 EstimateInsertCost(I, V);
18523 }
18524 // FIXME: add a cost for constant vector materialization.
18525 bool IsAnyNonUndefConst =
18526 any_of(Range&: VL, P: [](Value *V) { return !isa<UndefValue>(Val: V) && isConstant(V); });
18527 // 1. Shuffle input source vector and constant vector.
18528 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18529 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: VecTy,
18530 Mask: ConstantShuffleMask);
18531 }
18532
18533 // 2. Insert unique non-constants.
18534 if (!DemandedElements.isZero())
18535 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: DemandedElements,
18536 /*Insert=*/true,
18537 /*Extract=*/false, CostKind,
18538 ForPoisonSrc: ForPoisonSrc && !IsAnyNonUndefConst, VL);
18539 return Cost;
18540}
18541
18542Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
18543 auto It = EntryToLastInstruction.find(Val: E);
18544 if (It != EntryToLastInstruction.end())
18545 return *cast<Instruction>(Val&: It->second);
18546 Instruction *Res = nullptr;
18547 // Get the basic block this bundle is in. All instructions in the bundle
18548 // should be in this block (except for extractelement-like instructions with
18549 // constant indices or gathered loads or copyables).
18550 Instruction *Front;
18551 unsigned Opcode;
18552 if (E->hasState()) {
18553 Front = E->getMainOp();
18554 Opcode = E->getOpcode();
18555 } else {
18556 Front = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: IsaPred<Instruction>));
18557 Opcode = Front->getOpcode();
18558 }
18559 auto *BB = Front->getParent();
18560 assert(
18561 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18562 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
18563 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
18564 all_of(E->Scalars,
18565 [=](Value *V) -> bool {
18566 if (Opcode == Instruction::GetElementPtr &&
18567 !isa<GetElementPtrInst>(V))
18568 return true;
18569 auto *I = dyn_cast<Instruction>(V);
18570 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18571 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18572 })) &&
18573 "Expected gathered loads or GEPs or instructions from same basic "
18574 "block.");
18575
18576 auto FindLastInst = [&]() {
18577 Instruction *LastInst = Front;
18578 for (Value *V : E->Scalars) {
18579 auto *I = dyn_cast<Instruction>(Val: V);
18580 if (!I)
18581 continue;
18582 if (E->isCopyableElement(V: I))
18583 continue;
18584 if (LastInst->getParent() == I->getParent()) {
18585 if (LastInst->comesBefore(Other: I))
18586 LastInst = I;
18587 continue;
18588 }
18589 assert(((Opcode == Instruction::GetElementPtr &&
18590 !isa<GetElementPtrInst>(I)) ||
18591 E->State == TreeEntry::SplitVectorize ||
18592 (isVectorLikeInstWithConstOps(LastInst) &&
18593 isVectorLikeInstWithConstOps(I)) ||
18594 (GatheredLoadsEntriesFirst.has_value() &&
18595 Opcode == Instruction::Load && E->isGather() &&
18596 E->Idx < *GatheredLoadsEntriesFirst)) &&
18597 "Expected vector-like or non-GEP in GEP node insts only.");
18598 if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
18599 LastInst = I;
18600 continue;
18601 }
18602 if (!DT->isReachableFromEntry(A: I->getParent()))
18603 continue;
18604 auto *NodeA = DT->getNode(BB: LastInst->getParent());
18605 auto *NodeB = DT->getNode(BB: I->getParent());
18606 assert(NodeA && "Should only process reachable instructions");
18607 assert(NodeB && "Should only process reachable instructions");
18608 assert((NodeA == NodeB) ==
18609 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18610 "Different nodes should have different DFS numbers");
18611 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18612 LastInst = I;
18613 }
18614 BB = LastInst->getParent();
18615 return LastInst;
18616 };
18617
18618 auto FindFirstInst = [&]() {
18619 Instruction *FirstInst = Front;
18620 for (Value *V : E->Scalars) {
18621 auto *I = dyn_cast<Instruction>(Val: V);
18622 if (!I)
18623 continue;
18624 if (E->isCopyableElement(V: I))
18625 continue;
18626 if (FirstInst->getParent() == I->getParent()) {
18627 if (I->comesBefore(Other: FirstInst))
18628 FirstInst = I;
18629 continue;
18630 }
18631 assert(((Opcode == Instruction::GetElementPtr &&
18632 !isa<GetElementPtrInst>(I)) ||
18633 (isVectorLikeInstWithConstOps(FirstInst) &&
18634 isVectorLikeInstWithConstOps(I))) &&
18635 "Expected vector-like or non-GEP in GEP node insts only.");
18636 if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
18637 FirstInst = I;
18638 continue;
18639 }
18640 if (!DT->isReachableFromEntry(A: I->getParent()))
18641 continue;
18642 auto *NodeA = DT->getNode(BB: FirstInst->getParent());
18643 auto *NodeB = DT->getNode(BB: I->getParent());
18644 assert(NodeA && "Should only process reachable instructions");
18645 assert(NodeB && "Should only process reachable instructions");
18646 assert((NodeA == NodeB) ==
18647 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18648 "Different nodes should have different DFS numbers");
18649 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18650 FirstInst = I;
18651 }
18652 return FirstInst;
18653 };
18654
18655 if (E->State == TreeEntry::SplitVectorize) {
18656 Res = FindLastInst();
18657 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V: Res); !Entries.empty()) {
18658 for (auto *E : Entries) {
18659 auto *I = dyn_cast_or_null<Instruction>(Val&: E->VectorizedValue);
18660 if (!I)
18661 I = &getLastInstructionInBundle(E);
18662 if (Res->getParent() == I->getParent() && Res->comesBefore(Other: I))
18663 Res = I;
18664 }
18665 }
18666 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18667 return *Res;
18668 }
18669
18670 // Set insertpoint for gathered loads to the very first load.
18671 if (GatheredLoadsEntriesFirst.has_value() &&
18672 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18673 Opcode == Instruction::Load) {
18674 Res = FindFirstInst();
18675 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18676 return *Res;
18677 }
18678
18679 // Set the insert point to the beginning of the basic block if the entry
18680 // should not be scheduled.
18681 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
18682 if (E->isGather())
18683 return nullptr;
18684 // Found previously that the instruction do not need to be scheduled.
18685 const auto *It = BlocksSchedules.find(Key: BB);
18686 if (It == BlocksSchedules.end())
18687 return nullptr;
18688 for (Value *V : E->Scalars) {
18689 auto *I = dyn_cast<Instruction>(Val: V);
18690 if (!I || isa<PHINode>(Val: I) ||
18691 (!E->isCopyableElement(V: I) && doesNotNeedToBeScheduled(V: I)))
18692 continue;
18693 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(V: I);
18694 if (Bundles.empty())
18695 continue;
18696 const auto *It = find_if(
18697 Range&: Bundles, P: [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
18698 if (It != Bundles.end())
18699 return *It;
18700 }
18701 return nullptr;
18702 };
18703 const ScheduleBundle *Bundle = FindScheduleBundle(E);
18704 if (!E->isGather() && !Bundle) {
18705 if ((Opcode == Instruction::GetElementPtr &&
18706 any_of(Range: E->Scalars,
18707 P: [](Value *V) {
18708 return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
18709 })) ||
18710 (all_of(Range: E->Scalars,
18711 P: [&](Value *V) {
18712 return isa<PoisonValue>(Val: V) ||
18713 (E->Idx == 0 && isa<InsertElementInst>(Val: V)) ||
18714 E->isCopyableElement(V) ||
18715 (!isVectorLikeInstWithConstOps(V) &&
18716 isUsedOutsideBlock(V));
18717 }) &&
18718 (!E->doesNotNeedToSchedule() ||
18719 any_of(Range: E->Scalars,
18720 P: [&](Value *V) {
18721 if (!isa<Instruction>(Val: V) ||
18722 (E->hasCopyableElements() && E->isCopyableElement(V)))
18723 return false;
18724 return !areAllOperandsNonInsts(V);
18725 }) ||
18726 none_of(Range: E->Scalars, P: [&](Value *V) {
18727 if (!isa<Instruction>(Val: V) ||
18728 (E->hasCopyableElements() && E->isCopyableElement(V)))
18729 return false;
18730 return MustGather.contains(Ptr: V);
18731 }))))
18732 Res = FindLastInst();
18733 else
18734 Res = FindFirstInst();
18735 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18736 return *Res;
18737 }
18738
18739 // Find the last instruction. The common case should be that BB has been
18740 // scheduled, and the last instruction is VL.back(). So we start with
18741 // VL.back() and iterate over schedule data until we reach the end of the
18742 // bundle. The end of the bundle is marked by null ScheduleData.
18743 if (Bundle) {
18744 assert(!E->isGather() && "Gathered instructions should not be scheduled");
18745 Res = Bundle->getBundle().back()->getInst();
18746 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18747 return *Res;
18748 }
18749
18750 // LastInst can still be null at this point if there's either not an entry
18751 // for BB in BlocksSchedules or there's no ScheduleData available for
18752 // VL.back(). This can be the case if buildTreeRec aborts for various
18753 // reasons (e.g., the maximum recursion depth is reached, the maximum region
18754 // size is reached, etc.). ScheduleData is initialized in the scheduling
18755 // "dry-run".
18756 //
18757 // If this happens, we can still find the last instruction by brute force. We
18758 // iterate forwards from Front (inclusive) until we either see all
18759 // instructions in the bundle or reach the end of the block. If Front is the
18760 // last instruction in program order, LastInst will be set to Front, and we
18761 // will visit all the remaining instructions in the block.
18762 //
18763 // One of the reasons we exit early from buildTreeRec is to place an upper
18764 // bound on compile-time. Thus, taking an additional compile-time hit here is
18765 // not ideal. However, this should be exceedingly rare since it requires that
18766 // we both exit early from buildTreeRec and that the bundle be out-of-order
18767 // (causing us to iterate all the way to the end of the block).
18768 if (!Res)
18769 Res = FindLastInst();
18770 assert(Res && "Failed to find last instruction in bundle");
18771 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18772 return *Res;
18773}
18774
18775void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
18776 auto *Front = E->getMainOp();
18777 Instruction *LastInst = &getLastInstructionInBundle(E);
18778 assert(LastInst && "Failed to find last instruction in bundle");
18779 BasicBlock::iterator LastInstIt = LastInst->getIterator();
18780 // If the instruction is PHI, set the insert point after all the PHIs.
18781 bool IsPHI = isa<PHINode>(Val: LastInst);
18782 if (IsPHI) {
18783 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
18784 if (LastInstIt != LastInst->getParent()->end() &&
18785 LastInstIt->getParent()->isLandingPad())
18786 LastInstIt = std::next(x: LastInstIt);
18787 }
18788 if (IsPHI ||
18789 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
18790 (E->doesNotNeedToSchedule() ||
18791 (E->hasCopyableElements() && !E->isCopyableElement(V: LastInst) &&
18792 isUsedOutsideBlock(V: LastInst)))) ||
18793 (GatheredLoadsEntriesFirst.has_value() &&
18794 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18795 E->getOpcode() == Instruction::Load)) {
18796 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
18797 } else {
18798 // Set the insertion point after the last instruction in the bundle. Set the
18799 // debug location to Front.
18800 Builder.SetInsertPoint(
18801 TheBB: LastInst->getParent(),
18802 IP: LastInst->getNextNode()->getIterator());
18803 if (Instruction *Res = LastInstructionToPos.lookup(Val: LastInst)) {
18804 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
18805 } else {
18806 Res = Builder.CreateAlignedLoad(Ty: Builder.getPtrTy(),
18807 Ptr: PoisonValue::get(T: Builder.getPtrTy()),
18808 Align: MaybeAlign());
18809 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
18810 eraseInstruction(I: Res);
18811 LastInstructionToPos.try_emplace(Key: LastInst, Args&: Res);
18812 }
18813 }
18814 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
18815}
18816
18817Value *BoUpSLP::gather(
18818 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
18819 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
18820 // List of instructions/lanes from current block and/or the blocks which are
18821 // part of the current loop. These instructions will be inserted at the end to
18822 // make it possible to optimize loops and hoist invariant instructions out of
18823 // the loops body with better chances for success.
18824 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
18825 SmallSet<int, 4> PostponedIndices;
18826 Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
18827 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
18828 SmallPtrSet<BasicBlock *, 4> Visited;
18829 while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
18830 InsertBB = InsertBB->getSinglePredecessor();
18831 return InsertBB && InsertBB == InstBB;
18832 };
18833 for (int I = 0, E = VL.size(); I < E; ++I) {
18834 if (auto *Inst = dyn_cast<Instruction>(Val: VL[I]))
18835 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
18836 isVectorized(V: Inst) ||
18837 (L && (!Root || L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
18838 PostponedIndices.insert(V: I).second)
18839 PostponedInsts.emplace_back(Args&: Inst, Args&: I);
18840 }
18841
18842 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
18843 Type *Ty) {
18844 Value *Scalar = V;
18845 if (Scalar->getType() != Ty) {
18846 assert(Scalar->getType()->isIntOrIntVectorTy() &&
18847 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
18848 Value *V = Scalar;
18849 if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
18850 isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
18851 Value *Op = CI->getOperand(i_nocapture: 0);
18852 if (auto *IOp = dyn_cast<Instruction>(Val: Op);
18853 !IOp || !(isDeleted(I: IOp) || isVectorized(V: IOp)))
18854 V = Op;
18855 }
18856 Scalar = Builder.CreateIntCast(
18857 V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
18858 }
18859
18860 Instruction *InsElt;
18861 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
18862 assert(SLPReVec && "FixedVectorType is not expected.");
18863 Vec =
18864 createInsertVector(Builder, Vec, V: Scalar, Index: Pos * getNumElements(Ty: VecTy));
18865 auto *II = dyn_cast<Instruction>(Val: Vec);
18866 if (!II)
18867 return Vec;
18868 InsElt = II;
18869 } else {
18870 Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
18871 InsElt = dyn_cast<InsertElementInst>(Val: Vec);
18872 if (!InsElt)
18873 return Vec;
18874 }
18875 GatherShuffleExtractSeq.insert(X: InsElt);
18876 CSEBlocks.insert(V: InsElt->getParent());
18877 // Add to our 'need-to-extract' list.
18878 if (isa<Instruction>(Val: V)) {
18879 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
18880 const auto *It = find_if(Range&: Entries, P: [&](const TreeEntry *E) {
18881 return !TransformedToGatherNodes.contains(Val: E) &&
18882 !DeletedNodes.contains(Ptr: E);
18883 });
18884 if (It != Entries.end()) {
18885 // Find which lane we need to extract.
18886 User *UserOp = nullptr;
18887 if (Scalar != V) {
18888 if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
18889 UserOp = SI;
18890 } else {
18891 if (V->getType()->isVectorTy()) {
18892 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: InsElt);
18893 SV && SV->getOperand(i_nocapture: 0) != V && SV->getOperand(i_nocapture: 1) != V) {
18894 // Find shufflevector, caused by resize.
18895 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
18896 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Vec)) {
18897 if (SV->getOperand(i_nocapture: 0) == V)
18898 return SV;
18899 if (SV->getOperand(i_nocapture: 1) == V)
18900 return SV;
18901 }
18902 return nullptr;
18903 };
18904 InsElt = nullptr;
18905 if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: 0), V))
18906 InsElt = User;
18907 else if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: 1), V))
18908 InsElt = User;
18909 assert(InsElt &&
18910 "Failed to find shufflevector, caused by resize.");
18911 }
18912 }
18913 UserOp = InsElt;
18914 }
18915 if (UserOp) {
18916 unsigned FoundLane = (*It)->findLaneForValue(V);
18917 ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: **It, Args&: FoundLane);
18918 }
18919 }
18920 }
18921 return Vec;
18922 };
18923 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
18924 Value *Vec = PoisonValue::get(T: VecTy);
18925 SmallVector<int> NonConsts;
18926 SmallVector<int> Mask(VL.size());
18927 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
18928 Value *OriginalRoot = Root;
18929 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Val: Root);
18930 SV && isa<PoisonValue>(Val: SV->getOperand(i_nocapture: 1)) &&
18931 SV->getOperand(i_nocapture: 0)->getType() == VecTy) {
18932 Root = SV->getOperand(i_nocapture: 0);
18933 Mask.assign(in_start: SV->getShuffleMask().begin(), in_end: SV->getShuffleMask().end());
18934 }
18935 // Insert constant values at first.
18936 for (int I = 0, E = VL.size(); I < E; ++I) {
18937 if (PostponedIndices.contains(V: I))
18938 continue;
18939 if (!isConstant(V: VL[I])) {
18940 NonConsts.push_back(Elt: I);
18941 continue;
18942 }
18943 if (isa<PoisonValue>(Val: VL[I]))
18944 continue;
18945 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18946 Mask[I] = I + E;
18947 }
18948 if (Root) {
18949 if (isa<PoisonValue>(Val: Vec)) {
18950 Vec = OriginalRoot;
18951 } else {
18952 Vec = CreateShuffle(Root, Vec, Mask);
18953 if (auto *OI = dyn_cast<Instruction>(Val: OriginalRoot);
18954 OI && OI->use_empty() &&
18955 none_of(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
18956 return TE->VectorizedValue == OI;
18957 }))
18958 eraseInstruction(I: OI);
18959 }
18960 }
18961 // Insert non-constant values.
18962 for (int I : NonConsts)
18963 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
18964 // Append instructions, which are/may be part of the loop, in the end to make
18965 // it possible to hoist non-loop-based instructions.
18966 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
18967 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
18968
18969 return Vec;
18970}
18971
18972/// Merges shuffle masks and emits final shuffle instruction, if required. It
18973/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
18974/// when the actual shuffle instruction is generated only if this is actually
18975/// required. Otherwise, the shuffle instruction emission is delayed till the
18976/// end of the process, to reduce the number of emitted instructions and further
18977/// analysis/transformations.
18978/// The class also will look through the previously emitted shuffle instructions
18979/// and properly mark indices in mask as undef.
18980/// For example, given the code
18981/// \code
18982/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
18983/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
18984/// \endcode
18985/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
18986/// look through %s1 and %s2 and emit
18987/// \code
18988/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
18989/// \endcode
18990/// instead.
18991/// If 2 operands are of different size, the smallest one will be resized and
18992/// the mask recalculated properly.
18993/// For example, given the code
18994/// \code
18995/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
18996/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
18997/// \endcode
18998/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
18999/// look through %s1 and %s2 and emit
19000/// \code
19001/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19002/// \endcode
19003/// instead.
19004class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
19005 bool IsFinalized = false;
19006 /// Combined mask for all applied operands and masks. It is built during
19007 /// analysis and actual emission of shuffle vector instructions.
19008 SmallVector<int> CommonMask;
19009 /// List of operands for the shuffle vector instruction. It hold at max 2
19010 /// operands, if the 3rd is going to be added, the first 2 are combined into
19011 /// shuffle with \p CommonMask mask, the first operand sets to be the
19012 /// resulting shuffle and the second operand sets to be the newly added
19013 /// operand. The \p CommonMask is transformed in the proper way after that.
19014 SmallVector<Value *, 2> InVectors;
19015 IRBuilderBase &Builder;
19016 BoUpSLP &R;
19017
19018 class ShuffleIRBuilder {
19019 IRBuilderBase &Builder;
19020 /// Holds all of the instructions that we gathered.
19021 SetVector<Instruction *> &GatherShuffleExtractSeq;
19022 /// A list of blocks that we are going to CSE.
19023 DenseSet<BasicBlock *> &CSEBlocks;
19024 /// Data layout.
19025 const DataLayout &DL;
19026
19027 public:
19028 ShuffleIRBuilder(IRBuilderBase &Builder,
19029 SetVector<Instruction *> &GatherShuffleExtractSeq,
19030 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
19031 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
19032 CSEBlocks(CSEBlocks), DL(DL) {}
19033 ~ShuffleIRBuilder() = default;
19034 /// Creates shufflevector for the 2 operands with the given mask.
19035 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
19036 if (V1->getType() != V2->getType()) {
19037 assert(V1->getType()->isIntOrIntVectorTy() &&
19038 V1->getType()->isIntOrIntVectorTy() &&
19039 "Expected integer vector types only.");
19040 if (V1->getType() != V2->getType()) {
19041 if (cast<VectorType>(Val: V2->getType())
19042 ->getElementType()
19043 ->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
19044 ->getElementType()
19045 ->getIntegerBitWidth())
19046 V2 = Builder.CreateIntCast(
19047 V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery(DL)));
19048 else
19049 V1 = Builder.CreateIntCast(
19050 V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery(DL)));
19051 }
19052 }
19053 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
19054 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
19055 GatherShuffleExtractSeq.insert(X: I);
19056 CSEBlocks.insert(V: I->getParent());
19057 }
19058 return Vec;
19059 }
19060 /// Creates permutation of the single vector operand with the given mask, if
19061 /// it is not identity mask.
19062 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
19063 if (Mask.empty())
19064 return V1;
19065 unsigned VF = Mask.size();
19066 unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19067 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
19068 return V1;
19069 Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
19070 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
19071 GatherShuffleExtractSeq.insert(X: I);
19072 CSEBlocks.insert(V: I->getParent());
19073 }
19074 return Vec;
19075 }
19076 Value *createIdentity(Value *V) { return V; }
19077 Value *createPoison(Type *Ty, unsigned VF) {
19078 return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
19079 }
19080 /// Resizes 2 input vector to match the sizes, if the they are not equal
19081 /// yet. The smallest vector is resized to the size of the larger vector.
19082 void resizeToMatch(Value *&V1, Value *&V2) {
19083 if (V1->getType() == V2->getType())
19084 return;
19085 int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19086 int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
19087 int VF = std::max(a: V1VF, b: V2VF);
19088 int MinVF = std::min(a: V1VF, b: V2VF);
19089 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
19090 std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
19091 value: 0);
19092 Value *&Op = MinVF == V1VF ? V1 : V2;
19093 Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
19094 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
19095 GatherShuffleExtractSeq.insert(X: I);
19096 CSEBlocks.insert(V: I->getParent());
19097 }
19098 if (MinVF == V1VF)
19099 V1 = Op;
19100 else
19101 V2 = Op;
19102 }
19103 };
19104
19105 /// Smart shuffle instruction emission, walks through shuffles trees and
19106 /// tries to find the best matching vector for the actual shuffle
19107 /// instruction.
19108 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
19109 assert(V1 && "Expected at least one vector value.");
19110 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
19111 R.CSEBlocks, *R.DL);
19112 return BaseShuffleAnalysis::createShuffle<Value *>(
19113 V1, V2, Mask, Builder&: ShuffleBuilder, ScalarTy);
19114 }
19115
19116 /// Cast value \p V to the vector type with the same number of elements, but
19117 /// the base type \p ScalarTy.
19118 Value *castToScalarTyElem(Value *V,
19119 std::optional<bool> IsSigned = std::nullopt) {
19120 auto *VecTy = cast<VectorType>(Val: V->getType());
19121 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
19122 if (VecTy->getElementType() == ScalarTy->getScalarType())
19123 return V;
19124 return Builder.CreateIntCast(
19125 V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
19126 isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL))));
19127 }
19128
19129 Value *getVectorizedValue(const TreeEntry &E) {
19130 Value *Vec = E.VectorizedValue;
19131 if (!Vec->getType()->isIntOrIntVectorTy())
19132 return Vec;
19133 return castToScalarTyElem(V: Vec, IsSigned: any_of(Range: E.Scalars, P: [&](Value *V) {
19134 return !isa<PoisonValue>(Val: V) &&
19135 !isKnownNonNegative(
19136 V, SQ: SimplifyQuery(*R.DL));
19137 }));
19138 }
19139
19140public:
19141 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
19142 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
19143
19144 /// Adjusts extractelements after reusing them.
19145 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
19146 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
19147 unsigned NumParts, bool &UseVecBaseAsInput) {
19148 UseVecBaseAsInput = false;
19149 SmallPtrSet<Value *, 4> UniqueBases;
19150 Value *VecBase = nullptr;
19151 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
19152 if (!E->ReorderIndices.empty()) {
19153 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19154 E->ReorderIndices.end());
19155 reorderScalars(Scalars&: VL, Mask: ReorderMask);
19156 }
19157 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19158 int Idx = Mask[I];
19159 if (Idx == PoisonMaskElem)
19160 continue;
19161 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
19162 VecBase = EI->getVectorOperand();
19163 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecBase); !TEs.empty())
19164 VecBase = TEs.front()->VectorizedValue;
19165 assert(VecBase && "Expected vectorized value.");
19166 UniqueBases.insert(Ptr: VecBase);
19167 // If the only one use is vectorized - can delete the extractelement
19168 // itself.
19169 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(Ptr: EI) ||
19170 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
19171 !R.isVectorized(V: EI) &&
19172 count_if(Range: E->Scalars, P: [&](Value *V) { return V == EI; }) !=
19173 count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
19174 P: [&](Value *V) { return V == EI; })) ||
19175 (NumParts != 1 && count(Range&: VL, Element: EI) > 1) ||
19176 any_of(Range: EI->users(), P: [&](User *U) {
19177 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(V: U);
19178 return UTEs.empty() || UTEs.size() > 1 ||
19179 any_of(Range&: UTEs,
19180 P: [&](const TreeEntry *TE) {
19181 return R.DeletedNodes.contains(Ptr: TE) ||
19182 R.TransformedToGatherNodes.contains(Val: TE);
19183 }) ||
19184 (isa<GetElementPtrInst>(Val: U) &&
19185 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) ||
19186 (!UTEs.empty() &&
19187 count_if(Range&: R.VectorizableTree,
19188 P: [&](const std::unique_ptr<TreeEntry> &TE) {
19189 return TE->UserTreeIndex.UserTE ==
19190 UTEs.front() &&
19191 is_contained(Range&: VL, Element: EI);
19192 }) != 1);
19193 }))
19194 continue;
19195 R.eraseInstruction(I: EI);
19196 }
19197 if (NumParts == 1 || UniqueBases.size() == 1) {
19198 assert(VecBase && "Expected vectorized value.");
19199 return castToScalarTyElem(V: VecBase);
19200 }
19201 UseVecBaseAsInput = true;
19202 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
19203 for (auto [I, Idx] : enumerate(First&: Mask))
19204 if (Idx != PoisonMaskElem)
19205 Idx = I;
19206 };
19207 // Perform multi-register vector shuffle, joining them into a single virtual
19208 // long vector.
19209 // Need to shuffle each part independently and then insert all this parts
19210 // into a long virtual vector register, forming the original vector.
19211 Value *Vec = nullptr;
19212 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19213 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
19214 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
19215 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
19216 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(N: Part * SliceSize, M: Limit);
19217 MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
19218 constexpr int MaxBases = 2;
19219 SmallVector<Value *, MaxBases> Bases(MaxBases);
19220 auto VLMask = zip(t&: SubVL, u&: SubMask);
19221 const unsigned VF = std::accumulate(
19222 first: VLMask.begin(), last: VLMask.end(), init: 0U, binary_op: [&](unsigned S, const auto &D) {
19223 if (std::get<1>(D) == PoisonMaskElem)
19224 return S;
19225 Value *VecOp =
19226 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
19227 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp);
19228 !TEs.empty())
19229 VecOp = TEs.front()->VectorizedValue;
19230 assert(VecOp && "Expected vectorized value.");
19231 const unsigned Size =
19232 cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
19233 return std::max(a: S, b: Size);
19234 });
19235 for (const auto [V, I] : VLMask) {
19236 if (I == PoisonMaskElem)
19237 continue;
19238 Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
19239 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); !TEs.empty())
19240 VecOp = TEs.front()->VectorizedValue;
19241 assert(VecOp && "Expected vectorized value.");
19242 VecOp = castToScalarTyElem(V: VecOp);
19243 Bases[I / VF] = VecOp;
19244 }
19245 if (!Bases.front())
19246 continue;
19247 Value *SubVec;
19248 if (Bases.back()) {
19249 SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
19250 TransformToIdentity(SubMask);
19251 } else {
19252 SubVec = Bases.front();
19253 }
19254 if (!Vec) {
19255 Vec = SubVec;
19256 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
19257 [&](unsigned P) {
19258 ArrayRef<int> SubMask =
19259 Mask.slice(P * SliceSize,
19260 getNumElems(Mask.size(),
19261 SliceSize, P));
19262 return all_of(SubMask, [](int Idx) {
19263 return Idx == PoisonMaskElem;
19264 });
19265 })) &&
19266 "Expected first part or all previous parts masked.");
19267 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
19268 } else {
19269 unsigned NewVF =
19270 cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19271 if (Vec->getType() != SubVec->getType()) {
19272 unsigned SubVecVF =
19273 cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
19274 NewVF = std::max(a: NewVF, b: SubVecVF);
19275 }
19276 // Adjust SubMask.
19277 for (int &Idx : SubMask)
19278 if (Idx != PoisonMaskElem)
19279 Idx += NewVF;
19280 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
19281 Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
19282 TransformToIdentity(VecMask);
19283 }
19284 }
19285 copy(Range&: VecMask, Out: Mask.begin());
19286 return Vec;
19287 }
19288 /// Checks if the specified entry \p E needs to be delayed because of its
19289 /// dependency nodes.
19290 std::optional<Value *>
19291 needToDelay(const TreeEntry *E,
19292 ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
19293 // No need to delay emission if all deps are ready.
19294 if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
19295 return all_of(
19296 Range&: TEs, P: [](const TreeEntry *TE) { return TE->VectorizedValue; });
19297 }))
19298 return std::nullopt;
19299 // Postpone gather emission, will be emitted after the end of the
19300 // process to keep correct order.
19301 auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
19302 return Builder.CreateAlignedLoad(
19303 Ty: ResVecTy,
19304 Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
19305 Align: MaybeAlign());
19306 }
19307 /// Reset the builder to handle perfect diamond match.
19308 void resetForSameNode() {
19309 IsFinalized = false;
19310 CommonMask.clear();
19311 InVectors.clear();
19312 }
19313 /// Adds 2 input vectors (in form of tree entries) and the mask for their
19314 /// shuffling.
19315 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
19316 Value *V1 = getVectorizedValue(E: E1);
19317 Value *V2 = getVectorizedValue(E: E2);
19318 add(V1, V2, Mask);
19319 }
19320 /// Adds single input vector (in form of tree entry) and the mask for its
19321 /// shuffling.
19322 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
19323 Value *V1 = getVectorizedValue(E: E1);
19324 add(V1, Mask);
19325 }
19326 /// Adds 2 input vectors and the mask for their shuffling.
19327 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
19328 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
19329 assert(isa<FixedVectorType>(V1->getType()) &&
19330 isa<FixedVectorType>(V2->getType()) &&
19331 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19332 V1 = castToScalarTyElem(V: V1);
19333 V2 = castToScalarTyElem(V: V2);
19334 if (InVectors.empty()) {
19335 InVectors.push_back(Elt: V1);
19336 InVectors.push_back(Elt: V2);
19337 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
19338 return;
19339 }
19340 Value *Vec = InVectors.front();
19341 if (InVectors.size() == 2) {
19342 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19343 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19344 } else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
19345 Mask.size()) {
19346 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19347 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19348 }
19349 V1 = createShuffle(V1, V2, Mask);
19350 unsigned VF = std::max(a: getVF(V: V1), b: getVF(V: Vec));
19351 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19352 if (Mask[Idx] != PoisonMaskElem)
19353 CommonMask[Idx] = Idx + VF;
19354 InVectors.front() = Vec;
19355 if (InVectors.size() == 2)
19356 InVectors.back() = V1;
19357 else
19358 InVectors.push_back(Elt: V1);
19359 }
19360 /// Adds another one input vector and the mask for the shuffling.
19361 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
19362 assert(isa<FixedVectorType>(V1->getType()) &&
19363 "castToScalarTyElem expects V1 to be FixedVectorType");
19364 V1 = castToScalarTyElem(V: V1);
19365 if (InVectors.empty()) {
19366 InVectors.push_back(Elt: V1);
19367 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
19368 return;
19369 }
19370 const auto *It = find(Range&: InVectors, Val: V1);
19371 if (It == InVectors.end()) {
19372 if (InVectors.size() == 2 ||
19373 InVectors.front()->getType() != V1->getType()) {
19374 Value *V = InVectors.front();
19375 if (InVectors.size() == 2) {
19376 V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
19377 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19378 } else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
19379 CommonMask.size()) {
19380 V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
19381 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19382 }
19383 unsigned VF = std::max(a: CommonMask.size(), b: Mask.size());
19384 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19385 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
19386 CommonMask[Idx] = V->getType() != V1->getType()
19387 ? Idx + VF
19388 : Mask[Idx] + getVF(V: V1);
19389 if (V->getType() != V1->getType())
19390 V1 = createShuffle(V1, V2: nullptr, Mask);
19391 InVectors.front() = V;
19392 if (InVectors.size() == 2)
19393 InVectors.back() = V1;
19394 else
19395 InVectors.push_back(Elt: V1);
19396 return;
19397 }
19398 // Check if second vector is required if the used elements are already
19399 // used from the first one.
19400 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19401 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
19402 InVectors.push_back(Elt: V1);
19403 break;
19404 }
19405 }
19406 unsigned VF = 0;
19407 for (Value *V : InVectors)
19408 VF = std::max(a: VF, b: getVF(V));
19409 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19410 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
19411 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19412 }
19413 /// Adds another one input vector and the mask for the shuffling.
19414 void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
19415 SmallVector<int> NewMask;
19416 inversePermutation(Indices: Order, Mask&: NewMask);
19417 add(V1, Mask: NewMask);
19418 }
19419 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
19420 Value *Root = nullptr) {
19421 return R.gather(VL, Root, ScalarTy,
19422 CreateShuffle: [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
19423 return createShuffle(V1, V2, Mask);
19424 });
19425 }
19426 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
19427 /// Finalize emission of the shuffles.
19428 /// \param Action the action (if any) to be performed before final applying of
19429 /// the \p ExtMask mask.
19430 Value *finalize(
19431 ArrayRef<int> ExtMask,
19432 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19433 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
19434 function_ref<void(Value *&, SmallVectorImpl<int> &,
19435 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
19436 Action = {}) {
19437 IsFinalized = true;
19438 if (Action) {
19439 Value *Vec = InVectors.front();
19440 if (InVectors.size() == 2) {
19441 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19442 InVectors.pop_back();
19443 } else {
19444 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19445 }
19446 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19447 assert(VF > 0 &&
19448 "Expected vector length for the final value before action.");
19449 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19450 if (VecVF < VF) {
19451 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19452 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
19453 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
19454 }
19455 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
19456 return createShuffle(V1, V2, Mask);
19457 });
19458 InVectors.front() = Vec;
19459 }
19460 if (!SubVectors.empty()) {
19461 Value *Vec = InVectors.front();
19462 if (InVectors.size() == 2) {
19463 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19464 InVectors.pop_back();
19465 } else {
19466 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19467 }
19468 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19469 auto CreateSubVectors = [&](Value *Vec,
19470 SmallVectorImpl<int> &CommonMask) {
19471 for (auto [E, Idx] : SubVectors) {
19472 Value *V = getVectorizedValue(E: *E);
19473 unsigned InsertionIndex = Idx * getNumElements(Ty: ScalarTy);
19474 // Use scalar version of the SCalarType to correctly handle shuffles
19475 // for revectorization. The revectorization mode operates by the
19476 // vectors, but here we need to operate on the scalars, because the
19477 // masks were already transformed for the vector elements and we don't
19478 // need doing this transformation again.
19479 Type *OrigScalarTy = ScalarTy;
19480 ScalarTy = ScalarTy->getScalarType();
19481 Vec = createInsertVector(
19482 Builder, Vec, V, Index: InsertionIndex,
19483 Generator: std::bind(f: &ShuffleInstructionBuilder::createShuffle, args: this, args: _1, args: _2,
19484 args: _3));
19485 ScalarTy = OrigScalarTy;
19486 if (!CommonMask.empty()) {
19487 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
19488 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
19489 value: Idx);
19490 }
19491 }
19492 return Vec;
19493 };
19494 if (SubVectorsMask.empty()) {
19495 Vec = CreateSubVectors(Vec, CommonMask);
19496 } else {
19497 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
19498 copy(Range&: SubVectorsMask, Out: SVMask.begin());
19499 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
19500 if (I2 != PoisonMaskElem) {
19501 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
19502 I1 = I2 + CommonMask.size();
19503 }
19504 }
19505 Value *InsertVec =
19506 CreateSubVectors(PoisonValue::get(T: Vec->getType()), CommonMask);
19507 Vec = createShuffle(V1: InsertVec, V2: Vec, Mask: SVMask);
19508 transformMaskAfterShuffle(CommonMask, Mask: SVMask);
19509 }
19510 InVectors.front() = Vec;
19511 }
19512
19513 if (!ExtMask.empty()) {
19514 if (CommonMask.empty()) {
19515 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
19516 } else {
19517 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
19518 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
19519 if (ExtMask[I] == PoisonMaskElem)
19520 continue;
19521 NewMask[I] = CommonMask[ExtMask[I]];
19522 }
19523 CommonMask.swap(RHS&: NewMask);
19524 }
19525 }
19526 if (CommonMask.empty()) {
19527 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
19528 return InVectors.front();
19529 }
19530 if (InVectors.size() == 2)
19531 return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
19532 return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
19533 }
19534
19535 ~ShuffleInstructionBuilder() {
19536 assert((IsFinalized || CommonMask.empty()) &&
19537 "Shuffle construction must be finalized.");
19538 }
19539};
19540
19541Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
19542 return vectorizeTree(E: getOperandEntry(E, Idx: NodeIdx));
19543}
19544
19545template <typename BVTy, typename ResTy, typename... Args>
19546ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
19547 Args &...Params) {
19548 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19549 "Expected gather node.");
19550 unsigned VF = E->getVectorFactor();
19551
19552 bool NeedFreeze = false;
19553 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
19554 // Do not process split vectorize node, marked to be gathers/buildvectors.
19555 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
19556 E->CombinedEntriesWithIndices.size());
19557 if (E->State == TreeEntry::SplitVectorize &&
19558 TransformedToGatherNodes.contains(Val: E)) {
19559 SubVectors.clear();
19560 } else {
19561 // Clear values, to be replaced by insertvector instructions.
19562 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19563 for_each(MutableArrayRef(GatheredScalars)
19564 .slice(N: Idx, M: VectorizableTree[EIdx]->getVectorFactor()),
19565 [&](Value *&V) { V = PoisonValue::get(T: V->getType()); });
19566 transform(
19567 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19568 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19569 });
19570 }
19571 // Build a mask out of the reorder indices and reorder scalars per this
19572 // mask.
19573 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19574 E->ReorderIndices.end());
19575 if (!ReorderMask.empty())
19576 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
19577 SmallVector<int> SubVectorsMask;
19578 inversePermutation(Indices: E->ReorderIndices, Mask&: SubVectorsMask);
19579 // Transform non-clustered elements in the mask to poison (-1).
19580 // "Clustered" operations will be reordered using this mask later.
19581 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
19582 for (unsigned I : seq<unsigned>(Size: GatheredScalars.size()))
19583 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
19584 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
19585 } else {
19586 SubVectorsMask.clear();
19587 }
19588 SmallVector<Value *> StoredGS(GatheredScalars);
19589 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
19590 unsigned I, unsigned SliceSize,
19591 bool IsNotPoisonous) {
19592 if (!isSplat(VL: E->Scalars) || none_of(E->Scalars, [](Value *V) {
19593 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
19594 }))
19595 return false;
19596 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19597 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19598 if (UserTE->getNumOperands() != 2)
19599 return false;
19600 if (!IsNotPoisonous) {
19601 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(N: UserTE->Idx + 1),
19602 [=](const std::unique_ptr<TreeEntry> &TE) {
19603 return TE->UserTreeIndex.UserTE == UserTE &&
19604 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19605 });
19606 if (It == VectorizableTree.end())
19607 return false;
19608 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
19609 if (!(*It)->ReorderIndices.empty()) {
19610 inversePermutation((*It)->ReorderIndices, ReorderMask);
19611 reorderScalars(Scalars&: GS, Mask: ReorderMask);
19612 }
19613 if (!all_of(zip(t&: GatheredScalars, u&: GS), [&](const auto &P) {
19614 Value *V0 = std::get<0>(P);
19615 Value *V1 = std::get<1>(P);
19616 return !isa<UndefValue>(Val: V0) || isa<PoisonValue>(Val: V0) ||
19617 (isa<UndefValue>(Val: V0) && !isa<PoisonValue>(Val: V0) &&
19618 is_contained(Range: E->Scalars, Element: V1));
19619 }))
19620 return false;
19621 }
19622 int Idx;
19623 if ((Mask.size() < InputVF &&
19624 ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
19625 Idx == 0) ||
19626 (Mask.size() == InputVF &&
19627 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
19628 std::iota(
19629 first: std::next(x: Mask.begin(), n: I * SliceSize),
19630 last: std::next(x: Mask.begin(),
19631 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
19632 value: 0);
19633 } else {
19634 unsigned IVal =
19635 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
19636 std::fill(
19637 first: std::next(x: Mask.begin(), n: I * SliceSize),
19638 last: std::next(x: Mask.begin(),
19639 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
19640 value: IVal);
19641 }
19642 return true;
19643 };
19644 BVTy ShuffleBuilder(ScalarTy, Params...);
19645 ResTy Res = ResTy();
19646 SmallVector<int> Mask;
19647 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
19648 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
19649 Value *ExtractVecBase = nullptr;
19650 bool UseVecBaseAsInput = false;
19651 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
19652 SmallVector<SmallVector<const TreeEntry *>> Entries;
19653 Type *OrigScalarTy = GatheredScalars.front()->getType();
19654 auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
19655 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: GatheredScalars.size());
19656 if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
19657 // Check for gathered extracts.
19658 bool Resized = false;
19659 ExtractShuffles =
19660 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
19661 if (!ExtractShuffles.empty()) {
19662 SmallVector<const TreeEntry *> ExtractEntries;
19663 for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
19664 if (I == PoisonMaskElem)
19665 continue;
19666 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
19667 V: cast<ExtractElementInst>(Val: StoredGS[Idx])->getVectorOperand());
19668 !TEs.empty())
19669 ExtractEntries.append(in_start: TEs.begin(), in_end: TEs.end());
19670 }
19671 if (std::optional<ResTy> Delayed =
19672 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19673 // Delay emission of gathers which are not ready yet.
19674 PostponedGathers.insert(X: E);
19675 // Postpone gather emission, will be emitted after the end of the
19676 // process to keep correct order.
19677 return *Delayed;
19678 }
19679 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
19680 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19681 ExtractVecBase = VecBase;
19682 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
19683 if (VF == VecBaseTy->getNumElements() &&
19684 GatheredScalars.size() != VF) {
19685 Resized = true;
19686 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
19687 Elt: PoisonValue::get(T: OrigScalarTy));
19688 NumParts =
19689 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF), Limit: VF);
19690 }
19691 }
19692 }
19693 // Gather extracts after we check for full matched gathers only.
19694 if (!ExtractShuffles.empty() || !E->hasState() ||
19695 E->getOpcode() != Instruction::Load ||
19696 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19697 any_of(Range: E->Scalars, P: IsaPred<LoadInst>)) &&
19698 any_of(E->Scalars,
19699 [this](Value *V) {
19700 return isa<LoadInst>(Val: V) && isVectorized(V);
19701 })) ||
19702 (E->hasState() && E->isAltShuffle()) ||
19703 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
19704 isSplat(VL: E->Scalars) ||
19705 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
19706 GatherShuffles =
19707 isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
19708 }
19709 if (!GatherShuffles.empty()) {
19710 if (std::optional<ResTy> Delayed =
19711 ShuffleBuilder.needToDelay(E, Entries)) {
19712 // Delay emission of gathers which are not ready yet.
19713 PostponedGathers.insert(X: E);
19714 // Postpone gather emission, will be emitted after the end of the
19715 // process to keep correct order.
19716 return *Delayed;
19717 }
19718 if (GatherShuffles.size() == 1 &&
19719 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
19720 Entries.front().front()->isSame(VL: E->Scalars)) {
19721 // Perfect match in the graph, will reuse the previously vectorized
19722 // node. Cost is 0.
19723 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
19724 << shortBundleName(E->Scalars, E->Idx) << ".\n");
19725 // Restore the mask for previous partially matched values.
19726 Mask.resize(N: E->Scalars.size());
19727 const TreeEntry *FrontTE = Entries.front().front();
19728 if (FrontTE->ReorderIndices.empty() &&
19729 ((FrontTE->ReuseShuffleIndices.empty() &&
19730 E->Scalars.size() == FrontTE->Scalars.size()) ||
19731 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
19732 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
19733 } else {
19734 for (auto [I, V] : enumerate(First: E->Scalars)) {
19735 if (isa<PoisonValue>(Val: V)) {
19736 Mask[I] = PoisonMaskElem;
19737 continue;
19738 }
19739 Mask[I] = FrontTE->findLaneForValue(V);
19740 }
19741 }
19742 // Reset the builder(s) to correctly handle perfect diamond matched
19743 // nodes.
19744 ShuffleBuilder.resetForSameNode();
19745 ShuffleBuilder.add(*FrontTE, Mask);
19746 // Full matched entry found, no need to insert subvectors.
19747 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
19748 return Res;
19749 }
19750 if (!Resized) {
19751 if (GatheredScalars.size() != VF &&
19752 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
19753 return any_of(TEs, [&](const TreeEntry *TE) {
19754 return TE->getVectorFactor() == VF;
19755 });
19756 }))
19757 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
19758 Elt: PoisonValue::get(T: OrigScalarTy));
19759 }
19760 // Remove shuffled elements from list of gathers.
19761 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19762 if (Mask[I] != PoisonMaskElem)
19763 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
19764 }
19765 }
19766 }
19767 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
19768 SmallVectorImpl<int> &ReuseMask,
19769 bool IsRootPoison) {
19770 // For splats with can emit broadcasts instead of gathers, so try to find
19771 // such sequences.
19772 bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
19773 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
19774 Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
19775 SmallVector<int> UndefPos;
19776 DenseMap<Value *, unsigned> UniquePositions;
19777 // Gather unique non-const values and all constant values.
19778 // For repeated values, just shuffle them.
19779 int NumNonConsts = 0;
19780 int SinglePos = 0;
19781 for (auto [I, V] : enumerate(First&: Scalars)) {
19782 if (isa<UndefValue>(Val: V)) {
19783 if (!isa<PoisonValue>(Val: V)) {
19784 ReuseMask[I] = I;
19785 UndefPos.push_back(Elt: I);
19786 }
19787 continue;
19788 }
19789 if (isConstant(V)) {
19790 ReuseMask[I] = I;
19791 continue;
19792 }
19793 ++NumNonConsts;
19794 SinglePos = I;
19795 Value *OrigV = V;
19796 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
19797 if (IsSplat) {
19798 Scalars.front() = OrigV;
19799 ReuseMask[I] = 0;
19800 } else {
19801 const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
19802 Scalars[Res.first->second] = OrigV;
19803 ReuseMask[I] = Res.first->second;
19804 }
19805 }
19806 if (NumNonConsts == 1) {
19807 // Restore single insert element.
19808 if (IsSplat) {
19809 ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
19810 std::swap(a&: Scalars.front(), b&: Scalars[SinglePos]);
19811 if (!UndefPos.empty() && UndefPos.front() == 0)
19812 Scalars.front() = UndefValue::get(T: OrigScalarTy);
19813 }
19814 ReuseMask[SinglePos] = SinglePos;
19815 } else if (!UndefPos.empty() && IsSplat) {
19816 // For undef values, try to replace them with the simple broadcast.
19817 // We can do it if the broadcasted value is guaranteed to be
19818 // non-poisonous, or by freezing the incoming scalar value first.
19819 auto *It = find_if(Scalars, [this, E](Value *V) {
19820 return !isa<UndefValue>(Val: V) &&
19821 (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||
19822 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
19823 // Check if the value already used in the same operation in
19824 // one of the nodes already.
19825 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
19826 is_contained(Range&: E->UserTreeIndex.UserTE->Scalars,
19827 Element: U.getUser());
19828 })));
19829 });
19830 if (It != Scalars.end()) {
19831 // Replace undefs by the non-poisoned scalars and emit broadcast.
19832 int Pos = std::distance(Scalars.begin(), It);
19833 for (int I : UndefPos) {
19834 // Set the undef position to the non-poisoned scalar.
19835 ReuseMask[I] = Pos;
19836 // Replace the undef by the poison, in the mask it is replaced by
19837 // non-poisoned scalar already.
19838 if (I != Pos)
19839 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
19840 }
19841 } else {
19842 // Replace undefs by the poisons, emit broadcast and then emit
19843 // freeze.
19844 for (int I : UndefPos) {
19845 ReuseMask[I] = PoisonMaskElem;
19846 if (isa<UndefValue>(Val: Scalars[I]))
19847 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
19848 }
19849 NeedFreeze = true;
19850 }
19851 }
19852 };
19853 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
19854 bool IsNonPoisoned = true;
19855 bool IsUsedInExpr = true;
19856 Value *Vec1 = nullptr;
19857 if (!ExtractShuffles.empty()) {
19858 // Gather of extractelements can be represented as just a shuffle of
19859 // a single/two vectors the scalars are extracted from.
19860 // Find input vectors.
19861 Value *Vec2 = nullptr;
19862 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19863 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
19864 ExtractMask[I] = PoisonMaskElem;
19865 }
19866 if (UseVecBaseAsInput) {
19867 Vec1 = ExtractVecBase;
19868 } else {
19869 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
19870 if (ExtractMask[I] == PoisonMaskElem)
19871 continue;
19872 if (isa<UndefValue>(Val: StoredGS[I]))
19873 continue;
19874 auto *EI = cast<ExtractElementInst>(Val: StoredGS[I]);
19875 Value *VecOp = EI->getVectorOperand();
19876 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: VecOp);
19877 !TEs.empty() && TEs.front()->VectorizedValue)
19878 VecOp = TEs.front()->VectorizedValue;
19879 if (!Vec1) {
19880 Vec1 = VecOp;
19881 } else if (Vec1 != VecOp) {
19882 assert((!Vec2 || Vec2 == VecOp) &&
19883 "Expected only 1 or 2 vectors shuffle.");
19884 Vec2 = VecOp;
19885 }
19886 }
19887 }
19888 if (Vec2) {
19889 IsUsedInExpr = false;
19890 IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1, AC) &&
19891 isGuaranteedNotToBePoison(V: Vec2, AC);
19892 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
19893 } else if (Vec1) {
19894 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(V: Vec1, AC);
19895 IsUsedInExpr &= FindReusedSplat(
19896 ExtractMask,
19897 cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), 0,
19898 ExtractMask.size(), IsNotPoisonedVec);
19899 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
19900 IsNonPoisoned &= IsNotPoisonedVec;
19901 } else {
19902 IsUsedInExpr = false;
19903 ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
19904 /*ForExtracts=*/true);
19905 }
19906 }
19907 if (!GatherShuffles.empty()) {
19908 unsigned SliceSize =
19909 getPartNumElems(Size: E->Scalars.size(),
19910 NumParts: ::getNumberOfParts(TTI: *TTI, VecTy, Limit: E->Scalars.size()));
19911 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19912 for (const auto [I, TEs] : enumerate(First&: Entries)) {
19913 if (TEs.empty()) {
19914 assert(!GatherShuffles[I] &&
19915 "No shuffles with empty entries list expected.");
19916 continue;
19917 }
19918 assert((TEs.size() == 1 || TEs.size() == 2) &&
19919 "Expected shuffle of 1 or 2 entries.");
19920 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
19921 auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
19922 VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
19923 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
19924 if (TEs.size() == 1) {
19925 bool IsNotPoisonedVec =
19926 TEs.front()->VectorizedValue
19927 ? isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC)
19928 : true;
19929 IsUsedInExpr &=
19930 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
19931 SliceSize, IsNotPoisonedVec);
19932 ShuffleBuilder.add(*TEs.front(), VecMask);
19933 IsNonPoisoned &= IsNotPoisonedVec;
19934 } else {
19935 IsUsedInExpr = false;
19936 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
19937 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
19938 IsNonPoisoned &=
19939 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) &&
19940 isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue, AC);
19941 }
19942 }
19943 }
19944 // Try to figure out best way to combine values: build a shuffle and insert
19945 // elements or just build several shuffles.
19946 // Insert non-constant scalars.
19947 SmallVector<Value *> NonConstants(GatheredScalars);
19948 int EMSz = ExtractMask.size();
19949 int MSz = Mask.size();
19950 // Try to build constant vector and shuffle with it only if currently we
19951 // have a single permutation and more than 1 scalar constants.
19952 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
19953 bool IsIdentityShuffle =
19954 ((UseVecBaseAsInput ||
19955 all_of(ExtractShuffles,
19956 [](const std::optional<TTI::ShuffleKind> &SK) {
19957 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
19958 TTI::SK_PermuteSingleSrc;
19959 })) &&
19960 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
19961 ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) ||
19962 (!GatherShuffles.empty() &&
19963 all_of(GatherShuffles,
19964 [](const std::optional<TTI::ShuffleKind> &SK) {
19965 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
19966 TTI::SK_PermuteSingleSrc;
19967 }) &&
19968 none_of(Mask, [&](int I) { return I >= MSz; }) &&
19969 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
19970 bool EnoughConstsForShuffle =
19971 IsSingleShuffle &&
19972 (none_of(GatheredScalars,
19973 [](Value *V) {
19974 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
19975 }) ||
19976 any_of(GatheredScalars,
19977 [](Value *V) {
19978 return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
19979 })) &&
19980 (!IsIdentityShuffle ||
19981 (GatheredScalars.size() == 2 &&
19982 any_of(GatheredScalars,
19983 [](Value *V) { return !isa<UndefValue>(Val: V); })) ||
19984 count_if(GatheredScalars, [](Value *V) {
19985 return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
19986 }) > 1);
19987 // NonConstants array contains just non-constant values, GatheredScalars
19988 // contains only constant to build final vector and then shuffle.
19989 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
19990 if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars[I]))
19991 NonConstants[I] = PoisonValue::get(T: OrigScalarTy);
19992 else
19993 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
19994 }
19995 // Generate constants for final shuffle and build a mask for them.
19996 if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
19997 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
19998 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
19999 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
20000 ShuffleBuilder.add(BV, BVMask);
20001 }
20002 if (all_of(NonConstants, [=](Value *V) {
20003 return isa<PoisonValue>(Val: V) ||
20004 (IsSingleShuffle && ((IsIdentityShuffle &&
20005 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(Val: V));
20006 }))
20007 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20008 SubVectorsMask);
20009 else
20010 Res = ShuffleBuilder.finalize(
20011 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
20012 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
20013 bool IsSplat = isSplat(VL: NonConstants);
20014 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
20015 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
20016 auto CheckIfSplatIsProfitable = [&]() {
20017 // Estimate the cost of splatting + shuffle and compare with
20018 // insert + shuffle.
20019 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20020 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
20021 if (isa<ExtractElementInst>(Val: V) || isVectorized(V))
20022 return false;
20023 InstructionCost SplatCost = TTI->getVectorInstrCost(
20024 Opcode: Instruction::InsertElement, Val: VecTy, CostKind, /*Index=*/0,
20025 Op0: PoisonValue::get(T: VecTy), Op1: V);
20026 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20027 for (auto [Idx, I] : enumerate(First&: BVMask))
20028 if (I != PoisonMaskElem)
20029 NewMask[Idx] = Mask.size();
20030 SplatCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy,
20031 Mask: NewMask, CostKind);
20032 InstructionCost BVCost = TTI->getVectorInstrCost(
20033 Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
20034 Index: *find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem)), Op0: Vec, Op1: V);
20035 // Shuffle required?
20036 if (count(Range&: BVMask, Element: PoisonMaskElem) <
20037 static_cast<int>(BVMask.size() - 1)) {
20038 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20039 for (auto [Idx, I] : enumerate(First&: BVMask))
20040 if (I != PoisonMaskElem)
20041 NewMask[Idx] = I;
20042 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
20043 Tp: VecTy, Mask: NewMask, CostKind);
20044 }
20045 return SplatCost <= BVCost;
20046 };
20047 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
20048 for (auto [Idx, I] : enumerate(First&: BVMask))
20049 if (I != PoisonMaskElem)
20050 Mask[Idx] = I;
20051 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
20052 } else {
20053 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
20054 SmallVector<Value *> Values(NonConstants.size(),
20055 PoisonValue::get(T: ScalarTy));
20056 Values[0] = V;
20057 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
20058 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
20059 transform(BVMask, SplatMask.begin(), [](int I) {
20060 return I == PoisonMaskElem ? PoisonMaskElem : 0;
20061 });
20062 if (!ShuffleVectorInst::isIdentityMask(Mask: SplatMask, NumSrcElts: VF))
20063 BV = CreateShuffle(BV, nullptr, SplatMask);
20064 for (auto [Idx, I] : enumerate(First&: BVMask))
20065 if (I != PoisonMaskElem)
20066 Mask[Idx] = BVMask.size() + Idx;
20067 Vec = CreateShuffle(Vec, BV, Mask);
20068 for (auto [Idx, I] : enumerate(First&: Mask))
20069 if (I != PoisonMaskElem)
20070 Mask[Idx] = Idx;
20071 }
20072 });
20073 } else if (!allConstant(VL: GatheredScalars)) {
20074 // Gather unique scalars and all constants.
20075 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
20076 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
20077 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
20078 ShuffleBuilder.add(BV, ReuseMask);
20079 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20080 SubVectorsMask);
20081 } else {
20082 // Gather all constants.
20083 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
20084 for (auto [I, V] : enumerate(First&: GatheredScalars)) {
20085 if (!isa<PoisonValue>(Val: V))
20086 Mask[I] = I;
20087 }
20088 Value *BV = ShuffleBuilder.gather(GatheredScalars);
20089 ShuffleBuilder.add(BV, Mask);
20090 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20091 SubVectorsMask);
20092 }
20093
20094 if (NeedFreeze)
20095 Res = ShuffleBuilder.createFreeze(Res);
20096 return Res;
20097}
20098
20099Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
20100 // Do not do this for split vectorize node, marked to be gathers/buildvectors.
20101 if (E->State != TreeEntry::SplitVectorize ||
20102 !TransformedToGatherNodes.contains(Val: E)) {
20103 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
20104 (void)vectorizeTree(E: VectorizableTree[EIdx].get());
20105 }
20106 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
20107 Params&: Builder, Params&: *this);
20108}
20109
20110/// \returns \p I after propagating metadata from \p VL only for instructions in
20111/// \p VL.
20112static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
20113 SmallVector<Value *> Insts;
20114 for (Value *V : VL)
20115 if (isa<Instruction>(Val: V))
20116 Insts.push_back(Elt: V);
20117 return llvm::propagateMetadata(I: Inst, VL: Insts);
20118}
20119
20120static DebugLoc getDebugLocFromPHI(PHINode &PN) {
20121 if (DebugLoc DL = PN.getDebugLoc())
20122 return DL;
20123 return DebugLoc::getUnknown();
20124}
20125
20126Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20127 IRBuilderBase::InsertPointGuard Guard(Builder);
20128
20129 Value *V = E->Scalars.front();
20130 Type *ScalarTy = V->getType();
20131 if (!isa<CmpInst>(Val: V))
20132 ScalarTy = getValueType(V);
20133 auto It = MinBWs.find(Val: E);
20134 if (It != MinBWs.end()) {
20135 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
20136 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
20137 if (VecTy)
20138 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
20139 }
20140 if (E->VectorizedValue)
20141 return E->VectorizedValue;
20142 auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
20143 if (E->isGather() || TransformedToGatherNodes.contains(Val: E)) {
20144 // Set insert point for non-reduction initial nodes.
20145 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
20146 setInsertPointAfterBundle(E);
20147 Value *Vec = createBuildVector(E, ScalarTy);
20148 E->VectorizedValue = Vec;
20149 return Vec;
20150 }
20151 if (E->State == TreeEntry::SplitVectorize) {
20152 assert(E->CombinedEntriesWithIndices.size() == 2 &&
20153 "Expected exactly 2 combined entries.");
20154 setInsertPointAfterBundle(E);
20155 TreeEntry &OpTE1 =
20156 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
20157 assert(OpTE1.isSame(
20158 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
20159 "Expected same first part of scalars.");
20160 Value *Op1 = vectorizeTree(E: &OpTE1);
20161 TreeEntry &OpTE2 =
20162 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
20163 assert(
20164 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
20165 "Expected same second part of scalars.");
20166 Value *Op2 = vectorizeTree(E: &OpTE2);
20167 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
20168 bool IsSigned = false;
20169 auto It = MinBWs.find(Val: OpE);
20170 if (It != MinBWs.end())
20171 IsSigned = It->second.second;
20172 else
20173 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
20174 if (isa<PoisonValue>(Val: V))
20175 return false;
20176 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
20177 });
20178 return IsSigned;
20179 };
20180 if (cast<VectorType>(Val: Op1->getType())->getElementType() !=
20181 ScalarTy->getScalarType()) {
20182 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20183 Op1 = Builder.CreateIntCast(
20184 V: Op1,
20185 DestTy: getWidenedType(
20186 ScalarTy,
20187 VF: cast<FixedVectorType>(Val: Op1->getType())->getNumElements()),
20188 isSigned: GetOperandSignedness(&OpTE1));
20189 }
20190 if (cast<VectorType>(Val: Op2->getType())->getElementType() !=
20191 ScalarTy->getScalarType()) {
20192 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20193 Op2 = Builder.CreateIntCast(
20194 V: Op2,
20195 DestTy: getWidenedType(
20196 ScalarTy,
20197 VF: cast<FixedVectorType>(Val: Op2->getType())->getNumElements()),
20198 isSigned: GetOperandSignedness(&OpTE2));
20199 }
20200 if (E->ReorderIndices.empty()) {
20201 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
20202 std::iota(
20203 first: Mask.begin(),
20204 last: std::next(x: Mask.begin(), n: E->CombinedEntriesWithIndices.back().second),
20205 value: 0);
20206 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
20207 if (ScalarTyNumElements != 1) {
20208 assert(SLPReVec && "Only supported by REVEC.");
20209 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask);
20210 }
20211 Value *Vec = Builder.CreateShuffleVector(V: Op1, Mask);
20212 Vec = createInsertVector(Builder, Vec, V: Op2,
20213 Index: E->CombinedEntriesWithIndices.back().second *
20214 ScalarTyNumElements);
20215 E->VectorizedValue = Vec;
20216 return Vec;
20217 }
20218 unsigned CommonVF =
20219 std::max(a: OpTE1.getVectorFactor(), b: OpTE2.getVectorFactor());
20220 if (getNumElements(Ty: Op1->getType()) != CommonVF) {
20221 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20222 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE1.getVectorFactor()),
20223 value: 0);
20224 Op1 = Builder.CreateShuffleVector(V: Op1, Mask);
20225 }
20226 if (getNumElements(Ty: Op2->getType()) != CommonVF) {
20227 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20228 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: OpTE2.getVectorFactor()),
20229 value: 0);
20230 Op2 = Builder.CreateShuffleVector(V: Op2, Mask);
20231 }
20232 Value *Vec = Builder.CreateShuffleVector(V1: Op1, V2: Op2, Mask: E->getSplitMask());
20233 E->VectorizedValue = Vec;
20234 return Vec;
20235 }
20236
20237 bool IsReverseOrder =
20238 !E->ReorderIndices.empty() && isReverseOrder(Order: E->ReorderIndices);
20239 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
20240 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
20241 if (E->getOpcode() == Instruction::Store &&
20242 E->State == TreeEntry::Vectorize) {
20243 ArrayRef<int> Mask =
20244 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
20245 E->ReorderIndices.size());
20246 ShuffleBuilder.add(V1: V, Mask);
20247 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
20248 E->State == TreeEntry::CompressVectorize) {
20249 ShuffleBuilder.addOrdered(V1: V, Order: {});
20250 } else {
20251 ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
20252 }
20253 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
20254 E->CombinedEntriesWithIndices.size());
20255 transform(
20256 Range: E->CombinedEntriesWithIndices, d_first: SubVectors.begin(), F: [&](const auto &P) {
20257 return std::make_pair(VectorizableTree[P.first].get(), P.second);
20258 });
20259 assert(
20260 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
20261 "Expected either combined subnodes or reordering");
20262 return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices, SubVectors, SubVectorsMask: {});
20263 };
20264
20265 assert(!E->isGather() && "Unhandled state");
20266 unsigned ShuffleOrOp =
20267 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
20268 if (!E->isAltShuffle()) {
20269 switch (E->CombinedOp) {
20270 case TreeEntry::ReducedBitcast:
20271 case TreeEntry::ReducedBitcastBSwap:
20272 ShuffleOrOp = E->CombinedOp;
20273 break;
20274 default:
20275 break;
20276 }
20277 }
20278 Instruction *VL0 = E->getMainOp();
20279 auto GetOperandSignedness = [&](unsigned Idx) {
20280 const TreeEntry *OpE = getOperandEntry(E, Idx);
20281 bool IsSigned = false;
20282 auto It = MinBWs.find(Val: OpE);
20283 if (It != MinBWs.end())
20284 IsSigned = It->second.second;
20285 else
20286 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
20287 if (isa<PoisonValue>(Val: V))
20288 return false;
20289 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
20290 });
20291 return IsSigned;
20292 };
20293 switch (ShuffleOrOp) {
20294 case Instruction::PHI: {
20295 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
20296 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
20297 "PHI reordering is free.");
20298 auto *PH = cast<PHINode>(Val: VL0);
20299 Builder.SetInsertPoint(TheBB: PH->getParent(),
20300 IP: PH->getParent()->getFirstNonPHIIt());
20301 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20302 PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
20303 Value *V = NewPhi;
20304
20305 // Adjust insertion point once all PHI's have been generated.
20306 Builder.SetInsertPoint(TheBB: PH->getParent(),
20307 IP: PH->getParent()->getFirstInsertionPt());
20308 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20309
20310 V = FinalShuffle(V, E);
20311
20312 E->VectorizedValue = V;
20313 // If phi node is fully emitted - exit.
20314 if (NewPhi->getNumIncomingValues() != 0)
20315 return NewPhi;
20316
20317 // PHINodes may have multiple entries from the same block. We want to
20318 // visit every block once.
20319 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
20320 for (unsigned I : seq<unsigned>(Size: PH->getNumIncomingValues())) {
20321 BasicBlock *IBB = PH->getIncomingBlock(i: I);
20322
20323 // Stop emission if all incoming values are generated.
20324 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
20325 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
20326 return NewPhi;
20327 }
20328
20329 auto Res = VisitedBBs.try_emplace(Key: IBB, Args&: I);
20330 if (!Res.second) {
20331 TreeEntry *OpTE = getOperandEntry(E, Idx: I);
20332 if (OpTE->isGather() || DeletedNodes.contains(Ptr: OpTE) ||
20333 TransformedToGatherNodes.contains(Val: OpTE)) {
20334 Value *VecOp = NewPhi->getIncomingValue(i: Res.first->getSecond());
20335 NewPhi->addIncoming(V: VecOp, BB: IBB);
20336 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
20337 OpTE->VectorizedValue = VecOp;
20338 continue;
20339 }
20340 }
20341
20342 Builder.SetInsertPoint(IBB->getTerminator());
20343 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20344 Value *Vec = vectorizeOperand(E, NodeIdx: I);
20345 if (VecTy != Vec->getType()) {
20346 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
20347 MinBWs.contains(getOperandEntry(E, I))) &&
20348 "Expected item in MinBWs.");
20349 Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
20350 }
20351 NewPhi->addIncoming(V: Vec, BB: IBB);
20352 }
20353
20354 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
20355 "Invalid number of incoming values");
20356 assert(E->VectorizedValue && "Expected vectorized value.");
20357 return E->VectorizedValue;
20358 }
20359
20360 case Instruction::ExtractElement: {
20361 Value *V = E->getSingleOperand(OpIdx: 0);
20362 setInsertPointAfterBundle(E);
20363 V = FinalShuffle(V, E);
20364 E->VectorizedValue = V;
20365 return V;
20366 }
20367 case Instruction::ExtractValue: {
20368 auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: 0));
20369 Builder.SetInsertPoint(LI);
20370 Value *Ptr = LI->getPointerOperand();
20371 LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
20372 Value *NewV = ::propagateMetadata(Inst: V, VL: E->Scalars);
20373 NewV = FinalShuffle(NewV, E);
20374 E->VectorizedValue = NewV;
20375 return NewV;
20376 }
20377 case Instruction::InsertElement: {
20378 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
20379 if (const TreeEntry *OpE = getOperandEntry(E, Idx: 1);
20380 OpE && !OpE->isGather() && OpE->hasState() &&
20381 !OpE->hasCopyableElements())
20382 Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
20383 else
20384 setInsertPointAfterBundle(E);
20385 Value *V = vectorizeOperand(E, NodeIdx: 1);
20386 ArrayRef<Value *> Op = E->getOperand(OpIdx: 1);
20387 Type *ScalarTy = Op.front()->getType();
20388 if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
20389 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20390 std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: 1));
20391 assert(Res.first > 0 && "Expected item in MinBWs.");
20392 V = Builder.CreateIntCast(
20393 V,
20394 DestTy: getWidenedType(
20395 ScalarTy,
20396 VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
20397 isSigned: Res.second);
20398 }
20399
20400 // Create InsertVector shuffle if necessary
20401 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range&: E->Scalars, P: [E](Value *V) {
20402 return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
20403 }));
20404 const unsigned NumElts =
20405 cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
20406 const unsigned NumScalars = E->Scalars.size();
20407
20408 unsigned Offset = *getElementIndex(Inst: VL0);
20409 assert(Offset < NumElts && "Failed to find vector index offset");
20410
20411 // Create shuffle to resize vector
20412 SmallVector<int> Mask;
20413 if (!E->ReorderIndices.empty()) {
20414 inversePermutation(Indices: E->ReorderIndices, Mask);
20415 Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
20416 } else {
20417 Mask.assign(NumElts, Elt: PoisonMaskElem);
20418 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: 0);
20419 }
20420 // Create InsertVector shuffle if necessary
20421 bool IsIdentity = true;
20422 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
20423 Mask.swap(RHS&: PrevMask);
20424 for (unsigned I = 0; I < NumScalars; ++I) {
20425 Value *Scalar = E->Scalars[PrevMask[I]];
20426 unsigned InsertIdx = *getElementIndex(Inst: Scalar);
20427 IsIdentity &= InsertIdx - Offset == I;
20428 Mask[InsertIdx - Offset] = I;
20429 }
20430 if (!IsIdentity || NumElts != NumScalars) {
20431 Value *V2 = nullptr;
20432 bool IsVNonPoisonous =
20433 !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
20434 SmallVector<int> InsertMask(Mask);
20435 if (NumElts != NumScalars && Offset == 0) {
20436 // Follow all insert element instructions from the current buildvector
20437 // sequence.
20438 InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
20439 do {
20440 std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
20441 if (!InsertIdx)
20442 break;
20443 if (InsertMask[*InsertIdx] == PoisonMaskElem)
20444 InsertMask[*InsertIdx] = *InsertIdx;
20445 if (!Ins->hasOneUse())
20446 break;
20447 Ins = dyn_cast_or_null<InsertElementInst>(
20448 Val: Ins->getUniqueUndroppableUser());
20449 } while (Ins);
20450 SmallBitVector UseMask =
20451 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
20452 SmallBitVector IsFirstPoison =
20453 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
20454 SmallBitVector IsFirstUndef =
20455 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
20456 if (!IsFirstPoison.all()) {
20457 unsigned Idx = 0;
20458 for (unsigned I = 0; I < NumElts; I++) {
20459 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
20460 IsFirstUndef.test(Idx: I)) {
20461 if (IsVNonPoisonous) {
20462 InsertMask[I] = I < NumScalars ? I : 0;
20463 continue;
20464 }
20465 if (!V2)
20466 V2 = UndefValue::get(T: V->getType());
20467 if (Idx >= NumScalars)
20468 Idx = NumScalars - 1;
20469 InsertMask[I] = NumScalars + Idx;
20470 ++Idx;
20471 } else if (InsertMask[I] != PoisonMaskElem &&
20472 Mask[I] == PoisonMaskElem) {
20473 InsertMask[I] = PoisonMaskElem;
20474 }
20475 }
20476 } else {
20477 InsertMask = Mask;
20478 }
20479 }
20480 if (!V2)
20481 V2 = PoisonValue::get(T: V->getType());
20482 V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
20483 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20484 GatherShuffleExtractSeq.insert(X: I);
20485 CSEBlocks.insert(V: I->getParent());
20486 }
20487 }
20488
20489 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
20490 for (unsigned I = 0; I < NumElts; I++) {
20491 if (Mask[I] != PoisonMaskElem)
20492 InsertMask[Offset + I] = I;
20493 }
20494 SmallBitVector UseMask =
20495 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
20496 SmallBitVector IsFirstUndef =
20497 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
20498 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
20499 NumElts != NumScalars) {
20500 if (IsFirstUndef.all()) {
20501 if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
20502 SmallBitVector IsFirstPoison =
20503 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
20504 if (!IsFirstPoison.all()) {
20505 for (unsigned I = 0; I < NumElts; I++) {
20506 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
20507 InsertMask[I] = I + NumElts;
20508 }
20509 }
20510 V = Builder.CreateShuffleVector(
20511 V1: V,
20512 V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
20513 : FirstInsert->getOperand(i: 0),
20514 Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
20515 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20516 GatherShuffleExtractSeq.insert(X: I);
20517 CSEBlocks.insert(V: I->getParent());
20518 }
20519 }
20520 } else {
20521 SmallBitVector IsFirstPoison =
20522 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
20523 for (unsigned I = 0; I < NumElts; I++) {
20524 if (InsertMask[I] == PoisonMaskElem)
20525 InsertMask[I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
20526 else
20527 InsertMask[I] += NumElts;
20528 }
20529 V = Builder.CreateShuffleVector(
20530 V1: FirstInsert->getOperand(i: 0), V2: V, Mask: InsertMask,
20531 Name: cast<Instruction>(Val: E->Scalars.back())->getName());
20532 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20533 GatherShuffleExtractSeq.insert(X: I);
20534 CSEBlocks.insert(V: I->getParent());
20535 }
20536 }
20537 }
20538
20539 ++NumVectorInstructions;
20540 E->VectorizedValue = V;
20541 return V;
20542 }
20543 case Instruction::ZExt:
20544 case Instruction::SExt:
20545 case Instruction::FPToUI:
20546 case Instruction::FPToSI:
20547 case Instruction::FPExt:
20548 case Instruction::PtrToInt:
20549 case Instruction::IntToPtr:
20550 case Instruction::SIToFP:
20551 case Instruction::UIToFP:
20552 case Instruction::Trunc:
20553 case Instruction::FPTrunc:
20554 case Instruction::BitCast: {
20555 setInsertPointAfterBundle(E);
20556
20557 Value *InVec = vectorizeOperand(E, NodeIdx: 0);
20558
20559 auto *CI = cast<CastInst>(Val: VL0);
20560 Instruction::CastOps VecOpcode = CI->getOpcode();
20561 Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
20562 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
20563 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
20564 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20565 SrcScalarTy != CI->getOperand(i_nocapture: 0)->getType()->getScalarType())) {
20566 // Check if the values are candidates to demote.
20567 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
20568 if (SrcIt != MinBWs.end())
20569 SrcBWSz = SrcIt->second.first;
20570 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
20571 if (BWSz == SrcBWSz) {
20572 VecOpcode = Instruction::BitCast;
20573 } else if (BWSz < SrcBWSz) {
20574 VecOpcode = Instruction::Trunc;
20575 } else if (It != MinBWs.end()) {
20576 assert(BWSz > SrcBWSz && "Invalid cast!");
20577 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20578 } else if (SrcIt != MinBWs.end()) {
20579 assert(BWSz > SrcBWSz && "Invalid cast!");
20580 VecOpcode =
20581 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20582 }
20583 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20584 !SrcIt->second.second) {
20585 VecOpcode = Instruction::UIToFP;
20586 } else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
20587 ScalarTy->isFPOrFPVectorTy()) {
20588 Type *OrigSrcScalarTy = CI->getSrcTy();
20589 auto *OrigSrcVectorTy =
20590 getWidenedType(ScalarTy: OrigSrcScalarTy, VF: E->Scalars.size());
20591 InVec =
20592 Builder.CreateIntCast(V: InVec, DestTy: OrigSrcVectorTy, isSigned: SrcIt->second.second);
20593 }
20594 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20595 ? InVec
20596 : Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
20597 V = FinalShuffle(V, E);
20598
20599 E->VectorizedValue = V;
20600 ++NumVectorInstructions;
20601 return V;
20602 }
20603 case Instruction::FCmp:
20604 case Instruction::ICmp: {
20605 setInsertPointAfterBundle(E);
20606
20607 Value *L = vectorizeOperand(E, NodeIdx: 0);
20608 Value *R = vectorizeOperand(E, NodeIdx: 1);
20609 if (L->getType() != R->getType()) {
20610 assert((getOperandEntry(E, 0)->isGather() ||
20611 getOperandEntry(E, 1)->isGather() ||
20612 MinBWs.contains(getOperandEntry(E, 0)) ||
20613 MinBWs.contains(getOperandEntry(E, 1))) &&
20614 "Expected item in MinBWs.");
20615 if (cast<VectorType>(Val: L->getType())
20616 ->getElementType()
20617 ->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
20618 ->getElementType()
20619 ->getIntegerBitWidth()) {
20620 Type *CastTy = R->getType();
20621 L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness(0));
20622 } else {
20623 Type *CastTy = L->getType();
20624 R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness(1));
20625 }
20626 }
20627
20628 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
20629 Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
20630 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
20631 if (auto *ICmp = dyn_cast<ICmpInst>(Val: V); ICmp && It == MinBWs.end())
20632 ICmp->setSameSign(/*B=*/false);
20633 // Do not cast for cmps.
20634 VecTy = cast<FixedVectorType>(Val: V->getType());
20635 V = FinalShuffle(V, E);
20636
20637 E->VectorizedValue = V;
20638 ++NumVectorInstructions;
20639 return V;
20640 }
20641 case Instruction::Select: {
20642 setInsertPointAfterBundle(E);
20643
20644 Value *Cond = vectorizeOperand(E, NodeIdx: 0);
20645 Value *True = vectorizeOperand(E, NodeIdx: 1);
20646 Value *False = vectorizeOperand(E, NodeIdx: 2);
20647 if (True->getType() != VecTy || False->getType() != VecTy) {
20648 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
20649 getOperandEntry(E, 2)->isGather() ||
20650 MinBWs.contains(getOperandEntry(E, 1)) ||
20651 MinBWs.contains(getOperandEntry(E, 2))) &&
20652 "Expected item in MinBWs.");
20653 if (True->getType() != VecTy)
20654 True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness(1));
20655 if (False->getType() != VecTy)
20656 False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness(2));
20657 }
20658
20659 unsigned CondNumElements = getNumElements(Ty: Cond->getType());
20660 unsigned TrueNumElements = getNumElements(Ty: True->getType());
20661 assert(TrueNumElements >= CondNumElements &&
20662 TrueNumElements % CondNumElements == 0 &&
20663 "Cannot vectorize Instruction::Select");
20664 assert(TrueNumElements == getNumElements(False->getType()) &&
20665 "Cannot vectorize Instruction::Select");
20666 if (CondNumElements != TrueNumElements) {
20667 // When the return type is i1 but the source is fixed vector type, we
20668 // need to duplicate the condition value.
20669 Cond = Builder.CreateShuffleVector(
20670 V: Cond, Mask: createReplicatedMask(ReplicationFactor: TrueNumElements / CondNumElements,
20671 VF: CondNumElements));
20672 }
20673 assert(getNumElements(Cond->getType()) == TrueNumElements &&
20674 "Cannot vectorize Instruction::Select");
20675 Value *V =
20676 Builder.CreateSelectWithUnknownProfile(C: Cond, True, False, DEBUG_TYPE);
20677 V = FinalShuffle(V, E);
20678
20679 E->VectorizedValue = V;
20680 ++NumVectorInstructions;
20681 return V;
20682 }
20683 case Instruction::FNeg: {
20684 setInsertPointAfterBundle(E);
20685
20686 Value *Op = vectorizeOperand(E, NodeIdx: 0);
20687
20688 Value *V = Builder.CreateUnOp(
20689 Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
20690 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
20691 if (auto *I = dyn_cast<Instruction>(Val: V))
20692 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
20693
20694 V = FinalShuffle(V, E);
20695
20696 E->VectorizedValue = V;
20697 ++NumVectorInstructions;
20698
20699 return V;
20700 }
20701 case Instruction::Freeze: {
20702 setInsertPointAfterBundle(E);
20703
20704 Value *Op = vectorizeOperand(E, NodeIdx: 0);
20705
20706 if (Op->getType() != VecTy) {
20707 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20708 MinBWs.contains(getOperandEntry(E, 0))) &&
20709 "Expected item in MinBWs.");
20710 Op = Builder.CreateIntCast(V: Op, DestTy: VecTy, isSigned: GetOperandSignedness(0));
20711 }
20712 Value *V = Builder.CreateFreeze(V: Op);
20713 V = FinalShuffle(V, E);
20714
20715 E->VectorizedValue = V;
20716 ++NumVectorInstructions;
20717
20718 return V;
20719 }
20720 case Instruction::Add:
20721 case Instruction::FAdd:
20722 case Instruction::Sub:
20723 case Instruction::FSub:
20724 case Instruction::Mul:
20725 case Instruction::FMul:
20726 case Instruction::UDiv:
20727 case Instruction::SDiv:
20728 case Instruction::FDiv:
20729 case Instruction::URem:
20730 case Instruction::SRem:
20731 case Instruction::FRem:
20732 case Instruction::Shl:
20733 case Instruction::LShr:
20734 case Instruction::AShr:
20735 case Instruction::And:
20736 case Instruction::Or:
20737 case Instruction::Xor: {
20738 setInsertPointAfterBundle(E);
20739
20740 Value *LHS = vectorizeOperand(E, NodeIdx: 0);
20741 Value *RHS = vectorizeOperand(E, NodeIdx: 1);
20742 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
20743 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
20744 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
20745 if (all_of(Range&: Ops, P: [&](Value *Op) {
20746 auto *CI = dyn_cast<ConstantInt>(Val: Op);
20747 return CI && CI->getValue().countr_one() >= It->second.first;
20748 })) {
20749 V = FinalShuffle(I == 0 ? RHS : LHS, E);
20750 E->VectorizedValue = V;
20751 ++NumVectorInstructions;
20752 return V;
20753 }
20754 }
20755 }
20756 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
20757 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
20758 getOperandEntry(E, 1)->isGather() ||
20759 MinBWs.contains(getOperandEntry(E, 0)) ||
20760 MinBWs.contains(getOperandEntry(E, 1))) &&
20761 "Expected item in MinBWs.");
20762 if (LHS->getType() != VecTy)
20763 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness(0));
20764 if (RHS->getType() != VecTy)
20765 RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness(1));
20766 }
20767
20768 Value *V = Builder.CreateBinOp(
20769 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
20770 RHS);
20771 propagateIRFlags(I: V, VL: E->Scalars, OpValue: nullptr, IncludeWrapFlags: It == MinBWs.end());
20772 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20773 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
20774 // Drop nuw flags for abs(sub(commutative), true).
20775 if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
20776 any_of(Range&: E->Scalars, P: [E](Value *V) {
20777 return isa<PoisonValue>(Val: V) ||
20778 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
20779 isCommutative(I: cast<Instruction>(Val: V));
20780 }))
20781 I->setHasNoUnsignedWrap(/*b=*/false);
20782 }
20783
20784 V = FinalShuffle(V, E);
20785
20786 E->VectorizedValue = V;
20787 ++NumVectorInstructions;
20788
20789 return V;
20790 }
20791 case Instruction::Load: {
20792 // Loads are inserted at the head of the tree because we don't want to
20793 // sink them all the way down past store instructions.
20794 setInsertPointAfterBundle(E);
20795
20796 LoadInst *LI = cast<LoadInst>(Val: VL0);
20797 Instruction *NewLI;
20798 FixedVectorType *StridedLoadTy = nullptr;
20799 Value *PO = LI->getPointerOperand();
20800 if (E->State == TreeEntry::Vectorize) {
20801 NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
20802 } else if (E->State == TreeEntry::CompressVectorize) {
20803 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
20804 CompressEntryToData.at(Val: E);
20805 Align CommonAlignment = LI->getAlign();
20806 if (IsMasked) {
20807 unsigned VF = getNumElements(Ty: LoadVecTy);
20808 SmallVector<Constant *> MaskValues(
20809 VF / getNumElements(Ty: LI->getType()),
20810 ConstantInt::getFalse(Context&: VecTy->getContext()));
20811 for (int I : CompressMask)
20812 MaskValues[I] = ConstantInt::getTrue(Context&: VecTy->getContext());
20813 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
20814 assert(SLPReVec && "Only supported by REVEC.");
20815 MaskValues = replicateMask(Val: MaskValues, VF: VecTy->getNumElements());
20816 }
20817 Constant *MaskValue = ConstantVector::get(V: MaskValues);
20818 NewLI = Builder.CreateMaskedLoad(Ty: LoadVecTy, Ptr: PO, Alignment: CommonAlignment,
20819 Mask: MaskValue);
20820 } else {
20821 NewLI = Builder.CreateAlignedLoad(Ty: LoadVecTy, Ptr: PO, Align: CommonAlignment);
20822 }
20823 NewLI = ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
20824 // TODO: include this cost into CommonCost.
20825 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
20826 assert(SLPReVec && "FixedVectorType is not expected.");
20827 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(),
20828 Mask&: CompressMask);
20829 }
20830 NewLI =
20831 cast<Instruction>(Val: Builder.CreateShuffleVector(V: NewLI, Mask: CompressMask));
20832 } else if (E->State == TreeEntry::StridedVectorize) {
20833 Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
20834 Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
20835 PO = IsReverseOrder ? PtrN : Ptr0;
20836 Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
20837 Value *StrideVal;
20838 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
20839 StridedLoadTy = SPtrInfo.Ty;
20840 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
20841 unsigned StridedLoadEC =
20842 StridedLoadTy->getElementCount().getKnownMinValue();
20843
20844 Value *Stride = SPtrInfo.StrideVal;
20845 if (!Stride) {
20846 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
20847 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
20848 SCEVExpander Expander(*SE, "strided-load-vec");
20849 Stride = Expander.expandCodeFor(SH: StrideSCEV, Ty: StrideSCEV->getType(),
20850 I: &*Builder.GetInsertPoint());
20851 }
20852 Value *NewStride =
20853 Builder.CreateIntCast(V: Stride, DestTy: StrideTy, /*isSigned=*/true);
20854 StrideVal = Builder.CreateMul(
20855 LHS: NewStride, RHS: ConstantInt::getSigned(
20856 Ty: StrideTy, V: (IsReverseOrder ? -1 : 1) *
20857 static_cast<int>(
20858 DL->getTypeAllocSize(Ty: ScalarTy))));
20859 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
20860 auto *Inst = Builder.CreateIntrinsic(
20861 ID: Intrinsic::experimental_vp_strided_load,
20862 Types: {StridedLoadTy, PO->getType(), StrideTy},
20863 Args: {PO, StrideVal,
20864 Builder.getAllOnesMask(NumElts: ElementCount::getFixed(MinVal: StridedLoadEC)),
20865 Builder.getInt32(C: StridedLoadEC)});
20866 Inst->addParamAttr(
20867 /*ArgNo=*/0,
20868 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
20869 NewLI = Inst;
20870 } else {
20871 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
20872 Value *VecPtr = vectorizeOperand(E, NodeIdx: 0);
20873 if (isa<FixedVectorType>(Val: ScalarTy)) {
20874 assert(SLPReVec && "FixedVectorType is not expected.");
20875 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
20876 // to expand VecPtr if ScalarTy is a vector type.
20877 unsigned ScalarTyNumElements =
20878 cast<FixedVectorType>(Val: ScalarTy)->getNumElements();
20879 unsigned VecTyNumElements =
20880 cast<FixedVectorType>(Val: VecTy)->getNumElements();
20881 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
20882 "Cannot expand getelementptr.");
20883 unsigned VF = VecTyNumElements / ScalarTyNumElements;
20884 SmallVector<Constant *> Indices(VecTyNumElements);
20885 transform(Range: seq(Size: VecTyNumElements), d_first: Indices.begin(), F: [=](unsigned I) {
20886 return Builder.getInt64(C: I % ScalarTyNumElements);
20887 });
20888 VecPtr = Builder.CreateGEP(
20889 Ty: VecTy->getElementType(),
20890 Ptr: Builder.CreateShuffleVector(
20891 V: VecPtr, Mask: createReplicatedMask(ReplicationFactor: ScalarTyNumElements, VF)),
20892 IdxList: ConstantVector::get(V: Indices));
20893 }
20894 // Use the minimum alignment of the gathered loads.
20895 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
20896 NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
20897 }
20898 Value *V = E->State == TreeEntry::CompressVectorize
20899 ? NewLI
20900 : ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
20901
20902 if (StridedLoadTy != VecTy)
20903 V = Builder.CreateBitOrPointerCast(V, DestTy: VecTy);
20904 V = FinalShuffle(V, E);
20905 E->VectorizedValue = V;
20906 ++NumVectorInstructions;
20907 return V;
20908 }
20909 case Instruction::Store: {
20910 auto *SI = cast<StoreInst>(Val: VL0);
20911
20912 setInsertPointAfterBundle(E);
20913
20914 Value *VecValue = vectorizeOperand(E, NodeIdx: 0);
20915 if (VecValue->getType() != VecTy)
20916 VecValue =
20917 Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness(0));
20918 VecValue = FinalShuffle(VecValue, E);
20919
20920 Value *Ptr = SI->getPointerOperand();
20921 Instruction *ST;
20922 if (E->State == TreeEntry::Vectorize) {
20923 ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
20924 } else {
20925 assert(E->State == TreeEntry::StridedVectorize &&
20926 "Expected either strided or consecutive stores.");
20927 if (!E->ReorderIndices.empty()) {
20928 SI = cast<StoreInst>(Val: E->Scalars[E->ReorderIndices.front()]);
20929 Ptr = SI->getPointerOperand();
20930 }
20931 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
20932 Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
20933 auto *Inst = Builder.CreateIntrinsic(
20934 ID: Intrinsic::experimental_vp_strided_store,
20935 Types: {VecTy, Ptr->getType(), StrideTy},
20936 Args: {VecValue, Ptr,
20937 ConstantInt::getSigned(
20938 Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
20939 Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
20940 Builder.getInt32(C: E->Scalars.size())});
20941 Inst->addParamAttr(
20942 /*ArgNo=*/1,
20943 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
20944 ST = Inst;
20945 }
20946
20947 Value *V = ::propagateMetadata(Inst: ST, VL: E->Scalars);
20948
20949 E->VectorizedValue = V;
20950 ++NumVectorInstructions;
20951 return V;
20952 }
20953 case Instruction::GetElementPtr: {
20954 auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
20955 setInsertPointAfterBundle(E);
20956
20957 Value *Op0 = vectorizeOperand(E, NodeIdx: 0);
20958
20959 SmallVector<Value *> OpVecs;
20960 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
20961 Value *OpVec = vectorizeOperand(E, NodeIdx: J);
20962 OpVecs.push_back(Elt: OpVec);
20963 }
20964
20965 Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
20966 if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
20967 SmallVector<Value *> GEPs;
20968 for (Value *V : E->Scalars) {
20969 if (isa<GetElementPtrInst>(Val: V))
20970 GEPs.push_back(Elt: V);
20971 }
20972 V = ::propagateMetadata(Inst: I, VL: GEPs);
20973 }
20974
20975 V = FinalShuffle(V, E);
20976
20977 E->VectorizedValue = V;
20978 ++NumVectorInstructions;
20979
20980 return V;
20981 }
20982 case Instruction::Call: {
20983 CallInst *CI = cast<CallInst>(Val: VL0);
20984 setInsertPointAfterBundle(E);
20985
20986 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
20987
20988 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
20989 CI, ID, VF: VecTy->getNumElements(),
20990 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
20991 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
20992 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
20993 VecCallCosts.first <= VecCallCosts.second;
20994
20995 Value *ScalarArg = nullptr;
20996 SmallVector<Value *> OpVecs;
20997 SmallVector<Type *, 2> TysForDecl;
20998 // Add return type if intrinsic is overloaded on it.
20999 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -1, TTI))
21000 TysForDecl.push_back(Elt: VecTy);
21001 auto *CEI = cast<CallInst>(Val: VL0);
21002 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
21003 // Some intrinsics have scalar arguments. This argument should not be
21004 // vectorized.
21005 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) {
21006 ScalarArg = CEI->getArgOperand(i: I);
21007 // if decided to reduce bitwidth of abs intrinsic, it second argument
21008 // must be set false (do not return poison, if value issigned min).
21009 if (ID == Intrinsic::abs && It != MinBWs.end() &&
21010 It->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
21011 ScalarArg = Builder.getFalse();
21012 OpVecs.push_back(Elt: ScalarArg);
21013 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
21014 TysForDecl.push_back(Elt: ScalarArg->getType());
21015 continue;
21016 }
21017
21018 Value *OpVec = vectorizeOperand(E, NodeIdx: I);
21019 ScalarArg = CEI->getArgOperand(i: I);
21020 if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
21021 ScalarArg->getType()->getScalarType() &&
21022 It == MinBWs.end()) {
21023 auto *CastTy =
21024 getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
21025 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness(I));
21026 } else if (It != MinBWs.end()) {
21027 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
21028 }
21029 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
21030 OpVecs.push_back(Elt: OpVec);
21031 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
21032 TysForDecl.push_back(Elt: OpVec->getType());
21033 }
21034
21035 Function *CF;
21036 if (!UseIntrinsic) {
21037 VFShape Shape =
21038 VFShape::get(FTy: CI->getFunctionType(),
21039 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
21040 HasGlobalPred: false /*HasGlobalPred*/);
21041 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
21042 } else {
21043 CF = Intrinsic::getOrInsertDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
21044 }
21045
21046 SmallVector<OperandBundleDef, 1> OpBundles;
21047 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
21048 Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
21049
21050 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21051 V = FinalShuffle(V, E);
21052
21053 E->VectorizedValue = V;
21054 ++NumVectorInstructions;
21055 return V;
21056 }
21057 case Instruction::ShuffleVector: {
21058 Value *V;
21059 if (SLPReVec && !E->isAltShuffle()) {
21060 setInsertPointAfterBundle(E);
21061 Value *Src = vectorizeOperand(E, NodeIdx: 0);
21062 SmallVector<int> ThisMask(calculateShufflevectorMask(VL: E->Scalars));
21063 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Val: Src)) {
21064 SmallVector<int> NewMask(ThisMask.size());
21065 transform(Range&: ThisMask, d_first: NewMask.begin(), F: [&SVSrc](int Mask) {
21066 return SVSrc->getShuffleMask()[Mask];
21067 });
21068 V = Builder.CreateShuffleVector(V1: SVSrc->getOperand(i_nocapture: 0),
21069 V2: SVSrc->getOperand(i_nocapture: 1), Mask: NewMask);
21070 } else {
21071 V = Builder.CreateShuffleVector(V: Src, Mask: ThisMask);
21072 }
21073 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21074 if (auto *I = dyn_cast<Instruction>(Val: V))
21075 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21076 V = FinalShuffle(V, E);
21077 } else {
21078 assert(E->isAltShuffle() &&
21079 ((Instruction::isBinaryOp(E->getOpcode()) &&
21080 Instruction::isBinaryOp(E->getAltOpcode())) ||
21081 (Instruction::isCast(E->getOpcode()) &&
21082 Instruction::isCast(E->getAltOpcode())) ||
21083 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
21084 "Invalid Shuffle Vector Operand");
21085
21086 Value *LHS = nullptr, *RHS = nullptr;
21087 if (Instruction::isBinaryOp(Opcode: E->getOpcode()) || isa<CmpInst>(Val: VL0)) {
21088 setInsertPointAfterBundle(E);
21089 LHS = vectorizeOperand(E, NodeIdx: 0);
21090 RHS = vectorizeOperand(E, NodeIdx: 1);
21091 } else {
21092 setInsertPointAfterBundle(E);
21093 LHS = vectorizeOperand(E, NodeIdx: 0);
21094 }
21095 if (LHS && RHS &&
21096 ((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
21097 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
21098 (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
21099 assert((It != MinBWs.end() ||
21100 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
21101 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
21102 MinBWs.contains(getOperandEntry(E, 0)) ||
21103 MinBWs.contains(getOperandEntry(E, 1))) &&
21104 "Expected item in MinBWs.");
21105 Type *CastTy = VecTy;
21106 if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
21107 if (cast<VectorType>(Val: LHS->getType())
21108 ->getElementType()
21109 ->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
21110 ->getElementType()
21111 ->getIntegerBitWidth())
21112 CastTy = RHS->getType();
21113 else
21114 CastTy = LHS->getType();
21115 }
21116 if (LHS->getType() != CastTy)
21117 LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness(0));
21118 if (RHS->getType() != CastTy)
21119 RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness(1));
21120 }
21121
21122 Value *V0, *V1;
21123 if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
21124 V0 = Builder.CreateBinOp(
21125 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
21126 V1 = Builder.CreateBinOp(
21127 Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
21128 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
21129 V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
21130 auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
21131 CmpInst::Predicate AltPred = AltCI->getPredicate();
21132 V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
21133 } else {
21134 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
21135 unsigned SrcBWSz = DL->getTypeSizeInBits(
21136 Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
21137 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
21138 if (BWSz <= SrcBWSz) {
21139 if (BWSz < SrcBWSz)
21140 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It->second.first);
21141 assert(LHS->getType() == VecTy &&
21142 "Expected same type as operand.");
21143 if (auto *I = dyn_cast<Instruction>(Val: LHS))
21144 LHS = ::propagateMetadata(Inst: I, VL: E->Scalars);
21145 LHS = FinalShuffle(LHS, E);
21146 E->VectorizedValue = LHS;
21147 ++NumVectorInstructions;
21148 return LHS;
21149 }
21150 }
21151 V0 = Builder.CreateCast(
21152 Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
21153 V1 = Builder.CreateCast(
21154 Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
21155 }
21156 // Add V0 and V1 to later analysis to try to find and remove matching
21157 // instruction, if any.
21158 for (Value *V : {V0, V1}) {
21159 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21160 GatherShuffleExtractSeq.insert(X: I);
21161 CSEBlocks.insert(V: I->getParent());
21162 }
21163 }
21164
21165 // Create shuffle to take alternate operations from the vector.
21166 // Also, gather up main and alt scalar ops to propagate IR flags to
21167 // each vector operation.
21168 ValueList OpScalars, AltScalars;
21169 SmallVector<int> Mask;
21170 E->buildAltOpShuffleMask(
21171 IsAltOp: [E, this](Instruction *I) {
21172 assert(E->getMatchingMainOpOrAltOp(I) &&
21173 "Unexpected main/alternate opcode");
21174 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
21175 TLI: *TLI);
21176 },
21177 Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
21178
21179 propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
21180 propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
21181 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
21182 // Drop nuw flags for abs(sub(commutative), true).
21183 if (auto *I = dyn_cast<Instruction>(Val: Vec);
21184 I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
21185 any_of(Range&: E->Scalars, P: [E](Value *V) {
21186 if (isa<PoisonValue>(Val: V))
21187 return false;
21188 if (E->hasCopyableElements() && E->isCopyableElement(V))
21189 return false;
21190 auto *IV = cast<Instruction>(Val: V);
21191 return IV->getOpcode() == Instruction::Sub && isCommutative(I: IV);
21192 }))
21193 I->setHasNoUnsignedWrap(/*b=*/false);
21194 };
21195 DropNuwFlag(V0, E->getOpcode());
21196 DropNuwFlag(V1, E->getAltOpcode());
21197
21198 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
21199 assert(SLPReVec && "FixedVectorType is not expected.");
21200 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), Mask);
21201 }
21202 V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
21203 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21204 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21205 GatherShuffleExtractSeq.insert(X: I);
21206 CSEBlocks.insert(V: I->getParent());
21207 }
21208 }
21209
21210 E->VectorizedValue = V;
21211 ++NumVectorInstructions;
21212
21213 return V;
21214 }
21215 case TreeEntry::ReducedBitcast:
21216 case TreeEntry::ReducedBitcastBSwap: {
21217 assert(UserIgnoreList && "Expected reduction operations only.");
21218 setInsertPointAfterBundle(E);
21219 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
21220 ZExt->VectorizedValue = PoisonValue::get(T: getWidenedType(
21221 ScalarTy: ZExt->getMainOp()->getType(), VF: ZExt->getVectorFactor()));
21222 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
21223 Const->VectorizedValue = PoisonValue::get(T: getWidenedType(
21224 ScalarTy: Const->Scalars.front()->getType(), VF: Const->getVectorFactor()));
21225 Value *Op = vectorizeOperand(E: ZExt, NodeIdx: 0);
21226 // Set the scalar type properly to avoid casting to the extending type.
21227 ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
21228 Op = FinalShuffle(Op, E);
21229 auto *V = Builder.CreateBitCast(
21230 V: Op, DestTy: IntegerType::get(
21231 C&: Op->getContext(),
21232 NumBits: DL->getTypeSizeInBits(Ty: ZExt->getMainOp()->getType())));
21233 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap)
21234 V = Builder.CreateUnaryIntrinsic(ID: Intrinsic::bswap, V);
21235 E->VectorizedValue = V;
21236 ++NumVectorInstructions;
21237 return V;
21238 }
21239 default:
21240 llvm_unreachable("unknown inst");
21241 }
21242 return nullptr;
21243}
21244
21245Value *BoUpSLP::vectorizeTree() {
21246 ExtraValueToDebugLocsMap ExternallyUsedValues;
21247 return vectorizeTree(ExternallyUsedValues);
21248}
21249
21250Value *BoUpSLP::vectorizeTree(
21251 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
21252 Instruction *ReductionRoot,
21253 ArrayRef<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales) {
21254 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
21255 // need to rebuild it.
21256 EntryToLastInstruction.clear();
21257 // All blocks must be scheduled before any instructions are inserted.
21258 for (auto &BSIter : BlocksSchedules)
21259 scheduleBlock(R: *this, BS: BSIter.second.get());
21260 // Cache last instructions for the nodes to avoid side effects, which may
21261 // appear during vectorization, like extra uses, etc.
21262 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21263 if (TE->isGather() || DeletedNodes.contains(Ptr: TE.get()) ||
21264 (TE->State == TreeEntry::CombinedVectorize &&
21265 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
21266 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap)))
21267 continue;
21268 (void)getLastInstructionInBundle(E: TE.get());
21269 }
21270
21271 if (ReductionRoot)
21272 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
21273 IP: ReductionRoot->getIterator());
21274 else
21275 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
21276
21277 // Vectorize gather operands of the nodes with the external uses only.
21278 SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
21279 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21280 if (DeletedNodes.contains(Ptr: TE.get()))
21281 continue;
21282 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
21283 TE->UserTreeIndex.UserTE->hasState() &&
21284 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
21285 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
21286 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
21287 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
21288 all_of(Range&: TE->UserTreeIndex.UserTE->Scalars,
21289 P: [](Value *V) { return isUsedOutsideBlock(V); })) {
21290 Instruction &LastInst =
21291 getLastInstructionInBundle(E: TE->UserTreeIndex.UserTE);
21292 GatherEntries.emplace_back(Args: TE.get(), Args: &LastInst);
21293 }
21294 }
21295 for (auto &Entry : GatherEntries) {
21296 IRBuilderBase::InsertPointGuard Guard(Builder);
21297 Builder.SetInsertPoint(Entry.second);
21298 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
21299 (void)vectorizeTree(E: Entry.first);
21300 }
21301 // Emit gathered loads first to emit better code for the users of those
21302 // gathered loads.
21303 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21304 if (DeletedNodes.contains(Ptr: TE.get()))
21305 continue;
21306 if (GatheredLoadsEntriesFirst.has_value() &&
21307 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
21308 (!TE->isGather() || TE->UserTreeIndex)) {
21309 assert((TE->UserTreeIndex ||
21310 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
21311 "Expected gathered load node.");
21312 (void)vectorizeTree(E: TE.get());
21313 }
21314 }
21315 (void)vectorizeTree(E: VectorizableTree[0].get());
21316 // Run through the list of postponed gathers and emit them, replacing the temp
21317 // emitted allocas with actual vector instructions.
21318 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
21319 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
21320 for (const TreeEntry *E : PostponedNodes) {
21321 auto *TE = const_cast<TreeEntry *>(E);
21322 auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
21323 TE->VectorizedValue = nullptr;
21324 auto *UserI = cast<Instruction>(Val&: TE->UserTreeIndex.UserTE->VectorizedValue);
21325 // If user is a PHI node, its vector code have to be inserted right before
21326 // block terminator. Since the node was delayed, there were some unresolved
21327 // dependencies at the moment when stab instruction was emitted. In a case
21328 // when any of these dependencies turn out an operand of another PHI, coming
21329 // from this same block, position of a stab instruction will become invalid.
21330 // The is because source vector that supposed to feed this gather node was
21331 // inserted at the end of the block [after stab instruction]. So we need
21332 // to adjust insertion point again to the end of block.
21333 if (isa<PHINode>(Val: UserI) ||
21334 (TE->UserTreeIndex.UserTE->hasState() &&
21335 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
21336 // Insert before all users.
21337 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
21338 for (User *U : PrevVec->users()) {
21339 if (U == UserI)
21340 continue;
21341 auto *UI = dyn_cast<Instruction>(Val: U);
21342 if (!UI || isa<PHINode>(Val: UI) || UI->getParent() != InsertPt->getParent())
21343 continue;
21344 if (UI->comesBefore(Other: InsertPt))
21345 InsertPt = UI;
21346 }
21347 Builder.SetInsertPoint(InsertPt);
21348 } else {
21349 Builder.SetInsertPoint(PrevVec);
21350 }
21351 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
21352 Value *Vec = vectorizeTree(E: TE);
21353 if (auto *VecI = dyn_cast<Instruction>(Val: Vec);
21354 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
21355 Builder.GetInsertPoint()->comesBefore(Other: VecI))
21356 VecI->moveBeforePreserving(BB&: *Builder.GetInsertBlock(),
21357 I: Builder.GetInsertPoint());
21358 if (Vec->getType() != PrevVec->getType()) {
21359 assert(Vec->getType()->isIntOrIntVectorTy() &&
21360 PrevVec->getType()->isIntOrIntVectorTy() &&
21361 "Expected integer vector types only.");
21362 std::optional<bool> IsSigned;
21363 for (Value *V : TE->Scalars) {
21364 if (isVectorized(V)) {
21365 for (const TreeEntry *MNTE : getTreeEntries(V)) {
21366 auto It = MinBWs.find(Val: MNTE);
21367 if (It != MinBWs.end()) {
21368 IsSigned = IsSigned.value_or(u: false) || It->second.second;
21369 if (*IsSigned)
21370 break;
21371 }
21372 }
21373 if (IsSigned.value_or(u: false))
21374 break;
21375 // Scan through gather nodes.
21376 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
21377 auto It = MinBWs.find(Val: BVE);
21378 if (It != MinBWs.end()) {
21379 IsSigned = IsSigned.value_or(u: false) || It->second.second;
21380 if (*IsSigned)
21381 break;
21382 }
21383 }
21384 if (IsSigned.value_or(u: false))
21385 break;
21386 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
21387 IsSigned =
21388 IsSigned.value_or(u: false) ||
21389 !isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery(*DL));
21390 continue;
21391 }
21392 if (IsSigned.value_or(u: false))
21393 break;
21394 }
21395 }
21396 if (IsSigned.value_or(u: false)) {
21397 // Final attempt - check user node.
21398 auto It = MinBWs.find(Val: TE->UserTreeIndex.UserTE);
21399 if (It != MinBWs.end())
21400 IsSigned = It->second.second;
21401 }
21402 assert(IsSigned &&
21403 "Expected user node or perfect diamond match in MinBWs.");
21404 Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
21405 }
21406 PrevVec->replaceAllUsesWith(V: Vec);
21407 PostponedValues.try_emplace(Key: Vec).first->second.push_back(Elt: TE);
21408 // Replace the stub vector node, if it was used before for one of the
21409 // buildvector nodes already.
21410 auto It = PostponedValues.find(Val: PrevVec);
21411 if (It != PostponedValues.end()) {
21412 for (TreeEntry *VTE : It->getSecond())
21413 VTE->VectorizedValue = Vec;
21414 }
21415 eraseInstruction(I: PrevVec);
21416 }
21417
21418 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
21419 << " values .\n");
21420
21421 SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
21422 // Maps vector instruction to original insertelement instruction
21423 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
21424 // Maps extract Scalar to the corresponding extractelement instruction in the
21425 // basic block. Only one extractelement per block should be emitted.
21426 DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
21427 ScalarToEEs;
21428 SmallDenseSet<Value *, 4> UsedInserts;
21429 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
21430 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
21431 SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;
21432 // Extract all of the elements with the external uses.
21433 for (const auto &ExternalUse : ExternalUses) {
21434 Value *Scalar = ExternalUse.Scalar;
21435 llvm::User *User = ExternalUse.User;
21436
21437 // Skip users that we already RAUW. This happens when one instruction
21438 // has multiple uses of the same value.
21439 if (User && !is_contained(Range: Scalar->users(), Element: User))
21440 continue;
21441 const TreeEntry *E = &ExternalUse.E;
21442 assert(E && "Invalid scalar");
21443 assert(!E->isGather() && "Extracting from a gather list");
21444 // Non-instruction pointers are not deleted, just skip them.
21445 if (E->getOpcode() == Instruction::GetElementPtr &&
21446 !isa<GetElementPtrInst>(Val: Scalar))
21447 continue;
21448
21449 Value *Vec = E->VectorizedValue;
21450 assert(Vec && "Can't find vectorizable value");
21451
21452 Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
21453 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
21454 if (Scalar->getType() != Vec->getType()) {
21455 Value *Ex = nullptr;
21456 Value *ExV = nullptr;
21457 auto *Inst = dyn_cast<Instruction>(Val: Scalar);
21458 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Ptr: Inst);
21459 auto It = ScalarToEEs.find(Val: Scalar);
21460 if (It != ScalarToEEs.end()) {
21461 // No need to emit many extracts, just move the only one in the
21462 // current block.
21463 auto EEIt = It->second.find(Val: ReplaceInst ? Inst->getParent()
21464 : Builder.GetInsertBlock());
21465 if (EEIt != It->second.end()) {
21466 Value *PrevV = EEIt->second.first;
21467 if (auto *I = dyn_cast<Instruction>(Val: PrevV);
21468 I && !ReplaceInst &&
21469 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21470 Builder.GetInsertPoint()->comesBefore(Other: I)) {
21471 I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
21472 I: Builder.GetInsertPoint());
21473 if (auto *CI = dyn_cast<Instruction>(Val: EEIt->second.second))
21474 CI->moveAfter(MovePos: I);
21475 }
21476 Ex = PrevV;
21477 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21478 }
21479 }
21480 if (!Ex) {
21481 // "Reuse" the existing extract to improve final codegen.
21482 if (ReplaceInst) {
21483 // Leave the instruction as is, if it cheaper extracts and all
21484 // operands are scalar.
21485 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) {
21486 IgnoredExtracts.insert(V: EE);
21487 Ex = EE;
21488 } else {
21489 auto *CloneInst = Inst->clone();
21490 CloneInst->insertBefore(InsertPos: Inst->getIterator());
21491 if (Inst->hasName())
21492 CloneInst->takeName(V: Inst);
21493 Ex = CloneInst;
21494 }
21495 } else if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar);
21496 ES && isa<Instruction>(Val: Vec)) {
21497 Value *V = ES->getVectorOperand();
21498 auto *IVec = cast<Instruction>(Val: Vec);
21499 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
21500 V = ETEs.front()->VectorizedValue;
21501 if (auto *IV = dyn_cast<Instruction>(Val: V);
21502 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
21503 IV->comesBefore(Other: IVec))
21504 Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
21505 else
21506 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
21507 } else if (auto *VecTy =
21508 dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
21509 assert(SLPReVec && "FixedVectorType is not expected.");
21510 unsigned VecTyNumElements = VecTy->getNumElements();
21511 // When REVEC is enabled, we need to extract a vector.
21512 // Note: The element size of Scalar may be different from the
21513 // element size of Vec.
21514 Ex = createExtractVector(Builder, Vec, SubVecVF: VecTyNumElements,
21515 Index: ExternalUse.Lane * VecTyNumElements);
21516 } else {
21517 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
21518 }
21519 // If necessary, sign-extend or zero-extend ScalarRoot
21520 // to the larger type.
21521 ExV = Ex;
21522 if (Scalar->getType() != Ex->getType())
21523 ExV = Builder.CreateIntCast(
21524 V: Ex, DestTy: Scalar->getType(),
21525 isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
21526 auto *I = dyn_cast<Instruction>(Val: Ex);
21527 ScalarToEEs[Scalar].try_emplace(Key: I ? I->getParent()
21528 : &F->getEntryBlock(),
21529 Args: std::make_pair(x&: Ex, y&: ExV));
21530 }
21531 // The then branch of the previous if may produce constants, since 0
21532 // operand might be a constant.
21533 if (auto *ExI = dyn_cast<Instruction>(Val: Ex);
21534 ExI && !isa<PHINode>(Val: ExI) && !mayHaveNonDefUseDependency(I: *ExI)) {
21535 GatherShuffleExtractSeq.insert(X: ExI);
21536 CSEBlocks.insert(V: ExI->getParent());
21537 }
21538 return ExV;
21539 }
21540 assert(isa<FixedVectorType>(Scalar->getType()) &&
21541 isa<InsertElementInst>(Scalar) &&
21542 "In-tree scalar of vector type is not insertelement?");
21543 auto *IE = cast<InsertElementInst>(Val: Scalar);
21544 VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
21545 return Vec;
21546 };
21547 // If User == nullptr, the Scalar remains as scalar in vectorized
21548 // instructions or is used as extra arg. Generate ExtractElement instruction
21549 // and update the record for this scalar in ExternallyUsedValues.
21550 if (!User) {
21551 if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
21552 continue;
21553 assert(
21554 (ExternallyUsedValues.count(Scalar) ||
21555 ExternalUsesWithNonUsers.count(Scalar) ||
21556 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21557 any_of(
21558 Scalar->users(),
21559 [&, TTI = TTI](llvm::User *U) {
21560 if (ExternalUsesAsOriginalScalar.contains(U))
21561 return true;
21562 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21563 return !UseEntries.empty() &&
21564 (E->State == TreeEntry::Vectorize ||
21565 E->State == TreeEntry::StridedVectorize ||
21566 E->State == TreeEntry::CompressVectorize) &&
21567 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21568 return (UseEntry->State == TreeEntry::Vectorize ||
21569 UseEntry->State ==
21570 TreeEntry::StridedVectorize ||
21571 UseEntry->State ==
21572 TreeEntry::CompressVectorize) &&
21573 doesInTreeUserNeedToExtract(
21574 Scalar, getRootEntryInstruction(*UseEntry),
21575 TLI, TTI);
21576 });
21577 })) &&
21578 "Scalar with nullptr User must be registered in "
21579 "ExternallyUsedValues map or remain as scalar in vectorized "
21580 "instructions");
21581 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
21582 if (auto *PHI = dyn_cast<PHINode>(Val: VecI)) {
21583 if (PHI->getParent()->isLandingPad())
21584 Builder.SetInsertPoint(
21585 TheBB: PHI->getParent(),
21586 IP: std::next(
21587 x: PHI->getParent()->getLandingPadInst()->getIterator()));
21588 else
21589 Builder.SetInsertPoint(TheBB: PHI->getParent(),
21590 IP: PHI->getParent()->getFirstNonPHIIt());
21591 } else {
21592 Builder.SetInsertPoint(TheBB: VecI->getParent(),
21593 IP: std::next(x: VecI->getIterator()));
21594 }
21595 } else {
21596 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
21597 }
21598 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21599 // Required to update internally referenced instructions.
21600 if (Scalar != NewInst) {
21601 assert((!isa<ExtractElementInst>(Scalar) ||
21602 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
21603 "Extractelements should not be replaced.");
21604 Scalar->replaceAllUsesWith(V: NewInst);
21605 }
21606 continue;
21607 }
21608
21609 if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
21610 VU && VU->getOperand(i_nocapture: 1) == Scalar) {
21611 // Skip if the scalar is another vector op or Vec is not an instruction.
21612 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
21613 if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
21614 if (!UsedInserts.insert(V: VU).second)
21615 continue;
21616 // Need to use original vector, if the root is truncated.
21617 auto BWIt = MinBWs.find(Val: E);
21618 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
21619 auto *ScalarTy = FTy->getElementType();
21620 auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
21621 auto VecIt = VectorCasts.find(Val: Key);
21622 if (VecIt == VectorCasts.end()) {
21623 IRBuilderBase::InsertPointGuard Guard(Builder);
21624 if (auto *IVec = dyn_cast<PHINode>(Val: Vec)) {
21625 if (IVec->getParent()->isLandingPad())
21626 Builder.SetInsertPoint(TheBB: IVec->getParent(),
21627 IP: std::next(x: IVec->getParent()
21628 ->getLandingPadInst()
21629 ->getIterator()));
21630 else
21631 Builder.SetInsertPoint(
21632 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
21633 } else if (auto *IVec = dyn_cast<Instruction>(Val: Vec)) {
21634 Builder.SetInsertPoint(IVec->getNextNode());
21635 }
21636 Vec = Builder.CreateIntCast(
21637 V: Vec,
21638 DestTy: getWidenedType(
21639 ScalarTy,
21640 VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
21641 isSigned: BWIt->second.second);
21642 VectorCasts.try_emplace(Key, Args&: Vec);
21643 } else {
21644 Vec = VecIt->second;
21645 }
21646 }
21647
21648 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
21649 if (InsertIdx) {
21650 auto *It = find_if(
21651 Range&: ShuffledInserts, P: [VU](const ShuffledInsertData<Value *> &Data) {
21652 // Checks if 2 insertelements are from the same buildvector.
21653 InsertElementInst *VecInsert = Data.InsertElements.front();
21654 return areTwoInsertFromSameBuildVector(
21655 VU, V: VecInsert,
21656 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); });
21657 });
21658 unsigned Idx = *InsertIdx;
21659 if (It == ShuffledInserts.end()) {
21660 (void)ShuffledInserts.emplace_back();
21661 It = std::next(x: ShuffledInserts.begin(),
21662 n: ShuffledInserts.size() - 1);
21663 }
21664 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
21665 if (Mask.empty())
21666 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
21667 Mask[Idx] = ExternalUse.Lane;
21668 It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
21669 continue;
21670 }
21671 }
21672 }
21673 }
21674
21675 // Generate extracts for out-of-tree users.
21676 // Find the insertion point for the extractelement lane.
21677 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
21678 if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
21679 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
21680 if (PH->getIncomingValue(i: I) == Scalar) {
21681 Instruction *IncomingTerminator =
21682 PH->getIncomingBlock(i: I)->getTerminator();
21683 if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
21684 Builder.SetInsertPoint(TheBB: VecI->getParent(),
21685 IP: std::next(x: VecI->getIterator()));
21686 } else {
21687 Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
21688 }
21689 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21690 PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
21691 }
21692 }
21693 } else {
21694 Builder.SetInsertPoint(cast<Instruction>(Val: User));
21695 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21696 User->replaceUsesOfWith(From: Scalar, To: NewInst);
21697 }
21698 } else {
21699 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
21700 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21701 User->replaceUsesOfWith(From: Scalar, To: NewInst);
21702 }
21703
21704 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
21705 }
21706
21707 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
21708 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
21709 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
21710 int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
21711 for (int I = 0, E = Mask.size(); I < E; ++I) {
21712 if (Mask[I] < VF)
21713 CombinedMask1[I] = Mask[I];
21714 else
21715 CombinedMask2[I] = Mask[I] - VF;
21716 }
21717 ShuffleInstructionBuilder ShuffleBuilder(
21718 cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
21719 ShuffleBuilder.add(V1, Mask: CombinedMask1);
21720 if (V2)
21721 ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
21722 return ShuffleBuilder.finalize(ExtMask: {}, SubVectors: {}, SubVectorsMask: {});
21723 };
21724
21725 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
21726 bool ForSingleMask) {
21727 unsigned VF = Mask.size();
21728 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
21729 if (VF != VecVF) {
21730 if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
21731 Vec = CreateShuffle(Vec, nullptr, Mask);
21732 return std::make_pair(x&: Vec, y: true);
21733 }
21734 if (!ForSingleMask) {
21735 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
21736 for (unsigned I = 0; I < VF; ++I) {
21737 if (Mask[I] != PoisonMaskElem)
21738 ResizeMask[Mask[I]] = Mask[I];
21739 }
21740 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
21741 }
21742 }
21743
21744 return std::make_pair(x&: Vec, y: false);
21745 };
21746 // Perform shuffling of the vectorize tree entries for better handling of
21747 // external extracts.
21748 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
21749 // Find the first and the last instruction in the list of insertelements.
21750 sort(C&: ShuffledInserts[I].InsertElements, Comp: isFirstInsertElement);
21751 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
21752 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
21753 Builder.SetInsertPoint(LastInsert);
21754 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
21755 Value *NewInst = performExtractsShuffleAction<Value>(
21756 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
21757 Base: FirstInsert->getOperand(i_nocapture: 0),
21758 GetVF: [](Value *Vec) {
21759 return cast<VectorType>(Val: Vec->getType())
21760 ->getElementCount()
21761 .getKnownMinValue();
21762 },
21763 ResizeAction: ResizeToVF,
21764 Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
21765 ArrayRef<Value *> Vals) {
21766 assert((Vals.size() == 1 || Vals.size() == 2) &&
21767 "Expected exactly 1 or 2 input values.");
21768 if (Vals.size() == 1) {
21769 // Do not create shuffle if the mask is a simple identity
21770 // non-resizing mask.
21771 if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
21772 ->getNumElements() ||
21773 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
21774 return CreateShuffle(Vals.front(), nullptr, Mask);
21775 return Vals.front();
21776 }
21777 return CreateShuffle(Vals.front() ? Vals.front()
21778 : FirstInsert->getOperand(i_nocapture: 0),
21779 Vals.back(), Mask);
21780 });
21781 auto It = ShuffledInserts[I].InsertElements.rbegin();
21782 // Rebuild buildvector chain.
21783 InsertElementInst *II = nullptr;
21784 if (It != ShuffledInserts[I].InsertElements.rend())
21785 II = *It;
21786 SmallVector<Instruction *> Inserts;
21787 while (It != ShuffledInserts[I].InsertElements.rend()) {
21788 assert(II && "Must be an insertelement instruction.");
21789 if (*It == II)
21790 ++It;
21791 else
21792 Inserts.push_back(Elt: cast<Instruction>(Val: II));
21793 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
21794 }
21795 for (Instruction *II : reverse(C&: Inserts)) {
21796 II->replaceUsesOfWith(From: II->getOperand(i: 0), To: NewInst);
21797 if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
21798 if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
21799 II->moveAfter(MovePos: NewI);
21800 NewInst = II;
21801 }
21802 LastInsert->replaceAllUsesWith(V: NewInst);
21803 for (InsertElementInst *IE : reverse(C&: ShuffledInserts[I].InsertElements)) {
21804 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 0),
21805 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 0)->getType()));
21806 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 1),
21807 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 1)->getType()));
21808 eraseInstruction(I: IE);
21809 }
21810 CSEBlocks.insert(V: LastInsert->getParent());
21811 }
21812
21813 SmallVector<Instruction *> RemovedInsts;
21814 // For each vectorized value:
21815 for (auto &TEPtr : VectorizableTree) {
21816 TreeEntry *Entry = TEPtr.get();
21817
21818 // No need to handle users of gathered values.
21819 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
21820 DeletedNodes.contains(Ptr: Entry) ||
21821 TransformedToGatherNodes.contains(Val: Entry))
21822 continue;
21823
21824 if (Entry->CombinedOp == TreeEntry::ReducedBitcast ||
21825 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap) {
21826 // Skip constant node
21827 if (!Entry->hasState()) {
21828 assert(allConstant(Entry->Scalars) && "Expected constants only.");
21829 continue;
21830 }
21831 for (Value *Scalar : Entry->Scalars) {
21832 auto *I = dyn_cast<Instruction>(Val: Scalar);
21833
21834 if (!I || Entry->isCopyableElement(V: I))
21835 continue;
21836 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *I << ".\n");
21837 RemovedInsts.push_back(Elt: I);
21838 }
21839 continue;
21840 }
21841
21842 assert(Entry->VectorizedValue && "Can't find vectorizable value");
21843
21844 // For each lane:
21845 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
21846 Value *Scalar = Entry->Scalars[Lane];
21847
21848 if (Entry->getOpcode() == Instruction::GetElementPtr &&
21849 !isa<GetElementPtrInst>(Val: Scalar))
21850 continue;
21851 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Scalar);
21852 EE && IgnoredExtracts.contains(V: EE))
21853 continue;
21854 if (!isa<Instruction>(Val: Scalar) || Entry->isCopyableElement(V: Scalar))
21855 continue;
21856#ifndef NDEBUG
21857 Type *Ty = Scalar->getType();
21858 if (!Ty->isVoidTy()) {
21859 for (User *U : Scalar->users()) {
21860 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
21861
21862 // It is legal to delete users in the ignorelist.
21863 assert((isVectorized(U) ||
21864 (UserIgnoreList && UserIgnoreList->contains(U)) ||
21865 (isa_and_nonnull<Instruction>(U) &&
21866 isDeleted(cast<Instruction>(U)))) &&
21867 "Deleting out-of-tree value");
21868 }
21869 }
21870#endif
21871 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
21872 auto *I = cast<Instruction>(Val: Scalar);
21873 RemovedInsts.push_back(Elt: I);
21874 }
21875 }
21876
21877 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
21878 // new vector instruction.
21879 if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree[0]->VectorizedValue))
21880 V->mergeDIAssignID(SourceInstructions: RemovedInsts);
21881
21882 // Clear up reduction references, if any.
21883 if (UserIgnoreList) {
21884 for (Instruction *I : RemovedInsts) {
21885 const TreeEntry *IE = getTreeEntries(V: I).front();
21886 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(V: I);
21887 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
21888 IE = SplitEntries.front();
21889 if (IE->Idx != 0 &&
21890 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
21891 (ValueToGatherNodes.lookup(Val: I).contains(
21892 key: VectorizableTree.front().get()) ||
21893 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
21894 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
21895 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
21896 IE->UserTreeIndex &&
21897 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
21898 !(GatheredLoadsEntriesFirst.has_value() &&
21899 IE->Idx >= *GatheredLoadsEntriesFirst &&
21900 VectorizableTree.front()->isGather() &&
21901 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
21902 !(!VectorizableTree.front()->isGather() &&
21903 VectorizableTree.front()->isCopyableElement(V: I)))
21904 continue;
21905 SmallVector<SelectInst *> LogicalOpSelects;
21906 I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
21907 // Do not replace condition of the logical op in form select <cond>.
21908 bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
21909 (match(V: U.getUser(), P: m_LogicalAnd()) ||
21910 match(V: U.getUser(), P: m_LogicalOr())) &&
21911 U.getOperandNo() == 0;
21912 if (IsPoisoningLogicalOp) {
21913 LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
21914 return false;
21915 }
21916 return UserIgnoreList->contains(V: U.getUser());
21917 });
21918 // Replace conditions of the poisoning logical ops with the non-poison
21919 // constant value.
21920 for (SelectInst *SI : LogicalOpSelects)
21921 SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
21922 }
21923 }
21924 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
21925 // cache correctness.
21926 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
21927 // - instructions are not deleted until later.
21928 removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts), VectorValuesAndScales);
21929
21930 Builder.ClearInsertionPoint();
21931 InstrElementSize.clear();
21932
21933 const TreeEntry &RootTE = *VectorizableTree.front();
21934 Value *Vec = RootTE.VectorizedValue;
21935 if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != 0 &&
21936 It != MinBWs.end() &&
21937 ReductionBitWidth != It->second.first) {
21938 IRBuilder<>::InsertPointGuard Guard(Builder);
21939 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
21940 IP: ReductionRoot->getIterator());
21941 Vec = Builder.CreateIntCast(
21942 V: Vec,
21943 DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
21944 EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
21945 isSigned: It->second.second);
21946 }
21947 return Vec;
21948}
21949
21950void BoUpSLP::optimizeGatherSequence() {
21951 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
21952 << " gather sequences instructions.\n");
21953 // LICM InsertElementInst sequences.
21954 for (Instruction *I : GatherShuffleExtractSeq) {
21955 if (isDeleted(I))
21956 continue;
21957
21958 // Check if this block is inside a loop.
21959 Loop *L = LI->getLoopFor(BB: I->getParent());
21960 if (!L)
21961 continue;
21962
21963 // Check if it has a preheader.
21964 BasicBlock *PreHeader = L->getLoopPreheader();
21965 if (!PreHeader)
21966 continue;
21967
21968 // If the vector or the element that we insert into it are
21969 // instructions that are defined in this basic block then we can't
21970 // hoist this instruction.
21971 if (any_of(Range: I->operands(), P: [L](Value *V) {
21972 auto *OpI = dyn_cast<Instruction>(Val: V);
21973 return OpI && L->contains(Inst: OpI);
21974 }))
21975 continue;
21976
21977 // We can hoist this instruction. Move it to the pre-header.
21978 I->moveBefore(InsertPos: PreHeader->getTerminator()->getIterator());
21979 CSEBlocks.insert(V: PreHeader);
21980 }
21981
21982 // Make a list of all reachable blocks in our CSE queue.
21983 SmallVector<const DomTreeNode *, 8> CSEWorkList;
21984 CSEWorkList.reserve(N: CSEBlocks.size());
21985 for (BasicBlock *BB : CSEBlocks)
21986 if (DomTreeNode *N = DT->getNode(BB)) {
21987 assert(DT->isReachableFromEntry(N));
21988 CSEWorkList.push_back(Elt: N);
21989 }
21990
21991 // Sort blocks by domination. This ensures we visit a block after all blocks
21992 // dominating it are visited.
21993 llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode *A, const DomTreeNode *B) {
21994 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
21995 "Different nodes should have different DFS numbers");
21996 return A->getDFSNumIn() < B->getDFSNumIn();
21997 });
21998
21999 // Less defined shuffles can be replaced by the more defined copies.
22000 // Between two shuffles one is less defined if it has the same vector operands
22001 // and its mask indeces are the same as in the first one or undefs. E.g.
22002 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
22003 // poison, <0, 0, 0, 0>.
22004 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
22005 Instruction *I2,
22006 SmallVectorImpl<int> &NewMask) {
22007 if (I1->getType() != I2->getType())
22008 return false;
22009 auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
22010 auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
22011 if (!SI1 || !SI2)
22012 return I1->isIdenticalTo(I: I2);
22013 if (SI1->isIdenticalTo(I: SI2))
22014 return true;
22015 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
22016 if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
22017 return false;
22018 // Check if the second instruction is more defined than the first one.
22019 NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
22020 ArrayRef<int> SM1 = SI1->getShuffleMask();
22021 // Count trailing undefs in the mask to check the final number of used
22022 // registers.
22023 unsigned LastUndefsCnt = 0;
22024 for (int I = 0, E = NewMask.size(); I < E; ++I) {
22025 if (SM1[I] == PoisonMaskElem)
22026 ++LastUndefsCnt;
22027 else
22028 LastUndefsCnt = 0;
22029 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
22030 NewMask[I] != SM1[I])
22031 return false;
22032 if (NewMask[I] == PoisonMaskElem)
22033 NewMask[I] = SM1[I];
22034 }
22035 // Check if the last undefs actually change the final number of used vector
22036 // registers.
22037 return SM1.size() - LastUndefsCnt > 1 &&
22038 ::getNumberOfParts(TTI: *TTI, VecTy: SI1->getType()) ==
22039 ::getNumberOfParts(
22040 TTI: *TTI, VecTy: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
22041 VF: SM1.size() - LastUndefsCnt));
22042 };
22043 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
22044 // instructions. TODO: We can further optimize this scan if we split the
22045 // instructions into different buckets based on the insert lane.
22046 SmallVector<Instruction *, 16> Visited;
22047 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
22048 assert(*I &&
22049 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
22050 "Worklist not sorted properly!");
22051 BasicBlock *BB = (*I)->getBlock();
22052 // For all instructions in blocks containing gather sequences:
22053 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
22054 if (isDeleted(I: &In))
22055 continue;
22056 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
22057 !GatherShuffleExtractSeq.contains(key: &In))
22058 continue;
22059
22060 // Check if we can replace this instruction with any of the
22061 // visited instructions.
22062 bool Replaced = false;
22063 for (Instruction *&V : Visited) {
22064 SmallVector<int> NewMask;
22065 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
22066 DT->dominates(A: V->getParent(), B: In.getParent())) {
22067 In.replaceAllUsesWith(V);
22068 eraseInstruction(I: &In);
22069 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
22070 if (!NewMask.empty())
22071 SI->setShuffleMask(NewMask);
22072 Replaced = true;
22073 break;
22074 }
22075 if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
22076 GatherShuffleExtractSeq.contains(key: V) &&
22077 IsIdenticalOrLessDefined(V, &In, NewMask) &&
22078 DT->dominates(A: In.getParent(), B: V->getParent())) {
22079 In.moveAfter(MovePos: V);
22080 V->replaceAllUsesWith(V: &In);
22081 eraseInstruction(I: V);
22082 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
22083 if (!NewMask.empty())
22084 SI->setShuffleMask(NewMask);
22085 V = &In;
22086 Replaced = true;
22087 break;
22088 }
22089 }
22090 if (!Replaced) {
22091 assert(!is_contained(Visited, &In));
22092 Visited.push_back(Elt: &In);
22093 }
22094 }
22095 }
22096 CSEBlocks.clear();
22097 GatherShuffleExtractSeq.clear();
22098}
22099
22100BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
22101 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
22102 auto &BundlePtr =
22103 ScheduledBundlesList.emplace_back(Args: std::make_unique<ScheduleBundle>());
22104 for (Value *V : VL) {
22105 if (S.isNonSchedulable(V))
22106 continue;
22107 auto *I = cast<Instruction>(Val: V);
22108 if (S.isCopyableElement(V)) {
22109 // Add a copyable element model.
22110 ScheduleCopyableData &SD =
22111 addScheduleCopyableData(EI, I, SchedulingRegionID, Bundle&: *BundlePtr);
22112 // Group the instructions to a bundle.
22113 BundlePtr->add(SD: &SD);
22114 continue;
22115 }
22116 ScheduleData *BundleMember = getScheduleData(V);
22117 assert(BundleMember && "no ScheduleData for bundle member "
22118 "(maybe not in same basic block)");
22119 // Group the instructions to a bundle.
22120 BundlePtr->add(SD: BundleMember);
22121 ScheduledBundles.try_emplace(Key: I).first->getSecond().push_back(
22122 Elt: BundlePtr.get());
22123 }
22124 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
22125 return *BundlePtr;
22126}
22127
22128// Groups the instructions to a bundle (which is then a single scheduling entity)
22129// and schedules instructions until the bundle gets ready.
22130std::optional<BoUpSLP::ScheduleBundle *>
22131BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
22132 const InstructionsState &S,
22133 const EdgeInfo &EI) {
22134 // No need to schedule PHIs, insertelement, extractelement and extractvalue
22135 // instructions.
22136 if (isa<PHINode>(Val: S.getMainOp()) ||
22137 isVectorLikeInstWithConstOps(V: S.getMainOp()))
22138 return nullptr;
22139 // If the parent node is non-schedulable and the current node is copyable, and
22140 // any of parent instructions are used outside several basic blocks or in
22141 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
22142 // analysis, leading to a crash.
22143 // Non-scheduled nodes may not have related ScheduleData model, which may lead
22144 // to a skipped dep analysis.
22145 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22146 EI.UserTE->doesNotNeedToSchedule() &&
22147 EI.UserTE->getOpcode() != Instruction::PHI &&
22148 any_of(Range&: EI.UserTE->Scalars, P: [](Value *V) {
22149 auto *I = dyn_cast<Instruction>(Val: V);
22150 if (!I || I->hasOneUser())
22151 return false;
22152 for (User *U : I->users()) {
22153 auto *UI = cast<Instruction>(Val: U);
22154 if (isa<BinaryOperator>(Val: UI))
22155 return true;
22156 }
22157 return false;
22158 }))
22159 return std::nullopt;
22160 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22161 EI.UserTE->hasCopyableElements() &&
22162 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
22163 all_of(Range&: VL, P: [&](Value *V) {
22164 if (S.isCopyableElement(V))
22165 return true;
22166 return isUsedOutsideBlock(V);
22167 }))
22168 return std::nullopt;
22169 // If any instruction is used outside block only and its operand is placed
22170 // immediately before it, do not schedule, it may cause wrong def-use chain.
22171 if (S.areInstructionsWithCopyableElements() && any_of(Range&: VL, P: [&](Value *V) {
22172 if (isa<PoisonValue>(Val: V) || S.isCopyableElement(V))
22173 return false;
22174 if (isUsedOutsideBlock(V)) {
22175 for (Value *Op : cast<Instruction>(Val: V)->operands()) {
22176 auto *I = dyn_cast<Instruction>(Val: Op);
22177 if (!I)
22178 continue;
22179 return SLP->isVectorized(V: I) && I->getNextNode() == V;
22180 }
22181 }
22182 return false;
22183 }))
22184 return std::nullopt;
22185 if (S.areInstructionsWithCopyableElements() && EI) {
22186 bool IsNonSchedulableWithParentPhiNode =
22187 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
22188 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
22189 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22190 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22191 if (IsNonSchedulableWithParentPhiNode) {
22192 SmallSet<std::pair<Value *, Value *>, 4> Values;
22193 for (const auto [Idx, V] :
22194 enumerate(First&: EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
22195 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
22196 OpIdx: EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
22197 auto *I = dyn_cast<Instruction>(Val: Op);
22198 if (!I || !isCommutative(I))
22199 continue;
22200 if (!Values.insert(V: std::make_pair(x&: V, y&: Op)).second)
22201 return std::nullopt;
22202 }
22203 } else {
22204 // If any of the parent requires scheduling - exit, complex dep between
22205 // schedulable/non-schedulable parents.
22206 if (any_of(Range&: EI.UserTE->Scalars, P: [&](Value *V) {
22207 if (EI.UserTE->hasCopyableElements() &&
22208 EI.UserTE->isCopyableElement(V))
22209 return false;
22210 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
22211 return any_of(Range&: Entries, P: [](const TreeEntry *TE) {
22212 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
22213 TE->UserTreeIndex.UserTE->hasState() &&
22214 TE->UserTreeIndex.UserTE->State !=
22215 TreeEntry::SplitVectorize &&
22216 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22217 });
22218 }))
22219 return std::nullopt;
22220 }
22221 }
22222 bool HasCopyables = S.areInstructionsWithCopyableElements();
22223 if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
22224 all_of(Range&: VL, P: [&](Value *V) { return S.isNonSchedulable(V); }))) {
22225 // If all operands were replaced by copyables, the operands of this node
22226 // might be not, so need to recalculate dependencies for schedule data,
22227 // replaced by copyable schedule data.
22228 SmallVector<ScheduleData *> ControlDependentMembers;
22229 for (Value *V : VL) {
22230 auto *I = dyn_cast<Instruction>(Val: V);
22231 if (!I || (HasCopyables && S.isCopyableElement(V)))
22232 continue;
22233 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
22234 for (const Use &U : I->operands()) {
22235 unsigned &NumOps =
22236 UserOpToNumOps.try_emplace(Key: std::make_pair(x&: I, y: U.get()), Args: 0)
22237 .first->getSecond();
22238 ++NumOps;
22239 if (auto *Op = dyn_cast<Instruction>(Val: U.get());
22240 Op && areAllOperandsReplacedByCopyableData(User: I, Op, SLP&: *SLP, NumOps)) {
22241 if (ScheduleData *OpSD = getScheduleData(I: Op);
22242 OpSD && OpSD->hasValidDependencies())
22243 // TODO: investigate how to improve it instead of early exiting.
22244 return std::nullopt;
22245 }
22246 }
22247 }
22248 return nullptr;
22249 }
22250
22251 // Initialize the instruction bundle.
22252 Instruction *OldScheduleEnd = ScheduleEnd;
22253 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
22254
22255 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
22256 // Clear deps or recalculate the region, if the memory instruction is a
22257 // copyable. It may have memory deps, which must be recalculated.
22258 SmallVector<ScheduleData *> ControlDependentMembers;
22259 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
22260 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
22261 for (ScheduleEntity *SE : Bundle.getBundle()) {
22262 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
22263 if (ScheduleData *BundleMember = getScheduleData(I: SD->getInst());
22264 BundleMember && BundleMember->hasValidDependencies()) {
22265 BundleMember->clearDirectDependencies();
22266 if (RegionHasStackSave ||
22267 !isGuaranteedToTransferExecutionToSuccessor(
22268 I: BundleMember->getInst()))
22269 ControlDependentMembers.push_back(Elt: BundleMember);
22270 }
22271 continue;
22272 }
22273 auto *SD = cast<ScheduleData>(Val: SE);
22274 if (SD->hasValidDependencies() &&
22275 (!S.areInstructionsWithCopyableElements() ||
22276 !S.isCopyableElement(V: SD->getInst())) &&
22277 !getScheduleCopyableData(I: SD->getInst()).empty() && EI.UserTE &&
22278 EI.UserTE->hasState() &&
22279 (!EI.UserTE->hasCopyableElements() ||
22280 !EI.UserTE->isCopyableElement(V: SD->getInst())))
22281 SD->clearDirectDependencies();
22282 for (const Use &U : SD->getInst()->operands()) {
22283 unsigned &NumOps =
22284 UserOpToNumOps
22285 .try_emplace(Key: std::make_pair(x: SD->getInst(), y: U.get()), Args: 0)
22286 .first->getSecond();
22287 ++NumOps;
22288 if (auto *Op = dyn_cast<Instruction>(Val: U.get());
22289 Op && areAllOperandsReplacedByCopyableData(User: SD->getInst(), Op,
22290 SLP&: *SLP, NumOps)) {
22291 if (ScheduleData *OpSD = getScheduleData(I: Op);
22292 OpSD && OpSD->hasValidDependencies()) {
22293 OpSD->clearDirectDependencies();
22294 if (RegionHasStackSave ||
22295 !isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
22296 ControlDependentMembers.push_back(Elt: OpSD);
22297 }
22298 }
22299 }
22300 }
22301 };
22302 // The scheduling region got new instructions at the lower end (or it is a
22303 // new region for the first bundle). This makes it necessary to
22304 // recalculate all dependencies.
22305 // It is seldom that this needs to be done a second time after adding the
22306 // initial bundle to the region.
22307 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
22308 for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
22309 if (BB != P.first->getParent())
22310 return;
22311 ScheduleData *SD = P.second;
22312 if (isInSchedulingRegion(SD: *SD))
22313 SD->clearDependencies();
22314 });
22315 for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
22316 for_each(P.second, [&](ScheduleCopyableData *SD) {
22317 if (isInSchedulingRegion(SD: *SD))
22318 SD->clearDependencies();
22319 });
22320 });
22321 ReSchedule = true;
22322 }
22323 // Check if the bundle data has deps for copyable elements already. In
22324 // this case need to reset deps and recalculate it.
22325 if (Bundle && !Bundle.getBundle().empty()) {
22326 if (S.areInstructionsWithCopyableElements() ||
22327 !ScheduleCopyableDataMap.empty())
22328 CheckIfNeedToClearDeps(Bundle);
22329 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
22330 << BB->getName() << "\n");
22331 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
22332 ControlDeps: ControlDependentMembers);
22333 } else if (!ControlDependentMembers.empty()) {
22334 ScheduleBundle Invalid = ScheduleBundle::invalid();
22335 calculateDependencies(Bundle&: Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
22336 ControlDeps: ControlDependentMembers);
22337 }
22338
22339 if (ReSchedule) {
22340 resetSchedule();
22341 initialFillReadyList(ReadyList&: ReadyInsts);
22342 }
22343
22344 // Now try to schedule the new bundle or (if no bundle) just calculate
22345 // dependencies. As soon as the bundle is "ready" it means that there are no
22346 // cyclic dependencies and we can schedule it. Note that's important that we
22347 // don't "schedule" the bundle yet.
22348 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
22349 !ReadyInsts.empty()) {
22350 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
22351 assert(Picked->isReady() && "must be ready to schedule");
22352 schedule(R: *SLP, S, EI, Data: Picked, ReadyList&: ReadyInsts);
22353 if (Picked == &Bundle)
22354 break;
22355 }
22356 };
22357
22358 // Make sure that the scheduling region contains all
22359 // instructions of the bundle.
22360 for (Value *V : VL) {
22361 if (S.isNonSchedulable(V))
22362 continue;
22363 if (!extendSchedulingRegion(V, S)) {
22364 // If the scheduling region got new instructions at the lower end (or it
22365 // is a new region for the first bundle). This makes it necessary to
22366 // recalculate all dependencies.
22367 // Otherwise the compiler may crash trying to incorrectly calculate
22368 // dependencies and emit instruction in the wrong order at the actual
22369 // scheduling.
22370 ScheduleBundle Invalid = ScheduleBundle::invalid();
22371 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
22372 return std::nullopt;
22373 }
22374 }
22375
22376 bool ReSchedule = false;
22377 for (Value *V : VL) {
22378 if (S.isNonSchedulable(V))
22379 continue;
22380 SmallVector<ScheduleCopyableData *> CopyableData =
22381 getScheduleCopyableData(I: cast<Instruction>(Val: V));
22382 if (!CopyableData.empty()) {
22383 for (ScheduleCopyableData *SD : CopyableData)
22384 ReadyInsts.remove(X: SD);
22385 }
22386 ScheduleData *BundleMember = getScheduleData(V);
22387 assert((BundleMember || S.isCopyableElement(V)) &&
22388 "no ScheduleData for bundle member (maybe not in same basic block)");
22389 if (!BundleMember)
22390 continue;
22391
22392 // Make sure we don't leave the pieces of the bundle in the ready list when
22393 // whole bundle might not be ready.
22394 ReadyInsts.remove(X: BundleMember);
22395 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
22396 !Bundles.empty()) {
22397 for (ScheduleBundle *B : Bundles)
22398 ReadyInsts.remove(X: B);
22399 }
22400
22401 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
22402 continue;
22403 // A bundle member was scheduled as single instruction before and now
22404 // needs to be scheduled as part of the bundle. We just get rid of the
22405 // existing schedule.
22406 // A bundle member has deps calculated before it was copyable element - need
22407 // to reschedule.
22408 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
22409 << " was already scheduled\n");
22410 ReSchedule = true;
22411 }
22412
22413 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
22414 TryScheduleBundleImpl(ReSchedule, Bundle);
22415 if (!Bundle.isReady()) {
22416 for (ScheduleEntity *BD : Bundle.getBundle()) {
22417 // Copyable data scheduling is just removed.
22418 if (isa<ScheduleCopyableData>(Val: BD))
22419 continue;
22420 if (BD->isReady()) {
22421 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: BD->getInst());
22422 if (Bundles.empty()) {
22423 ReadyInsts.insert(X: BD);
22424 continue;
22425 }
22426 for (ScheduleBundle *B : Bundles)
22427 if (B->isReady())
22428 ReadyInsts.insert(X: B);
22429 }
22430 }
22431 ScheduledBundlesList.pop_back();
22432 SmallVector<ScheduleData *> ControlDependentMembers;
22433 for (Value *V : VL) {
22434 if (S.isNonSchedulable(V))
22435 continue;
22436 auto *I = cast<Instruction>(Val: V);
22437 if (S.isCopyableElement(V: I)) {
22438 // Remove the copyable data from the scheduling region and restore
22439 // previous mappings.
22440 auto KV = std::make_pair(x: EI, y&: I);
22441 assert(ScheduleCopyableDataMap.contains(KV) &&
22442 "no ScheduleCopyableData for copyable element");
22443 ScheduleCopyableData *SD =
22444 ScheduleCopyableDataMapByInst.find(Val: I)->getSecond().pop_back_val();
22445 ScheduleCopyableDataMapByUsers[I].remove(X: SD);
22446 if (EI.UserTE) {
22447 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
22448 const auto *It = find(Range&: Op, Val: I);
22449 assert(It != Op.end() && "Lane not set");
22450 SmallPtrSet<Instruction *, 4> Visited;
22451 do {
22452 int Lane = std::distance(first: Op.begin(), last: It);
22453 assert(Lane >= 0 && "Lane not set");
22454 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
22455 !EI.UserTE->ReorderIndices.empty())
22456 Lane = EI.UserTE->ReorderIndices[Lane];
22457 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22458 "Couldn't find extract lane");
22459 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
22460 if (!Visited.insert(Ptr: In).second) {
22461 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
22462 break;
22463 }
22464 ScheduleCopyableDataMapByInstUser
22465 [std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I)]
22466 .pop_back();
22467 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
22468 } while (It != Op.end());
22469 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
22470 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(EI: UserEI, V: I))
22471 ScheduleCopyableDataMapByUsers[I].insert(X: UserCD);
22472 }
22473 if (ScheduleCopyableDataMapByUsers[I].empty())
22474 ScheduleCopyableDataMapByUsers.erase(Val: I);
22475 ScheduleCopyableDataMap.erase(Val: KV);
22476 // Need to recalculate dependencies for the actual schedule data.
22477 if (ScheduleData *OpSD = getScheduleData(I);
22478 OpSD && OpSD->hasValidDependencies()) {
22479 OpSD->clearDirectDependencies();
22480 if (RegionHasStackSave ||
22481 !isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
22482 ControlDependentMembers.push_back(Elt: OpSD);
22483 }
22484 continue;
22485 }
22486 ScheduledBundles.find(Val: I)->getSecond().pop_back();
22487 }
22488 if (!ControlDependentMembers.empty()) {
22489 ScheduleBundle Invalid = ScheduleBundle::invalid();
22490 calculateDependencies(Bundle&: Invalid, /*InsertInReadyList=*/false, SLP,
22491 ControlDeps: ControlDependentMembers);
22492 }
22493 return std::nullopt;
22494 }
22495 return &Bundle;
22496}
22497
22498BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22499 // Allocate a new ScheduleData for the instruction.
22500 if (ChunkPos >= ChunkSize) {
22501 ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
22502 ChunkPos = 0;
22503 }
22504 return &(ScheduleDataChunks.back()[ChunkPos++]);
22505}
22506
22507bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22508 Value *V, const InstructionsState &S) {
22509 Instruction *I = dyn_cast<Instruction>(Val: V);
22510 assert(I && "bundle member must be an instruction");
22511 if (getScheduleData(I))
22512 return true;
22513 if (!ScheduleStart) {
22514 // It's the first instruction in the new region.
22515 initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
22516 ScheduleStart = I;
22517 ScheduleEnd = I->getNextNode();
22518 assert(ScheduleEnd && "tried to vectorize a terminator?");
22519 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
22520 return true;
22521 }
22522 // Search up and down at the same time, because we don't know if the new
22523 // instruction is above or below the existing scheduling region.
22524 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
22525 // against the budget. Otherwise debug info could affect codegen.
22526 BasicBlock::reverse_iterator UpIter =
22527 ++ScheduleStart->getIterator().getReverse();
22528 BasicBlock::reverse_iterator UpperEnd = BB->rend();
22529 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
22530 BasicBlock::iterator LowerEnd = BB->end();
22531 auto IsAssumeLikeIntr = [](const Instruction &I) {
22532 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
22533 return II->isAssumeLikeIntrinsic();
22534 return false;
22535 };
22536 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
22537 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
22538 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
22539 &*DownIter != I) {
22540 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22541 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
22542 return false;
22543 }
22544
22545 ++UpIter;
22546 ++DownIter;
22547
22548 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
22549 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
22550 }
22551 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
22552 assert(I->getParent() == ScheduleStart->getParent() &&
22553 "Instruction is in wrong basic block.");
22554 initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
22555 ScheduleStart = I;
22556 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
22557 << "\n");
22558 return true;
22559 }
22560 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
22561 "Expected to reach top of the basic block or instruction down the "
22562 "lower end.");
22563 assert(I->getParent() == ScheduleEnd->getParent() &&
22564 "Instruction is in wrong basic block.");
22565 initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
22566 NextLoadStore: nullptr);
22567 ScheduleEnd = I->getNextNode();
22568 assert(ScheduleEnd && "tried to vectorize a terminator?");
22569 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
22570 return true;
22571}
22572
22573void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22574 Instruction *ToI,
22575 ScheduleData *PrevLoadStore,
22576 ScheduleData *NextLoadStore) {
22577 ScheduleData *CurrentLoadStore = PrevLoadStore;
22578 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
22579 // No need to allocate data for non-schedulable instructions.
22580 if (isa<PHINode>(Val: I))
22581 continue;
22582 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
22583 if (!SD) {
22584 SD = allocateScheduleDataChunks();
22585 ScheduleDataMap[I] = SD;
22586 }
22587 assert(!isInSchedulingRegion(*SD) &&
22588 "new ScheduleData already in scheduling region");
22589 SD->init(BlockSchedulingRegionID: SchedulingRegionID, I);
22590
22591 auto CanIgnoreLoad = [](const Instruction *I) {
22592 const auto *LI = dyn_cast<LoadInst>(Val: I);
22593 // If there is a simple load marked as invariant, we can ignore it.
22594 // But, in the (unlikely) case of non-simple invariant load,
22595 // we should not ignore it.
22596 return LI && LI->isSimple() &&
22597 LI->getMetadata(KindID: LLVMContext::MD_invariant_load);
22598 };
22599
22600 if (I->mayReadOrWriteMemory() &&
22601 // Simple InvariantLoad does not depend on other memory accesses.
22602 !CanIgnoreLoad(I) &&
22603 (!isa<IntrinsicInst>(Val: I) ||
22604 (cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
22605 cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
22606 Intrinsic::pseudoprobe))) {
22607 // Update the linked list of memory accessing instructions.
22608 if (CurrentLoadStore) {
22609 CurrentLoadStore->setNextLoadStore(SD);
22610 } else {
22611 FirstLoadStoreInRegion = SD;
22612 }
22613 CurrentLoadStore = SD;
22614 }
22615
22616 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
22617 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
22618 RegionHasStackSave = true;
22619 }
22620 if (NextLoadStore) {
22621 if (CurrentLoadStore)
22622 CurrentLoadStore->setNextLoadStore(NextLoadStore);
22623 } else {
22624 LastLoadStoreInRegion = CurrentLoadStore;
22625 }
22626}
22627
22628void BoUpSLP::BlockScheduling::calculateDependencies(
22629 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
22630 ArrayRef<ScheduleData *> ControlDeps) {
22631 SmallVector<ScheduleEntity *> WorkList;
22632 auto ProcessNode = [&](ScheduleEntity *SE) {
22633 if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
22634 if (CD->hasValidDependencies())
22635 return;
22636 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
22637 CD->initDependencies();
22638 CD->resetUnscheduledDeps();
22639 const EdgeInfo &EI = CD->getEdgeInfo();
22640 if (EI.UserTE) {
22641 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
22642 const auto *It = find(Range&: Op, Val: CD->getInst());
22643 assert(It != Op.end() && "Lane not set");
22644 SmallPtrSet<Instruction *, 4> Visited;
22645 do {
22646 int Lane = std::distance(first: Op.begin(), last: It);
22647 assert(Lane >= 0 && "Lane not set");
22648 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
22649 !EI.UserTE->ReorderIndices.empty())
22650 Lane = EI.UserTE->ReorderIndices[Lane];
22651 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22652 "Couldn't find extract lane");
22653 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
22654 if (EI.UserTE->isCopyableElement(V: In)) {
22655 // We may have not have related copyable scheduling data, if the
22656 // instruction is non-schedulable.
22657 if (ScheduleCopyableData *UseSD =
22658 getScheduleCopyableData(EI: EI.UserTE->UserTreeIndex, V: In)) {
22659 CD->incDependencies();
22660 if (!UseSD->isScheduled())
22661 CD->incrementUnscheduledDeps(Incr: 1);
22662 if (!UseSD->hasValidDependencies() ||
22663 (InsertInReadyList && UseSD->isReady()))
22664 WorkList.push_back(Elt: UseSD);
22665 }
22666 } else if (Visited.insert(Ptr: In).second) {
22667 if (ScheduleData *UseSD = getScheduleData(I: In)) {
22668 CD->incDependencies();
22669 if (!UseSD->isScheduled())
22670 CD->incrementUnscheduledDeps(Incr: 1);
22671 if (!UseSD->hasValidDependencies() ||
22672 (InsertInReadyList && UseSD->isReady()))
22673 WorkList.push_back(Elt: UseSD);
22674 }
22675 }
22676 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: CD->getInst());
22677 } while (It != Op.end());
22678 if (CD->isReady() && CD->getDependencies() == 0 &&
22679 (EI.UserTE->hasState() &&
22680 (EI.UserTE->getMainOp()->getParent() !=
22681 CD->getInst()->getParent() ||
22682 (isa<PHINode>(Val: EI.UserTE->getMainOp()) &&
22683 (EI.UserTE->getMainOp()->hasNUsesOrMore(N: UsesLimit) ||
22684 any_of(Range: EI.UserTE->getMainOp()->users(), P: [&](User *U) {
22685 auto *IU = dyn_cast<Instruction>(Val: U);
22686 if (!IU)
22687 return true;
22688 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
22689 })))))) {
22690 // If no uses in the block - mark as having pseudo-use, which cannot
22691 // be scheduled.
22692 // Prevents incorrect def-use tracking between external user and
22693 // actual instruction.
22694 CD->incDependencies();
22695 CD->incrementUnscheduledDeps(Incr: 1);
22696 }
22697 }
22698 return;
22699 }
22700 auto *BundleMember = cast<ScheduleData>(Val: SE);
22701 if (BundleMember->hasValidDependencies())
22702 return;
22703 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
22704 BundleMember->initDependencies();
22705 BundleMember->resetUnscheduledDeps();
22706 // Handle def-use chain dependencies.
22707 SmallDenseMap<Value *, unsigned> UserToNumOps;
22708 for (User *U : BundleMember->getInst()->users()) {
22709 if (isa<PHINode>(Val: U))
22710 continue;
22711 if (ScheduleData *UseSD = getScheduleData(V: U)) {
22712 // The operand is a copyable element - skip.
22713 unsigned &NumOps = UserToNumOps.try_emplace(Key: U, Args: 0).first->getSecond();
22714 ++NumOps;
22715 if (areAllOperandsReplacedByCopyableData(
22716 User: cast<Instruction>(Val: U), Op: BundleMember->getInst(), SLP&: *SLP, NumOps))
22717 continue;
22718 BundleMember->incDependencies();
22719 if (!UseSD->isScheduled())
22720 BundleMember->incrementUnscheduledDeps(Incr: 1);
22721 if (!UseSD->hasValidDependencies() ||
22722 (InsertInReadyList && UseSD->isReady()))
22723 WorkList.push_back(Elt: UseSD);
22724 }
22725 }
22726 for (ScheduleCopyableData *UseSD :
22727 getScheduleCopyableDataUsers(User: BundleMember->getInst())) {
22728 BundleMember->incDependencies();
22729 if (!UseSD->isScheduled())
22730 BundleMember->incrementUnscheduledDeps(Incr: 1);
22731 if (!UseSD->hasValidDependencies() ||
22732 (InsertInReadyList && UseSD->isReady()))
22733 WorkList.push_back(Elt: UseSD);
22734 }
22735
22736 SmallPtrSet<const Instruction *, 4> Visited;
22737 auto MakeControlDependent = [&](Instruction *I) {
22738 // Do not mark control dependent twice.
22739 if (!Visited.insert(Ptr: I).second)
22740 return;
22741 auto *DepDest = getScheduleData(I);
22742 assert(DepDest && "must be in schedule window");
22743 DepDest->addControlDependency(Dep: BundleMember);
22744 BundleMember->incDependencies();
22745 if (!DepDest->isScheduled())
22746 BundleMember->incrementUnscheduledDeps(Incr: 1);
22747 if (!DepDest->hasValidDependencies() ||
22748 (InsertInReadyList && DepDest->isReady()))
22749 WorkList.push_back(Elt: DepDest);
22750 };
22751
22752 // Any instruction which isn't safe to speculate at the beginning of the
22753 // block is control depend on any early exit or non-willreturn call
22754 // which proceeds it.
22755 if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->getInst())) {
22756 for (Instruction *I = BundleMember->getInst()->getNextNode();
22757 I != ScheduleEnd; I = I->getNextNode()) {
22758 if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
22759 continue;
22760
22761 // Add the dependency
22762 MakeControlDependent(I);
22763
22764 if (!isGuaranteedToTransferExecutionToSuccessor(I))
22765 // Everything past here must be control dependent on I.
22766 break;
22767 }
22768 }
22769
22770 if (RegionHasStackSave) {
22771 // If we have an inalloc alloca instruction, it needs to be scheduled
22772 // after any preceeding stacksave. We also need to prevent any alloca
22773 // from reordering above a preceeding stackrestore.
22774 if (match(V: BundleMember->getInst(), P: m_Intrinsic<Intrinsic::stacksave>()) ||
22775 match(V: BundleMember->getInst(),
22776 P: m_Intrinsic<Intrinsic::stackrestore>())) {
22777 for (Instruction *I = BundleMember->getInst()->getNextNode();
22778 I != ScheduleEnd; I = I->getNextNode()) {
22779 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
22780 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
22781 // Any allocas past here must be control dependent on I, and I
22782 // must be memory dependend on BundleMember->Inst.
22783 break;
22784
22785 if (!isa<AllocaInst>(Val: I))
22786 continue;
22787
22788 // Add the dependency
22789 MakeControlDependent(I);
22790 }
22791 }
22792
22793 // In addition to the cases handle just above, we need to prevent
22794 // allocas and loads/stores from moving below a stacksave or a
22795 // stackrestore. Avoiding moving allocas below stackrestore is currently
22796 // thought to be conservatism. Moving loads/stores below a stackrestore
22797 // can lead to incorrect code.
22798 if (isa<AllocaInst>(Val: BundleMember->getInst()) ||
22799 BundleMember->getInst()->mayReadOrWriteMemory()) {
22800 for (Instruction *I = BundleMember->getInst()->getNextNode();
22801 I != ScheduleEnd; I = I->getNextNode()) {
22802 if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
22803 !match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
22804 continue;
22805
22806 // Add the dependency
22807 MakeControlDependent(I);
22808 break;
22809 }
22810 }
22811 }
22812
22813 // Handle the memory dependencies (if any).
22814 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
22815 if (!NextLoadStore)
22816 return;
22817 Instruction *SrcInst = BundleMember->getInst();
22818 assert(SrcInst->mayReadOrWriteMemory() &&
22819 "NextLoadStore list for non memory effecting bundle?");
22820 MemoryLocation SrcLoc = getLocation(I: SrcInst);
22821 bool SrcMayWrite = SrcInst->mayWriteToMemory();
22822 unsigned NumAliased = 0;
22823 unsigned DistToSrc = 1;
22824 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(I: SrcInst);
22825
22826 for (ScheduleData *DepDest = NextLoadStore; DepDest;
22827 DepDest = DepDest->getNextLoadStore()) {
22828 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
22829
22830 // We have two limits to reduce the complexity:
22831 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
22832 // SLP->isAliased (which is the expensive part in this loop).
22833 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
22834 // the whole loop (even if the loop is fast, it's quadratic).
22835 // It's important for the loop break condition (see below) to
22836 // check this limit even between two read-only instructions.
22837 if (DistToSrc >= MaxMemDepDistance ||
22838 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
22839 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
22840 SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->getInst())))) {
22841
22842 // We increment the counter only if the locations are aliased
22843 // (instead of counting all alias checks). This gives a better
22844 // balance between reduced runtime and accurate dependencies.
22845 NumAliased++;
22846
22847 DepDest->addMemoryDependency(Dep: BundleMember);
22848 BundleMember->incDependencies();
22849 if (!DepDest->isScheduled())
22850 BundleMember->incrementUnscheduledDeps(Incr: 1);
22851 if (!DepDest->hasValidDependencies() ||
22852 (InsertInReadyList && DepDest->isReady()))
22853 WorkList.push_back(Elt: DepDest);
22854 }
22855
22856 // Example, explaining the loop break condition: Let's assume our
22857 // starting instruction is i0 and MaxMemDepDistance = 3.
22858 //
22859 // +--------v--v--v
22860 // i0,i1,i2,i3,i4,i5,i6,i7,i8
22861 // +--------^--^--^
22862 //
22863 // MaxMemDepDistance let us stop alias-checking at i3 and we add
22864 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
22865 // Previously we already added dependencies from i3 to i6,i7,i8
22866 // (because of MaxMemDepDistance). As we added a dependency from
22867 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
22868 // and we can abort this loop at i6.
22869 if (DistToSrc >= 2 * MaxMemDepDistance)
22870 break;
22871 DistToSrc++;
22872 }
22873 };
22874
22875 assert((Bundle || !ControlDeps.empty()) &&
22876 "expected at least one instruction to schedule");
22877 if (Bundle)
22878 WorkList.push_back(Elt: Bundle.getBundle().front());
22879 WorkList.append(in_start: ControlDeps.begin(), in_end: ControlDeps.end());
22880 SmallPtrSet<ScheduleBundle *, 16> Visited;
22881 while (!WorkList.empty()) {
22882 ScheduleEntity *SD = WorkList.pop_back_val();
22883 SmallVector<ScheduleBundle *, 1> CopyableBundle;
22884 ArrayRef<ScheduleBundle *> Bundles;
22885 if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SD)) {
22886 CopyableBundle.push_back(Elt: &CD->getBundle());
22887 Bundles = CopyableBundle;
22888 } else {
22889 Bundles = getScheduleBundles(V: SD->getInst());
22890 }
22891 if (Bundles.empty()) {
22892 if (!SD->hasValidDependencies())
22893 ProcessNode(SD);
22894 if (InsertInReadyList && SD->isReady()) {
22895 ReadyInsts.insert(X: SD);
22896 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
22897 }
22898 continue;
22899 }
22900 for (ScheduleBundle *Bundle : Bundles) {
22901 if (Bundle->hasValidDependencies() || !Visited.insert(Ptr: Bundle).second)
22902 continue;
22903 assert(isInSchedulingRegion(*Bundle) &&
22904 "ScheduleData not in scheduling region");
22905 for_each(Range: Bundle->getBundle(), F: ProcessNode);
22906 }
22907 if (InsertInReadyList && SD->isReady()) {
22908 for (ScheduleBundle *Bundle : Bundles) {
22909 assert(isInSchedulingRegion(*Bundle) &&
22910 "ScheduleData not in scheduling region");
22911 if (!Bundle->isReady())
22912 continue;
22913 ReadyInsts.insert(X: Bundle);
22914 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
22915 << "\n");
22916 }
22917 }
22918 }
22919}
22920
22921void BoUpSLP::BlockScheduling::resetSchedule() {
22922 assert(ScheduleStart &&
22923 "tried to reset schedule on block which has not been scheduled");
22924 for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
22925 if (BB != P.first->getParent())
22926 return;
22927 ScheduleData *SD = P.second;
22928 if (isInSchedulingRegion(SD: *SD)) {
22929 SD->setScheduled(/*Scheduled=*/false);
22930 SD->resetUnscheduledDeps();
22931 }
22932 });
22933 for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
22934 for_each(P.second, [&](ScheduleCopyableData *SD) {
22935 if (isInSchedulingRegion(SD: *SD)) {
22936 SD->setScheduled(/*Scheduled=*/false);
22937 SD->resetUnscheduledDeps();
22938 }
22939 });
22940 });
22941 for_each(Range&: ScheduledBundles, F: [&](auto &P) {
22942 for_each(P.second, [&](ScheduleBundle *Bundle) {
22943 if (isInSchedulingRegion(SD: *Bundle))
22944 Bundle->setScheduled(/*Scheduled=*/false);
22945 });
22946 });
22947 // Reset schedule data for copyable elements.
22948 for (auto &P : ScheduleCopyableDataMap) {
22949 if (isInSchedulingRegion(SD: *P.second)) {
22950 P.second->setScheduled(/*Scheduled=*/false);
22951 P.second->resetUnscheduledDeps();
22952 }
22953 }
22954 ReadyInsts.clear();
22955}
22956
22957void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
22958 if (!BS->ScheduleStart)
22959 return;
22960
22961 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
22962
22963 // A key point - if we got here, pre-scheduling was able to find a valid
22964 // scheduling of the sub-graph of the scheduling window which consists
22965 // of all vector bundles and their transitive users. As such, we do not
22966 // need to reschedule anything *outside of* that subgraph.
22967
22968 BS->resetSchedule();
22969
22970 // For the real scheduling we use a more sophisticated ready-list: it is
22971 // sorted by the original instruction location. This lets the final schedule
22972 // be as close as possible to the original instruction order.
22973 // WARNING: If changing this order causes a correctness issue, that means
22974 // there is some missing dependence edge in the schedule data graph.
22975 struct ScheduleDataCompare {
22976 bool operator()(const ScheduleEntity *SD1,
22977 const ScheduleEntity *SD2) const {
22978 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
22979 }
22980 };
22981 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
22982
22983 // Ensure that all dependency data is updated (for nodes in the sub-graph)
22984 // and fill the ready-list with initial instructions.
22985 int Idx = 0;
22986 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
22987 I = I->getNextNode()) {
22988 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(V: I);
22989 if (!Bundles.empty()) {
22990 for (ScheduleBundle *Bundle : Bundles) {
22991 Bundle->setSchedulingPriority(Idx++);
22992 if (!Bundle->hasValidDependencies())
22993 BS->calculateDependencies(Bundle&: *Bundle, /*InsertInReadyList=*/false, SLP: this);
22994 }
22995 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
22996 for (ScheduleCopyableData *SD : reverse(C&: SDs)) {
22997 ScheduleBundle &Bundle = SD->getBundle();
22998 Bundle.setSchedulingPriority(Idx++);
22999 if (!Bundle.hasValidDependencies())
23000 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
23001 }
23002 continue;
23003 }
23004 SmallVector<ScheduleCopyableData *> CopyableData =
23005 BS->getScheduleCopyableDataUsers(User: I);
23006 if (ScheduleData *SD = BS->getScheduleData(I)) {
23007 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(V: I);
23008 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
23009 SDTEs.front()->doesNotNeedToSchedule() ||
23010 doesNotNeedToBeScheduled(I)) &&
23011 "scheduler and vectorizer bundle mismatch");
23012 SD->setSchedulingPriority(Idx++);
23013 if (!SD->hasValidDependencies() &&
23014 (!CopyableData.empty() ||
23015 any_of(Range: R.ValueToGatherNodes.lookup(Val: I), P: [&](const TreeEntry *TE) {
23016 assert(TE->isGather() && "expected gather node");
23017 return TE->hasState() && TE->hasCopyableElements() &&
23018 TE->isCopyableElement(V: I);
23019 }))) {
23020 // Need to calculate deps for these nodes to correctly handle copyable
23021 // dependencies, even if they were cancelled.
23022 // If copyables bundle was cancelled, the deps are cleared and need to
23023 // recalculate them.
23024 ScheduleBundle Bundle;
23025 Bundle.add(SD);
23026 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
23027 }
23028 }
23029 for (ScheduleCopyableData *SD : reverse(C&: CopyableData)) {
23030 ScheduleBundle &Bundle = SD->getBundle();
23031 Bundle.setSchedulingPriority(Idx++);
23032 if (!Bundle.hasValidDependencies())
23033 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
23034 }
23035 }
23036 BS->initialFillReadyList(ReadyList&: ReadyInsts);
23037
23038 Instruction *LastScheduledInst = BS->ScheduleEnd;
23039
23040 // Do the "real" scheduling.
23041 SmallPtrSet<Instruction *, 16> Scheduled;
23042 while (!ReadyInsts.empty()) {
23043 auto *Picked = *ReadyInsts.begin();
23044 ReadyInsts.erase(position: ReadyInsts.begin());
23045
23046 // Move the scheduled instruction(s) to their dedicated places, if not
23047 // there yet.
23048 if (auto *Bundle = dyn_cast<ScheduleBundle>(Val: Picked)) {
23049 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
23050 Instruction *PickedInst = BundleMember->getInst();
23051 // If copyable must be schedule as part of something else, skip it.
23052 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(V: PickedInst);
23053 if ((IsCopyable && BS->getScheduleData(I: PickedInst)) ||
23054 (!IsCopyable && !Scheduled.insert(Ptr: PickedInst).second))
23055 continue;
23056 if (PickedInst->getNextNode() != LastScheduledInst)
23057 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
23058 LastScheduledInst = PickedInst;
23059 }
23060 EntryToLastInstruction.try_emplace(Key: Bundle->getTreeEntry(),
23061 Args&: LastScheduledInst);
23062 } else {
23063 auto *SD = cast<ScheduleData>(Val: Picked);
23064 Instruction *PickedInst = SD->getInst();
23065 if (PickedInst->getNextNode() != LastScheduledInst)
23066 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
23067 LastScheduledInst = PickedInst;
23068 }
23069 auto Invalid = InstructionsState::invalid();
23070 BS->schedule(R, S: Invalid, EI: EdgeInfo(), Data: Picked, ReadyList&: ReadyInsts);
23071 }
23072
23073 // Check that we didn't break any of our invariants.
23074#ifdef EXPENSIVE_CHECKS
23075 BS->verify();
23076#endif
23077
23078#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
23079 // Check that all schedulable entities got scheduled
23080 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
23081 I = I->getNextNode()) {
23082 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
23083 assert(all_of(Bundles,
23084 [](const ScheduleBundle *Bundle) {
23085 return Bundle->isScheduled();
23086 }) &&
23087 "must be scheduled at this point");
23088 }
23089#endif
23090
23091 // Avoid duplicate scheduling of the block.
23092 BS->ScheduleStart = nullptr;
23093}
23094
23095unsigned BoUpSLP::getVectorElementSize(Value *V) {
23096 // If V is a store, just return the width of the stored value (or value
23097 // truncated just before storing) without traversing the expression tree.
23098 // This is the common case.
23099 if (auto *Store = dyn_cast<StoreInst>(Val: V))
23100 return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
23101
23102 if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
23103 return getVectorElementSize(V: IEI->getOperand(i_nocapture: 1));
23104
23105 auto E = InstrElementSize.find(Val: V);
23106 if (E != InstrElementSize.end())
23107 return E->second;
23108
23109 // If V is not a store, we can traverse the expression tree to find loads
23110 // that feed it. The type of the loaded value may indicate a more suitable
23111 // width than V's type. We want to base the vector element size on the width
23112 // of memory operations where possible.
23113 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
23114 SmallPtrSet<Instruction *, 16> Visited;
23115 if (auto *I = dyn_cast<Instruction>(Val: V)) {
23116 Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: 0);
23117 Visited.insert(Ptr: I);
23118 }
23119
23120 // Traverse the expression tree in bottom-up order looking for loads. If we
23121 // encounter an instruction we don't yet handle, we give up.
23122 auto Width = 0u;
23123 Value *FirstNonBool = nullptr;
23124 while (!Worklist.empty()) {
23125 auto [I, Parent, Level] = Worklist.pop_back_val();
23126
23127 // We should only be looking at scalar instructions here. If the current
23128 // instruction has a vector type, skip.
23129 auto *Ty = I->getType();
23130 if (isa<VectorType>(Val: Ty))
23131 continue;
23132 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
23133 FirstNonBool = I;
23134 if (Level > RecursionMaxDepth)
23135 continue;
23136
23137 // If the current instruction is a load, update MaxWidth to reflect the
23138 // width of the loaded value.
23139 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
23140 Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
23141
23142 // Otherwise, we need to visit the operands of the instruction. We only
23143 // handle the interesting cases from buildTree here. If an operand is an
23144 // instruction we haven't yet visited and from the same basic block as the
23145 // user or the use is a PHI node, we add it to the worklist.
23146 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
23147 BinaryOperator, UnaryOperator>(Val: I)) {
23148 for (Use &U : I->operands()) {
23149 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
23150 if (Visited.insert(Ptr: J).second &&
23151 (isa<PHINode>(Val: I) || J->getParent() == Parent)) {
23152 Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + 1);
23153 continue;
23154 }
23155 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
23156 FirstNonBool = U.get();
23157 }
23158 } else {
23159 break;
23160 }
23161 }
23162
23163 // If we didn't encounter a memory access in the expression tree, or if we
23164 // gave up for some reason, just return the width of V. Otherwise, return the
23165 // maximum width we found.
23166 if (!Width) {
23167 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
23168 V = FirstNonBool;
23169 Width = DL->getTypeSizeInBits(Ty: V->getType());
23170 }
23171
23172 for (Instruction *I : Visited)
23173 InstrElementSize[I] = Width;
23174
23175 return Width;
23176}
23177
23178bool BoUpSLP::collectValuesToDemote(
23179 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
23180 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
23181 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
23182 bool &IsProfitableToDemote, bool IsTruncRoot) const {
23183 // We can always demote constants.
23184 if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
23185 return true;
23186
23187 unsigned OrigBitWidth =
23188 DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType()->getScalarType());
23189 if (OrigBitWidth == BitWidth) {
23190 MaxDepthLevel = 1;
23191 return true;
23192 }
23193
23194 // Check if the node was analyzed already and must keep its original bitwidth.
23195 if (NodesToKeepBWs.contains(V: E.Idx))
23196 return false;
23197
23198 // If the value is not a vectorized instruction in the expression and not used
23199 // by the insertelement instruction and not used in multiple vector nodes, it
23200 // cannot be demoted.
23201 bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
23202 if (isa<PoisonValue>(Val: R))
23203 return false;
23204 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
23205 });
23206 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
23207 if (isa<PoisonValue>(Val: V))
23208 return true;
23209 if (getTreeEntries(V).size() > 1)
23210 return false;
23211 // For lat shuffle of sext/zext with many uses need to check the extra bit
23212 // for unsigned values, otherwise may have incorrect casting for reused
23213 // scalars.
23214 bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery(*DL));
23215 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
23216 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23217 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
23218 return true;
23219 }
23220 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
23221 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
23222 if (IsSignedNode)
23223 ++BitWidth1;
23224 if (auto *I = dyn_cast<Instruction>(Val: V)) {
23225 APInt Mask = DB->getDemandedBits(I);
23226 unsigned BitWidth2 =
23227 std::max<unsigned>(a: 1, b: Mask.getBitWidth() - Mask.countl_zero());
23228 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
23229 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - 1);
23230 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
23231 break;
23232 BitWidth2 *= 2;
23233 }
23234 BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
23235 }
23236 BitWidth = std::max(a: BitWidth, b: BitWidth1);
23237 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
23238 };
23239 auto FinalAnalysis = [&, TTI = TTI]() {
23240 if (!IsProfitableToDemote)
23241 return false;
23242 bool Res = all_of(
23243 Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
23244 // Demote gathers.
23245 if (Res && E.isGather()) {
23246 if (E.hasState()) {
23247 if (const TreeEntry *SameTE =
23248 getSameValuesTreeEntry(V: E.getMainOp(), VL: E.Scalars);
23249 SameTE)
23250 if (collectValuesToDemote(E: *SameTE, IsProfitableToDemoteRoot, BitWidth,
23251 ToDemote, Visited, NodesToKeepBWs,
23252 MaxDepthLevel, IsProfitableToDemote,
23253 IsTruncRoot)) {
23254 ToDemote.push_back(Elt: E.Idx);
23255 return true;
23256 }
23257 }
23258 // Check possible extractelement instructions bases and final vector
23259 // length.
23260 SmallPtrSet<Value *, 4> UniqueBases;
23261 for (Value *V : E.Scalars) {
23262 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
23263 if (!EE)
23264 continue;
23265 UniqueBases.insert(Ptr: EE->getVectorOperand());
23266 }
23267 const unsigned VF = E.Scalars.size();
23268 Type *OrigScalarTy = E.Scalars.front()->getType();
23269 if (UniqueBases.size() <= 2 ||
23270 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF)) >=
23271 ::getNumberOfParts(
23272 TTI: *TTI,
23273 VecTy: getWidenedType(
23274 ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth),
23275 VF))) {
23276 ToDemote.push_back(Elt: E.Idx);
23277 return true;
23278 }
23279 }
23280 return Res;
23281 };
23282 if (E.isGather() || !Visited.insert(V: &E).second ||
23283 any_of(Range: E.Scalars, P: [&](Value *V) {
23284 return !isa<Constant>(Val: V) && all_of(Range: V->users(), P: [&](User *U) {
23285 return isa<InsertElementInst>(Val: U) && !isVectorized(V: U);
23286 });
23287 }))
23288 return FinalAnalysis();
23289
23290 if (any_of(Range: E.Scalars, P: [&](Value *V) {
23291 return !isa<Constant>(Val: V) && !all_of(Range: V->users(), P: [=](User *U) {
23292 return isVectorized(V: U) ||
23293 (E.Idx == 0 && UserIgnoreList &&
23294 UserIgnoreList->contains(V: U)) ||
23295 (!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
23296 !U->getType()->isScalableTy() &&
23297 DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
23298 }) && !IsPotentiallyTruncated(V, BitWidth);
23299 }))
23300 return false;
23301
23302 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
23303 bool &NeedToExit) {
23304 NeedToExit = false;
23305 unsigned InitLevel = MaxDepthLevel;
23306 for (const TreeEntry *Op : Operands) {
23307 unsigned Level = InitLevel;
23308 if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
23309 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel&: Level,
23310 IsProfitableToDemote, IsTruncRoot)) {
23311 if (!IsProfitableToDemote)
23312 return false;
23313 NeedToExit = true;
23314 if (!FinalAnalysis())
23315 return false;
23316 continue;
23317 }
23318 MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
23319 }
23320 return true;
23321 };
23322 auto AttemptCheckBitwidth =
23323 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
23324 // Try all bitwidth < OrigBitWidth.
23325 NeedToExit = false;
23326 unsigned BestFailBitwidth = 0;
23327 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
23328 if (Checker(BitWidth, OrigBitWidth))
23329 return true;
23330 if (BestFailBitwidth == 0 && FinalAnalysis())
23331 BestFailBitwidth = BitWidth;
23332 }
23333 if (BitWidth >= OrigBitWidth) {
23334 if (BestFailBitwidth == 0) {
23335 BitWidth = OrigBitWidth;
23336 return false;
23337 }
23338 MaxDepthLevel = 1;
23339 BitWidth = BestFailBitwidth;
23340 NeedToExit = true;
23341 return true;
23342 }
23343 return false;
23344 };
23345 auto TryProcessInstruction =
23346 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
23347 function_ref<bool(unsigned, unsigned)> Checker = {}) {
23348 if (Operands.empty()) {
23349 if (!IsTruncRoot)
23350 MaxDepthLevel = 1;
23351 for (Value *V : E.Scalars)
23352 (void)IsPotentiallyTruncated(V, BitWidth);
23353 } else {
23354 // Several vectorized uses? Check if we can truncate it, otherwise -
23355 // exit.
23356 if (any_of(Range: E.Scalars, P: [&](Value *V) {
23357 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
23358 }))
23359 return false;
23360 bool NeedToExit = false;
23361 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
23362 return false;
23363 if (NeedToExit)
23364 return true;
23365 if (!ProcessOperands(Operands, NeedToExit))
23366 return false;
23367 if (NeedToExit)
23368 return true;
23369 }
23370
23371 ++MaxDepthLevel;
23372 // Record the entry that we can demote.
23373 ToDemote.push_back(Elt: E.Idx);
23374 return IsProfitableToDemote;
23375 };
23376
23377 if (E.State == TreeEntry::SplitVectorize)
23378 return TryProcessInstruction(
23379 BitWidth,
23380 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
23381 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
23382
23383 if (E.isAltShuffle()) {
23384 // Combining these opcodes may lead to incorrect analysis, skip for now.
23385 auto IsDangerousOpcode = [](unsigned Opcode) {
23386 switch (Opcode) {
23387 case Instruction::Shl:
23388 case Instruction::AShr:
23389 case Instruction::LShr:
23390 case Instruction::UDiv:
23391 case Instruction::SDiv:
23392 case Instruction::URem:
23393 case Instruction::SRem:
23394 return true;
23395 default:
23396 break;
23397 }
23398 return false;
23399 };
23400 if (IsDangerousOpcode(E.getAltOpcode()))
23401 return FinalAnalysis();
23402 }
23403
23404 switch (E.getOpcode()) {
23405
23406 // We can always demote truncations and extensions. Since truncations can
23407 // seed additional demotion, we save the truncated value.
23408 case Instruction::Trunc:
23409 if (IsProfitableToDemoteRoot)
23410 IsProfitableToDemote = true;
23411 return TryProcessInstruction(BitWidth);
23412 case Instruction::ZExt:
23413 case Instruction::SExt:
23414 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
23415 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
23416 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
23417 return false;
23418 IsProfitableToDemote = true;
23419 return TryProcessInstruction(BitWidth);
23420
23421 // We can demote certain binary operations if we can demote both of their
23422 // operands.
23423 case Instruction::Add:
23424 case Instruction::Sub:
23425 case Instruction::Mul:
23426 case Instruction::And:
23427 case Instruction::Or:
23428 case Instruction::Xor: {
23429 return TryProcessInstruction(
23430 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)});
23431 }
23432 case Instruction::Freeze:
23433 return TryProcessInstruction(BitWidth, getOperandEntry(E: &E, Idx: 0));
23434 case Instruction::Shl: {
23435 // If we are truncating the result of this SHL, and if it's a shift of an
23436 // inrange amount, we can always perform a SHL in a smaller type.
23437 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
23438 return all_of(Range: E.Scalars, P: [&](Value *V) {
23439 if (isa<PoisonValue>(Val: V))
23440 return true;
23441 if (E.isCopyableElement(V))
23442 return true;
23443 auto *I = cast<Instruction>(Val: V);
23444 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
23445 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
23446 });
23447 };
23448 return TryProcessInstruction(
23449 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, ShlChecker);
23450 }
23451 case Instruction::LShr: {
23452 // If this is a truncate of a logical shr, we can truncate it to a smaller
23453 // lshr iff we know that the bits we would otherwise be shifting in are
23454 // already zeros.
23455 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23456 return all_of(Range: E.Scalars, P: [&](Value *V) {
23457 if (isa<PoisonValue>(Val: V))
23458 return true;
23459 APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23460 if (E.isCopyableElement(V))
23461 return MaskedValueIsZero(V, Mask: ShiftedBits, SQ: SimplifyQuery(*DL));
23462 auto *I = cast<Instruction>(Val: V);
23463 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
23464 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
23465 MaskedValueIsZero(V: I->getOperand(i: 0), Mask: ShiftedBits,
23466 SQ: SimplifyQuery(*DL));
23467 });
23468 };
23469 return TryProcessInstruction(
23470 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
23471 LShrChecker);
23472 }
23473 case Instruction::AShr: {
23474 // If this is a truncate of an arithmetic shr, we can truncate it to a
23475 // smaller ashr iff we know that all the bits from the sign bit of the
23476 // original type and the sign bit of the truncate type are similar.
23477 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23478 return all_of(Range: E.Scalars, P: [&](Value *V) {
23479 if (isa<PoisonValue>(Val: V))
23480 return true;
23481 auto *I = cast<Instruction>(Val: V);
23482 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
23483 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23484 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
23485 ShiftedBits <
23486 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
23487 });
23488 };
23489 return TryProcessInstruction(
23490 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
23491 AShrChecker);
23492 }
23493 case Instruction::UDiv:
23494 case Instruction::URem: {
23495 // UDiv and URem can be truncated if all the truncated bits are zero.
23496 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23497 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23498 return all_of(Range: E.Scalars, P: [&](Value *V) {
23499 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23500 if (E.hasCopyableElements() && E.isCopyableElement(V))
23501 return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL));
23502 auto *I = cast<Instruction>(Val: V);
23503 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)) &&
23504 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
23505 });
23506 };
23507 return TryProcessInstruction(
23508 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, Checker);
23509 }
23510
23511 // We can demote selects if we can demote their true and false values.
23512 case Instruction::Select: {
23513 return TryProcessInstruction(
23514 BitWidth, {getOperandEntry(E: &E, Idx: 1), getOperandEntry(E: &E, Idx: 2)});
23515 }
23516
23517 // We can demote phis if we can demote all their incoming operands.
23518 case Instruction::PHI: {
23519 const unsigned NumOps = E.getNumOperands();
23520 SmallVector<const TreeEntry *> Ops(NumOps);
23521 transform(Range: seq<unsigned>(Begin: 0, End: NumOps), d_first: Ops.begin(),
23522 F: [&](unsigned Idx) { return getOperandEntry(E: &E, Idx); });
23523
23524 return TryProcessInstruction(BitWidth, Ops);
23525 }
23526
23527 case Instruction::Call: {
23528 auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
23529 if (!IC)
23530 break;
23531 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
23532 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
23533 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
23534 break;
23535 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(E: &E, Idx: 0));
23536 function_ref<bool(unsigned, unsigned)> CallChecker;
23537 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23538 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23539 return all_of(Range: E.Scalars, P: [&](Value *V) {
23540 auto *I = cast<Instruction>(Val: V);
23541 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23542 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23543 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
23544 SQ: SimplifyQuery(*DL)) &&
23545 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
23546 }
23547 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
23548 "Expected min/max intrinsics only.");
23549 unsigned SignBits = OrigBitWidth - BitWidth;
23550 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
23551 unsigned Op0SignBits =
23552 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
23553 unsigned Op1SignBits =
23554 ComputeNumSignBits(Op: I->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
23555 return SignBits <= Op0SignBits &&
23556 ((SignBits != Op0SignBits &&
23557 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
23558 MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
23559 SQ: SimplifyQuery(*DL))) &&
23560 SignBits <= Op1SignBits &&
23561 ((SignBits != Op1SignBits &&
23562 !isKnownNonNegative(V: I->getOperand(i: 1), SQ: SimplifyQuery(*DL))) ||
23563 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL)));
23564 });
23565 };
23566 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23567 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23568 return all_of(Range: E.Scalars, P: [&](Value *V) {
23569 auto *I = cast<Instruction>(Val: V);
23570 unsigned SignBits = OrigBitWidth - BitWidth;
23571 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
23572 unsigned Op0SignBits =
23573 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
23574 return SignBits <= Op0SignBits &&
23575 ((SignBits != Op0SignBits &&
23576 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
23577 MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)));
23578 });
23579 };
23580 if (ID != Intrinsic::abs) {
23581 Operands.push_back(Elt: getOperandEntry(E: &E, Idx: 1));
23582 CallChecker = CompChecker;
23583 } else {
23584 CallChecker = AbsChecker;
23585 }
23586 InstructionCost BestCost =
23587 std::numeric_limits<InstructionCost::CostType>::max();
23588 unsigned BestBitWidth = BitWidth;
23589 unsigned VF = E.Scalars.size();
23590 // Choose the best bitwidth based on cost estimations.
23591 auto Checker = [&](unsigned BitWidth, unsigned) {
23592 unsigned MinBW = PowerOf2Ceil(A: BitWidth);
23593 SmallVector<Type *> ArgTys =
23594 buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW, TTI);
23595 auto VecCallCosts = getVectorCallCosts(
23596 CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
23597 TTI, TLI, ArgTys);
23598 InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
23599 if (Cost < BestCost) {
23600 BestCost = Cost;
23601 BestBitWidth = BitWidth;
23602 }
23603 return false;
23604 };
23605 [[maybe_unused]] bool NeedToExit;
23606 (void)AttemptCheckBitwidth(Checker, NeedToExit);
23607 BitWidth = BestBitWidth;
23608 return TryProcessInstruction(BitWidth, Operands, CallChecker);
23609 }
23610
23611 // Otherwise, conservatively give up.
23612 default:
23613 break;
23614 }
23615 MaxDepthLevel = 1;
23616 return FinalAnalysis();
23617}
23618
23619static RecurKind getRdxKind(Value *V);
23620
23621void BoUpSLP::computeMinimumValueSizes() {
23622 // We only attempt to truncate integer expressions.
23623 bool IsStoreOrInsertElt =
23624 VectorizableTree.front()->hasState() &&
23625 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
23626 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
23627 if ((IsStoreOrInsertElt || UserIgnoreList) &&
23628 ExtraBitWidthNodes.size() <= 1 &&
23629 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
23630 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
23631 return;
23632
23633 unsigned NodeIdx = 0;
23634 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
23635 NodeIdx = 1;
23636
23637 // Ensure the roots of the vectorizable tree don't form a cycle.
23638 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
23639 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
23640 "Unexpected tree is graph.");
23641
23642 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
23643 // resize to the final type.
23644 bool IsTruncRoot = false;
23645 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
23646 SmallVector<unsigned> RootDemotes;
23647 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
23648 if (NodeIdx != 0 &&
23649 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23650 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23651 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
23652 IsTruncRoot = true;
23653 RootDemotes.push_back(Elt: NodeIdx);
23654 IsProfitableToDemoteRoot = true;
23655 ++NodeIdx;
23656 }
23657
23658 // Analyzed the reduction already and not profitable - exit.
23659 if (AnalyzedMinBWVals.contains(V: VectorizableTree[NodeIdx]->Scalars.front()))
23660 return;
23661
23662 SmallVector<unsigned> ToDemote;
23663 auto ComputeMaxBitWidth =
23664 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
23665 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
23666 ToDemote.clear();
23667 // Check if the root is trunc and the next node is gather/buildvector, then
23668 // keep trunc in scalars, which is free in most cases.
23669 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
23670 !NodesToKeepBWs.contains(V: E.Idx) &&
23671 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
23672 all_of(Range: E.Scalars, P: [&](Value *V) {
23673 return V->hasOneUse() || isa<Constant>(Val: V) ||
23674 (!V->hasNUsesOrMore(N: UsesLimit) &&
23675 none_of(Range: V->users(), P: [&](User *U) {
23676 ArrayRef<TreeEntry *> TEs = getTreeEntries(V: U);
23677 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23678 if (TEs.empty() || is_contained(Range&: TEs, Element: UserTE))
23679 return false;
23680 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23681 SelectInst>(Val: U) ||
23682 isa<SIToFPInst, UIToFPInst>(Val: U) ||
23683 (UserTE->hasState() &&
23684 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
23685 SelectInst>(Val: UserTE->getMainOp()) ||
23686 isa<SIToFPInst, UIToFPInst>(Val: UserTE->getMainOp()))))
23687 return true;
23688 unsigned UserTESz = DL->getTypeSizeInBits(
23689 Ty: UserTE->Scalars.front()->getType());
23690 if (all_of(Range&: TEs, P: [&](const TreeEntry *TE) {
23691 auto It = MinBWs.find(Val: TE);
23692 return It != MinBWs.end() &&
23693 It->second.first > UserTESz;
23694 }))
23695 return true;
23696 return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
23697 }));
23698 })) {
23699 ToDemote.push_back(Elt: E.Idx);
23700 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
23701 auto It = MinBWs.find(Val: UserTE);
23702 if (It != MinBWs.end())
23703 return It->second.first;
23704 unsigned MaxBitWidth =
23705 DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
23706 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
23707 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23708 MaxBitWidth = 8;
23709 return MaxBitWidth;
23710 }
23711
23712 if (!E.hasState())
23713 return 0u;
23714
23715 unsigned VF = E.getVectorFactor();
23716 Type *ScalarTy = E.Scalars.front()->getType();
23717 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
23718 auto *TreeRootIT = dyn_cast<IntegerType>(Val: ScalarTy->getScalarType());
23719 if (!TreeRootIT)
23720 return 0u;
23721
23722 if (any_of(Range: E.Scalars,
23723 P: [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
23724 return 0u;
23725
23726 unsigned NumParts = ::getNumberOfParts(
23727 TTI: *TTI, VecTy: getWidenedType(ScalarTy: TreeRootIT, VF: VF * ScalarTyNumElements));
23728
23729 // The maximum bit width required to represent all the values that can be
23730 // demoted without loss of precision. It would be safe to truncate the roots
23731 // of the expression to this width.
23732 unsigned MaxBitWidth = 1u;
23733
23734 // True if the roots can be zero-extended back to their original type,
23735 // rather than sign-extended. We know that if the leading bits are not
23736 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
23737 // True.
23738 // Determine if the sign bit of all the roots is known to be zero. If not,
23739 // IsKnownPositive is set to False.
23740 bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
23741 if (isa<PoisonValue>(Val: R))
23742 return true;
23743 KnownBits Known = computeKnownBits(V: R, DL: *DL);
23744 return Known.isNonNegative();
23745 });
23746
23747 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
23748 E.UserTreeIndex.UserTE->hasState() &&
23749 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
23750 MaxBitWidth =
23751 std::min(a: DL->getTypeSizeInBits(
23752 Ty: E.UserTreeIndex.UserTE->Scalars.front()->getType()),
23753 b: DL->getTypeSizeInBits(Ty: ScalarTy));
23754
23755 // We first check if all the bits of the roots are demanded. If they're not,
23756 // we can truncate the roots to this narrower type.
23757 for (Value *Root : E.Scalars) {
23758 if (isa<PoisonValue>(Val: Root))
23759 continue;
23760 unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: *DL, AC, CxtI: nullptr, DT);
23761 TypeSize NumTypeBits =
23762 DL->getTypeSizeInBits(Ty: Root->getType()->getScalarType());
23763 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23764 // If we can't prove that the sign bit is zero, we must add one to the
23765 // maximum bit width to account for the unknown sign bit. This preserves
23766 // the existing sign bit so we can safely sign-extend the root back to the
23767 // original type. Otherwise, if we know the sign bit is zero, we will
23768 // zero-extend the root instead.
23769 //
23770 // FIXME: This is somewhat suboptimal, as there will be cases where adding
23771 // one to the maximum bit width will yield a larger-than-necessary
23772 // type. In general, we need to add an extra bit only if we can't
23773 // prove that the upper bit of the original type is equal to the
23774 // upper bit of the proposed smaller type. If these two bits are
23775 // the same (either zero or one) we know that sign-extending from
23776 // the smaller type will result in the same value. Here, since we
23777 // can't yet prove this, we are just making the proposed smaller
23778 // type larger to ensure correctness.
23779 if (!IsKnownPositive)
23780 ++BitWidth1;
23781
23782 auto *I = dyn_cast<Instruction>(Val: Root);
23783 if (!I) {
23784 MaxBitWidth = std::max(a: BitWidth1, b: MaxBitWidth);
23785 continue;
23786 }
23787 APInt Mask = DB->getDemandedBits(I);
23788 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23789 MaxBitWidth =
23790 std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
23791 }
23792
23793 if (MaxBitWidth < 8 && MaxBitWidth > 1)
23794 MaxBitWidth = 8;
23795
23796 // If the original type is large, but reduced type does not improve the reg
23797 // use - ignore it.
23798 if (NumParts > 1 &&
23799 NumParts ==
23800 ::getNumberOfParts(
23801 TTI: *TTI, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: F->getContext(),
23802 NumBits: bit_ceil(Value: MaxBitWidth)),
23803 VF)))
23804 return 0u;
23805
23806 unsigned Opcode = E.getOpcode();
23807 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
23808 Opcode == Instruction::SExt ||
23809 Opcode == Instruction::ZExt || NumParts > 1;
23810 // Conservatively determine if we can actually truncate the roots of the
23811 // expression. Collect the values that can be demoted in ToDemote and
23812 // additional roots that require investigating in Roots.
23813 DenseSet<const TreeEntry *> Visited;
23814 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
23815 bool NeedToDemote = IsProfitableToDemote;
23816
23817 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
23818 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
23819 IsProfitableToDemote&: NeedToDemote, IsTruncRoot) ||
23820 (MaxDepthLevel <= Limit &&
23821 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
23822 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
23823 DL->getTypeSizeInBits(Ty: TreeRootIT) /
23824 DL->getTypeSizeInBits(
23825 Ty: E.getMainOp()->getOperand(i: 0)->getType()) >
23826 2)))))
23827 return 0u;
23828 // Round MaxBitWidth up to the next power-of-two.
23829 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
23830
23831 return MaxBitWidth;
23832 };
23833
23834 // If we can truncate the root, we must collect additional values that might
23835 // be demoted as a result. That is, those seeded by truncations we will
23836 // modify.
23837 // Add reduction ops sizes, if any.
23838 if (UserIgnoreList &&
23839 isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
23840 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
23841 // x i1> to in)).
23842 if (all_of(Range: *UserIgnoreList,
23843 P: [](Value *V) {
23844 return isa<PoisonValue>(Val: V) ||
23845 cast<Instruction>(Val: V)->getOpcode() == Instruction::Add;
23846 }) &&
23847 VectorizableTree.front()->State == TreeEntry::Vectorize &&
23848 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
23849 cast<CastInst>(Val: VectorizableTree.front()->getMainOp())->getSrcTy() ==
23850 Builder.getInt1Ty()) {
23851 ReductionBitWidth = 1;
23852 } else {
23853 for (Value *V : *UserIgnoreList) {
23854 if (isa<PoisonValue>(Val: V))
23855 continue;
23856 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
23857 TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
23858 unsigned BitWidth1 = NumTypeBits - NumSignBits;
23859 if (!isKnownNonNegative(V, SQ: SimplifyQuery(*DL)))
23860 ++BitWidth1;
23861 unsigned BitWidth2 = BitWidth1;
23862 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
23863 APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
23864 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
23865 }
23866 ReductionBitWidth =
23867 std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
23868 }
23869 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
23870 ReductionBitWidth = 8;
23871
23872 ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
23873 }
23874 }
23875 bool IsTopRoot = NodeIdx == 0;
23876 while (NodeIdx < VectorizableTree.size() &&
23877 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
23878 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
23879 RootDemotes.push_back(Elt: NodeIdx);
23880 ++NodeIdx;
23881 IsTruncRoot = true;
23882 }
23883 bool IsSignedCmp = false;
23884 if (UserIgnoreList &&
23885 all_of(Range: *UserIgnoreList,
23886 P: match_fn(P: m_CombineOr(L: m_SMin(L: m_Value(), R: m_Value()),
23887 R: m_SMax(L: m_Value(), R: m_Value())))))
23888 IsSignedCmp = true;
23889 while (NodeIdx < VectorizableTree.size()) {
23890 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
23891 unsigned Limit = 2;
23892 if (IsTopRoot &&
23893 ReductionBitWidth ==
23894 DL->getTypeSizeInBits(
23895 Ty: VectorizableTree.front()->Scalars.front()->getType()))
23896 Limit = 3;
23897 unsigned MaxBitWidth = ComputeMaxBitWidth(
23898 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
23899 IsTruncRoot, IsSignedCmp);
23900 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
23901 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
23902 ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
23903 else if (MaxBitWidth == 0)
23904 ReductionBitWidth = 0;
23905 }
23906
23907 for (unsigned Idx : RootDemotes) {
23908 if (all_of(Range&: VectorizableTree[Idx]->Scalars, P: [&](Value *V) {
23909 uint32_t OrigBitWidth =
23910 DL->getTypeSizeInBits(Ty: V->getType()->getScalarType());
23911 if (OrigBitWidth > MaxBitWidth) {
23912 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
23913 return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL));
23914 }
23915 return false;
23916 }))
23917 ToDemote.push_back(Elt: Idx);
23918 }
23919 RootDemotes.clear();
23920 IsTopRoot = false;
23921 IsProfitableToDemoteRoot = true;
23922
23923 if (ExtraBitWidthNodes.empty()) {
23924 NodeIdx = VectorizableTree.size();
23925 } else {
23926 unsigned NewIdx = 0;
23927 do {
23928 NewIdx = *ExtraBitWidthNodes.begin();
23929 ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
23930 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
23931 NodeIdx = NewIdx;
23932 IsTruncRoot =
23933 NodeIdx < VectorizableTree.size() &&
23934 VectorizableTree[NodeIdx]->UserTreeIndex &&
23935 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
23936 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23937 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23938 Instruction::Trunc &&
23939 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
23940 IsSignedCmp =
23941 NodeIdx < VectorizableTree.size() &&
23942 VectorizableTree[NodeIdx]->UserTreeIndex &&
23943 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
23944 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
23945 Instruction::ICmp &&
23946 any_of(
23947 Range&: VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
23948 P: [&](Value *V) {
23949 auto *IC = dyn_cast<ICmpInst>(Val: V);
23950 return IC && (IC->isSigned() ||
23951 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 0),
23952 SQ: SimplifyQuery(*DL)) ||
23953 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 1),
23954 SQ: SimplifyQuery(*DL)));
23955 });
23956 }
23957
23958 // If the maximum bit width we compute is less than the width of the roots'
23959 // type, we can proceed with the narrowing. Otherwise, do nothing.
23960 if (MaxBitWidth == 0 ||
23961 MaxBitWidth >=
23962 cast<IntegerType>(Val: TreeRoot.front()->getType()->getScalarType())
23963 ->getBitWidth()) {
23964 if (UserIgnoreList)
23965 AnalyzedMinBWVals.insert_range(R&: TreeRoot);
23966 NodesToKeepBWs.insert_range(R&: ToDemote);
23967 continue;
23968 }
23969
23970 // Finally, map the values we can demote to the maximum bit with we
23971 // computed.
23972 for (unsigned Idx : ToDemote) {
23973 TreeEntry *TE = VectorizableTree[Idx].get();
23974 if (MinBWs.contains(Val: TE))
23975 continue;
23976 bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
23977 if (isa<PoisonValue>(Val: R))
23978 return false;
23979 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
23980 });
23981 MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
23982 }
23983 }
23984}
23985
23986PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
23987 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
23988 auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
23989 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
23990 auto *AA = &AM.getResult<AAManager>(IR&: F);
23991 auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
23992 auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
23993 auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
23994 auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
23995 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
23996
23997 bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
23998 if (!Changed)
23999 return PreservedAnalyses::all();
24000
24001 PreservedAnalyses PA;
24002 PA.preserveSet<CFGAnalyses>();
24003 return PA;
24004}
24005
24006bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
24007 TargetTransformInfo *TTI_,
24008 TargetLibraryInfo *TLI_, AAResults *AA_,
24009 LoopInfo *LI_, DominatorTree *DT_,
24010 AssumptionCache *AC_, DemandedBits *DB_,
24011 OptimizationRemarkEmitter *ORE_) {
24012 if (!RunSLPVectorization)
24013 return false;
24014 SE = SE_;
24015 TTI = TTI_;
24016 TLI = TLI_;
24017 AA = AA_;
24018 LI = LI_;
24019 DT = DT_;
24020 AC = AC_;
24021 DB = DB_;
24022 DL = &F.getDataLayout();
24023
24024 Stores.clear();
24025 GEPs.clear();
24026 bool Changed = false;
24027
24028 // If the target claims to have no vector registers don't attempt
24029 // vectorization.
24030 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
24031 LLVM_DEBUG(
24032 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
24033 return false;
24034 }
24035
24036 // Don't vectorize when the attribute NoImplicitFloat is used.
24037 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
24038 return false;
24039
24040 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
24041
24042 // Use the bottom up slp vectorizer to construct chains that start with
24043 // store instructions.
24044 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
24045
24046 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
24047 // delete instructions.
24048
24049 // Update DFS numbers now so that we can use them for ordering.
24050 DT->updateDFSNumbers();
24051
24052 // Scan the blocks in the function in post order.
24053 for (auto *BB : post_order(G: &F.getEntryBlock())) {
24054 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()))
24055 continue;
24056
24057 // Start new block - clear the list of reduction roots.
24058 R.clearReductionData();
24059 collectSeedInstructions(BB);
24060
24061 // Vectorize trees that end at stores.
24062 if (!Stores.empty()) {
24063 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
24064 << " underlying objects.\n");
24065 Changed |= vectorizeStoreChains(R);
24066 }
24067
24068 // Vectorize trees that end at reductions.
24069 Changed |= vectorizeChainsInBlock(BB, R);
24070
24071 // Vectorize the index computations of getelementptr instructions. This
24072 // is primarily intended to catch gather-like idioms ending at
24073 // non-consecutive loads.
24074 if (!GEPs.empty()) {
24075 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
24076 << " underlying objects.\n");
24077 Changed |= vectorizeGEPIndices(BB, R);
24078 }
24079 }
24080
24081 if (Changed) {
24082 R.optimizeGatherSequence();
24083 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
24084 }
24085 return Changed;
24086}
24087
24088std::optional<bool>
24089SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
24090 unsigned Idx, unsigned MinVF,
24091 unsigned &Size) {
24092 Size = 0;
24093 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
24094 << "\n");
24095 const unsigned Sz = R.getVectorElementSize(V: Chain[0]);
24096 unsigned VF = Chain.size();
24097
24098 if (!has_single_bit(Value: Sz) ||
24099 !hasFullVectorsOrPowerOf2(
24100 TTI: *TTI, Ty: cast<StoreInst>(Val: Chain.front())->getValueOperand()->getType(),
24101 Sz: VF) ||
24102 VF < 2 || VF < MinVF) {
24103 // Check if vectorizing with a non-power-of-2 VF should be considered. At
24104 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
24105 // all vector lanes are used.
24106 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
24107 return false;
24108 }
24109
24110 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
24111 << "\n");
24112
24113 SetVector<Value *> ValOps;
24114 for (Value *V : Chain)
24115 ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
24116 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
24117 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
24118 InstructionsState S = Analysis.buildInstructionsState(
24119 VL: ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
24120 if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > 1) {
24121 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
24122 bool IsAllowedSize =
24123 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ValOps.front()->getType(),
24124 Sz: ValOps.size()) ||
24125 (VectorizeNonPowerOf2 && has_single_bit(Value: ValOps.size() + 1));
24126 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
24127 (!S.getMainOp()->isSafeToRemove() ||
24128 any_of(Range: ValOps.getArrayRef(),
24129 P: [&](Value *V) {
24130 return !isa<ExtractElementInst>(Val: V) &&
24131 (V->getNumUses() > Chain.size() ||
24132 any_of(Range: V->users(), P: [&](User *U) {
24133 return !Stores.contains(V: U);
24134 }));
24135 }))) ||
24136 (ValOps.size() > Chain.size() / 2 && !S)) {
24137 Size = (!IsAllowedSize && S) ? 1 : 2;
24138 return false;
24139 }
24140 }
24141 if (R.isLoadCombineCandidate(Stores: Chain))
24142 return true;
24143 R.buildTree(Roots: Chain);
24144 // Check if tree tiny and store itself or its value is not vectorized.
24145 if (R.isTreeTinyAndNotFullyVectorizable()) {
24146 if (R.isGathered(V: Chain.front()) ||
24147 R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
24148 return std::nullopt;
24149 Size = R.getCanonicalGraphSize();
24150 return false;
24151 }
24152 if (R.isProfitableToReorder()) {
24153 R.reorderTopToBottom();
24154 R.reorderBottomToTop();
24155 }
24156 R.transformNodes();
24157 R.computeMinimumValueSizes();
24158
24159 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24160 R.buildExternalUses();
24161
24162 Size = R.getCanonicalGraphSize();
24163 if (S && S.getOpcode() == Instruction::Load)
24164 Size = 2; // cut off masked gather small trees
24165 InstructionCost Cost = R.getTreeCost(TreeCost);
24166
24167 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
24168 if (Cost < -SLPCostThreshold) {
24169 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
24170
24171 using namespace ore;
24172
24173 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "StoresVectorized",
24174 cast<StoreInst>(Val: Chain[0]))
24175 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
24176 << " and with tree size "
24177 << NV("TreeSize", R.getTreeSize()));
24178
24179 R.vectorizeTree();
24180 return true;
24181 }
24182
24183 return false;
24184}
24185
24186/// Checks if the quadratic mean deviation is less than 90% of the mean size.
24187static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes) {
24188 unsigned Num = 0;
24189 uint64_t Sum = std::accumulate(
24190 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
24191 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24192 unsigned Size = Val.first;
24193 if (Size == 1)
24194 return V;
24195 ++Num;
24196 return V + Size;
24197 });
24198 if (Num == 0)
24199 return true;
24200 uint64_t Mean = Sum / Num;
24201 if (Mean == 0)
24202 return true;
24203 uint64_t Dev = std::accumulate(
24204 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
24205 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24206 unsigned P = Val.first;
24207 if (P == 1)
24208 return V;
24209 return V + (P - Mean) * (P - Mean);
24210 }) /
24211 Num;
24212 return Dev * 96 / (Mean * Mean) == 0;
24213}
24214
24215namespace {
24216
24217/// A group of stores that we'll try to bundle together using vector ops.
24218/// They are ordered using the signed distance of their address operand to the
24219/// address of this group's BaseInstr.
24220class RelatedStoreInsts {
24221public:
24222 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
24223 : AllStores(AllStores) {
24224 reset(NewBaseInstr: BaseInstrIdx);
24225 }
24226
24227 void reset(unsigned NewBaseInstr) {
24228 assert(NewBaseInstr < AllStores.size() &&
24229 "Instruction index out of bounds");
24230 BaseInstrIdx = NewBaseInstr;
24231 Instrs.clear();
24232 insertOrLookup(InstrIdx: NewBaseInstr, PtrDist: 0);
24233 }
24234
24235 /// Tries to insert \p InstrIdx as the store with a pointer distance of
24236 /// \p PtrDist.
24237 /// Does nothing if there is already a store with that \p PtrDist.
24238 /// \returns The previously associated Instruction index, or std::nullopt
24239 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
24240 auto [It, Inserted] = Instrs.emplace(args&: PtrDist, args&: InstrIdx);
24241 return Inserted ? std::nullopt : std::make_optional(t&: It->second);
24242 }
24243
24244 using DistToInstMap = std::map<int64_t, unsigned>;
24245 const DistToInstMap &getStores() const { return Instrs; }
24246
24247 /// If \p SI is related to this group of stores, return the distance of its
24248 /// pointer operand to the one the group's BaseInstr.
24249 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
24250 ScalarEvolution &SE) const {
24251 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
24252 return getPointersDiff(
24253 ElemTyA: BaseStore.getValueOperand()->getType(), PtrA: BaseStore.getPointerOperand(),
24254 ElemTyB: SI.getValueOperand()->getType(), PtrB: SI.getPointerOperand(), DL, SE,
24255 /*StrictCheck=*/true);
24256 }
24257
24258 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
24259 /// Stores whose index is less than \p MinSafeIdx will be dropped.
24260 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
24261 int64_t DistFromCurBase) {
24262 DistToInstMap PrevSet = std::move(Instrs);
24263 reset(NewBaseInstr: NewBaseInstIdx);
24264
24265 // Re-insert stores that come after MinSafeIdx to try and vectorize them
24266 // again. Their distance will be "rebased" to use NewBaseInstIdx as
24267 // reference.
24268 for (auto [Dist, InstIdx] : PrevSet) {
24269 if (InstIdx >= MinSafeIdx)
24270 insertOrLookup(InstrIdx: InstIdx, PtrDist: Dist - DistFromCurBase);
24271 }
24272 }
24273
24274 /// Remove all stores that have been vectorized from this group.
24275 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
24276 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
24277 Range: reverse(C&: Instrs), P: [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
24278 return VectorizedStores.contains(Ptr: AllStores[DistAndIdx.second]);
24279 });
24280
24281 // Get a forward iterator pointing after the last vectorized store and erase
24282 // all stores before it so we don't try to vectorize them again.
24283 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
24284 Instrs.erase(first: Instrs.begin(), last: VectorizedStoresEnd);
24285 }
24286
24287private:
24288 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
24289 unsigned BaseInstrIdx;
24290
24291 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
24292 DistToInstMap Instrs;
24293
24294 /// Reference to all the stores in the BB being analyzed.
24295 ArrayRef<StoreInst *> AllStores;
24296};
24297
24298} // end anonymous namespace
24299
24300bool SLPVectorizerPass::vectorizeStores(
24301 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
24302 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
24303 &Visited) {
24304 // We may run into multiple chains that merge into a single chain. We mark the
24305 // stores that we vectorized so that we don't visit the same store twice.
24306 BoUpSLP::ValueSet VectorizedStores;
24307 bool Changed = false;
24308
24309 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
24310 int64_t PrevDist = -1;
24311 BoUpSLP::ValueList Operands;
24312 // Collect the chain into a list.
24313 for (auto [Idx, Data] : enumerate(First: StoreSeq)) {
24314 auto &[Dist, InstIdx] = Data;
24315 if (Operands.empty() || Dist - PrevDist == 1) {
24316 Operands.push_back(Elt: Stores[InstIdx]);
24317 PrevDist = Dist;
24318 if (Idx != StoreSeq.size() - 1)
24319 continue;
24320 }
24321 llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
24322 Operands.clear();
24323 Operands.push_back(Elt: Stores[InstIdx]);
24324 PrevDist = Dist;
24325 });
24326
24327 if (Operands.size() <= 1 ||
24328 !Visited
24329 .insert(V: {Operands.front(),
24330 cast<StoreInst>(Val: Operands.front())->getValueOperand(),
24331 Operands.back(),
24332 cast<StoreInst>(Val: Operands.back())->getValueOperand(),
24333 Operands.size()})
24334 .second)
24335 continue;
24336
24337 unsigned MaxVecRegSize = R.getMaxVecRegSize();
24338 unsigned EltSize = R.getVectorElementSize(V: Operands[0]);
24339 unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
24340
24341 unsigned MaxVF =
24342 std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
24343 auto *Store = cast<StoreInst>(Val: Operands[0]);
24344 Type *StoreTy = Store->getValueOperand()->getType();
24345 Type *ValueTy = StoreTy;
24346 if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
24347 ValueTy = Trunc->getSrcTy();
24348 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
24349 // getStoreMinimumVF only support scalar type as arguments. As a result,
24350 // we need to use the element type of StoreTy and ValueTy to retrieve the
24351 // VF and then transform it back.
24352 // Remember: VF is defined as the number we want to vectorize, not the
24353 // number of elements in the final vector.
24354 Type *StoreScalarTy = StoreTy->getScalarType();
24355 unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF(
24356 VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreScalarTy)), ScalarMemTy: StoreScalarTy,
24357 ScalarValTy: ValueTy->getScalarType()));
24358 MinVF /= getNumElements(Ty: StoreTy);
24359 MinVF = std::max<unsigned>(a: 2, b: MinVF);
24360
24361 if (MaxVF < MinVF) {
24362 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24363 << ") < "
24364 << "MinVF (" << MinVF << ")\n");
24365 continue;
24366 }
24367
24368 unsigned NonPowerOf2VF = 0;
24369 if (VectorizeNonPowerOf2) {
24370 // First try vectorizing with a non-power-of-2 VF. At the moment, only
24371 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
24372 // lanes are used.
24373 unsigned CandVF = std::clamp<unsigned>(val: Operands.size(), lo: MinVF, hi: MaxVF);
24374 if (has_single_bit(Value: CandVF + 1)) {
24375 NonPowerOf2VF = CandVF;
24376 assert(NonPowerOf2VF != MaxVF &&
24377 "Non-power-of-2 VF should not be equal to MaxVF");
24378 }
24379 }
24380
24381 // MaxRegVF represents the number of instructions (scalar, or vector in
24382 // case of revec) that can be vectorized to naturally fit in a vector
24383 // register.
24384 unsigned MaxRegVF = MaxVF;
24385
24386 MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
24387 if (MaxVF < MinVF) {
24388 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24389 << ") < "
24390 << "MinVF (" << MinVF << ")\n");
24391 continue;
24392 }
24393
24394 SmallVector<unsigned> CandidateVFs;
24395 for (unsigned VF = std::max(a: MaxVF, b: NonPowerOf2VF); VF >= MinVF;
24396 VF = divideCeil(Numerator: VF, Denominator: 2))
24397 CandidateVFs.push_back(Elt: VF);
24398
24399 unsigned End = Operands.size();
24400 unsigned Repeat = 0;
24401 constexpr unsigned MaxAttempts = 4;
24402 // first: the best TreeSize from all prior loops over CandidateVFs, gets
24403 // updated after looping through CandidateVFs
24404 // second: the best TreeSize from all prior loops including the current
24405 // one
24406 llvm::SmallVector<std::pair<unsigned, unsigned>> RangeSizesStorage(
24407 Operands.size(), {1, 1});
24408 // The `slice` and `drop_front` interfaces are convenient
24409 const auto RangeSizes = MutableArrayRef(RangeSizesStorage);
24410 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
24411 auto IsNotVectorized = [](const std::pair<unsigned, unsigned> &P) {
24412 return P.first > 0;
24413 };
24414 auto IsVectorized = [](const std::pair<unsigned, unsigned> &P) {
24415 return P.first == 0;
24416 };
24417 auto VFIsProfitable = [](unsigned Size,
24418 const std::pair<unsigned, unsigned> &P) {
24419 return Size >= P.first;
24420 };
24421 auto FirstSizeSame = [](unsigned Size,
24422 const std::pair<unsigned, unsigned> &P) {
24423 return Size == P.first;
24424 };
24425 while (true) {
24426 ++Repeat;
24427 bool RepeatChanged = false;
24428 bool AnyProfitableGraph = false;
24429 for (unsigned VF : CandidateVFs) {
24430 AnyProfitableGraph = false;
24431 unsigned FirstUnvecStore = std::distance(
24432 first: RangeSizes.begin(), last: find_if(Range: RangeSizes, P: IsNotVectorized));
24433
24434 // Form slices of size VF starting from FirstUnvecStore and try to
24435 // vectorize them.
24436 while (FirstUnvecStore < End) {
24437 unsigned FirstVecStore = std::distance(
24438 first: RangeSizes.begin(),
24439 last: find_if(Range: RangeSizes.drop_front(N: FirstUnvecStore), P: IsVectorized));
24440 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24441 for (unsigned SliceStartIdx = FirstUnvecStore;
24442 SliceStartIdx + VF <= MaxSliceEnd;) {
24443 if (!checkTreeSizes(Sizes: RangeSizes.slice(N: SliceStartIdx, M: VF))) {
24444 ++SliceStartIdx;
24445 continue;
24446 }
24447 ArrayRef<Value *> Slice =
24448 ArrayRef(Operands).slice(N: SliceStartIdx, M: VF);
24449 assert(all_of(Slice,
24450 [&](Value *V) {
24451 return cast<StoreInst>(V)
24452 ->getValueOperand()
24453 ->getType() ==
24454 cast<StoreInst>(Slice.front())
24455 ->getValueOperand()
24456 ->getType();
24457 }) &&
24458 "Expected all operands of same type.");
24459 if (!NonSchedulable.empty()) {
24460 auto [NonSchedSizeMax, NonSchedSizeMin] =
24461 NonSchedulable.lookup(Val: Slice.front());
24462 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24463 // VF is too ambitious. Try to vectorize another slice before
24464 // trying a smaller VF.
24465 SliceStartIdx += NonSchedSizeMax;
24466 continue;
24467 }
24468 }
24469 unsigned TreeSize;
24470 std::optional<bool> Res =
24471 vectorizeStoreChain(Chain: Slice, R, Idx: SliceStartIdx, MinVF, Size&: TreeSize);
24472 if (!Res) {
24473 // Update the range of non schedulable VFs for slices starting
24474 // at SliceStartIdx.
24475 NonSchedulable
24476 .try_emplace(Key: Slice.front(), Args: std::make_pair(x&: VF, y&: VF))
24477 .first->getSecond()
24478 .second = VF;
24479 } else if (*Res) {
24480 // Mark the vectorized stores so that we don't vectorize them
24481 // again.
24482 VectorizedStores.insert_range(R&: Slice);
24483 AnyProfitableGraph = RepeatChanged = Changed = true;
24484 // If we vectorized initial block, no need to try to vectorize
24485 // it again.
24486 for (std::pair<unsigned, unsigned> &P :
24487 RangeSizes.slice(N: SliceStartIdx, M: VF))
24488 P.first = P.second = 0;
24489 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24490 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
24491 N: FirstUnvecStore, M: SliceStartIdx - FirstUnvecStore))
24492 P.first = P.second = 0;
24493 FirstUnvecStore = SliceStartIdx + VF;
24494 }
24495 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24496 for (std::pair<unsigned, unsigned> &P :
24497 RangeSizes.slice(N: SliceStartIdx + VF,
24498 M: MaxSliceEnd - (SliceStartIdx + VF)))
24499 P.first = P.second = 0;
24500 if (MaxSliceEnd == End)
24501 End = SliceStartIdx;
24502 MaxSliceEnd = SliceStartIdx;
24503 }
24504 SliceStartIdx += VF;
24505 continue;
24506 }
24507 if (VF > 2 && Res &&
24508 !all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
24509 P: std::bind(f&: VFIsProfitable, args&: TreeSize, args: _1))) {
24510 SliceStartIdx += VF;
24511 continue;
24512 }
24513 // Check for the very big VFs that we're not rebuilding same
24514 // trees, just with larger number of elements.
24515 if (VF > MaxRegVF && TreeSize > 1 &&
24516 all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
24517 P: std::bind(f&: FirstSizeSame, args&: TreeSize, args: _1))) {
24518 SliceStartIdx += VF;
24519 while (SliceStartIdx != MaxSliceEnd &&
24520 RangeSizes[SliceStartIdx].first == TreeSize)
24521 ++SliceStartIdx;
24522 continue;
24523 }
24524 if (TreeSize > 1)
24525 for (std::pair<unsigned, unsigned> &P :
24526 RangeSizes.slice(N: SliceStartIdx, M: VF))
24527 P.second = std::max(a: P.second, b: TreeSize);
24528 ++SliceStartIdx;
24529 AnyProfitableGraph = true;
24530 }
24531 if (FirstUnvecStore >= End)
24532 break;
24533 if (MaxSliceEnd - FirstUnvecStore < VF &&
24534 MaxSliceEnd - FirstUnvecStore >= MinVF)
24535 AnyProfitableGraph = true;
24536 FirstUnvecStore = std::distance(
24537 first: RangeSizes.begin(),
24538 last: find_if(Range: RangeSizes.drop_front(N: MaxSliceEnd), P: IsNotVectorized));
24539 }
24540 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(Value: VF))
24541 break;
24542 // For the MaxRegVF case, save RangeSizes to limit compile time
24543 if (VF == MaxRegVF)
24544 for (std::pair<unsigned, unsigned> &P : RangeSizes)
24545 if (P.first != 0)
24546 P.first = std::max(a: P.second, b: P.first);
24547 }
24548 // All values vectorized - exit.
24549 if (all_of(Range: RangeSizes, P: IsVectorized))
24550 break;
24551 // Check if tried all attempts or no need for the last attempts at all.
24552 if (Repeat >= MaxAttempts ||
24553 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24554 break;
24555 constexpr unsigned StoresLimit = 64;
24556 const unsigned MaxTotalNum = std::min<unsigned>(
24557 a: Operands.size(),
24558 b: static_cast<unsigned>(
24559 End -
24560 std::distance(first: RangeSizes.begin(),
24561 last: find_if(Range: RangeSizes, P: IsNotVectorized)) +
24562 1));
24563 unsigned VF = bit_ceil(Value: CandidateVFs.front()) * 2;
24564 if (VF > MaxTotalNum || VF >= StoresLimit)
24565 break;
24566 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
24567 if (P.first != 0)
24568 P.first = std::max(a: P.second, b: P.first);
24569 }
24570 // Attempt again to vectorize even larger chains if all previous
24571 // attempts were unsuccessful because of the cost issues.
24572 CandidateVFs.clear();
24573 unsigned Limit =
24574 getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: StoreTy, Sz: MaxTotalNum);
24575 if (bit_floor(Value: Limit) == VF && Limit != VF)
24576 CandidateVFs.push_back(Elt: Limit);
24577 CandidateVFs.push_back(Elt: VF);
24578 }
24579 }
24580 };
24581
24582 /// Groups of stores to vectorize
24583 SmallVector<RelatedStoreInsts> SortedStores;
24584
24585 // Inserts the specified store SI with the given index Idx to the set of the
24586 // stores. If the store with the same distance is found already - stop
24587 // insertion, try to vectorize already found stores. If some stores from this
24588 // sequence were not vectorized - try to vectorize them with the new store
24589 // later. But this logic is applied only to the stores, that come before the
24590 // previous store with the same distance.
24591 // Example:
24592 // 1. store x, %p
24593 // 2. store y, %p+1
24594 // 3. store z, %p+2
24595 // 4. store a, %p
24596 // 5. store b, %p+3
24597 // - Scan this from the last to first store. The very first bunch of stores is
24598 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
24599 // vector).
24600 // - The next store in the list - #1 - has the same distance from store #5 as
24601 // the store #4.
24602 // - Try to vectorize sequence of stores 4,2,3,5.
24603 // - If all these stores are vectorized - just drop them.
24604 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
24605 // - Start new stores sequence.
24606 // The new bunch of stores is {1, {1, 0}}.
24607 // - Add the stores from previous sequence, that were not vectorized.
24608 // Here we consider the stores in the reversed order, rather they are used in
24609 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
24610 // Store #3 can be added -> comes after store #4 with the same distance as
24611 // store #1.
24612 // Store #5 cannot be added - comes before store #4.
24613 // This logic allows to improve the compile time, we assume that the stores
24614 // after previous store with the same distance most likely have memory
24615 // dependencies and no need to waste compile time to try to vectorize them.
24616 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
24617 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
24618 std::optional<int64_t> PtrDist;
24619 auto *RelatedStores = find_if(
24620 Range&: SortedStores, P: [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
24621 PtrDist = StoreSeq.getPointerDiff(SI&: *SI, DL: *DL, SE&: *SE);
24622 return PtrDist.has_value();
24623 });
24624
24625 // We did not find a comparable store, start a new group.
24626 if (RelatedStores == SortedStores.end()) {
24627 SortedStores.emplace_back(Args&: Idx, Args&: Stores);
24628 return;
24629 }
24630
24631 // If there is already a store in the group with the same PtrDiff, try to
24632 // vectorize the existing instructions before adding the current store.
24633 // Otherwise, insert this store and keep collecting.
24634 if (std::optional<unsigned> PrevInst =
24635 RelatedStores->insertOrLookup(InstrIdx: Idx, PtrDist: *PtrDist)) {
24636 TryToVectorize(RelatedStores->getStores());
24637 RelatedStores->clearVectorizedStores(VectorizedStores);
24638 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
24639 /*NewBaseInstIdx=*/Idx,
24640 /*DistFromCurBase=*/*PtrDist);
24641 }
24642 };
24643 Type *PrevValTy = nullptr;
24644 for (auto [I, SI] : enumerate(First&: Stores)) {
24645 if (R.isDeleted(I: SI))
24646 continue;
24647 if (!PrevValTy)
24648 PrevValTy = SI->getValueOperand()->getType();
24649 // Check that we do not try to vectorize stores of different types.
24650 if (PrevValTy != SI->getValueOperand()->getType()) {
24651 for (RelatedStoreInsts &StoreSeq : SortedStores)
24652 TryToVectorize(StoreSeq.getStores());
24653 SortedStores.clear();
24654 PrevValTy = SI->getValueOperand()->getType();
24655 }
24656 FillStoresSet(I, SI);
24657 }
24658
24659 // Final vectorization attempt.
24660 for (RelatedStoreInsts &StoreSeq : SortedStores)
24661 TryToVectorize(StoreSeq.getStores());
24662
24663 return Changed;
24664}
24665
24666void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
24667 // Initialize the collections. We will make a single pass over the block.
24668 Stores.clear();
24669 GEPs.clear();
24670
24671 // Visit the store and getelementptr instructions in BB and organize them in
24672 // Stores and GEPs according to the underlying objects of their pointer
24673 // operands.
24674 for (Instruction &I : *BB) {
24675 // Ignore store instructions that are volatile or have a pointer operand
24676 // that doesn't point to a scalar type.
24677 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
24678 if (!SI->isSimple())
24679 continue;
24680 if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
24681 continue;
24682 Stores[getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
24683 }
24684
24685 // Ignore getelementptr instructions that have more than one index, a
24686 // constant index, or a pointer operand that doesn't point to a scalar
24687 // type.
24688 else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
24689 if (GEP->getNumIndices() != 1)
24690 continue;
24691 Value *Idx = GEP->idx_begin()->get();
24692 if (isa<Constant>(Val: Idx))
24693 continue;
24694 if (!isValidElementType(Ty: Idx->getType()))
24695 continue;
24696 if (GEP->getType()->isVectorTy())
24697 continue;
24698 GEPs[GEP->getPointerOperand()].push_back(Elt: GEP);
24699 }
24700 }
24701}
24702
24703bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
24704 bool MaxVFOnly) {
24705 if (VL.size() < 2)
24706 return false;
24707
24708 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
24709 << VL.size() << ".\n");
24710
24711 // Check that all of the parts are instructions of the same type,
24712 // we permit an alternate opcode via InstructionsState.
24713 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
24714 if (!S)
24715 return false;
24716
24717 Instruction *I0 = S.getMainOp();
24718 // Make sure invalid types (including vector type) are rejected before
24719 // determining vectorization factor for scalar instructions.
24720 for (Value *V : VL) {
24721 Type *Ty = V->getType();
24722 if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
24723 // NOTE: the following will give user internal llvm type name, which may
24724 // not be useful.
24725 R.getORE()->emit(RemarkBuilder: [&]() {
24726 std::string TypeStr;
24727 llvm::raw_string_ostream OS(TypeStr);
24728 Ty->print(O&: OS);
24729 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
24730 << "Cannot SLP vectorize list: type "
24731 << TypeStr + " is unsupported by vectorizer";
24732 });
24733 return false;
24734 }
24735 }
24736
24737 Type *ScalarTy = getValueType(V: VL[0]);
24738 unsigned Sz = R.getVectorElementSize(V: I0);
24739 unsigned MinVF = R.getMinVF(Sz);
24740 unsigned MaxVF = std::max<unsigned>(
24741 a: getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: ScalarTy, Sz: VL.size()), b: MinVF);
24742 MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
24743 if (MaxVF < 2) {
24744 R.getORE()->emit(RemarkBuilder: [&]() {
24745 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
24746 << "Cannot SLP vectorize list: vectorization factor "
24747 << "less than 2 is not supported";
24748 });
24749 return false;
24750 }
24751
24752 bool Changed = false;
24753 bool CandidateFound = false;
24754 InstructionCost MinCost = SLPCostThreshold.getValue();
24755
24756 unsigned NextInst = 0, MaxInst = VL.size();
24757 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
24758 VF = getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: I0->getType(), Sz: VF - 1)) {
24759 // No actual vectorization should happen, if number of parts is the same as
24760 // provided vectorization factor (i.e. the scalar type is used for vector
24761 // code during codegen).
24762 auto *VecTy = getWidenedType(ScalarTy, VF);
24763 if (TTI->getNumberOfParts(Tp: VecTy) == VF)
24764 continue;
24765 for (unsigned I = NextInst; I < MaxInst; ++I) {
24766 unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
24767
24768 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ScalarTy, Sz: ActualVF))
24769 continue;
24770
24771 if (MaxVFOnly && ActualVF < MaxVF)
24772 break;
24773 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
24774 break;
24775
24776 SmallVector<Value *> Ops(ActualVF, nullptr);
24777 unsigned Idx = 0;
24778 for (Value *V : VL.drop_front(N: I)) {
24779 // Check that a previous iteration of this loop did not delete the
24780 // Value.
24781 if (auto *Inst = dyn_cast<Instruction>(Val: V);
24782 !Inst || !R.isDeleted(I: Inst)) {
24783 Ops[Idx] = V;
24784 ++Idx;
24785 if (Idx == ActualVF)
24786 break;
24787 }
24788 }
24789 // Not enough vectorizable instructions - exit.
24790 if (Idx != ActualVF)
24791 break;
24792
24793 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
24794 << "\n");
24795
24796 R.buildTree(Roots: Ops);
24797 if (R.isTreeTinyAndNotFullyVectorizable())
24798 continue;
24799 if (R.isProfitableToReorder()) {
24800 R.reorderTopToBottom();
24801 R.reorderBottomToTop(IgnoreReorder: !isa<InsertElementInst>(Val: Ops.front()));
24802 }
24803 R.transformNodes();
24804 R.computeMinimumValueSizes();
24805 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24806 R.buildExternalUses();
24807
24808 InstructionCost Cost = R.getTreeCost(TreeCost);
24809 CandidateFound = true;
24810 MinCost = std::min(a: MinCost, b: Cost);
24811
24812 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
24813 << " for VF=" << ActualVF << "\n");
24814 if (Cost < -SLPCostThreshold) {
24815 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
24816 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "VectorizedList",
24817 cast<Instruction>(Val: Ops[0]))
24818 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
24819 << " and with tree size "
24820 << ore::NV("TreeSize", R.getTreeSize()));
24821
24822 R.vectorizeTree();
24823 // Move to the next bundle.
24824 I += VF - 1;
24825 NextInst = I + 1;
24826 Changed = true;
24827 }
24828 }
24829 }
24830
24831 if (!Changed && CandidateFound) {
24832 R.getORE()->emit(RemarkBuilder: [&]() {
24833 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
24834 << "List vectorization was possible but not beneficial with cost "
24835 << ore::NV("Cost", MinCost) << " >= "
24836 << ore::NV("Treshold", -SLPCostThreshold);
24837 });
24838 } else if (!Changed) {
24839 R.getORE()->emit(RemarkBuilder: [&]() {
24840 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
24841 << "Cannot SLP vectorize list: vectorization was impossible"
24842 << " with available vectorization factors";
24843 });
24844 }
24845 return Changed;
24846}
24847
24848namespace {
24849
24850/// Model horizontal reductions.
24851///
24852/// A horizontal reduction is a tree of reduction instructions that has values
24853/// that can be put into a vector as its leaves. For example:
24854///
24855/// mul mul mul mul
24856/// \ / \ /
24857/// + +
24858/// \ /
24859/// +
24860/// This tree has "mul" as its leaf values and "+" as its reduction
24861/// instructions. A reduction can feed into a store or a binary operation
24862/// feeding a phi.
24863/// ...
24864/// \ /
24865/// +
24866/// |
24867/// phi +=
24868///
24869/// Or:
24870/// ...
24871/// \ /
24872/// +
24873/// |
24874/// *p =
24875///
24876class HorizontalReduction {
24877 using ReductionOpsType = SmallVector<Value *, 16>;
24878 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
24879 ReductionOpsListType ReductionOps;
24880 /// List of possibly reduced values.
24881 SmallVector<SmallVector<Value *>> ReducedVals;
24882 /// Maps reduced value to the corresponding reduction operation.
24883 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
24884 WeakTrackingVH ReductionRoot;
24885 /// The type of reduction operation.
24886 RecurKind RdxKind;
24887 /// Checks if the optimization of original scalar identity operations on
24888 /// matched horizontal reductions is enabled and allowed.
24889 bool IsSupportedHorRdxIdentityOp = false;
24890 /// The minimum number of the reduced values.
24891 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
24892 /// Contains vector values for reduction including their scale factor and
24893 /// signedness.
24894 SmallVector<std::tuple<Value *, unsigned, bool>> VectorValuesAndScales;
24895
24896 static bool isCmpSelMinMax(Instruction *I) {
24897 return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
24898 RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
24899 }
24900
24901 // And/or are potentially poison-safe logical patterns like:
24902 // select x, y, false
24903 // select x, true, y
24904 static bool isBoolLogicOp(Instruction *I) {
24905 return isa<SelectInst>(Val: I) &&
24906 (match(V: I, P: m_LogicalAnd()) || match(V: I, P: m_LogicalOr()));
24907 }
24908
24909 /// Checks if instruction is associative and can be vectorized.
24910 static bool isVectorizable(RecurKind Kind, Instruction *I,
24911 bool TwoElementReduction = false) {
24912 if (Kind == RecurKind::None)
24913 return false;
24914
24915 // Integer ops that map to select instructions or intrinsics are fine.
24916 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
24917 isBoolLogicOp(I))
24918 return true;
24919
24920 // No need to check for associativity, if 2 reduced values.
24921 if (TwoElementReduction)
24922 return true;
24923
24924 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
24925 // FP min/max are associative except for NaN and -0.0. We do not
24926 // have to rule out -0.0 here because the intrinsic semantics do not
24927 // specify a fixed result for it.
24928 return I->getFastMathFlags().noNaNs();
24929 }
24930
24931 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
24932 return true;
24933
24934 return I->isAssociative();
24935 }
24936
24937 static Value *getRdxOperand(Instruction *I, unsigned Index) {
24938 // Poison-safe 'or' takes the form: select X, true, Y
24939 // To make that work with the normal operand processing, we skip the
24940 // true value operand.
24941 // TODO: Change the code and data structures to handle this without a hack.
24942 if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == 1)
24943 return I->getOperand(i: 2);
24944 return I->getOperand(i: Index);
24945 }
24946
24947 /// Creates reduction operation with the current opcode.
24948 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
24949 Value *RHS, const Twine &Name, bool UseSelect) {
24950 Type *OpTy = LHS->getType();
24951 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
24952 switch (Kind) {
24953 case RecurKind::Or: {
24954 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
24955 return Builder.CreateSelectWithUnknownProfile(
24956 C: LHS, True: ConstantInt::getAllOnesValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
24957 False: RHS, DEBUG_TYPE, Name);
24958 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24959 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24960 Name);
24961 }
24962 case RecurKind::And: {
24963 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
24964 return Builder.CreateSelectWithUnknownProfile(
24965 C: LHS, True: RHS,
24966 False: ConstantInt::getNullValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
24967 DEBUG_TYPE, Name);
24968 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24969 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24970 Name);
24971 }
24972 case RecurKind::Add:
24973 case RecurKind::Mul:
24974 case RecurKind::Xor:
24975 case RecurKind::FAdd:
24976 case RecurKind::FMul: {
24977 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
24978 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
24979 Name);
24980 }
24981 case RecurKind::SMax:
24982 case RecurKind::SMin:
24983 case RecurKind::UMax:
24984 case RecurKind::UMin:
24985 if (UseSelect) {
24986 CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(RK: Kind);
24987 Value *Cmp = Builder.CreateICmp(P: Pred, LHS, RHS, Name);
24988 return Builder.CreateSelectWithUnknownProfile(C: Cmp, True: LHS, False: RHS, DEBUG_TYPE,
24989 Name);
24990 }
24991 [[fallthrough]];
24992 case RecurKind::FMax:
24993 case RecurKind::FMin:
24994 case RecurKind::FMaximum:
24995 case RecurKind::FMinimum:
24996 case RecurKind::FMaximumNum:
24997 case RecurKind::FMinimumNum: {
24998 Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(RK: Kind);
24999 return Builder.CreateBinaryIntrinsic(ID: Id, LHS, RHS);
25000 }
25001 default:
25002 llvm_unreachable("Unknown reduction operation.");
25003 }
25004 }
25005
25006 /// Creates reduction operation with the current opcode with the IR flags
25007 /// from \p ReductionOps, dropping nuw/nsw flags.
25008 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
25009 Value *RHS, const Twine &Name,
25010 const ReductionOpsListType &ReductionOps) {
25011 bool UseSelect = ReductionOps.size() == 2 ||
25012 // Logical or/and.
25013 (ReductionOps.size() == 1 &&
25014 any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
25015 assert((!UseSelect || ReductionOps.size() != 2 ||
25016 isa<SelectInst>(ReductionOps[1][0])) &&
25017 "Expected cmp + select pairs for reduction");
25018 Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
25019 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
25020 if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
25021 propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps[0], OpValue: nullptr,
25022 /*IncludeWrapFlags=*/false);
25023 propagateIRFlags(I: Op, VL: ReductionOps[1], OpValue: nullptr,
25024 /*IncludeWrapFlags=*/false);
25025 return Op;
25026 }
25027 }
25028 propagateIRFlags(I: Op, VL: ReductionOps[0], OpValue: nullptr, /*IncludeWrapFlags=*/false);
25029 return Op;
25030 }
25031
25032public:
25033 static RecurKind getRdxKind(Value *V) {
25034 auto *I = dyn_cast<Instruction>(Val: V);
25035 if (!I)
25036 return RecurKind::None;
25037 if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
25038 return RecurKind::Add;
25039 if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
25040 return RecurKind::Mul;
25041 if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) ||
25042 match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
25043 return RecurKind::And;
25044 if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) ||
25045 match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
25046 return RecurKind::Or;
25047 if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
25048 return RecurKind::Xor;
25049 if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
25050 return RecurKind::FAdd;
25051 if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
25052 return RecurKind::FMul;
25053
25054 if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
25055 return RecurKind::FMax;
25056 if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
25057 return RecurKind::FMin;
25058
25059 if (match(V: I, P: m_FMaximum(Op0: m_Value(), Op1: m_Value())))
25060 return RecurKind::FMaximum;
25061 if (match(V: I, P: m_FMinimum(Op0: m_Value(), Op1: m_Value())))
25062 return RecurKind::FMinimum;
25063 // This matches either cmp+select or intrinsics. SLP is expected to handle
25064 // either form.
25065 // TODO: If we are canonicalizing to intrinsics, we can remove several
25066 // special-case paths that deal with selects.
25067 if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
25068 return RecurKind::SMax;
25069 if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
25070 return RecurKind::SMin;
25071 if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
25072 return RecurKind::UMax;
25073 if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
25074 return RecurKind::UMin;
25075
25076 if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
25077 // Try harder: look for min/max pattern based on instructions producing
25078 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
25079 // During the intermediate stages of SLP, it's very common to have
25080 // pattern like this (since optimizeGatherSequence is run only once
25081 // at the end):
25082 // %1 = extractelement <2 x i32> %a, i32 0
25083 // %2 = extractelement <2 x i32> %a, i32 1
25084 // %cond = icmp sgt i32 %1, %2
25085 // %3 = extractelement <2 x i32> %a, i32 0
25086 // %4 = extractelement <2 x i32> %a, i32 1
25087 // %select = select i1 %cond, i32 %3, i32 %4
25088 CmpPredicate Pred;
25089 Instruction *L1;
25090 Instruction *L2;
25091
25092 Value *LHS = Select->getTrueValue();
25093 Value *RHS = Select->getFalseValue();
25094 Value *Cond = Select->getCondition();
25095
25096 // TODO: Support inverse predicates.
25097 if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
25098 if (!isa<ExtractElementInst>(Val: RHS) ||
25099 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
25100 return RecurKind::None;
25101 } else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
25102 if (!isa<ExtractElementInst>(Val: LHS) ||
25103 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
25104 return RecurKind::None;
25105 } else {
25106 if (!isa<ExtractElementInst>(Val: LHS) || !isa<ExtractElementInst>(Val: RHS))
25107 return RecurKind::None;
25108 if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) ||
25109 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) ||
25110 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
25111 return RecurKind::None;
25112 }
25113
25114 switch (Pred) {
25115 default:
25116 return RecurKind::None;
25117 case CmpInst::ICMP_SGT:
25118 case CmpInst::ICMP_SGE:
25119 return RecurKind::SMax;
25120 case CmpInst::ICMP_SLT:
25121 case CmpInst::ICMP_SLE:
25122 return RecurKind::SMin;
25123 case CmpInst::ICMP_UGT:
25124 case CmpInst::ICMP_UGE:
25125 return RecurKind::UMax;
25126 case CmpInst::ICMP_ULT:
25127 case CmpInst::ICMP_ULE:
25128 return RecurKind::UMin;
25129 }
25130 }
25131 return RecurKind::None;
25132 }
25133
25134 /// Get the index of the first operand.
25135 static unsigned getFirstOperandIndex(Instruction *I) {
25136 return isCmpSelMinMax(I) ? 1 : 0;
25137 }
25138
25139private:
25140 /// Total number of operands in the reduction operation.
25141 static unsigned getNumberOfOperands(Instruction *I) {
25142 return isCmpSelMinMax(I) ? 3 : 2;
25143 }
25144
25145 /// Checks if the instruction is in basic block \p BB.
25146 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
25147 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
25148 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
25149 auto *Sel = cast<SelectInst>(Val: I);
25150 auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
25151 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
25152 }
25153 return I->getParent() == BB;
25154 }
25155
25156 /// Expected number of uses for reduction operations/reduced values.
25157 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
25158 if (IsCmpSelMinMax) {
25159 // SelectInst must be used twice while the condition op must have single
25160 // use only.
25161 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
25162 return Sel->hasNUses(N: 2) && Sel->getCondition()->hasOneUse();
25163 return I->hasNUses(N: 2);
25164 }
25165
25166 // Arithmetic reduction operation must be used once only.
25167 return I->hasOneUse();
25168 }
25169
25170 /// Initializes the list of reduction operations.
25171 void initReductionOps(Instruction *I) {
25172 if (isCmpSelMinMax(I))
25173 ReductionOps.assign(NumElts: 2, Elt: ReductionOpsType());
25174 else
25175 ReductionOps.assign(NumElts: 1, Elt: ReductionOpsType());
25176 }
25177
25178 /// Add all reduction operations for the reduction instruction \p I.
25179 void addReductionOps(Instruction *I) {
25180 if (isCmpSelMinMax(I)) {
25181 ReductionOps[0].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
25182 ReductionOps[1].emplace_back(Args&: I);
25183 } else {
25184 ReductionOps[0].emplace_back(Args&: I);
25185 }
25186 }
25187
25188 static bool isGoodForReduction(ArrayRef<Value *> Data) {
25189 int Sz = Data.size();
25190 auto *I = dyn_cast<Instruction>(Val: Data.front());
25191 return Sz > 1 || isConstant(V: Data.front()) ||
25192 (I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
25193 }
25194
25195public:
25196 HorizontalReduction() = default;
25197 HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
25198 : ReductionRoot(I), ReductionLimit(2) {
25199 RdxKind = HorizontalReduction::getRdxKind(V: I);
25200 ReductionOps.emplace_back().push_back(Elt: I);
25201 ReducedVals.emplace_back().assign(in_start: Ops.begin(), in_end: Ops.end());
25202 for (Value *V : Ops)
25203 ReducedValsToOps[V].push_back(Elt: I);
25204 }
25205
25206 bool matchReductionForOperands() const {
25207 // Analyze "regular" integer/FP types for reductions - no target-specific
25208 // types or pointers.
25209 assert(ReductionRoot && "Reduction root is not set!");
25210 if (!isVectorizable(Kind: RdxKind, I: cast<Instruction>(Val: ReductionRoot),
25211 TwoElementReduction: all_of(Range: ReducedVals, P: [](ArrayRef<Value *> Ops) {
25212 return Ops.size() == 2;
25213 })))
25214 return false;
25215
25216 return true;
25217 }
25218
25219 /// Try to find a reduction tree.
25220 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
25221 ScalarEvolution &SE, const DataLayout &DL,
25222 const TargetLibraryInfo &TLI) {
25223 RdxKind = HorizontalReduction::getRdxKind(V: Root);
25224 if (!isVectorizable(Kind: RdxKind, I: Root))
25225 return false;
25226
25227 // Analyze "regular" integer/FP types for reductions - no target-specific
25228 // types or pointers.
25229 Type *Ty = Root->getType();
25230 if (!isValidElementType(Ty) || Ty->isPointerTy())
25231 return false;
25232
25233 // Though the ultimate reduction may have multiple uses, its condition must
25234 // have only single use.
25235 if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
25236 if (!Sel->getCondition()->hasOneUse())
25237 return false;
25238
25239 ReductionRoot = Root;
25240
25241 // Iterate through all the operands of the possible reduction tree and
25242 // gather all the reduced values, sorting them by their value id.
25243 BasicBlock *BB = Root->getParent();
25244 bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
25245 SmallVector<std::pair<Instruction *, unsigned>> Worklist(
25246 1, std::make_pair(x&: Root, y: 0));
25247 // Checks if the operands of the \p TreeN instruction are also reduction
25248 // operations or should be treated as reduced values or an extra argument,
25249 // which is not part of the reduction.
25250 auto CheckOperands = [&](Instruction *TreeN,
25251 SmallVectorImpl<Value *> &PossibleReducedVals,
25252 SmallVectorImpl<Instruction *> &ReductionOps,
25253 unsigned Level) {
25254 for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
25255 End: getNumberOfOperands(I: TreeN)))) {
25256 Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
25257 ReducedValsToOps[EdgeVal].push_back(Elt: TreeN);
25258 auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
25259 // If the edge is not an instruction, or it is different from the main
25260 // reduction opcode or has too many uses - possible reduced value.
25261 // Also, do not try to reduce const values, if the operation is not
25262 // foldable.
25263 if (!EdgeInst || Level > RecursionMaxDepth ||
25264 getRdxKind(V: EdgeInst) != RdxKind ||
25265 IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) ||
25266 !hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) ||
25267 !isVectorizable(Kind: RdxKind, I: EdgeInst) ||
25268 (R.isAnalyzedReductionRoot(I: EdgeInst) &&
25269 all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
25270 PossibleReducedVals.push_back(Elt: EdgeVal);
25271 continue;
25272 }
25273 ReductionOps.push_back(Elt: EdgeInst);
25274 }
25275 };
25276 // Try to regroup reduced values so that it gets more profitable to try to
25277 // reduce them. Values are grouped by their value ids, instructions - by
25278 // instruction op id and/or alternate op id, plus do extra analysis for
25279 // loads (grouping them by the distance between pointers) and cmp
25280 // instructions (grouping them by the predicate).
25281 SmallMapVector<
25282 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
25283 8>
25284 PossibleReducedVals;
25285 initReductionOps(I: Root);
25286 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
25287 SmallSet<size_t, 2> LoadKeyUsed;
25288
25289 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
25290 Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
25291 Value *Ptr =
25292 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
25293 if (!LoadKeyUsed.insert(V: Key).second) {
25294 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
25295 if (LIt != LoadsMap.end()) {
25296 for (LoadInst *RLI : LIt->second) {
25297 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
25298 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
25299 /*StrictCheck=*/true))
25300 return hash_value(ptr: RLI->getPointerOperand());
25301 }
25302 for (LoadInst *RLI : LIt->second) {
25303 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
25304 Ptr2: LI->getPointerOperand(), TLI)) {
25305 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
25306 return SubKey;
25307 }
25308 }
25309 if (LIt->second.size() > 2) {
25310 hash_code SubKey =
25311 hash_value(ptr: LIt->second.back()->getPointerOperand());
25312 return SubKey;
25313 }
25314 }
25315 }
25316 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr))
25317 .first->second.push_back(Elt: LI);
25318 return hash_value(ptr: LI->getPointerOperand());
25319 };
25320
25321 while (!Worklist.empty()) {
25322 auto [TreeN, Level] = Worklist.pop_back_val();
25323 SmallVector<Value *> PossibleRedVals;
25324 SmallVector<Instruction *> PossibleReductionOps;
25325 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
25326 addReductionOps(I: TreeN);
25327 // Add reduction values. The values are sorted for better vectorization
25328 // results.
25329 for (Value *V : PossibleRedVals) {
25330 size_t Key, Idx;
25331 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
25332 /*AllowAlternate=*/false);
25333 ++PossibleReducedVals[Key][Idx].try_emplace(Key: V, Args: 0).first->second;
25334 }
25335 for (Instruction *I : reverse(C&: PossibleReductionOps))
25336 Worklist.emplace_back(Args&: I, Args: I->getParent() == BB ? 0 : Level + 1);
25337 }
25338 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
25339 // Sort values by the total number of values kinds to start the reduction
25340 // from the longest possible reduced values sequences.
25341 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
25342 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
25343 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
25344 for (auto &Slice : PossibleRedVals) {
25345 PossibleRedValsVect.emplace_back();
25346 auto RedValsVect = Slice.second.takeVector();
25347 stable_sort(Range&: RedValsVect, C: llvm::less_second());
25348 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
25349 PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
25350 }
25351 stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
25352 return P1.size() > P2.size();
25353 });
25354 bool First = true;
25355 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
25356 if (First) {
25357 First = false;
25358 ReducedVals.emplace_back();
25359 } else if (!isGoodForReduction(Data)) {
25360 auto *LI = dyn_cast<LoadInst>(Val: Data.front());
25361 auto *LastLI = dyn_cast<LoadInst>(Val: ReducedVals.back().front());
25362 if (!LI || !LastLI ||
25363 getUnderlyingObject(V: LI->getPointerOperand()) !=
25364 getUnderlyingObject(V: LastLI->getPointerOperand()))
25365 ReducedVals.emplace_back();
25366 }
25367 ReducedVals.back().append(in_start: Data.rbegin(), in_end: Data.rend());
25368 }
25369 }
25370 // Sort the reduced values by number of same/alternate opcode and/or pointer
25371 // operand.
25372 stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
25373 return P1.size() > P2.size();
25374 });
25375 return true;
25376 }
25377
25378 /// Attempt to vectorize the tree found by matchAssociativeReduction.
25379 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
25380 const TargetLibraryInfo &TLI, AssumptionCache *AC,
25381 DominatorTree &DT) {
25382 constexpr unsigned RegMaxNumber = 4;
25383 constexpr unsigned RedValsMaxNumber = 128;
25384 // If there are a sufficient number of reduction values, reduce
25385 // to a nearby power-of-2. We can safely generate oversized
25386 // vectors and rely on the backend to split them to legal sizes.
25387 if (unsigned NumReducedVals = std::accumulate(
25388 first: ReducedVals.begin(), last: ReducedVals.end(), init: 0,
25389 binary_op: [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
25390 if (!isGoodForReduction(Data: Vals))
25391 return Num;
25392 return Num + Vals.size();
25393 });
25394 NumReducedVals < ReductionLimit &&
25395 all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
25396 return RedV.size() < 2 || !allConstant(VL: RedV) || !isSplat(VL: RedV);
25397 })) {
25398 for (ReductionOpsType &RdxOps : ReductionOps)
25399 for (Value *RdxOp : RdxOps)
25400 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
25401 return nullptr;
25402 }
25403
25404 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
25405 TargetFolder(DL));
25406 Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
25407
25408 // Track the reduced values in case if they are replaced by extractelement
25409 // because of the vectorization.
25410 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
25411 ReducedVals.front().size());
25412
25413 // The compare instruction of a min/max is the insertion point for new
25414 // instructions and may be replaced with a new compare instruction.
25415 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
25416 assert(isa<SelectInst>(RdxRootInst) &&
25417 "Expected min/max reduction to have select root instruction");
25418 Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
25419 assert(isa<Instruction>(ScalarCond) &&
25420 "Expected min/max reduction to have compare condition");
25421 return cast<Instruction>(Val: ScalarCond);
25422 };
25423
25424 bool AnyBoolLogicOp = any_of(Range&: ReductionOps.back(), P: [](Value *V) {
25425 return isBoolLogicOp(I: cast<Instruction>(Val: V));
25426 });
25427 // Return new VectorizedTree, based on previous value.
25428 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
25429 if (VectorizedTree) {
25430 // Update the final value in the reduction.
25431 Builder.SetCurrentDebugLocation(
25432 cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
25433 if (AnyBoolLogicOp) {
25434 auto It = ReducedValsToOps.find(Val: VectorizedTree);
25435 auto It1 = ReducedValsToOps.find(Val: Res);
25436 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
25437 isGuaranteedNotToBePoison(V: VectorizedTree, AC) ||
25438 (It != ReducedValsToOps.end() &&
25439 any_of(Range&: It->getSecond(), P: [&](Instruction *I) {
25440 return isBoolLogicOp(I) &&
25441 getRdxOperand(I, Index: 0) == VectorizedTree;
25442 }))) {
25443 ;
25444 } else if (isGuaranteedNotToBePoison(V: Res, AC) ||
25445 (It1 != ReducedValsToOps.end() &&
25446 any_of(Range&: It1->getSecond(), P: [&](Instruction *I) {
25447 return isBoolLogicOp(I) && getRdxOperand(I, Index: 0) == Res;
25448 }))) {
25449 std::swap(a&: VectorizedTree, b&: Res);
25450 } else {
25451 VectorizedTree = Builder.CreateFreeze(V: VectorizedTree);
25452 }
25453 }
25454
25455 return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
25456 ReductionOps);
25457 }
25458 // Initialize the final value in the reduction.
25459 return Res;
25460 };
25461 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25462 ReductionOps.front().size());
25463 for (ReductionOpsType &RdxOps : ReductionOps)
25464 for (Value *RdxOp : RdxOps) {
25465 if (!RdxOp)
25466 continue;
25467 IgnoreList.insert(V: RdxOp);
25468 }
25469 // Intersect the fast-math-flags from all reduction operations.
25470 FastMathFlags RdxFMF;
25471 RdxFMF.set();
25472 for (Value *U : IgnoreList)
25473 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
25474 RdxFMF &= FPMO->getFastMathFlags();
25475 bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
25476
25477 // Need to track reduced vals, they may be changed during vectorization of
25478 // subvectors.
25479 for (ArrayRef<Value *> Candidates : ReducedVals)
25480 for (Value *V : Candidates)
25481 TrackedVals.try_emplace(Key: V, Args&: V);
25482
25483 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25484 Value *V) -> unsigned & {
25485 auto *It = MV.find(Key: V);
25486 assert(It != MV.end() && "Unable to find given key.");
25487 return It->second;
25488 };
25489
25490 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
25491 // List of the values that were reduced in other trees as part of gather
25492 // nodes and thus requiring extract if fully vectorized in other trees.
25493 SmallPtrSet<Value *, 4> RequiredExtract;
25494 WeakTrackingVH VectorizedTree = nullptr;
25495 bool CheckForReusedReductionOps = false;
25496 // Try to vectorize elements based on their type.
25497 SmallVector<InstructionsState> States;
25498 SmallVector<SmallVector<Value *>> LocalReducedVals;
25499 // Try merge consecutive reduced values into a single vectorizable group and
25500 // check, if they can be vectorized as copyables.
25501 for (ArrayRef<Value *> RV : ReducedVals) {
25502 // Loads are not very compatible with undefs.
25503 if (isa<UndefValue>(Val: RV.front()) &&
25504 (States.empty() || !States.back() ||
25505 States.back().getOpcode() == Instruction::Load)) {
25506 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25507 States.push_back(Elt: InstructionsState::invalid());
25508 continue;
25509 }
25510 if (!LocalReducedVals.empty() &&
25511 isa<UndefValue>(Val: LocalReducedVals.back().front()) &&
25512 isa<LoadInst>(Val: RV.front())) {
25513 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25514 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
25515 continue;
25516 }
25517 SmallVector<Value *> Ops;
25518 if (!LocalReducedVals.empty())
25519 Ops = LocalReducedVals.back();
25520 Ops.append(in_start: RV.begin(), in_end: RV.end());
25521 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
25522 InstructionsState OpS =
25523 Analysis.buildInstructionsState(VL: Ops, R: V, TryCopyableElementsVectorization: VectorizeCopyableElements);
25524 if (LocalReducedVals.empty()) {
25525 LocalReducedVals.push_back(Elt: Ops);
25526 States.push_back(Elt: OpS);
25527 continue;
25528 }
25529 if (OpS) {
25530 LocalReducedVals.back().swap(RHS&: Ops);
25531 States.back() = OpS;
25532 continue;
25533 }
25534 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25535 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
25536 }
25537 ReducedVals.swap(RHS&: LocalReducedVals);
25538 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
25539 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
25540 InstructionsState S = States[I];
25541 SmallVector<Value *> Candidates;
25542 Candidates.reserve(N: 2 * OrigReducedVals.size());
25543 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
25544 for (Value *ReducedVal : OrigReducedVals) {
25545 Value *RdxVal = TrackedVals.at(Val: ReducedVal);
25546 // Check if the reduction value was not overriden by the extractelement
25547 // instruction because of the vectorization and exclude it, if it is not
25548 // compatible with other values.
25549 // Also check if the instruction was folded to constant/other value.
25550 auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
25551 if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
25552 (!S || (!S.getMatchingMainOpOrAltOp(I: Inst) &&
25553 !S.isCopyableElement(V: Inst)))) ||
25554 (S && !Inst && !isa<PoisonValue>(Val: RdxVal) &&
25555 !S.isCopyableElement(V: RdxVal)))
25556 continue;
25557 Candidates.push_back(Elt: RdxVal);
25558 TrackedToOrig.try_emplace(Key: RdxVal, Args&: ReducedVal);
25559 }
25560 bool ShuffledExtracts = false;
25561 // Try to handle shuffled extractelements.
25562 if (S && S.getOpcode() == Instruction::ExtractElement &&
25563 !S.isAltShuffle() && I + 1 < E) {
25564 SmallVector<Value *> CommonCandidates(Candidates);
25565 for (Value *RV : ReducedVals[I + 1]) {
25566 Value *RdxVal = TrackedVals.at(Val: RV);
25567 // Check if the reduction value was not overriden by the
25568 // extractelement instruction because of the vectorization and
25569 // exclude it, if it is not compatible with other values.
25570 auto *Inst = dyn_cast<ExtractElementInst>(Val: RdxVal);
25571 if (!Inst)
25572 continue;
25573 CommonCandidates.push_back(Elt: RdxVal);
25574 TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
25575 }
25576 SmallVector<int> Mask;
25577 if (isFixedVectorShuffle(VL: CommonCandidates, Mask, AC)) {
25578 ++I;
25579 Candidates.swap(RHS&: CommonCandidates);
25580 ShuffledExtracts = true;
25581 }
25582 }
25583
25584 // Emit code for constant values.
25585 if (Candidates.size() > 1 && allConstant(VL: Candidates)) {
25586 Value *Res = Candidates.front();
25587 Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
25588 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
25589 for (Value *VC : ArrayRef(Candidates).drop_front()) {
25590 Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
25591 Value *OrigV = TrackedToOrig.at(Val: VC);
25592 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
25593 if (auto *ResI = dyn_cast<Instruction>(Val: Res))
25594 V.analyzedReductionRoot(I: ResI);
25595 }
25596 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
25597 continue;
25598 }
25599
25600 unsigned NumReducedVals = Candidates.size();
25601 if (NumReducedVals < ReductionLimit &&
25602 (NumReducedVals < 2 || !isSplat(VL: Candidates)))
25603 continue;
25604
25605 // Check if we support repeated scalar values processing (optimization of
25606 // original scalar identity operations on matched horizontal reductions).
25607 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
25608 RdxKind != RecurKind::FMul &&
25609 RdxKind != RecurKind::FMulAdd;
25610 // Gather same values.
25611 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
25612 if (IsSupportedHorRdxIdentityOp)
25613 for (Value *V : Candidates) {
25614 Value *OrigV = TrackedToOrig.at(Val: V);
25615 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
25616 }
25617 // Used to check if the reduced values used same number of times. In this
25618 // case the compiler may produce better code. E.g. if reduced values are
25619 // aabbccdd (8 x values), then the first node of the tree will have a node
25620 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
25621 // Plus, the final reduction will be performed on <8 x aabbccdd>.
25622 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
25623 // x abcd) * 2.
25624 // Currently it only handles add/fadd/xor. and/or/min/max do not require
25625 // this analysis, other operations may require an extra estimation of
25626 // the profitability.
25627 bool SameScaleFactor = false;
25628 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
25629 SameValuesCounter.size() != Candidates.size();
25630 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
25631 if (OptReusedScalars) {
25632 SameScaleFactor =
25633 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
25634 RdxKind == RecurKind::Xor) &&
25635 all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
25636 P: [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
25637 return P.second == SameValuesCounter.front().second;
25638 });
25639 Candidates.resize(N: SameValuesCounter.size());
25640 transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
25641 F: [&](const auto &P) { return TrackedVals.at(P.first); });
25642 NumReducedVals = Candidates.size();
25643 // Have a reduction of the same element.
25644 if (NumReducedVals == 1) {
25645 Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
25646 unsigned Cnt = At(SameValuesCounter, OrigV);
25647 Value *RedVal =
25648 emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
25649 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25650 VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
25651 ExternallyUsedValues.insert(V: OrigV);
25652 continue;
25653 }
25654 }
25655
25656 unsigned MaxVecRegSize = V.getMaxVecRegSize();
25657 unsigned EltSize = V.getVectorElementSize(V: Candidates[0]);
25658 const unsigned MaxElts = std::clamp<unsigned>(
25659 val: llvm::bit_floor(Value: MaxVecRegSize / EltSize), lo: RedValsMaxNumber,
25660 hi: RegMaxNumber * RedValsMaxNumber);
25661
25662 unsigned ReduxWidth = NumReducedVals;
25663 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
25664 unsigned NumParts, NumRegs;
25665 Type *ScalarTy = Candidates.front()->getType();
25666 ReduxWidth =
25667 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: ReduxWidth);
25668 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
25669 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
25670 NumRegs =
25671 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
25672 while (NumParts > NumRegs) {
25673 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
25674 ReduxWidth = bit_floor(Value: ReduxWidth - 1);
25675 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
25676 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
25677 NumRegs =
25678 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
25679 }
25680 if (NumParts > NumRegs / 2)
25681 ReduxWidth = bit_floor(Value: ReduxWidth);
25682 return ReduxWidth;
25683 };
25684 if (!VectorizeNonPowerOf2 || !has_single_bit(Value: ReduxWidth + 1))
25685 ReduxWidth = GetVectorFactor(ReduxWidth);
25686 ReduxWidth = std::min(a: ReduxWidth, b: MaxElts);
25687
25688 unsigned Start = 0;
25689 unsigned Pos = Start;
25690 // Restarts vectorization attempt with lower vector factor.
25691 unsigned PrevReduxWidth = ReduxWidth;
25692 bool CheckForReusedReductionOpsLocal = false;
25693 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
25694 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
25695 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
25696 // Check if any of the reduction ops are gathered. If so, worth
25697 // trying again with less number of reduction ops.
25698 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
25699 }
25700 ++Pos;
25701 if (Pos < NumReducedVals - ReduxWidth + 1)
25702 return IsAnyRedOpGathered;
25703 Pos = Start;
25704 --ReduxWidth;
25705 if (ReduxWidth > 1)
25706 ReduxWidth = GetVectorFactor(ReduxWidth);
25707 return IsAnyRedOpGathered;
25708 };
25709 bool AnyVectorized = false;
25710 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
25711 while (Pos < NumReducedVals - ReduxWidth + 1 &&
25712 ReduxWidth >= ReductionLimit) {
25713 // Dependency in tree of the reduction ops - drop this attempt, try
25714 // later.
25715 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
25716 Start == 0) {
25717 CheckForReusedReductionOps = true;
25718 break;
25719 }
25720 PrevReduxWidth = ReduxWidth;
25721 ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
25722 // Been analyzed already - skip.
25723 if (IgnoredCandidates.contains(V: std::make_pair(x&: Pos, y&: ReduxWidth)) ||
25724 (!has_single_bit(Value: ReduxWidth) &&
25725 (IgnoredCandidates.contains(
25726 V: std::make_pair(x&: Pos, y: bit_floor(Value: ReduxWidth))) ||
25727 IgnoredCandidates.contains(
25728 V: std::make_pair(x: Pos + (ReduxWidth - bit_floor(Value: ReduxWidth)),
25729 y: bit_floor(Value: ReduxWidth))))) ||
25730 V.areAnalyzedReductionVals(VL)) {
25731 (void)AdjustReducedVals(/*IgnoreVL=*/true);
25732 continue;
25733 }
25734 // Early exit if any of the reduction values were deleted during
25735 // previous vectorization attempts.
25736 if (any_of(Range&: VL, P: [&V](Value *RedVal) {
25737 auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
25738 return RedValI && V.isDeleted(I: RedValI);
25739 }))
25740 break;
25741 V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
25742 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
25743 if (!AdjustReducedVals())
25744 V.analyzedReductionVals(VL);
25745 continue;
25746 }
25747 if (V.isLoadCombineReductionCandidate(RdxKind)) {
25748 if (!AdjustReducedVals())
25749 V.analyzedReductionVals(VL);
25750 continue;
25751 }
25752 V.reorderTopToBottom();
25753 // No need to reorder the root node at all for reassociative reduction.
25754 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
25755 VL.front()->getType()->isIntOrIntVectorTy() ||
25756 ReductionLimit > 2);
25757 // Keep extracted other reduction values, if they are used in the
25758 // vectorization trees.
25759 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
25760 ExternallyUsedValues);
25761 // The reduction root is used as the insertion point for new
25762 // instructions, so set it as externally used to prevent it from being
25763 // deleted.
25764 LocalExternallyUsedValues.insert(V: ReductionRoot);
25765 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
25766 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
25767 continue;
25768 for (Value *V : ReducedVals[Cnt])
25769 if (isa<Instruction>(Val: V))
25770 LocalExternallyUsedValues.insert(V: TrackedVals[V]);
25771 }
25772 if (!IsSupportedHorRdxIdentityOp) {
25773 // Number of uses of the candidates in the vector of values.
25774 assert(SameValuesCounter.empty() &&
25775 "Reused values counter map is not empty");
25776 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25777 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25778 continue;
25779 Value *V = Candidates[Cnt];
25780 Value *OrigV = TrackedToOrig.at(Val: V);
25781 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
25782 }
25783 }
25784 V.transformNodes();
25785 V.computeMinimumValueSizes();
25786 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VectorizedVals: VL);
25787
25788 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
25789 // Gather externally used values.
25790 SmallPtrSet<Value *, 4> Visited;
25791 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
25792 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
25793 continue;
25794 Value *RdxVal = Candidates[Cnt];
25795 if (auto It = TrackedVals.find(Val: RdxVal); It != TrackedVals.end())
25796 RdxVal = It->second;
25797 if (!Visited.insert(Ptr: RdxVal).second)
25798 continue;
25799 // Check if the scalar was vectorized as part of the vectorization
25800 // tree but not the top node.
25801 if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
25802 LocalExternallyUsedValues.insert(V: RdxVal);
25803 continue;
25804 }
25805 Value *OrigV = TrackedToOrig.at(Val: RdxVal);
25806 unsigned NumOps =
25807 VectorizedVals.lookup(Val: OrigV) + At(SameValuesCounter, OrigV);
25808 if (NumOps != ReducedValsToOps.at(Val: OrigV).size())
25809 LocalExternallyUsedValues.insert(V: RdxVal);
25810 }
25811 // Do not need the list of reused scalars in regular mode anymore.
25812 if (!IsSupportedHorRdxIdentityOp)
25813 SameValuesCounter.clear();
25814 for (Value *RdxVal : VL)
25815 if (RequiredExtract.contains(Ptr: RdxVal))
25816 LocalExternallyUsedValues.insert(V: RdxVal);
25817 V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
25818
25819 // Estimate cost.
25820 InstructionCost ReductionCost;
25821 if (V.isReducedBitcastRoot())
25822 ReductionCost = 0;
25823 else
25824 ReductionCost =
25825 getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, FMF: RdxFMF, R: V, DT, DL, TLI);
25826 InstructionCost Cost = V.getTreeCost(TreeCost, VectorizedVals: VL, ReductionCost);
25827 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25828 << " for reduction\n");
25829 if (!Cost.isValid())
25830 break;
25831 if (Cost >= -SLPCostThreshold) {
25832 V.getORE()->emit(RemarkBuilder: [&]() {
25833 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
25834 ReducedValsToOps.at(Val: VL[0]).front())
25835 << "Vectorizing horizontal reduction is possible "
25836 << "but not beneficial with cost " << ore::NV("Cost", Cost)
25837 << " and threshold "
25838 << ore::NV("Threshold", -SLPCostThreshold);
25839 });
25840 if (!AdjustReducedVals()) {
25841 V.analyzedReductionVals(VL);
25842 unsigned Offset = Pos == Start ? Pos : Pos - 1;
25843 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
25844 // Add subvectors of VL to the list of the analyzed values.
25845 for (unsigned VF = getFloorFullVectorNumberOfElements(
25846 TTI: *TTI, Ty: VL.front()->getType(), Sz: ReduxWidth - 1);
25847 VF >= ReductionLimit;
25848 VF = getFloorFullVectorNumberOfElements(
25849 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
25850 if (has_single_bit(Value: VF) &&
25851 V.getCanonicalGraphSize() != V.getTreeSize())
25852 continue;
25853 for (unsigned Idx : seq<unsigned>(Size: ReduxWidth - VF))
25854 IgnoredCandidates.insert(V: std::make_pair(x: Offset + Idx, y&: VF));
25855 }
25856 }
25857 }
25858 continue;
25859 }
25860
25861 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
25862 << Cost << ". (HorRdx)\n");
25863 V.getORE()->emit(RemarkBuilder: [&]() {
25864 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
25865 ReducedValsToOps.at(Val: VL[0]).front())
25866 << "Vectorized horizontal reduction with cost "
25867 << ore::NV("Cost", Cost) << " and with tree size "
25868 << ore::NV("TreeSize", V.getTreeSize());
25869 });
25870
25871 Builder.setFastMathFlags(RdxFMF);
25872
25873 // Emit a reduction. If the root is a select (min/max idiom), the insert
25874 // point is the compare condition of that select.
25875 Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
25876 Instruction *InsertPt = RdxRootInst;
25877 if (IsCmpSelMinMax)
25878 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
25879
25880 // Vectorize a tree.
25881 Value *VectorizedRoot = V.vectorizeTree(
25882 ExternallyUsedValues: LocalExternallyUsedValues, ReductionRoot: InsertPt, VectorValuesAndScales);
25883 // Update TrackedToOrig mapping, since the tracked values might be
25884 // updated.
25885 for (Value *RdxVal : Candidates) {
25886 Value *OrigVal = TrackedToOrig.at(Val: RdxVal);
25887 Value *TransformedRdxVal = TrackedVals.at(Val: OrigVal);
25888 if (TransformedRdxVal != RdxVal)
25889 TrackedToOrig.try_emplace(Key: TransformedRdxVal, Args&: OrigVal);
25890 }
25891
25892 Builder.SetInsertPoint(InsertPt);
25893
25894 // To prevent poison from leaking across what used to be sequential,
25895 // safe, scalar boolean logic operations, the reduction operand must be
25896 // frozen.
25897 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(V: VectorizedRoot, AC))
25898 VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
25899
25900 // Emit code to correctly handle reused reduced values, if required.
25901 if (OptReusedScalars && !SameScaleFactor) {
25902 VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
25903 SameValuesCounter, TrackedToOrig);
25904 }
25905
25906 Type *ScalarTy = VL.front()->getType();
25907 Type *VecTy = VectorizedRoot->getType();
25908 Type *RedScalarTy = VecTy->getScalarType();
25909 VectorValuesAndScales.emplace_back(
25910 Args&: VectorizedRoot,
25911 Args: OptReusedScalars && SameScaleFactor
25912 ? SameValuesCounter.front().second
25913 : 1,
25914 Args: RedScalarTy != ScalarTy->getScalarType()
25915 ? V.isSignedMinBitwidthRootNode()
25916 : true);
25917
25918 // Count vectorized reduced values to exclude them from final reduction.
25919 for (Value *RdxVal : VL) {
25920 Value *OrigV = TrackedToOrig.at(Val: RdxVal);
25921 if (IsSupportedHorRdxIdentityOp) {
25922 VectorizedVals.try_emplace(Key: OrigV, Args&: At(SameValuesCounter, OrigV));
25923 continue;
25924 }
25925 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
25926 if (!V.isVectorized(V: RdxVal))
25927 RequiredExtract.insert(Ptr: RdxVal);
25928 }
25929 Pos += ReduxWidth;
25930 Start = Pos;
25931 ReduxWidth = NumReducedVals - Pos;
25932 if (ReduxWidth > 1)
25933 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
25934 AnyVectorized = true;
25935 }
25936 if (OptReusedScalars && !AnyVectorized) {
25937 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
25938 Value *RdxVal = TrackedVals.at(Val: P.first);
25939 Value *RedVal = emitScaleForReusedOps(VectorizedValue: RdxVal, Builder, Cnt: P.second);
25940 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
25941 VectorizedVals.try_emplace(Key: P.first, Args: P.second);
25942 }
25943 continue;
25944 }
25945 }
25946 if (!VectorValuesAndScales.empty())
25947 VectorizedTree = GetNewVectorizedTree(
25948 VectorizedTree, emitReduction(Builder, TTI: *TTI, DestTy: ReductionRoot->getType(),
25949 ReducedInTree: V.isReducedBitcastRoot()));
25950
25951 if (!VectorizedTree) {
25952 if (!CheckForReusedReductionOps) {
25953 for (ReductionOpsType &RdxOps : ReductionOps)
25954 for (Value *RdxOp : RdxOps)
25955 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
25956 }
25957 return nullptr;
25958 }
25959
25960 // Reorder operands of bool logical op in the natural order to avoid
25961 // possible problem with poison propagation. If not possible to reorder
25962 // (both operands are originally RHS), emit an extra freeze instruction
25963 // for the LHS operand.
25964 // I.e., if we have original code like this:
25965 // RedOp1 = select i1 ?, i1 LHS, i1 false
25966 // RedOp2 = select i1 RHS, i1 ?, i1 false
25967
25968 // Then, we swap LHS/RHS to create a new op that matches the poison
25969 // semantics of the original code.
25970
25971 // If we have original code like this and both values could be poison:
25972 // RedOp1 = select i1 ?, i1 LHS, i1 false
25973 // RedOp2 = select i1 ?, i1 RHS, i1 false
25974
25975 // Then, we must freeze LHS in the new op.
25976 auto FixBoolLogicalOps =
25977 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
25978 Instruction *RedOp2, bool InitStep) {
25979 if (!AnyBoolLogicOp)
25980 return;
25981 if (isBoolLogicOp(I: RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
25982 getRdxOperand(I: RedOp1, Index: 0) == LHS ||
25983 isGuaranteedNotToBePoison(V: LHS, AC)))
25984 return;
25985 bool NeedFreeze = LHS != VectorizedTree;
25986 if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
25987 getRdxOperand(I: RedOp2, Index: 0) == RHS ||
25988 isGuaranteedNotToBePoison(V: RHS, AC))) {
25989 // If RedOp2 was used as a second operand - do not swap.
25990 if ((InitStep || RHS != VectorizedTree) &&
25991 getRdxOperand(I: RedOp2, Index: 0) == RHS &&
25992 ((isBoolLogicOp(I: RedOp1) &&
25993 getRdxOperand(I: RedOp1, Index: 1) == RedOp2) ||
25994 any_of(Range&: ReductionOps, P: [&](ArrayRef<Value *> Ops) {
25995 return any_of(Range&: Ops, P: [&](Value *Op) {
25996 auto *OpI = dyn_cast<Instruction>(Val: Op);
25997 return OpI && isBoolLogicOp(I: OpI) &&
25998 getRdxOperand(I: OpI, Index: 1) == RedOp2;
25999 });
26000 }))) {
26001 NeedFreeze = false;
26002 } else {
26003 std::swap(a&: LHS, b&: RHS);
26004 return;
26005 }
26006 }
26007 if (NeedFreeze)
26008 LHS = Builder.CreateFreeze(V: LHS);
26009 };
26010 // Finish the reduction.
26011 // Need to add extra arguments and not vectorized possible reduction values.
26012 // Try to avoid dependencies between the scalar remainders after reductions.
26013 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
26014 bool InitStep) {
26015 unsigned Sz = InstVals.size();
26016 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
26017 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
26018 Instruction *RedOp = InstVals[I + 1].first;
26019 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
26020 Value *RdxVal1 = InstVals[I].second;
26021 Value *StableRdxVal1 = RdxVal1;
26022 auto It1 = TrackedVals.find(Val: RdxVal1);
26023 if (It1 != TrackedVals.end())
26024 StableRdxVal1 = It1->second;
26025 Value *RdxVal2 = InstVals[I + 1].second;
26026 Value *StableRdxVal2 = RdxVal2;
26027 auto It2 = TrackedVals.find(Val: RdxVal2);
26028 if (It2 != TrackedVals.end())
26029 StableRdxVal2 = It2->second;
26030 // To prevent poison from leaking across what used to be sequential,
26031 // safe, scalar boolean logic operations, the reduction operand must be
26032 // frozen.
26033 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
26034 RedOp, InitStep);
26035 Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
26036 RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
26037 ExtraReds[I / 2] = std::make_pair(x: InstVals[I].first, y&: ExtraRed);
26038 }
26039 if (Sz % 2 == 1)
26040 ExtraReds[Sz / 2] = InstVals.back();
26041 return ExtraReds;
26042 };
26043 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
26044 ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
26045 Args&: VectorizedTree);
26046 SmallPtrSet<Value *, 8> Visited;
26047 for (ArrayRef<Value *> Candidates : ReducedVals) {
26048 for (Value *RdxVal : Candidates) {
26049 if (!Visited.insert(Ptr: RdxVal).second)
26050 continue;
26051 unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
26052 for (Instruction *RedOp :
26053 ArrayRef(ReducedValsToOps.at(Val: RdxVal)).drop_back(N: NumOps))
26054 ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
26055 }
26056 }
26057 // Iterate through all not-vectorized reduction values/extra arguments.
26058 bool InitStep = true;
26059 while (ExtraReductions.size() > 1) {
26060 SmallVector<std::pair<Instruction *, Value *>> NewReds =
26061 FinalGen(ExtraReductions, InitStep);
26062 ExtraReductions.swap(RHS&: NewReds);
26063 InitStep = false;
26064 }
26065 VectorizedTree = ExtraReductions.front().second;
26066
26067 ReductionRoot->replaceAllUsesWith(V: VectorizedTree);
26068
26069 // The original scalar reduction is expected to have no remaining
26070 // uses outside the reduction tree itself. Assert that we got this
26071 // correct, replace internal uses with undef, and mark for eventual
26072 // deletion.
26073#ifndef NDEBUG
26074 SmallPtrSet<Value *, 4> IgnoreSet;
26075 for (ArrayRef<Value *> RdxOps : ReductionOps)
26076 IgnoreSet.insert_range(RdxOps);
26077#endif
26078 for (ArrayRef<Value *> RdxOps : ReductionOps) {
26079 for (Value *Ignore : RdxOps) {
26080 if (!Ignore)
26081 continue;
26082#ifndef NDEBUG
26083 for (auto *U : Ignore->users()) {
26084 assert(IgnoreSet.count(U) &&
26085 "All users must be either in the reduction ops list.");
26086 }
26087#endif
26088 if (!Ignore->use_empty()) {
26089 Value *P = PoisonValue::get(T: Ignore->getType());
26090 Ignore->replaceAllUsesWith(V: P);
26091 }
26092 }
26093 V.removeInstructionsAndOperands(DeadVals: RdxOps, VectorValuesAndScales);
26094 }
26095 return VectorizedTree;
26096 }
26097
26098private:
26099 /// Creates the reduction from the given \p Vec vector value with the given
26100 /// scale \p Scale and signedness \p IsSigned.
26101 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
26102 Value *Vec, unsigned Scale, bool IsSigned, Type *DestTy,
26103 bool ReducedInTree) {
26104 Value *Rdx;
26105 if (ReducedInTree) {
26106 Rdx = Vec;
26107 } else if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DestTy)) {
26108 unsigned DestTyNumElements = getNumElements(Ty: VecTy);
26109 unsigned VF = getNumElements(Ty: Vec->getType()) / DestTyNumElements;
26110 Rdx = PoisonValue::get(
26111 T: getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: DestTyNumElements));
26112 for (unsigned I : seq<unsigned>(Size: DestTyNumElements)) {
26113 // Do reduction for each lane.
26114 // e.g., do reduce add for
26115 // VL[0] = <4 x Ty> <a, b, c, d>
26116 // VL[1] = <4 x Ty> <e, f, g, h>
26117 // Lane[0] = <2 x Ty> <a, e>
26118 // Lane[1] = <2 x Ty> <b, f>
26119 // Lane[2] = <2 x Ty> <c, g>
26120 // Lane[3] = <2 x Ty> <d, h>
26121 // result[0] = reduce add Lane[0]
26122 // result[1] = reduce add Lane[1]
26123 // result[2] = reduce add Lane[2]
26124 // result[3] = reduce add Lane[3]
26125 SmallVector<int, 16> Mask = createStrideMask(Start: I, Stride: DestTyNumElements, VF);
26126 Value *Lane = Builder.CreateShuffleVector(V: Vec, Mask);
26127 Rdx = Builder.CreateInsertElement(
26128 Vec: Rdx, NewElt: emitReduction(VectorizedValue: Lane, Builder, TTI: &TTI, DestTy), Idx: I);
26129 }
26130 } else {
26131 Rdx = emitReduction(VectorizedValue: Vec, Builder, TTI: &TTI, DestTy);
26132 }
26133 if (Rdx->getType() != DestTy)
26134 Rdx = Builder.CreateIntCast(V: Rdx, DestTy, isSigned: IsSigned);
26135 // Improved analysis for add/fadd/xor reductions with same scale
26136 // factor for all operands of reductions. We can emit scalar ops for
26137 // them instead.
26138 if (Scale > 1)
26139 Rdx = emitScaleForReusedOps(VectorizedValue: Rdx, Builder, Cnt: Scale);
26140 return Rdx;
26141 }
26142
26143 /// Calculate the cost of a reduction.
26144 InstructionCost getReductionCost(TargetTransformInfo *TTI,
26145 ArrayRef<Value *> ReducedVals,
26146 bool IsCmpSelMinMax, FastMathFlags FMF,
26147 const BoUpSLP &R, DominatorTree &DT,
26148 const DataLayout &DL,
26149 const TargetLibraryInfo &TLI) {
26150 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
26151 Type *ScalarTy = ReducedVals.front()->getType();
26152 unsigned ReduxWidth = ReducedVals.size();
26153 FixedVectorType *VectorTy = R.getReductionType();
26154 InstructionCost VectorCost = 0, ScalarCost;
26155 // If all of the reduced values are constant, the vector cost is 0, since
26156 // the reduction value can be calculated at the compile time.
26157 bool AllConsts = allConstant(VL: ReducedVals);
26158 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
26159 InstructionCost Cost = 0;
26160 // Scalar cost is repeated for N-1 elements.
26161 int Cnt = ReducedVals.size();
26162 for (Value *RdxVal : ReducedVals) {
26163 if (!isa<Instruction>(Val: RdxVal))
26164 continue;
26165 if (Cnt == 1)
26166 break;
26167 --Cnt;
26168 if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? 3 : 2)) {
26169 Cost += GenCostFn();
26170 continue;
26171 }
26172 InstructionCost ScalarCost = 0;
26173 for (User *U : RdxVal->users()) {
26174 auto *RdxOp = cast<Instruction>(Val: U);
26175 if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
26176 if (RdxKind == RecurKind::FAdd) {
26177 InstructionCost FMACost = canConvertToFMA(
26178 VL: RdxOp, S: getSameOpcode(VL: RdxOp, TLI), DT, DL, TTI&: *TTI, TLI);
26179 if (FMACost.isValid()) {
26180 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
26181 if (auto *I = dyn_cast<Instruction>(Val: RdxVal)) {
26182 // Also, exclude scalar fmul cost.
26183 InstructionCost FMulCost =
26184 TTI->getInstructionCost(U: I, CostKind);
26185 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
26186 FMACost -= FMulCost;
26187 }
26188 ScalarCost += FMACost;
26189 continue;
26190 }
26191 }
26192 ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
26193 continue;
26194 }
26195 ScalarCost = InstructionCost::getInvalid();
26196 break;
26197 }
26198 if (ScalarCost.isValid())
26199 Cost += ScalarCost;
26200 else
26201 Cost += GenCostFn();
26202 }
26203 return Cost;
26204 };
26205 // Require reduction cost if:
26206 // 1. This type is not a full register type and no other vectors with the
26207 // same type in the storage (first vector with small type).
26208 // 2. The storage does not have any vector with full vector use (first
26209 // vector with full register use).
26210 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
26211 switch (RdxKind) {
26212 case RecurKind::Add:
26213 case RecurKind::Mul:
26214 case RecurKind::Or:
26215 case RecurKind::And:
26216 case RecurKind::Xor:
26217 case RecurKind::FAdd:
26218 case RecurKind::FMul: {
26219 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
26220 if (!AllConsts) {
26221 if (DoesRequireReductionOp) {
26222 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
26223 assert(SLPReVec && "FixedVectorType is not expected.");
26224 unsigned ScalarTyNumElements = VecTy->getNumElements();
26225 for (unsigned I : seq<unsigned>(Size: ReducedVals.size())) {
26226 VectorCost += TTI->getShuffleCost(
26227 Kind: TTI::SK_PermuteSingleSrc,
26228 DstTy: FixedVectorType::get(ElementType: VecTy->getScalarType(),
26229 NumElts: ReducedVals.size()),
26230 SrcTy: VectorTy,
26231 Mask: createStrideMask(Start: I, Stride: ScalarTyNumElements, VF: ReducedVals.size()));
26232 VectorCost += TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VecTy,
26233 FMF, CostKind);
26234 }
26235 VectorCost += TTI->getScalarizationOverhead(
26236 Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: ScalarTyNumElements), /*Insert*/ true,
26237 /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
26238 } else {
26239 Type *RedTy = VectorTy->getElementType();
26240 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26241 u: std::make_pair(x&: RedTy, y: true));
26242 if (RType == RedTy) {
26243 VectorCost = TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy,
26244 FMF, CostKind);
26245 } else {
26246 VectorCost = TTI->getExtendedReductionCost(
26247 Opcode: RdxOpcode, IsUnsigned: !IsSigned, ResTy: RedTy,
26248 Ty: getWidenedType(ScalarTy: RType, VF: ReduxWidth), FMF, CostKind);
26249 }
26250 }
26251 } else {
26252 Type *RedTy = VectorTy->getElementType();
26253 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26254 u: std::make_pair(x&: RedTy, y: true));
26255 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
26256 InstructionCost FMACost = InstructionCost::getInvalid();
26257 if (RdxKind == RecurKind::FAdd) {
26258 // Check if the reduction operands can be converted to FMA.
26259 SmallVector<Value *> Ops;
26260 FastMathFlags FMF;
26261 FMF.set();
26262 for (Value *RdxVal : ReducedVals) {
26263 if (!RdxVal->hasOneUse()) {
26264 Ops.clear();
26265 break;
26266 }
26267 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: RdxVal))
26268 FMF &= FPCI->getFastMathFlags();
26269 Ops.push_back(Elt: RdxVal->user_back());
26270 }
26271 if (!Ops.empty()) {
26272 FMACost = canConvertToFMA(VL: Ops, S: getSameOpcode(VL: Ops, TLI), DT, DL,
26273 TTI&: *TTI, TLI);
26274 if (FMACost.isValid()) {
26275 // Calculate actual FMAD cost.
26276 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
26277 {RVecTy, RVecTy, RVecTy}, FMF);
26278 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
26279
26280 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
26281 // Also, exclude vector fmul cost.
26282 InstructionCost FMulCost = TTI->getArithmeticInstrCost(
26283 Opcode: Instruction::FMul, Ty: RVecTy, CostKind);
26284 LLVM_DEBUG(dbgs()
26285 << "Minus vector FMul cost: " << FMulCost << "\n");
26286 FMACost -= FMulCost;
26287 }
26288 }
26289 }
26290 if (FMACost.isValid())
26291 VectorCost += FMACost;
26292 else
26293 VectorCost +=
26294 TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: RVecTy, CostKind);
26295 if (RType != RedTy) {
26296 unsigned Opcode = Instruction::Trunc;
26297 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26298 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26299 VectorCost += TTI->getCastInstrCost(
26300 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
26301 }
26302 }
26303 }
26304 ScalarCost = EvaluateScalarCost([&]() {
26305 return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
26306 });
26307 break;
26308 }
26309 case RecurKind::FMax:
26310 case RecurKind::FMin:
26311 case RecurKind::FMaximum:
26312 case RecurKind::FMinimum:
26313 case RecurKind::SMax:
26314 case RecurKind::SMin:
26315 case RecurKind::UMax:
26316 case RecurKind::UMin: {
26317 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
26318 if (!AllConsts) {
26319 if (DoesRequireReductionOp) {
26320 VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
26321 } else {
26322 // Check if the previous reduction already exists and account it as
26323 // series of operations + single reduction.
26324 Type *RedTy = VectorTy->getElementType();
26325 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26326 u: std::make_pair(x&: RedTy, y: true));
26327 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
26328 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
26329 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
26330 if (RType != RedTy) {
26331 unsigned Opcode = Instruction::Trunc;
26332 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26333 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26334 VectorCost += TTI->getCastInstrCost(
26335 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
26336 }
26337 }
26338 }
26339 ScalarCost = EvaluateScalarCost([&]() {
26340 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
26341 return TTI->getIntrinsicInstrCost(ICA, CostKind);
26342 });
26343 break;
26344 }
26345 default:
26346 llvm_unreachable("Expected arithmetic or min/max reduction operation");
26347 }
26348
26349 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
26350 << " for reduction of " << shortBundleName(ReducedVals)
26351 << " (It is a splitting reduction)\n");
26352 return VectorCost - ScalarCost;
26353 }
26354
26355 /// Splits the values, stored in VectorValuesAndScales, into registers/free
26356 /// sub-registers, combines them with the given reduction operation as a
26357 /// vector operation and then performs single (small enough) reduction.
26358 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
26359 Type *DestTy, bool ReducedInTree) {
26360 Value *ReducedSubTree = nullptr;
26361 // Creates reduction and combines with the previous reduction.
26362 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned) {
26363 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy,
26364 ReducedInTree);
26365 if (ReducedSubTree)
26366 ReducedSubTree = createOp(Builder, RdxKind, LHS: ReducedSubTree, RHS: Rdx,
26367 Name: "op.rdx", ReductionOps);
26368 else
26369 ReducedSubTree = Rdx;
26370 };
26371 if (VectorValuesAndScales.size() == 1) {
26372 const auto &[Vec, Scale, IsSigned] = VectorValuesAndScales.front();
26373 CreateSingleOp(Vec, Scale, IsSigned);
26374 return ReducedSubTree;
26375 }
26376 // Scales Vec using given Cnt scale factor and then performs vector combine
26377 // with previous value of VecOp.
26378 Value *VecRes = nullptr;
26379 bool VecResSignedness = false;
26380 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned) {
26381 Type *ScalarTy = Vec->getType()->getScalarType();
26382 // Scale Vec using given Cnt scale factor.
26383 if (Cnt > 1) {
26384 ElementCount EC = cast<VectorType>(Val: Vec->getType())->getElementCount();
26385 switch (RdxKind) {
26386 case RecurKind::Add: {
26387 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
26388 unsigned VF = getNumElements(Ty: Vec->getType());
26389 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
26390 << ". (HorRdx)\n");
26391 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
26392 for (unsigned I : seq<unsigned>(Size: Cnt))
26393 std::iota(first: std::next(x: Mask.begin(), n: VF * I),
26394 last: std::next(x: Mask.begin(), n: VF * (I + 1)), value: 0);
26395 ++NumVectorInstructions;
26396 Vec = Builder.CreateShuffleVector(V: Vec, Mask);
26397 break;
26398 }
26399 // res = mul vv, n
26400 if (ScalarTy != DestTy->getScalarType())
26401 Vec = Builder.CreateIntCast(
26402 V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
26403 isSigned: IsSigned);
26404 Value *Scale = ConstantVector::getSplat(
26405 EC, Elt: ConstantInt::get(Ty: DestTy->getScalarType(), V: Cnt));
26406 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
26407 << ". (HorRdx)\n");
26408 ++NumVectorInstructions;
26409 Vec = Builder.CreateMul(LHS: Vec, RHS: Scale);
26410 break;
26411 }
26412 case RecurKind::Xor: {
26413 // res = n % 2 ? 0 : vv
26414 LLVM_DEBUG(dbgs()
26415 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
26416 if (Cnt % 2 == 0)
26417 Vec = Constant::getNullValue(Ty: Vec->getType());
26418 break;
26419 }
26420 case RecurKind::FAdd: {
26421 // res = fmul v, n
26422 Value *Scale =
26423 ConstantVector::getSplat(EC, Elt: ConstantFP::get(Ty: ScalarTy, V: Cnt));
26424 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
26425 << ". (HorRdx)\n");
26426 ++NumVectorInstructions;
26427 Vec = Builder.CreateFMul(L: Vec, R: Scale);
26428 break;
26429 }
26430 case RecurKind::And:
26431 case RecurKind::Or:
26432 case RecurKind::SMax:
26433 case RecurKind::SMin:
26434 case RecurKind::UMax:
26435 case RecurKind::UMin:
26436 case RecurKind::FMax:
26437 case RecurKind::FMin:
26438 case RecurKind::FMaximum:
26439 case RecurKind::FMinimum:
26440 // res = vv
26441 break;
26442 case RecurKind::Sub:
26443 case RecurKind::AddChainWithSubs:
26444 case RecurKind::Mul:
26445 case RecurKind::FMul:
26446 case RecurKind::FMulAdd:
26447 case RecurKind::AnyOf:
26448 case RecurKind::FindFirstIVSMin:
26449 case RecurKind::FindFirstIVUMin:
26450 case RecurKind::FindLastIVSMax:
26451 case RecurKind::FindLastIVUMax:
26452 case RecurKind::FindLast:
26453 case RecurKind::FMaxNum:
26454 case RecurKind::FMinNum:
26455 case RecurKind::FMaximumNum:
26456 case RecurKind::FMinimumNum:
26457 case RecurKind::None:
26458 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26459 }
26460 }
26461 // Combine Vec with the previous VecOp.
26462 if (!VecRes) {
26463 VecRes = Vec;
26464 VecResSignedness = IsSigned;
26465 } else {
26466 ++NumVectorInstructions;
26467 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
26468 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
26469 // Handle ctpop.
26470 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
26471 unsigned VecVF = getNumElements(Ty: Vec->getType());
26472 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
26473 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
26474 // Ensure that VecRes is always larger than Vec
26475 if (VecResVF < VecVF) {
26476 std::swap(a&: VecRes, b&: Vec);
26477 std::swap(a&: VecResVF, b&: VecVF);
26478 }
26479 if (VecResVF != VecVF) {
26480 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
26481 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
26482 Vec = Builder.CreateShuffleVector(V: Vec, Mask: ResizeMask);
26483 }
26484 VecRes = Builder.CreateShuffleVector(V1: VecRes, V2: Vec, Mask, Name: "rdx.op");
26485 return;
26486 }
26487 if (VecRes->getType()->getScalarType() != DestTy->getScalarType())
26488 VecRes = Builder.CreateIntCast(
26489 V: VecRes, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: VecRes->getType())),
26490 isSigned: VecResSignedness);
26491 if (ScalarTy != DestTy->getScalarType())
26492 Vec = Builder.CreateIntCast(
26493 V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
26494 isSigned: IsSigned);
26495 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
26496 unsigned VecVF = getNumElements(Ty: Vec->getType());
26497 // Ensure that VecRes is always larger than Vec
26498 if (VecResVF < VecVF) {
26499 std::swap(a&: VecRes, b&: Vec);
26500 std::swap(a&: VecResVF, b&: VecVF);
26501 }
26502 // extract + op + insert
26503 Value *Op = VecRes;
26504 if (VecResVF != VecVF)
26505 Op = createExtractVector(Builder, Vec: VecRes, SubVecVF: VecVF, /*Index=*/0);
26506 Op = createOp(Builder, RdxKind, LHS: Op, RHS: Vec, Name: "rdx.op", ReductionOps);
26507 if (VecResVF != VecVF)
26508 Op = createInsertVector(Builder, Vec: VecRes, V: Op, /*Index=*/0);
26509 VecRes = Op;
26510 }
26511 };
26512 for (auto [Vec, Scale, IsSigned] : VectorValuesAndScales)
26513 CreateVecOp(Vec, Scale, IsSigned);
26514 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false);
26515
26516 return ReducedSubTree;
26517 }
26518
26519 /// Emit a horizontal reduction of the vectorized value.
26520 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
26521 const TargetTransformInfo *TTI, Type *DestTy) {
26522 assert(VectorizedValue && "Need to have a vectorized tree node");
26523 assert(RdxKind != RecurKind::FMulAdd &&
26524 "A call to the llvm.fmuladd intrinsic is not handled yet");
26525
26526 auto *FTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
26527 if (FTy->getScalarType() == Builder.getInt1Ty() &&
26528 RdxKind == RecurKind::Add &&
26529 DestTy->getScalarType() != FTy->getScalarType()) {
26530 // Convert vector_reduce_add(ZExt(<n x i1>)) to
26531 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
26532 Value *V = Builder.CreateBitCast(
26533 V: VectorizedValue, DestTy: Builder.getIntNTy(N: FTy->getNumElements()));
26534 ++NumVectorInstructions;
26535 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V);
26536 }
26537 ++NumVectorInstructions;
26538 return createSimpleReduction(B&: Builder, Src: VectorizedValue, RdxKind);
26539 }
26540
26541 /// Emits optimized code for unique scalar value reused \p Cnt times.
26542 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
26543 unsigned Cnt) {
26544 assert(IsSupportedHorRdxIdentityOp &&
26545 "The optimization of matched scalar identity horizontal reductions "
26546 "must be supported.");
26547 if (Cnt == 1)
26548 return VectorizedValue;
26549 switch (RdxKind) {
26550 case RecurKind::Add: {
26551 // res = mul vv, n
26552 Value *Scale = ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt);
26553 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
26554 << VectorizedValue << ". (HorRdx)\n");
26555 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
26556 }
26557 case RecurKind::Xor: {
26558 // res = n % 2 ? 0 : vv
26559 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
26560 << ". (HorRdx)\n");
26561 if (Cnt % 2 == 0)
26562 return Constant::getNullValue(Ty: VectorizedValue->getType());
26563 return VectorizedValue;
26564 }
26565 case RecurKind::FAdd: {
26566 // res = fmul v, n
26567 Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
26568 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
26569 << VectorizedValue << ". (HorRdx)\n");
26570 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
26571 }
26572 case RecurKind::And:
26573 case RecurKind::Or:
26574 case RecurKind::SMax:
26575 case RecurKind::SMin:
26576 case RecurKind::UMax:
26577 case RecurKind::UMin:
26578 case RecurKind::FMax:
26579 case RecurKind::FMin:
26580 case RecurKind::FMaximum:
26581 case RecurKind::FMinimum:
26582 // res = vv
26583 return VectorizedValue;
26584 case RecurKind::Sub:
26585 case RecurKind::AddChainWithSubs:
26586 case RecurKind::Mul:
26587 case RecurKind::FMul:
26588 case RecurKind::FMulAdd:
26589 case RecurKind::AnyOf:
26590 case RecurKind::FindFirstIVSMin:
26591 case RecurKind::FindFirstIVUMin:
26592 case RecurKind::FindLastIVSMax:
26593 case RecurKind::FindLastIVUMax:
26594 case RecurKind::FindLast:
26595 case RecurKind::FMaxNum:
26596 case RecurKind::FMinNum:
26597 case RecurKind::FMaximumNum:
26598 case RecurKind::FMinimumNum:
26599 case RecurKind::None:
26600 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26601 }
26602 return nullptr;
26603 }
26604
26605 /// Emits actual operation for the scalar identity values, found during
26606 /// horizontal reduction analysis.
26607 Value *
26608 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
26609 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
26610 const DenseMap<Value *, Value *> &TrackedToOrig) {
26611 assert(IsSupportedHorRdxIdentityOp &&
26612 "The optimization of matched scalar identity horizontal reductions "
26613 "must be supported.");
26614 ArrayRef<Value *> VL = R.getRootNodeScalars();
26615 auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
26616 if (VTy->getElementType() != VL.front()->getType()) {
26617 VectorizedValue = Builder.CreateIntCast(
26618 V: VectorizedValue,
26619 DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
26620 isSigned: R.isSignedMinBitwidthRootNode());
26621 }
26622 switch (RdxKind) {
26623 case RecurKind::Add: {
26624 // root = mul prev_root, <1, 1, n, 1>
26625 SmallVector<Constant *> Vals;
26626 for (Value *V : VL) {
26627 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
26628 Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /*IsSigned=*/false));
26629 }
26630 auto *Scale = ConstantVector::get(V: Vals);
26631 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
26632 << VectorizedValue << ". (HorRdx)\n");
26633 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
26634 }
26635 case RecurKind::And:
26636 case RecurKind::Or:
26637 // No need for multiple or/and(s).
26638 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
26639 << ". (HorRdx)\n");
26640 return VectorizedValue;
26641 case RecurKind::SMax:
26642 case RecurKind::SMin:
26643 case RecurKind::UMax:
26644 case RecurKind::UMin:
26645 case RecurKind::FMax:
26646 case RecurKind::FMin:
26647 case RecurKind::FMaximum:
26648 case RecurKind::FMinimum:
26649 // No need for multiple min/max(s) of the same value.
26650 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
26651 << ". (HorRdx)\n");
26652 return VectorizedValue;
26653 case RecurKind::Xor: {
26654 // Replace values with even number of repeats with 0, since
26655 // x xor x = 0.
26656 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
26657 // 7>, if elements 4th and 6th elements have even number of repeats.
26658 SmallVector<int> Mask(
26659 cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
26660 PoisonMaskElem);
26661 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
26662 bool NeedShuffle = false;
26663 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
26664 Value *V = VL[I];
26665 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
26666 if (Cnt % 2 == 0) {
26667 Mask[I] = VF;
26668 NeedShuffle = true;
26669 }
26670 }
26671 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
26672 : Mask) dbgs()
26673 << I << " ";
26674 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
26675 if (NeedShuffle)
26676 VectorizedValue = Builder.CreateShuffleVector(
26677 V1: VectorizedValue,
26678 V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
26679 return VectorizedValue;
26680 }
26681 case RecurKind::FAdd: {
26682 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
26683 SmallVector<Constant *> Vals;
26684 for (Value *V : VL) {
26685 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
26686 Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
26687 }
26688 auto *Scale = ConstantVector::get(V: Vals);
26689 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
26690 }
26691 case RecurKind::Sub:
26692 case RecurKind::AddChainWithSubs:
26693 case RecurKind::Mul:
26694 case RecurKind::FMul:
26695 case RecurKind::FMulAdd:
26696 case RecurKind::AnyOf:
26697 case RecurKind::FindFirstIVSMin:
26698 case RecurKind::FindFirstIVUMin:
26699 case RecurKind::FindLastIVSMax:
26700 case RecurKind::FindLastIVUMax:
26701 case RecurKind::FindLast:
26702 case RecurKind::FMaxNum:
26703 case RecurKind::FMinNum:
26704 case RecurKind::FMaximumNum:
26705 case RecurKind::FMinimumNum:
26706 case RecurKind::None:
26707 llvm_unreachable("Unexpected reduction kind for reused scalars.");
26708 }
26709 return nullptr;
26710 }
26711};
26712} // end anonymous namespace
26713
26714/// Gets recurrence kind from the specified value.
26715static RecurKind getRdxKind(Value *V) {
26716 return HorizontalReduction::getRdxKind(V);
26717}
26718static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
26719 if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
26720 return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
26721
26722 unsigned AggregateSize = 1;
26723 auto *IV = cast<InsertValueInst>(Val: InsertInst);
26724 Type *CurrentType = IV->getType();
26725 do {
26726 if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
26727 for (auto *Elt : ST->elements())
26728 if (Elt != ST->getElementType(N: 0)) // check homogeneity
26729 return std::nullopt;
26730 AggregateSize *= ST->getNumElements();
26731 CurrentType = ST->getElementType(N: 0);
26732 } else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
26733 AggregateSize *= AT->getNumElements();
26734 CurrentType = AT->getElementType();
26735 } else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
26736 AggregateSize *= VT->getNumElements();
26737 return AggregateSize;
26738 } else if (CurrentType->isSingleValueType()) {
26739 return AggregateSize;
26740 } else {
26741 return std::nullopt;
26742 }
26743 } while (true);
26744}
26745
26746static void findBuildAggregateRec(Instruction *LastInsertInst,
26747 TargetTransformInfo *TTI,
26748 SmallVectorImpl<Value *> &BuildVectorOpds,
26749 SmallVectorImpl<Value *> &InsertElts,
26750 unsigned OperandOffset, const BoUpSLP &R) {
26751 do {
26752 Value *InsertedOperand = LastInsertInst->getOperand(i: 1);
26753 std::optional<unsigned> OperandIndex =
26754 getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
26755 if (!OperandIndex || R.isDeleted(I: LastInsertInst))
26756 return;
26757 if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
26758 findBuildAggregateRec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
26759 BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex, R);
26760
26761 } else {
26762 BuildVectorOpds[*OperandIndex] = InsertedOperand;
26763 InsertElts[*OperandIndex] = LastInsertInst;
26764 }
26765 LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: 0));
26766 } while (LastInsertInst != nullptr &&
26767 isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
26768 LastInsertInst->hasOneUse());
26769}
26770
26771/// Recognize construction of vectors like
26772/// %ra = insertelement <4 x float> poison, float %s0, i32 0
26773/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
26774/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
26775/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
26776/// starting from the last insertelement or insertvalue instruction.
26777///
26778/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
26779/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
26780/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
26781///
26782/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
26783///
26784/// \return true if it matches.
26785static bool findBuildAggregate(Instruction *LastInsertInst,
26786 TargetTransformInfo *TTI,
26787 SmallVectorImpl<Value *> &BuildVectorOpds,
26788 SmallVectorImpl<Value *> &InsertElts,
26789 const BoUpSLP &R) {
26790
26791 assert((isa<InsertElementInst>(LastInsertInst) ||
26792 isa<InsertValueInst>(LastInsertInst)) &&
26793 "Expected insertelement or insertvalue instruction!");
26794
26795 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
26796 "Expected empty result vectors!");
26797
26798 std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
26799 if (!AggregateSize)
26800 return false;
26801 BuildVectorOpds.resize(N: *AggregateSize);
26802 InsertElts.resize(N: *AggregateSize);
26803
26804 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: 0, R);
26805 llvm::erase(C&: BuildVectorOpds, V: nullptr);
26806 llvm::erase(C&: InsertElts, V: nullptr);
26807 if (BuildVectorOpds.size() >= 2)
26808 return true;
26809
26810 return false;
26811}
26812
26813/// Try and get a reduction instruction from a phi node.
26814///
26815/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
26816/// if they come from either \p ParentBB or a containing loop latch.
26817///
26818/// \returns A candidate reduction value if possible, or \code nullptr \endcode
26819/// if not possible.
26820static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
26821 BasicBlock *ParentBB, LoopInfo *LI) {
26822 // There are situations where the reduction value is not dominated by the
26823 // reduction phi. Vectorizing such cases has been reported to cause
26824 // miscompiles. See PR25787.
26825 auto DominatedReduxValue = [&](Value *R) {
26826 return isa<Instruction>(Val: R) &&
26827 DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
26828 };
26829
26830 Instruction *Rdx = nullptr;
26831
26832 // Return the incoming value if it comes from the same BB as the phi node.
26833 if (P->getIncomingBlock(i: 0) == ParentBB) {
26834 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
26835 } else if (P->getIncomingBlock(i: 1) == ParentBB) {
26836 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
26837 }
26838
26839 if (Rdx && DominatedReduxValue(Rdx))
26840 return Rdx;
26841
26842 // Otherwise, check whether we have a loop latch to look at.
26843 Loop *BBL = LI->getLoopFor(BB: ParentBB);
26844 if (!BBL)
26845 return nullptr;
26846 BasicBlock *BBLatch = BBL->getLoopLatch();
26847 if (!BBLatch)
26848 return nullptr;
26849
26850 // There is a loop latch, return the incoming value if it comes from
26851 // that. This reduction pattern occasionally turns up.
26852 if (P->getIncomingBlock(i: 0) == BBLatch) {
26853 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
26854 } else if (P->getIncomingBlock(i: 1) == BBLatch) {
26855 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
26856 }
26857
26858 if (Rdx && DominatedReduxValue(Rdx))
26859 return Rdx;
26860
26861 return nullptr;
26862}
26863
26864static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
26865 if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
26866 return true;
26867 if (match(V: I, P: m_FMaxNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26868 return true;
26869 if (match(V: I, P: m_FMinNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26870 return true;
26871 if (match(V: I, P: m_FMaximum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26872 return true;
26873 if (match(V: I, P: m_FMinimum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26874 return true;
26875 if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26876 return true;
26877 if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26878 return true;
26879 if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26880 return true;
26881 if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
26882 return true;
26883 return false;
26884}
26885
26886/// We could have an initial reduction that is not an add.
26887/// r *= v1 + v2 + v3 + v4
26888/// In such a case start looking for a tree rooted in the first '+'.
26889/// \Returns the new root if found, which may be nullptr if not an instruction.
26890static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
26891 Instruction *Root) {
26892 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
26893 isa<IntrinsicInst>(Root)) &&
26894 "Expected binop, select, or intrinsic for reduction matching");
26895 Value *LHS =
26896 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
26897 Value *RHS =
26898 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + 1);
26899 if (LHS == Phi)
26900 return dyn_cast<Instruction>(Val: RHS);
26901 if (RHS == Phi)
26902 return dyn_cast<Instruction>(Val: LHS);
26903 return nullptr;
26904}
26905
26906/// \p Returns the first operand of \p I that does not match \p Phi. If
26907/// operand is not an instruction it returns nullptr.
26908static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
26909 Value *Op0 = nullptr;
26910 Value *Op1 = nullptr;
26911 if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
26912 return nullptr;
26913 return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
26914}
26915
26916/// \Returns true if \p I is a candidate instruction for reduction vectorization.
26917static bool isReductionCandidate(Instruction *I) {
26918 bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
26919 Value *B0 = nullptr, *B1 = nullptr;
26920 bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
26921 return IsBinop || IsSelect;
26922}
26923
26924bool SLPVectorizerPass::vectorizeHorReduction(
26925 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
26926 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
26927 if (!ShouldVectorizeHor)
26928 return false;
26929 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
26930
26931 if (Root->getParent() != BB || isa<PHINode>(Val: Root))
26932 return false;
26933
26934 // If we can find a secondary reduction root, use that instead.
26935 auto SelectRoot = [&]() {
26936 if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
26937 HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
26938 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
26939 return NewRoot;
26940 return Root;
26941 };
26942
26943 // Start analysis starting from Root instruction. If horizontal reduction is
26944 // found, try to vectorize it. If it is not a horizontal reduction or
26945 // vectorization is not possible or not effective, and currently analyzed
26946 // instruction is a binary operation, try to vectorize the operands, using
26947 // pre-order DFS traversal order. If the operands were not vectorized, repeat
26948 // the same procedure considering each operand as a possible root of the
26949 // horizontal reduction.
26950 // Interrupt the process if the Root instruction itself was vectorized or all
26951 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
26952 // If a horizintal reduction was not matched or vectorized we collect
26953 // instructions for possible later attempts for vectorization.
26954 std::queue<std::pair<Instruction *, unsigned>> Stack;
26955 Stack.emplace(args: SelectRoot(), args: 0);
26956 SmallPtrSet<Value *, 8> VisitedInstrs;
26957 bool Res = false;
26958 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
26959 if (R.isAnalyzedReductionRoot(I: Inst))
26960 return nullptr;
26961 if (!isReductionCandidate(I: Inst))
26962 return nullptr;
26963 HorizontalReduction HorRdx;
26964 if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: *SE, DL: *DL, TLI: *TLI))
26965 return nullptr;
26966 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI, TLI: *TLI, AC, DT&: *DT);
26967 };
26968 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
26969 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
26970 FutureSeed = getNonPhiOperand(I: Root, Phi: P);
26971 if (!FutureSeed)
26972 return false;
26973 }
26974 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
26975 // analysis is done separately.
26976 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
26977 PostponedInsts.push_back(Elt: FutureSeed);
26978 return true;
26979 };
26980
26981 while (!Stack.empty()) {
26982 Instruction *Inst;
26983 unsigned Level;
26984 std::tie(args&: Inst, args&: Level) = Stack.front();
26985 Stack.pop();
26986 // Do not try to analyze instruction that has already been vectorized.
26987 // This may happen when we vectorize instruction operands on a previous
26988 // iteration while stack was populated before that happened.
26989 if (R.isDeleted(I: Inst))
26990 continue;
26991 if (Value *VectorizedV = TryToReduce(Inst)) {
26992 Res = true;
26993 if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
26994 // Try to find another reduction.
26995 Stack.emplace(args&: I, args&: Level);
26996 continue;
26997 }
26998 if (R.isDeleted(I: Inst))
26999 continue;
27000 } else {
27001 // We could not vectorize `Inst` so try to use it as a future seed.
27002 if (!TryAppendToPostponedInsts(Inst)) {
27003 assert(Stack.empty() && "Expected empty stack");
27004 break;
27005 }
27006 }
27007
27008 // Try to vectorize operands.
27009 // Continue analysis for the instruction from the same basic block only to
27010 // save compile time.
27011 if (++Level < RecursionMaxDepth)
27012 for (auto *Op : Inst->operand_values())
27013 if (VisitedInstrs.insert(Ptr: Op).second)
27014 if (auto *I = dyn_cast<Instruction>(Val: Op))
27015 // Do not try to vectorize CmpInst operands, this is done
27016 // separately.
27017 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
27018 !R.isDeleted(I) && I->getParent() == BB)
27019 Stack.emplace(args&: I, args&: Level);
27020 }
27021 return Res;
27022}
27023
27024bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
27025 if (!I)
27026 return false;
27027
27028 if (!isa<BinaryOperator, CmpInst>(Val: I) || isa<VectorType>(Val: I->getType()))
27029 return false;
27030 // Skip potential FMA candidates.
27031 if ((I->getOpcode() == Instruction::FAdd ||
27032 I->getOpcode() == Instruction::FSub) &&
27033 canConvertToFMA(VL: I, S: getSameOpcode(VL: I, TLI: *TLI), DT&: *DT, DL: *DL, TTI&: *TTI, TLI: *TLI)
27034 .isValid())
27035 return false;
27036
27037 Value *P = I->getParent();
27038
27039 // Vectorize in current basic block only.
27040 auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
27041 auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: 1));
27042 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
27043 R.isDeleted(I: Op0) || R.isDeleted(I: Op1))
27044 return false;
27045
27046 // First collect all possible candidates
27047 SmallVector<std::pair<Value *, Value *>, 4> Candidates;
27048 Candidates.emplace_back(Args&: Op0, Args&: Op1);
27049
27050 auto *A = dyn_cast<BinaryOperator>(Val: Op0);
27051 auto *B = dyn_cast<BinaryOperator>(Val: Op1);
27052 // Try to skip B.
27053 if (A && B && B->hasOneUse()) {
27054 auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 0));
27055 auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 1));
27056 if (B0 && B0->getParent() == P && !R.isDeleted(I: B0))
27057 Candidates.emplace_back(Args&: A, Args&: B0);
27058 if (B1 && B1->getParent() == P && !R.isDeleted(I: B1))
27059 Candidates.emplace_back(Args&: A, Args&: B1);
27060 }
27061 // Try to skip A.
27062 if (B && A && A->hasOneUse()) {
27063 auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 0));
27064 auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 1));
27065 if (A0 && A0->getParent() == P && !R.isDeleted(I: A0))
27066 Candidates.emplace_back(Args&: A0, Args&: B);
27067 if (A1 && A1->getParent() == P && !R.isDeleted(I: A1))
27068 Candidates.emplace_back(Args&: A1, Args&: B);
27069 }
27070
27071 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
27072 ArrayRef<Value *> Ops) {
27073 if (!isReductionCandidate(I: Inst))
27074 return false;
27075 Type *Ty = Inst->getType();
27076 if (!isValidElementType(Ty) || Ty->isPointerTy())
27077 return false;
27078 HorizontalReduction HorRdx(Inst, Ops);
27079 if (!HorRdx.matchReductionForOperands())
27080 return false;
27081 // Check the cost of operations.
27082 VectorType *VecTy = getWidenedType(ScalarTy: Ty, VF: Ops.size());
27083 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
27084 InstructionCost ScalarCost =
27085 TTI.getScalarizationOverhead(
27086 Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: getNumElements(Ty: VecTy)), /*Insert=*/false,
27087 /*Extract=*/true, CostKind) +
27088 TTI.getInstructionCost(U: Inst, CostKind);
27089 InstructionCost RedCost;
27090 switch (::getRdxKind(V: Inst)) {
27091 case RecurKind::Add:
27092 case RecurKind::Mul:
27093 case RecurKind::Or:
27094 case RecurKind::And:
27095 case RecurKind::Xor:
27096 case RecurKind::FAdd:
27097 case RecurKind::FMul: {
27098 FastMathFlags FMF;
27099 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: Inst))
27100 FMF = FPCI->getFastMathFlags();
27101 RedCost = TTI.getArithmeticReductionCost(Opcode: Inst->getOpcode(), Ty: VecTy, FMF,
27102 CostKind);
27103 break;
27104 }
27105 default:
27106 return false;
27107 }
27108 if (RedCost >= ScalarCost)
27109 return false;
27110
27111 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI: &TTI, TLI: *TLI, AC, DT&: *DT) != nullptr;
27112 };
27113 if (Candidates.size() == 1)
27114 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList(VL: {Op0, Op1}, R);
27115
27116 // We have multiple options. Try to pick the single best.
27117 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
27118 if (!BestCandidate)
27119 return false;
27120 return (*BestCandidate == 0 &&
27121 TryToReduce(I, {Candidates[*BestCandidate].first,
27122 Candidates[*BestCandidate].second})) ||
27123 tryToVectorizeList(VL: {Candidates[*BestCandidate].first,
27124 Candidates[*BestCandidate].second},
27125 R);
27126}
27127
27128bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
27129 BasicBlock *BB, BoUpSLP &R) {
27130 SmallVector<WeakTrackingVH> PostponedInsts;
27131 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
27132 Res |= tryToVectorize(Insts: PostponedInsts, R);
27133 return Res;
27134}
27135
27136bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
27137 BoUpSLP &R) {
27138 bool Res = false;
27139 for (Value *V : Insts)
27140 if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
27141 Res |= tryToVectorize(I: Inst, R);
27142 return Res;
27143}
27144
27145bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
27146 BasicBlock *BB, BoUpSLP &R,
27147 bool MaxVFOnly) {
27148 if (!R.canMapToVector(T: IVI->getType()))
27149 return false;
27150
27151 SmallVector<Value *, 16> BuildVectorOpds;
27152 SmallVector<Value *, 16> BuildVectorInsts;
27153 if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R))
27154 return false;
27155
27156 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
27157 R.getORE()->emit(RemarkBuilder: [&]() {
27158 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
27159 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
27160 "trying reduction first.";
27161 });
27162 return false;
27163 }
27164 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
27165 // Aggregate value is unlikely to be processed in vector register.
27166 return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
27167}
27168
27169bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
27170 BasicBlock *BB, BoUpSLP &R,
27171 bool MaxVFOnly) {
27172 SmallVector<Value *, 16> BuildVectorInsts;
27173 SmallVector<Value *, 16> BuildVectorOpds;
27174 SmallVector<int> Mask;
27175 if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R) ||
27176 (all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
27177 isFixedVectorShuffle(VL: BuildVectorOpds, Mask, AC)))
27178 return false;
27179
27180 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
27181 R.getORE()->emit(RemarkBuilder: [&]() {
27182 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
27183 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
27184 "trying reduction first.";
27185 });
27186 return false;
27187 }
27188 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
27189 return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
27190}
27191
27192template <typename T>
27193static bool tryToVectorizeSequence(
27194 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
27195 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
27196 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
27197 bool MaxVFOnly, BoUpSLP &R) {
27198 bool Changed = false;
27199 // Sort by type, parent, operands.
27200 stable_sort(Incoming, Comparator);
27201
27202 // Try to vectorize elements base on their type.
27203 SmallVector<T *> Candidates;
27204 SmallVector<T *> VL;
27205 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
27206 VL.clear()) {
27207 // Look for the next elements with the same type, parent and operand
27208 // kinds.
27209 auto *I = dyn_cast<Instruction>(*IncIt);
27210 if (!I || R.isDeleted(I)) {
27211 ++IncIt;
27212 continue;
27213 }
27214 auto *SameTypeIt = IncIt;
27215 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
27216 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
27217 AreCompatible(VL, *SameTypeIt))) {
27218 auto *I = dyn_cast<Instruction>(*SameTypeIt);
27219 ++SameTypeIt;
27220 if (I && !R.isDeleted(I))
27221 VL.push_back(cast<T>(I));
27222 }
27223
27224 // Try to vectorize them.
27225 unsigned NumElts = VL.size();
27226 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
27227 << NumElts << ")\n");
27228 // The vectorization is a 3-state attempt:
27229 // 1. Try to vectorize instructions with the same/alternate opcodes with the
27230 // size of maximal register at first.
27231 // 2. Try to vectorize remaining instructions with the same type, if
27232 // possible. This may result in the better vectorization results rather than
27233 // if we try just to vectorize instructions with the same/alternate opcodes.
27234 // 3. Final attempt to try to vectorize all instructions with the
27235 // same/alternate ops only, this may result in some extra final
27236 // vectorization.
27237 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
27238 // Success start over because instructions might have been changed.
27239 Changed = true;
27240 VL.swap(Candidates);
27241 Candidates.clear();
27242 for (T *V : VL) {
27243 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27244 Candidates.push_back(V);
27245 }
27246 } else {
27247 /// \Returns the minimum number of elements that we will attempt to
27248 /// vectorize.
27249 auto GetMinNumElements = [&R](Value *V) {
27250 unsigned EltSize = R.getVectorElementSize(V);
27251 return std::max(a: 2U, b: R.getMaxVecRegSize() / EltSize);
27252 };
27253 if (NumElts < GetMinNumElements(*IncIt) &&
27254 (Candidates.empty() ||
27255 Candidates.front()->getType() == (*IncIt)->getType())) {
27256 for (T *V : VL) {
27257 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27258 Candidates.push_back(V);
27259 }
27260 }
27261 }
27262 // Final attempt to vectorize instructions with the same types.
27263 if (Candidates.size() > 1 &&
27264 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
27265 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
27266 // Success start over because instructions might have been changed.
27267 Changed = true;
27268 } else if (MaxVFOnly) {
27269 // Try to vectorize using small vectors.
27270 SmallVector<T *> VL;
27271 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
27272 VL.clear()) {
27273 auto *I = dyn_cast<Instruction>(*It);
27274 if (!I || R.isDeleted(I)) {
27275 ++It;
27276 continue;
27277 }
27278 auto *SameTypeIt = It;
27279 while (SameTypeIt != End &&
27280 (!isa<Instruction>(*SameTypeIt) ||
27281 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
27282 AreCompatible(*SameTypeIt, *It))) {
27283 auto *I = dyn_cast<Instruction>(*SameTypeIt);
27284 ++SameTypeIt;
27285 if (I && !R.isDeleted(I))
27286 VL.push_back(cast<T>(I));
27287 }
27288 unsigned NumElts = VL.size();
27289 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
27290 /*MaxVFOnly=*/false))
27291 Changed = true;
27292 It = SameTypeIt;
27293 }
27294 }
27295 Candidates.clear();
27296 }
27297
27298 // Start over at the next instruction of a different type (or the end).
27299 IncIt = SameTypeIt;
27300 }
27301 return Changed;
27302}
27303
27304/// Compare two cmp instructions. If IsCompatibility is true, function returns
27305/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
27306/// operands. If IsCompatibility is false, function implements strict weak
27307/// ordering relation between two cmp instructions, returning true if the first
27308/// instruction is "less" than the second, i.e. its predicate is less than the
27309/// predicate of the second or the operands IDs are less than the operands IDs
27310/// of the second cmp instruction.
27311template <bool IsCompatibility>
27312static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
27313 const DominatorTree &DT) {
27314 assert(isValidElementType(V->getType()) &&
27315 isValidElementType(V2->getType()) &&
27316 "Expected valid element types only.");
27317 if (V == V2)
27318 return IsCompatibility;
27319 auto *CI1 = cast<CmpInst>(Val: V);
27320 auto *CI2 = cast<CmpInst>(Val: V2);
27321 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() <
27322 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
27323 return !IsCompatibility;
27324 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() >
27325 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
27326 return false;
27327 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() <
27328 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
27329 return !IsCompatibility;
27330 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() >
27331 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
27332 return false;
27333 CmpInst::Predicate Pred1 = CI1->getPredicate();
27334 CmpInst::Predicate Pred2 = CI2->getPredicate();
27335 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
27336 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
27337 CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
27338 CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
27339 if (BasePred1 < BasePred2)
27340 return !IsCompatibility;
27341 if (BasePred1 > BasePred2)
27342 return false;
27343 // Compare operands.
27344 bool CI1Preds = Pred1 == BasePred1;
27345 bool CI2Preds = Pred2 == BasePred1;
27346 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
27347 auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - 1);
27348 auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - 1);
27349 if (Op1 == Op2)
27350 continue;
27351 if (Op1->getValueID() < Op2->getValueID())
27352 return !IsCompatibility;
27353 if (Op1->getValueID() > Op2->getValueID())
27354 return false;
27355 if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
27356 if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
27357 if (IsCompatibility) {
27358 if (I1->getParent() != I2->getParent())
27359 return false;
27360 } else {
27361 // Try to compare nodes with same parent.
27362 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
27363 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
27364 if (!NodeI1)
27365 return NodeI2 != nullptr;
27366 if (!NodeI2)
27367 return false;
27368 assert((NodeI1 == NodeI2) ==
27369 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27370 "Different nodes should have different DFS numbers");
27371 if (NodeI1 != NodeI2)
27372 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27373 }
27374 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
27375 if (S && (IsCompatibility || !S.isAltShuffle()))
27376 continue;
27377 if (IsCompatibility)
27378 return false;
27379 if (I1->getOpcode() != I2->getOpcode())
27380 return I1->getOpcode() < I2->getOpcode();
27381 }
27382 }
27383 return IsCompatibility;
27384}
27385
27386template <typename ItT>
27387bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
27388 BasicBlock *BB, BoUpSLP &R) {
27389 bool Changed = false;
27390 // Try to find reductions first.
27391 for (CmpInst *I : CmpInsts) {
27392 if (R.isDeleted(I))
27393 continue;
27394 for (Value *Op : I->operands())
27395 if (auto *RootOp = dyn_cast<Instruction>(Val: Op)) {
27396 Changed |= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R);
27397 if (R.isDeleted(I))
27398 break;
27399 }
27400 }
27401 // Try to vectorize operands as vector bundles.
27402 for (CmpInst *I : CmpInsts) {
27403 if (R.isDeleted(I))
27404 continue;
27405 Changed |= tryToVectorize(I, R);
27406 }
27407 // Try to vectorize list of compares.
27408 // Sort by type, compare predicate, etc.
27409 auto CompareSorter = [&](Value *V, Value *V2) {
27410 if (V == V2)
27411 return false;
27412 return compareCmp<false>(V, V2, TLI&: *TLI, DT: *DT);
27413 };
27414
27415 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
27416 if (VL.empty() || VL.back() == V1)
27417 return true;
27418 return compareCmp<true>(V: V1, V2: VL.back(), TLI&: *TLI, DT: *DT);
27419 };
27420
27421 SmallVector<Value *> Vals;
27422 for (Instruction *V : CmpInsts)
27423 if (!R.isDeleted(I: V) && isValidElementType(Ty: getValueType(V)))
27424 Vals.push_back(Elt: V);
27425 if (Vals.size() <= 1)
27426 return Changed;
27427 Changed |= tryToVectorizeSequence<Value>(
27428 Vals, CompareSorter, AreCompatibleCompares,
27429 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27430 // Exclude possible reductions from other blocks.
27431 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
27432 return any_of(V->users(), [V](User *U) {
27433 auto *Select = dyn_cast<SelectInst>(Val: U);
27434 return Select &&
27435 Select->getParent() != cast<Instruction>(Val: V)->getParent();
27436 });
27437 });
27438 if (ArePossiblyReducedInOtherBlock)
27439 return false;
27440 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
27441 },
27442 /*MaxVFOnly=*/true, R);
27443 return Changed;
27444}
27445
27446bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27447 BasicBlock *BB, BoUpSLP &R) {
27448 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
27449 "This function only accepts Insert instructions");
27450 bool OpsChanged = false;
27451 SmallVector<WeakTrackingVH> PostponedInsts;
27452 for (auto *I : reverse(C&: Instructions)) {
27453 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
27454 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
27455 continue;
27456 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
27457 OpsChanged |=
27458 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/true);
27459 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
27460 OpsChanged |=
27461 vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /*MaxVFOnly=*/true);
27462 }
27463 // pass2 - try to vectorize reductions only
27464 if (R.isDeleted(I))
27465 continue;
27466 OpsChanged |= vectorizeHorReduction(P: nullptr, Root: I, BB, R, PostponedInsts);
27467 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
27468 continue;
27469 // pass3 - try to match and vectorize a buildvector sequence.
27470 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
27471 OpsChanged |=
27472 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/false);
27473 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
27474 OpsChanged |= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
27475 /*MaxVFOnly=*/false);
27476 }
27477 }
27478 // Now try to vectorize postponed instructions.
27479 OpsChanged |= tryToVectorize(Insts: PostponedInsts, R);
27480
27481 Instructions.clear();
27482 return OpsChanged;
27483}
27484
27485bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
27486 bool Changed = false;
27487 SmallVector<Value *, 4> Incoming;
27488 SmallPtrSet<Value *, 16> VisitedInstrs;
27489 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
27490 // node. Allows better to identify the chains that can be vectorized in the
27491 // better way.
27492 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27493 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
27494 assert(isValidElementType(V1->getType()) &&
27495 isValidElementType(V2->getType()) &&
27496 "Expected vectorizable types only.");
27497 if (V1 == V2)
27498 return false;
27499 // It is fine to compare type IDs here, since we expect only vectorizable
27500 // types, like ints, floats and pointers, we don't care about other type.
27501 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
27502 return true;
27503 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
27504 return false;
27505 if (V1->getType()->getScalarSizeInBits() <
27506 V2->getType()->getScalarSizeInBits())
27507 return true;
27508 if (V1->getType()->getScalarSizeInBits() >
27509 V2->getType()->getScalarSizeInBits())
27510 return false;
27511 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27512 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27513 if (Opcodes1.size() < Opcodes2.size())
27514 return true;
27515 if (Opcodes1.size() > Opcodes2.size())
27516 return false;
27517 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27518 {
27519 // Instructions come first.
27520 auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]);
27521 auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I]);
27522 if (I1 && I2) {
27523 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
27524 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
27525 if (!NodeI1)
27526 return NodeI2 != nullptr;
27527 if (!NodeI2)
27528 return false;
27529 assert((NodeI1 == NodeI2) ==
27530 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27531 "Different nodes should have different DFS numbers");
27532 if (NodeI1 != NodeI2)
27533 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27534 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
27535 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
27536 const auto *E1 = dyn_cast<ExtractElementInst>(Val: I1);
27537 const auto *E2 = dyn_cast<ExtractElementInst>(Val: I2);
27538 if (!E1 || !E2)
27539 continue;
27540
27541 // Sort on ExtractElementInsts primarily by vector operands. Prefer
27542 // program order of the vector operands.
27543 const auto *V1 = dyn_cast<Instruction>(Val: E1->getVectorOperand());
27544 const auto *V2 = dyn_cast<Instruction>(Val: E2->getVectorOperand());
27545 if (V1 != V2) {
27546 if (V1 && !V2)
27547 return true;
27548 if (!V1 && V2)
27549 return false;
27550 DomTreeNodeBase<BasicBlock> *NodeI1 =
27551 DT->getNode(BB: V1->getParent());
27552 DomTreeNodeBase<BasicBlock> *NodeI2 =
27553 DT->getNode(BB: V2->getParent());
27554 if (!NodeI1)
27555 return NodeI2 != nullptr;
27556 if (!NodeI2)
27557 return false;
27558 assert((NodeI1 == NodeI2) ==
27559 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27560 "Different nodes should have different DFS numbers");
27561 if (NodeI1 != NodeI2)
27562 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27563 return V1->comesBefore(Other: V2);
27564 }
27565 // If we have the same vector operand, try to sort by constant
27566 // index.
27567 std::optional<unsigned> Id1 = getExtractIndex(E: E1);
27568 std::optional<unsigned> Id2 = getExtractIndex(E: E2);
27569 // Bring constants to the top
27570 if (Id1 && !Id2)
27571 return true;
27572 if (!Id1 && Id2)
27573 return false;
27574 // First elements come first.
27575 if (Id1 && Id2)
27576 return *Id1 < *Id2;
27577
27578 continue;
27579 }
27580 if (I1->getOpcode() == I2->getOpcode())
27581 continue;
27582 return I1->getOpcode() < I2->getOpcode();
27583 }
27584 if (I1)
27585 return true;
27586 if (I2)
27587 return false;
27588 }
27589 {
27590 // Non-undef constants come next.
27591 bool C1 = isa<Constant>(Val: Opcodes1[I]) && !isa<UndefValue>(Val: Opcodes1[I]);
27592 bool C2 = isa<Constant>(Val: Opcodes2[I]) && !isa<UndefValue>(Val: Opcodes2[I]);
27593 if (C1 && C2)
27594 continue;
27595 if (C1)
27596 return true;
27597 if (C2)
27598 return false;
27599 }
27600 bool U1 = isa<UndefValue>(Val: Opcodes1[I]);
27601 bool U2 = isa<UndefValue>(Val: Opcodes2[I]);
27602 {
27603 // Non-constant non-instructions come next.
27604 if (!U1 && !U2) {
27605 auto ValID1 = Opcodes1[I]->getValueID();
27606 auto ValID2 = Opcodes2[I]->getValueID();
27607 if (ValID1 == ValID2)
27608 continue;
27609 if (ValID1 < ValID2)
27610 return true;
27611 if (ValID1 > ValID2)
27612 return false;
27613 }
27614 if (!U1)
27615 return true;
27616 if (!U2)
27617 return false;
27618 }
27619 // Undefs come last.
27620 assert(U1 && U2 && "The only thing left should be undef & undef.");
27621 }
27622 return false;
27623 };
27624 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
27625 Value *V1) {
27626 if (VL.empty() || V1 == VL.back())
27627 return true;
27628 Value *V2 = VL.back();
27629 if (V1->getType() != V2->getType())
27630 return false;
27631 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27632 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27633 if (Opcodes1.size() != Opcodes2.size())
27634 return false;
27635 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27636 // Undefs are compatible with any other value.
27637 if (isa<UndefValue>(Val: Opcodes1[I]) || isa<UndefValue>(Val: Opcodes2[I]))
27638 continue;
27639 if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]))
27640 if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I])) {
27641 if (R.isDeleted(I: I1) || R.isDeleted(I: I2))
27642 return false;
27643 if (I1->getParent() != I2->getParent())
27644 return false;
27645 if (getSameOpcode(VL: {I1, I2}, TLI: *TLI))
27646 continue;
27647 return false;
27648 }
27649 if (isa<Constant>(Val: Opcodes1[I]) && isa<Constant>(Val: Opcodes2[I]))
27650 continue;
27651 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
27652 return false;
27653 }
27654 return true;
27655 };
27656
27657 bool HaveVectorizedPhiNodes = false;
27658 do {
27659 // Collect the incoming values from the PHIs.
27660 Incoming.clear();
27661 for (Instruction &I : *BB) {
27662 auto *P = dyn_cast<PHINode>(Val: &I);
27663 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
27664 break;
27665
27666 // No need to analyze deleted, vectorized and non-vectorizable
27667 // instructions.
27668 if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
27669 isValidElementType(Ty: P->getType()))
27670 Incoming.push_back(Elt: P);
27671 }
27672
27673 if (Incoming.size() <= 1)
27674 break;
27675
27676 // Find the corresponding non-phi nodes for better matching when trying to
27677 // build the tree.
27678 for (Value *V : Incoming) {
27679 SmallVectorImpl<Value *> &Opcodes =
27680 PHIToOpcodes.try_emplace(Key: V).first->getSecond();
27681 if (!Opcodes.empty())
27682 continue;
27683 SmallVector<Value *, 4> Nodes(1, V);
27684 SmallPtrSet<Value *, 4> Visited;
27685 while (!Nodes.empty()) {
27686 auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
27687 if (!Visited.insert(Ptr: PHI).second)
27688 continue;
27689 for (Value *V : PHI->incoming_values()) {
27690 if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
27691 Nodes.push_back(Elt: PHI1);
27692 continue;
27693 }
27694 Opcodes.emplace_back(Args&: V);
27695 }
27696 }
27697 }
27698
27699 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
27700 Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
27701 TryToVectorizeHelper: [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27702 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
27703 },
27704 /*MaxVFOnly=*/true, R);
27705 Changed |= HaveVectorizedPhiNodes;
27706 if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
27707 auto *PHI = dyn_cast<PHINode>(P.first);
27708 return !PHI || R.isDeleted(I: PHI);
27709 }))
27710 PHIToOpcodes.clear();
27711 VisitedInstrs.insert_range(R&: Incoming);
27712 } while (HaveVectorizedPhiNodes);
27713
27714 VisitedInstrs.clear();
27715
27716 InstSetVector PostProcessInserts;
27717 SmallSetVector<CmpInst *, 8> PostProcessCmps;
27718 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
27719 // also vectorizes `PostProcessCmps`.
27720 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
27721 bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
27722 if (VectorizeCmps) {
27723 Changed |= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
27724 PostProcessCmps.clear();
27725 }
27726 PostProcessInserts.clear();
27727 return Changed;
27728 };
27729 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
27730 auto IsInPostProcessInstrs = [&](Instruction *I) {
27731 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
27732 return PostProcessCmps.contains(key: Cmp);
27733 return isa<InsertElementInst, InsertValueInst>(Val: I) &&
27734 PostProcessInserts.contains(key: I);
27735 };
27736 // Returns true if `I` is an instruction without users, like terminator, or
27737 // function call with ignored return value, store. Ignore unused instructions
27738 // (basing on instruction type, except for CallInst and InvokeInst).
27739 auto HasNoUsers = [](Instruction *I) {
27740 return I->use_empty() &&
27741 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(Val: I));
27742 };
27743 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
27744 // Skip instructions with scalable type. The num of elements is unknown at
27745 // compile-time for scalable type.
27746 if (isa<ScalableVectorType>(Val: It->getType()))
27747 continue;
27748
27749 // Skip instructions marked for the deletion.
27750 if (R.isDeleted(I: &*It))
27751 continue;
27752 // We may go through BB multiple times so skip the one we have checked.
27753 if (!VisitedInstrs.insert(Ptr: &*It).second) {
27754 if (HasNoUsers(&*It) &&
27755 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
27756 // We would like to start over since some instructions are deleted
27757 // and the iterator may become invalid value.
27758 Changed = true;
27759 It = BB->begin();
27760 E = BB->end();
27761 }
27762 continue;
27763 }
27764
27765 // Try to vectorize reductions that use PHINodes.
27766 if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
27767 // Check that the PHI is a reduction PHI.
27768 if (P->getNumIncomingValues() == 2) {
27769 // Try to match and vectorize a horizontal reduction.
27770 Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
27771 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
27772 Changed = true;
27773 It = BB->begin();
27774 E = BB->end();
27775 continue;
27776 }
27777 }
27778 // Try to vectorize the incoming values of the PHI, to catch reductions
27779 // that feed into PHIs.
27780 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
27781 // Skip if the incoming block is the current BB for now. Also, bypass
27782 // unreachable IR for efficiency and to avoid crashing.
27783 // TODO: Collect the skipped incoming values and try to vectorize them
27784 // after processing BB.
27785 if (BB == P->getIncomingBlock(i: I) ||
27786 !DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
27787 continue;
27788
27789 // Postponed instructions should not be vectorized here, delay their
27790 // vectorization.
27791 if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
27792 PI && !IsInPostProcessInstrs(PI)) {
27793 bool Res =
27794 vectorizeRootInstruction(P: nullptr, Root: PI, BB: P->getIncomingBlock(i: I), R);
27795 Changed |= Res;
27796 if (Res && R.isDeleted(I: P)) {
27797 It = BB->begin();
27798 E = BB->end();
27799 break;
27800 }
27801 }
27802 }
27803 continue;
27804 }
27805
27806 if (HasNoUsers(&*It)) {
27807 bool OpsChanged = false;
27808 auto *SI = dyn_cast<StoreInst>(Val&: It);
27809 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
27810 if (SI) {
27811 auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
27812 // Try to vectorize chain in store, if this is the only store to the
27813 // address in the block.
27814 // TODO: This is just a temporarily solution to save compile time. Need
27815 // to investigate if we can safely turn on slp-vectorize-hor-store
27816 // instead to allow lookup for reduction chains in all non-vectorized
27817 // stores (need to check side effects and compile time).
27818 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
27819 SI->getValueOperand()->hasOneUse();
27820 }
27821 if (TryToVectorizeRoot) {
27822 for (auto *V : It->operand_values()) {
27823 // Postponed instructions should not be vectorized here, delay their
27824 // vectorization.
27825 if (auto *VI = dyn_cast<Instruction>(Val: V);
27826 VI && !IsInPostProcessInstrs(VI))
27827 // Try to match and vectorize a horizontal reduction.
27828 OpsChanged |= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R);
27829 }
27830 }
27831 // Start vectorization of post-process list of instructions from the
27832 // top-tree instructions to try to vectorize as many instructions as
27833 // possible.
27834 OpsChanged |=
27835 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
27836 if (OpsChanged) {
27837 // We would like to start over since some instructions are deleted
27838 // and the iterator may become invalid value.
27839 Changed = true;
27840 It = BB->begin();
27841 E = BB->end();
27842 continue;
27843 }
27844 }
27845
27846 if (isa<InsertElementInst, InsertValueInst>(Val: It))
27847 PostProcessInserts.insert(X: &*It);
27848 else if (isa<CmpInst>(Val: It))
27849 PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
27850 }
27851
27852 return Changed;
27853}
27854
27855bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
27856 auto Changed = false;
27857 for (auto &Entry : GEPs) {
27858 // If the getelementptr list has fewer than two elements, there's nothing
27859 // to do.
27860 if (Entry.second.size() < 2)
27861 continue;
27862
27863 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
27864 << Entry.second.size() << ".\n");
27865
27866 // Process the GEP list in chunks suitable for the target's supported
27867 // vector size. If a vector register can't hold 1 element, we are done. We
27868 // are trying to vectorize the index computations, so the maximum number of
27869 // elements is based on the size of the index expression, rather than the
27870 // size of the GEP itself (the target's pointer size).
27871 auto *It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst *GEP) {
27872 return !R.isDeleted(I: GEP);
27873 });
27874 if (It == Entry.second.end())
27875 continue;
27876 unsigned MaxVecRegSize = R.getMaxVecRegSize();
27877 unsigned EltSize = R.getVectorElementSize(V: *(*It)->idx_begin());
27878 if (MaxVecRegSize < EltSize)
27879 continue;
27880
27881 unsigned MaxElts = MaxVecRegSize / EltSize;
27882 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
27883 auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
27884 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
27885
27886 // Initialize a set a candidate getelementptrs. Note that we use a
27887 // SetVector here to preserve program order. If the index computations
27888 // are vectorizable and begin with loads, we want to minimize the chance
27889 // of having to reorder them later.
27890 SetVector<Value *> Candidates(llvm::from_range, GEPList);
27891
27892 // Some of the candidates may have already been vectorized after we
27893 // initially collected them or their index is optimized to constant value.
27894 // If so, they are marked as deleted, so remove them from the set of
27895 // candidates.
27896 Candidates.remove_if(P: [&R](Value *I) {
27897 return R.isDeleted(I: cast<Instruction>(Val: I)) ||
27898 isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
27899 });
27900
27901 // Remove from the set of candidates all pairs of getelementptrs with
27902 // constant differences. Such getelementptrs are likely not good
27903 // candidates for vectorization in a bottom-up phase since one can be
27904 // computed from the other. We also ensure all candidate getelementptr
27905 // indices are unique.
27906 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
27907 auto *GEPI = GEPList[I];
27908 if (!Candidates.count(key: GEPI))
27909 continue;
27910 const SCEV *SCEVI = SE->getSCEV(V: GEPList[I]);
27911 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
27912 auto *GEPJ = GEPList[J];
27913 const SCEV *SCEVJ = SE->getSCEV(V: GEPList[J]);
27914 if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
27915 Candidates.remove(X: GEPI);
27916 Candidates.remove(X: GEPJ);
27917 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
27918 Candidates.remove(X: GEPJ);
27919 }
27920 }
27921 }
27922
27923 // We break out of the above computation as soon as we know there are
27924 // fewer than two candidates remaining.
27925 if (Candidates.size() < 2)
27926 continue;
27927
27928 // Add the single, non-constant index of each candidate to the bundle. We
27929 // ensured the indices met these constraints when we originally collected
27930 // the getelementptrs.
27931 SmallVector<Value *, 16> Bundle(Candidates.size());
27932 auto BundleIndex = 0u;
27933 for (auto *V : Candidates) {
27934 auto *GEP = cast<GetElementPtrInst>(Val: V);
27935 auto *GEPIdx = GEP->idx_begin()->get();
27936 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
27937 Bundle[BundleIndex++] = GEPIdx;
27938 }
27939
27940 // Try and vectorize the indices. We are currently only interested in
27941 // gather-like cases of the form:
27942 //
27943 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
27944 //
27945 // where the loads of "a", the loads of "b", and the subtractions can be
27946 // performed in parallel. It's likely that detecting this pattern in a
27947 // bottom-up phase will be simpler and less costly than building a
27948 // full-blown top-down phase beginning at the consecutive loads.
27949 Changed |= tryToVectorizeList(VL: Bundle, R);
27950 }
27951 }
27952 return Changed;
27953}
27954
27955bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
27956 bool Changed = false;
27957 // Sort by type, base pointers and values operand. Value operands must be
27958 // compatible (have the same opcode, same parent), otherwise it is
27959 // definitely not profitable to try to vectorize them.
27960 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
27961 if (V->getValueOperand()->getType()->getTypeID() <
27962 V2->getValueOperand()->getType()->getTypeID())
27963 return true;
27964 if (V->getValueOperand()->getType()->getTypeID() >
27965 V2->getValueOperand()->getType()->getTypeID())
27966 return false;
27967 if (V->getPointerOperandType()->getTypeID() <
27968 V2->getPointerOperandType()->getTypeID())
27969 return true;
27970 if (V->getPointerOperandType()->getTypeID() >
27971 V2->getPointerOperandType()->getTypeID())
27972 return false;
27973 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
27974 V2->getValueOperand()->getType()->getScalarSizeInBits())
27975 return true;
27976 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
27977 V2->getValueOperand()->getType()->getScalarSizeInBits())
27978 return false;
27979 // UndefValues are compatible with all other values.
27980 auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand());
27981 auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
27982 if (I1 && I2) {
27983 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
27984 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
27985 assert(NodeI1 && "Should only process reachable instructions");
27986 assert(NodeI2 && "Should only process reachable instructions");
27987 assert((NodeI1 == NodeI2) ==
27988 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27989 "Different nodes should have different DFS numbers");
27990 if (NodeI1 != NodeI2)
27991 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27992 return I1->getOpcode() < I2->getOpcode();
27993 }
27994 if (I1 && !I2)
27995 return true;
27996 if (!I1 && I2)
27997 return false;
27998 return V->getValueOperand()->getValueID() <
27999 V2->getValueOperand()->getValueID();
28000 };
28001
28002 bool SameParent = true;
28003 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
28004 if (VL.empty()) {
28005 SameParent = true;
28006 return true;
28007 }
28008 StoreInst *V2 = VL.back();
28009 if (V1 == V2)
28010 return true;
28011 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
28012 return false;
28013 if (V1->getPointerOperandType() != V2->getPointerOperandType())
28014 return false;
28015 // Undefs are compatible with any other value.
28016 if (isa<UndefValue>(Val: V1->getValueOperand()) ||
28017 isa<UndefValue>(Val: V2->getValueOperand()))
28018 return true;
28019 if (isa<Constant>(Val: V1->getValueOperand()) &&
28020 isa<Constant>(Val: V2->getValueOperand()))
28021 return true;
28022 // Check if the operands of the stores can be vectorized. They can be
28023 // vectorized, if they have compatible operands or have operands, which can
28024 // be vectorized as copyables.
28025 auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand());
28026 auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
28027 if (I1 || I2) {
28028 // Accept only tail-following non-compatible values for now.
28029 // TODO: investigate if it is possible to vectorize incompatible values,
28030 // if the copyables are first in the list.
28031 if (I1 && !I2)
28032 return false;
28033 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
28034 SmallVector<Value *> NewVL(VL.size() + 1);
28035 for (auto [SI, V] : zip(t&: VL, u&: NewVL))
28036 V = SI->getValueOperand();
28037 NewVL.back() = V1->getValueOperand();
28038 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
28039 InstructionsState S = Analysis.buildInstructionsState(
28040 VL: NewVL, R, TryCopyableElementsVectorization: VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
28041 /*SkipSameCodeCheck=*/!SameParent);
28042 if (S)
28043 return true;
28044 if (!SameParent)
28045 return false;
28046 }
28047 return V1->getValueOperand()->getValueID() ==
28048 V2->getValueOperand()->getValueID();
28049 };
28050
28051 // Attempt to sort and vectorize each of the store-groups.
28052 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
28053 for (auto &Pair : Stores) {
28054 if (Pair.second.size() < 2)
28055 continue;
28056
28057 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
28058 << Pair.second.size() << ".\n");
28059
28060 if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
28061 continue;
28062
28063 // Reverse stores to do bottom-to-top analysis. This is important if the
28064 // values are stores to the same addresses several times, in this case need
28065 // to follow the stores order (reversed to meet the memory dependecies).
28066 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
28067 Pair.second.rend());
28068 Changed |= tryToVectorizeSequence<StoreInst>(
28069 Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
28070 TryToVectorizeHelper: [&](ArrayRef<StoreInst *> Candidates, bool) {
28071 return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
28072 },
28073 /*MaxVFOnly=*/false, R);
28074 }
28075 return Changed;
28076}
28077