1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
22#include "llvm/ADT/PriorityQueue.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
25#include "llvm/ADT/SetOperations.h"
26#include "llvm/ADT/SetVector.h"
27#include "llvm/ADT/SmallBitVector.h"
28#include "llvm/ADT/SmallPtrSet.h"
29#include "llvm/ADT/SmallSet.h"
30#include "llvm/ADT/SmallString.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
33#include "llvm/ADT/iterator_range.h"
34#include "llvm/Analysis/AliasAnalysis.h"
35#include "llvm/Analysis/AssumptionCache.h"
36#include "llvm/Analysis/CodeMetrics.h"
37#include "llvm/Analysis/ConstantFolding.h"
38#include "llvm/Analysis/DemandedBits.h"
39#include "llvm/Analysis/GlobalsModRef.h"
40#include "llvm/Analysis/IVDescriptors.h"
41#include "llvm/Analysis/Loads.h"
42#include "llvm/Analysis/LoopAccessAnalysis.h"
43#include "llvm/Analysis/LoopInfo.h"
44#include "llvm/Analysis/MemoryLocation.h"
45#include "llvm/Analysis/OptimizationRemarkEmitter.h"
46#include "llvm/Analysis/ScalarEvolution.h"
47#include "llvm/Analysis/ScalarEvolutionExpressions.h"
48#include "llvm/Analysis/TargetLibraryInfo.h"
49#include "llvm/Analysis/TargetTransformInfo.h"
50#include "llvm/Analysis/ValueTracking.h"
51#include "llvm/Analysis/VectorUtils.h"
52#include "llvm/IR/Attributes.h"
53#include "llvm/IR/BasicBlock.h"
54#include "llvm/IR/Constant.h"
55#include "llvm/IR/Constants.h"
56#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/DerivedTypes.h"
58#include "llvm/IR/Dominators.h"
59#include "llvm/IR/Function.h"
60#include "llvm/IR/IRBuilder.h"
61#include "llvm/IR/InstrTypes.h"
62#include "llvm/IR/Instruction.h"
63#include "llvm/IR/Instructions.h"
64#include "llvm/IR/IntrinsicInst.h"
65#include "llvm/IR/Intrinsics.h"
66#include "llvm/IR/Module.h"
67#include "llvm/IR/Operator.h"
68#include "llvm/IR/PatternMatch.h"
69#include "llvm/IR/Type.h"
70#include "llvm/IR/Use.h"
71#include "llvm/IR/User.h"
72#include "llvm/IR/Value.h"
73#include "llvm/IR/ValueHandle.h"
74#ifdef EXPENSIVE_CHECKS
75#include "llvm/IR/Verifier.h"
76#endif
77#include "llvm/Pass.h"
78#include "llvm/Support/Casting.h"
79#include "llvm/Support/CommandLine.h"
80#include "llvm/Support/Compiler.h"
81#include "llvm/Support/DOTGraphTraits.h"
82#include "llvm/Support/Debug.h"
83#include "llvm/Support/DebugCounter.h"
84#include "llvm/Support/ErrorHandling.h"
85#include "llvm/Support/GraphWriter.h"
86#include "llvm/Support/InstructionCost.h"
87#include "llvm/Support/KnownBits.h"
88#include "llvm/Support/MathExtras.h"
89#include "llvm/Support/raw_ostream.h"
90#include "llvm/Transforms/Utils/InjectTLIMappings.h"
91#include "llvm/Transforms/Utils/Local.h"
92#include "llvm/Transforms/Utils/LoopUtils.h"
93#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
94#include <algorithm>
95#include <cassert>
96#include <cstdint>
97#include <iterator>
98#include <map>
99#include <memory>
100#include <optional>
101#include <set>
102#include <string>
103#include <tuple>
104#include <utility>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108using namespace slpvectorizer;
109using namespace std::placeholders;
110
111#define SV_NAME "slp-vectorizer"
112#define DEBUG_TYPE "SLP"
113
114STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
115
116DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
117 "Controls which SLP graphs should be vectorized.");
118
119static cl::opt<bool>
120 RunSLPVectorization("vectorize-slp", cl::init(Val: true), cl::Hidden,
121 cl::desc("Run the SLP vectorization passes"));
122
123static cl::opt<bool>
124 SLPReVec("slp-revec", cl::init(Val: false), cl::Hidden,
125 cl::desc("Enable vectorization for wider vector utilization"));
126
127static cl::opt<int>
128 SLPCostThreshold("slp-threshold", cl::init(Val: 0), cl::Hidden,
129 cl::desc("Only vectorize if you gain more than this "
130 "number "));
131
132static cl::opt<bool>
133ShouldVectorizeHor("slp-vectorize-hor", cl::init(Val: true), cl::Hidden,
134 cl::desc("Attempt to vectorize horizontal reductions"));
135
136static cl::opt<bool> ShouldStartVectorizeHorAtStore(
137 "slp-vectorize-hor-store", cl::init(Val: false), cl::Hidden,
138 cl::desc(
139 "Attempt to vectorize horizontal reductions feeding into a store"));
140
141static cl::opt<bool> SplitAlternateInstructions(
142 "slp-split-alternate-instructions", cl::init(Val: true), cl::Hidden,
143 cl::desc("Improve the code quality by splitting alternate instructions"));
144
145static cl::opt<int>
146MaxVectorRegSizeOption("slp-max-reg-size", cl::init(Val: 128), cl::Hidden,
147 cl::desc("Attempt to vectorize for this register size in bits"));
148
149static cl::opt<unsigned>
150MaxVFOption("slp-max-vf", cl::init(Val: 0), cl::Hidden,
151 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
152
153/// Limits the size of scheduling regions in a block.
154/// It avoid long compile times for _very_ large blocks where vector
155/// instructions are spread over a wide range.
156/// This limit is way higher than needed by real-world functions.
157static cl::opt<int>
158ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(Val: 100000), cl::Hidden,
159 cl::desc("Limit the size of the SLP scheduling region per block"));
160
161static cl::opt<int> MinVectorRegSizeOption(
162 "slp-min-reg-size", cl::init(Val: 128), cl::Hidden,
163 cl::desc("Attempt to vectorize for this register size in bits"));
164
165static cl::opt<unsigned> RecursionMaxDepth(
166 "slp-recursion-max-depth", cl::init(Val: 12), cl::Hidden,
167 cl::desc("Limit the recursion depth when building a vectorizable tree"));
168
169static cl::opt<unsigned> MinTreeSize(
170 "slp-min-tree-size", cl::init(Val: 3), cl::Hidden,
171 cl::desc("Only vectorize small trees if they are fully vectorizable"));
172
173// The maximum depth that the look-ahead score heuristic will explore.
174// The higher this value, the higher the compilation time overhead.
175static cl::opt<int> LookAheadMaxDepth(
176 "slp-max-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
177 cl::desc("The maximum look-ahead depth for operand reordering scores"));
178
179// The maximum depth that the look-ahead score heuristic will explore
180// when it probing among candidates for vectorization tree roots.
181// The higher this value, the higher the compilation time overhead but unlike
182// similar limit for operands ordering this is less frequently used, hence
183// impact of higher value is less noticeable.
184static cl::opt<int> RootLookAheadMaxDepth(
185 "slp-max-root-look-ahead-depth", cl::init(Val: 2), cl::Hidden,
186 cl::desc("The maximum look-ahead depth for searching best rooting option"));
187
188static cl::opt<unsigned> MinProfitableStridedLoads(
189 "slp-min-strided-loads", cl::init(Val: 2), cl::Hidden,
190 cl::desc("The minimum number of loads, which should be considered strided, "
191 "if the stride is > 1 or is runtime value"));
192
193static cl::opt<unsigned> MaxProfitableLoadStride(
194 "slp-max-stride", cl::init(Val: 8), cl::Hidden,
195 cl::desc("The maximum stride, considered to be profitable."));
196
197static cl::opt<bool>
198 DisableTreeReorder("slp-disable-tree-reorder", cl::init(Val: false), cl::Hidden,
199 cl::desc("Disable tree reordering even if it is "
200 "profitable. Used for testing only."));
201
202static cl::opt<bool>
203 ForceStridedLoads("slp-force-strided-loads", cl::init(Val: false), cl::Hidden,
204 cl::desc("Generate strided loads even if they are not "
205 "profitable. Used for testing only."));
206
207static cl::opt<bool>
208 ViewSLPTree("view-slp-tree", cl::Hidden,
209 cl::desc("Display the SLP trees with Graphviz"));
210
211static cl::opt<bool> VectorizeNonPowerOf2(
212 "slp-vectorize-non-power-of-2", cl::init(Val: false), cl::Hidden,
213 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
214
215/// Enables vectorization of copyable elements.
216static cl::opt<bool> VectorizeCopyableElements(
217 "slp-copyable-elements", cl::init(Val: true), cl::Hidden,
218 cl::desc("Try to replace values with the idempotent instructions for "
219 "better vectorization."));
220
221// Limit the number of alias checks. The limit is chosen so that
222// it has no negative effect on the llvm benchmarks.
223static const unsigned AliasedCheckLimit = 10;
224
225// Limit of the number of uses for potentially transformed instructions/values,
226// used in checks to avoid compile-time explode.
227static constexpr int UsesLimit = 64;
228
229// Another limit for the alias checks: The maximum distance between load/store
230// instructions where alias checks are done.
231// This limit is useful for very large basic blocks.
232static const unsigned MaxMemDepDistance = 160;
233
234/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
235/// regions to be handled.
236static const int MinScheduleRegionSize = 16;
237
238/// Maximum allowed number of operands in the PHI nodes.
239static const unsigned MaxPHINumOperands = 128;
240
241/// Predicate for the element types that the SLP vectorizer supports.
242///
243/// The most important thing to filter here are types which are invalid in LLVM
244/// vectors. We also filter target specific types which have absolutely no
245/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
246/// avoids spending time checking the cost model and realizing that they will
247/// be inevitably scalarized.
248static bool isValidElementType(Type *Ty) {
249 // TODO: Support ScalableVectorType.
250 if (SLPReVec && isa<FixedVectorType>(Val: Ty))
251 Ty = Ty->getScalarType();
252 return VectorType::isValidElementType(ElemTy: Ty) && !Ty->isX86_FP80Ty() &&
253 !Ty->isPPC_FP128Ty();
254}
255
256/// Returns the type of the given value/instruction \p V. If it is store,
257/// returns the type of its value operand, for Cmp - the types of the compare
258/// operands and for insertelement - the type os the inserted operand.
259/// Otherwise, just the type of the value is returned.
260static Type *getValueType(Value *V) {
261 if (auto *SI = dyn_cast<StoreInst>(Val: V))
262 return SI->getValueOperand()->getType();
263 if (auto *CI = dyn_cast<CmpInst>(Val: V))
264 return CI->getOperand(i_nocapture: 0)->getType();
265 if (!SLPReVec)
266 if (auto *IE = dyn_cast<InsertElementInst>(Val: V))
267 return IE->getOperand(i_nocapture: 1)->getType();
268 return V->getType();
269}
270
271/// \returns the number of elements for Ty.
272static unsigned getNumElements(Type *Ty) {
273 assert(!isa<ScalableVectorType>(Ty) &&
274 "ScalableVectorType is not supported.");
275 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Ty))
276 return VecTy->getNumElements();
277 return 1;
278}
279
280/// \returns the vector type of ScalarTy based on vectorization factor.
281static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
282 return FixedVectorType::get(ElementType: ScalarTy->getScalarType(),
283 NumElts: VF * getNumElements(Ty: ScalarTy));
284}
285
286/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
287/// which forms type, which splits by \p TTI into whole vector types during
288/// legalization.
289static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI,
290 Type *Ty, unsigned Sz) {
291 if (!isValidElementType(Ty))
292 return bit_ceil(Value: Sz);
293 // Find the number of elements, which forms full vectors.
294 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
295 if (NumParts == 0 || NumParts >= Sz)
296 return bit_ceil(Value: Sz);
297 return bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts)) * NumParts;
298}
299
300/// Returns the number of elements of the given type \p Ty, not greater than \p
301/// Sz, which forms type, which splits by \p TTI into whole vector types during
302/// legalization.
303static unsigned
304getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
305 unsigned Sz) {
306 if (!isValidElementType(Ty))
307 return bit_floor(Value: Sz);
308 // Find the number of elements, which forms full vectors.
309 unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
310 if (NumParts == 0 || NumParts >= Sz)
311 return bit_floor(Value: Sz);
312 unsigned RegVF = bit_ceil(Value: divideCeil(Numerator: Sz, Denominator: NumParts));
313 if (RegVF > Sz)
314 return bit_floor(Value: Sz);
315 return (Sz / RegVF) * RegVF;
316}
317
318static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
319 SmallVectorImpl<int> &Mask) {
320 // The ShuffleBuilder implementation use shufflevector to splat an "element".
321 // But the element have different meaning for SLP (scalar) and REVEC
322 // (vector). We need to expand Mask into masks which shufflevector can use
323 // directly.
324 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
325 for (unsigned I : seq<unsigned>(Size: Mask.size()))
326 for (auto [J, MaskV] : enumerate(First: MutableArrayRef(NewMask).slice(
327 N: I * VecTyNumElements, M: VecTyNumElements)))
328 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
329 : Mask[I] * VecTyNumElements + J;
330 Mask.swap(RHS&: NewMask);
331}
332
333/// \returns the number of groups of shufflevector
334/// A group has the following features
335/// 1. All of value in a group are shufflevector.
336/// 2. The mask of all shufflevector is isExtractSubvectorMask.
337/// 3. The mask of all shufflevector uses all of the elements of the source.
338/// e.g., it is 1 group (%0)
339/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
340/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
341/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
342/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
343/// it is 2 groups (%3 and %4)
344/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
345/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
346/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
347/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
348/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
349/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
350/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
351/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
352/// it is 0 group
353/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
354/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
355/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
356/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
357static unsigned getShufflevectorNumGroups(ArrayRef<Value *> VL) {
358 if (VL.empty())
359 return 0;
360 if (!all_of(Range&: VL, P: IsaPred<ShuffleVectorInst>))
361 return 0;
362 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
363 unsigned SVNumElements =
364 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
365 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
366 if (SVNumElements % ShuffleMaskSize != 0)
367 return 0;
368 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
369 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
370 return 0;
371 unsigned NumGroup = 0;
372 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
373 auto *SV = cast<ShuffleVectorInst>(Val: VL[I]);
374 Value *Src = SV->getOperand(i_nocapture: 0);
375 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
376 SmallBitVector ExpectedIndex(GroupSize);
377 if (!all_of(Range&: Group, P: [&](Value *V) {
378 auto *SV = cast<ShuffleVectorInst>(Val: V);
379 // From the same source.
380 if (SV->getOperand(i_nocapture: 0) != Src)
381 return false;
382 int Index;
383 if (!SV->isExtractSubvectorMask(Index))
384 return false;
385 ExpectedIndex.set(Index / ShuffleMaskSize);
386 return true;
387 }))
388 return 0;
389 if (!ExpectedIndex.all())
390 return 0;
391 ++NumGroup;
392 }
393 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
394 return NumGroup;
395}
396
397/// \returns a shufflevector mask which is used to vectorize shufflevectors
398/// e.g.,
399/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
400/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
401/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
402/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
403/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
404/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
405/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
406/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
407/// the result is
408/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
409static SmallVector<int> calculateShufflevectorMask(ArrayRef<Value *> VL) {
410 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
411 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
412 unsigned SVNumElements =
413 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())->getNumElements();
414 SmallVector<int> Mask;
415 unsigned AccumulateLength = 0;
416 for (Value *V : VL) {
417 auto *SV = cast<ShuffleVectorInst>(Val: V);
418 for (int M : SV->getShuffleMask())
419 Mask.push_back(Elt: M == PoisonMaskElem ? PoisonMaskElem
420 : AccumulateLength + M);
421 AccumulateLength += SVNumElements;
422 }
423 return Mask;
424}
425
426/// \returns True if the value is a constant (but not globals/constant
427/// expressions).
428static bool isConstant(Value *V) {
429 return isa<Constant>(Val: V) && !isa<ConstantExpr, GlobalValue>(Val: V);
430}
431
432/// Checks if \p V is one of vector-like instructions, i.e. undef,
433/// insertelement/extractelement with constant indices for fixed vector type or
434/// extractvalue instruction.
435static bool isVectorLikeInstWithConstOps(Value *V) {
436 if (!isa<InsertElementInst, ExtractElementInst>(Val: V) &&
437 !isa<ExtractValueInst, UndefValue>(Val: V))
438 return false;
439 auto *I = dyn_cast<Instruction>(Val: V);
440 if (!I || isa<ExtractValueInst>(Val: I))
441 return true;
442 if (!isa<FixedVectorType>(Val: I->getOperand(i: 0)->getType()))
443 return false;
444 if (isa<ExtractElementInst>(Val: I))
445 return isConstant(V: I->getOperand(i: 1));
446 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
447 return isConstant(V: I->getOperand(i: 2));
448}
449
450/// Returns power-of-2 number of elements in a single register (part), given the
451/// total number of elements \p Size and number of registers (parts) \p
452/// NumParts.
453static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
454 return std::min<unsigned>(a: Size, b: bit_ceil(Value: divideCeil(Numerator: Size, Denominator: NumParts)));
455}
456
457/// Returns correct remaining number of elements, considering total amount \p
458/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
459/// and current register (part) \p Part.
460static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
461 unsigned Part) {
462 return std::min<unsigned>(a: PartNumElems, b: Size - Part * PartNumElems);
463}
464
465#if !defined(NDEBUG)
466/// Print a short descriptor of the instruction bundle suitable for debug output.
467static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
468 std::string Result;
469 raw_string_ostream OS(Result);
470 if (Idx >= 0)
471 OS << "Idx: " << Idx << ", ";
472 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
473 return Result;
474}
475#endif
476
477/// \returns true if all of the instructions in \p VL are in the same block or
478/// false otherwise.
479static bool allSameBlock(ArrayRef<Value *> VL) {
480 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
481 if (It == VL.end())
482 return false;
483 Instruction *I0 = cast<Instruction>(Val: *It);
484 if (all_of(Range&: VL, P: isVectorLikeInstWithConstOps))
485 return true;
486
487 BasicBlock *BB = I0->getParent();
488 for (Value *V : iterator_range(It, VL.end())) {
489 if (isa<PoisonValue>(Val: V))
490 continue;
491 auto *II = dyn_cast<Instruction>(Val: V);
492 if (!II)
493 return false;
494
495 if (BB != II->getParent())
496 return false;
497 }
498 return true;
499}
500
501/// \returns True if all of the values in \p VL are constants (but not
502/// globals/constant expressions).
503static bool allConstant(ArrayRef<Value *> VL) {
504 // Constant expressions and globals can't be vectorized like normal integer/FP
505 // constants.
506 return all_of(Range&: VL, P: isConstant);
507}
508
509/// \returns True if all of the values in \p VL are identical or some of them
510/// are UndefValue.
511static bool isSplat(ArrayRef<Value *> VL) {
512 Value *FirstNonUndef = nullptr;
513 for (Value *V : VL) {
514 if (isa<UndefValue>(Val: V))
515 continue;
516 if (!FirstNonUndef) {
517 FirstNonUndef = V;
518 continue;
519 }
520 if (V != FirstNonUndef)
521 return false;
522 }
523 return FirstNonUndef != nullptr;
524}
525
526/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
527/// For BinaryOperator, it also checks if \p InstWithUses is used in specific
528/// patterns that make it effectively commutative (like equality comparisons
529/// with zero).
530/// In most cases, users should not call this function directly (since \p I and
531/// \p InstWithUses are the same). However, when analyzing interchangeable
532/// instructions, we need to use the converted opcode along with the original
533/// uses.
534/// \param I The instruction to check for commutativity
535/// \param ValWithUses The value whose uses are analyzed for special
536/// patterns
537static bool isCommutative(Instruction *I, Value *ValWithUses,
538 bool IsCopyable = false) {
539 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
540 return Cmp->isCommutative();
541 if (auto *BO = dyn_cast<BinaryOperator>(Val: I))
542 return BO->isCommutative() ||
543 (BO->getOpcode() == Instruction::Sub &&
544 ValWithUses->hasUseList() &&
545 !ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
546 all_of(
547 Range: ValWithUses->uses(),
548 P: [&](const Use &U) {
549 // Commutative, if icmp eq/ne sub, 0
550 CmpPredicate Pred;
551 if (match(V: U.getUser(),
552 P: m_ICmp(Pred, L: m_Specific(V: U.get()), R: m_Zero())) &&
553 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
554 return true;
555 // Commutative, if abs(sub nsw, true) or abs(sub, false).
556 ConstantInt *Flag;
557 auto *I = dyn_cast<BinaryOperator>(Val: U.get());
558 return match(V: U.getUser(),
559 P: m_Intrinsic<Intrinsic::abs>(
560 Op0: m_Specific(V: U.get()), Op1: m_ConstantInt(CI&: Flag))) &&
561 ((!IsCopyable && I && !I->hasNoSignedWrap()) ||
562 Flag->isOne());
563 })) ||
564 (BO->getOpcode() == Instruction::FSub &&
565 ValWithUses->hasUseList() &&
566 !ValWithUses->hasNUsesOrMore(N: UsesLimit) &&
567 all_of(Range: ValWithUses->uses(), P: [](const Use &U) {
568 return match(V: U.getUser(),
569 P: m_Intrinsic<Intrinsic::fabs>(Op0: m_Specific(V: U.get())));
570 }));
571 return I->isCommutative();
572}
573
574/// Checks if the operand is commutative. In commutative operations, not all
575/// operands might commutable, e.g. for fmuladd only 2 first operands are
576/// commutable.
577static bool isCommutableOperand(Instruction *I, Value *ValWithUses, unsigned Op,
578 bool IsCopyable = false) {
579 assert(::isCommutative(I, ValWithUses, IsCopyable) &&
580 "The instruction is not commutative.");
581 if (isa<CmpInst>(Val: I))
582 return true;
583 if (auto *BO = dyn_cast<BinaryOperator>(Val: I)) {
584 switch (BO->getOpcode()) {
585 case Instruction::Sub:
586 case Instruction::FSub:
587 return true;
588 default:
589 break;
590 }
591 }
592 return I->isCommutableOperand(Op);
593}
594
595/// This is a helper function to check whether \p I is commutative.
596/// This is a convenience wrapper that calls the two-parameter version of
597/// isCommutative with the same instruction for both parameters. This is
598/// the common case where the instruction being checked for commutativity
599/// is the same as the instruction whose uses are analyzed for special
600/// patterns (see the two-parameter version above for details).
601/// \param I The instruction to check for commutativity
602/// \returns true if the instruction is commutative, false otherwise
603static bool isCommutative(Instruction *I) { return isCommutative(I, ValWithUses: I); }
604
605/// \returns number of operands of \p I, considering commutativity. Returns 2
606/// for commutative intrinsics.
607/// \param I The instruction to check for commutativity
608static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) {
609 if (isa<IntrinsicInst>(Val: I) && isCommutative(I)) {
610 // IntrinsicInst::isCommutative returns true if swapping the first "two"
611 // arguments to the intrinsic produces the same result.
612 constexpr unsigned IntrinsicNumOperands = 2;
613 return IntrinsicNumOperands;
614 }
615 return I->getNumOperands();
616}
617
618template <typename T>
619static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
620 unsigned Offset) {
621 static_assert(std::is_same_v<T, InsertElementInst> ||
622 std::is_same_v<T, ExtractElementInst>,
623 "unsupported T");
624 int Index = Offset;
625 if (const auto *IE = dyn_cast<T>(Inst)) {
626 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
627 if (!VT)
628 return std::nullopt;
629 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
630 if (!CI)
631 return std::nullopt;
632 if (CI->getValue().uge(VT->getNumElements()))
633 return std::nullopt;
634 Index *= VT->getNumElements();
635 Index += CI->getZExtValue();
636 return Index;
637 }
638 return std::nullopt;
639}
640
641/// \returns inserting or extracting index of InsertElement, ExtractElement or
642/// InsertValue instruction, using Offset as base offset for index.
643/// \returns std::nullopt if the index is not an immediate.
644static std::optional<unsigned> getElementIndex(const Value *Inst,
645 unsigned Offset = 0) {
646 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
647 return Index;
648 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
649 return Index;
650
651 int Index = Offset;
652
653 const auto *IV = dyn_cast<InsertValueInst>(Val: Inst);
654 if (!IV)
655 return std::nullopt;
656
657 Type *CurrentType = IV->getType();
658 for (unsigned I : IV->indices()) {
659 if (const auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
660 Index *= ST->getNumElements();
661 CurrentType = ST->getElementType(N: I);
662 } else if (const auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
663 Index *= AT->getNumElements();
664 CurrentType = AT->getElementType();
665 } else {
666 return std::nullopt;
667 }
668 Index += I;
669 }
670 return Index;
671}
672
673/// \returns true if all of the values in \p VL use the same opcode.
674/// For comparison instructions, also checks if predicates match.
675/// PoisonValues are considered matching.
676/// Interchangeable instructions are not considered.
677static bool allSameOpcode(ArrayRef<Value *> VL) {
678 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
679 if (It == VL.end())
680 return true;
681 Instruction *MainOp = cast<Instruction>(Val: *It);
682 unsigned Opcode = MainOp->getOpcode();
683 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
684 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
685 : CmpInst::BAD_ICMP_PREDICATE;
686 return std::all_of(first: It, last: VL.end(), pred: [&](Value *V) {
687 if (auto *CI = dyn_cast<CmpInst>(Val: V))
688 return BasePred == CI->getPredicate();
689 if (auto *I = dyn_cast<Instruction>(Val: V))
690 return I->getOpcode() == Opcode;
691 return isa<PoisonValue>(Val: V);
692 });
693}
694
695namespace {
696/// Specifies the way the mask should be analyzed for undefs/poisonous elements
697/// in the shuffle mask.
698enum class UseMask {
699 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
700 ///< check for the mask elements for the first argument (mask
701 ///< indices are in range [0:VF)).
702 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
703 ///< for the mask elements for the second argument (mask indices
704 ///< are in range [VF:2*VF))
705 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
706 ///< future shuffle elements and mark them as ones as being used
707 ///< in future. Non-undef elements are considered as unused since
708 ///< they're already marked as used in the mask.
709};
710} // namespace
711
712/// Prepares a use bitset for the given mask either for the first argument or
713/// for the second.
714static SmallBitVector buildUseMask(int VF, ArrayRef<int> Mask,
715 UseMask MaskArg) {
716 SmallBitVector UseMask(VF, true);
717 for (auto [Idx, Value] : enumerate(First&: Mask)) {
718 if (Value == PoisonMaskElem) {
719 if (MaskArg == UseMask::UndefsAsMask)
720 UseMask.reset(Idx);
721 continue;
722 }
723 if (MaskArg == UseMask::FirstArg && Value < VF)
724 UseMask.reset(Idx: Value);
725 else if (MaskArg == UseMask::SecondArg && Value >= VF)
726 UseMask.reset(Idx: Value - VF);
727 }
728 return UseMask;
729}
730
731/// Checks if the given value is actually an undefined constant vector.
732/// Also, if the \p UseMask is not empty, tries to check if the non-masked
733/// elements actually mask the insertelement buildvector, if any.
734template <bool IsPoisonOnly = false>
735static SmallBitVector isUndefVector(const Value *V,
736 const SmallBitVector &UseMask = {}) {
737 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
738 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
739 if (isa<T>(V))
740 return Res;
741 auto *VecTy = dyn_cast<FixedVectorType>(Val: V->getType());
742 if (!VecTy)
743 return Res.reset();
744 auto *C = dyn_cast<Constant>(Val: V);
745 if (!C) {
746 if (!UseMask.empty()) {
747 const Value *Base = V;
748 while (auto *II = dyn_cast<InsertElementInst>(Val: Base)) {
749 Base = II->getOperand(i_nocapture: 0);
750 if (isa<T>(II->getOperand(i_nocapture: 1)))
751 continue;
752 std::optional<unsigned> Idx = getElementIndex(Inst: II);
753 if (!Idx) {
754 Res.reset();
755 return Res;
756 }
757 if (*Idx < UseMask.size() && !UseMask.test(Idx: *Idx))
758 Res.reset(Idx: *Idx);
759 }
760 // TODO: Add analysis for shuffles here too.
761 if (V == Base) {
762 Res.reset();
763 } else {
764 SmallBitVector SubMask(UseMask.size(), false);
765 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
766 }
767 } else {
768 Res.reset();
769 }
770 return Res;
771 }
772 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
773 if (Constant *Elem = C->getAggregateElement(Elt: I))
774 if (!isa<T>(Elem) &&
775 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(Idx: I))))
776 Res.reset(Idx: I);
777 }
778 return Res;
779}
780
781/// Checks if the vector of instructions can be represented as a shuffle, like:
782/// %x0 = extractelement <4 x i8> %x, i32 0
783/// %x3 = extractelement <4 x i8> %x, i32 3
784/// %y1 = extractelement <4 x i8> %y, i32 1
785/// %y2 = extractelement <4 x i8> %y, i32 2
786/// %x0x0 = mul i8 %x0, %x0
787/// %x3x3 = mul i8 %x3, %x3
788/// %y1y1 = mul i8 %y1, %y1
789/// %y2y2 = mul i8 %y2, %y2
790/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
791/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
792/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
793/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
794/// ret <4 x i8> %ins4
795/// can be transformed into:
796/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
797/// i32 6>
798/// %2 = mul <4 x i8> %1, %1
799/// ret <4 x i8> %2
800/// Mask will return the Shuffle Mask equivalent to the extracted elements.
801/// TODO: Can we split off and reuse the shuffle mask detection from
802/// ShuffleVectorInst/getShuffleCost?
803static std::optional<TargetTransformInfo::ShuffleKind>
804isFixedVectorShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
805 AssumptionCache *AC) {
806 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst>);
807 if (It == VL.end())
808 return std::nullopt;
809 unsigned Size =
810 std::accumulate(first: VL.begin(), last: VL.end(), init: 0u, binary_op: [](unsigned S, Value *V) {
811 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
812 if (!EI)
813 return S;
814 auto *VTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
815 if (!VTy)
816 return S;
817 return std::max(a: S, b: VTy->getNumElements());
818 });
819
820 Value *Vec1 = nullptr;
821 Value *Vec2 = nullptr;
822 bool HasNonUndefVec = any_of(Range&: VL, P: [&](Value *V) {
823 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
824 if (!EE)
825 return false;
826 Value *Vec = EE->getVectorOperand();
827 if (isa<UndefValue>(Val: Vec))
828 return false;
829 return isGuaranteedNotToBePoison(V: Vec, AC);
830 });
831 enum ShuffleMode { Unknown, Select, Permute };
832 ShuffleMode CommonShuffleMode = Unknown;
833 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
834 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
835 // Undef can be represented as an undef element in a vector.
836 if (isa<UndefValue>(Val: VL[I]))
837 continue;
838 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
839 if (isa<ScalableVectorType>(Val: EI->getVectorOperandType()))
840 return std::nullopt;
841 auto *Vec = EI->getVectorOperand();
842 // We can extractelement from undef or poison vector.
843 if (isUndefVector</*isPoisonOnly=*/true>(V: Vec).all())
844 continue;
845 // All vector operands must have the same number of vector elements.
846 if (isa<UndefValue>(Val: Vec)) {
847 Mask[I] = I;
848 } else {
849 if (isa<UndefValue>(Val: EI->getIndexOperand()))
850 continue;
851 auto *Idx = dyn_cast<ConstantInt>(Val: EI->getIndexOperand());
852 if (!Idx)
853 return std::nullopt;
854 // Undefined behavior if Idx is negative or >= Size.
855 if (Idx->getValue().uge(RHS: Size))
856 continue;
857 unsigned IntIdx = Idx->getValue().getZExtValue();
858 Mask[I] = IntIdx;
859 }
860 if (isUndefVector(V: Vec).all() && HasNonUndefVec)
861 continue;
862 // For correct shuffling we have to have at most 2 different vector operands
863 // in all extractelement instructions.
864 if (!Vec1 || Vec1 == Vec) {
865 Vec1 = Vec;
866 } else if (!Vec2 || Vec2 == Vec) {
867 Vec2 = Vec;
868 Mask[I] += Size;
869 } else {
870 return std::nullopt;
871 }
872 if (CommonShuffleMode == Permute)
873 continue;
874 // If the extract index is not the same as the operation number, it is a
875 // permutation.
876 if (Mask[I] % Size != I) {
877 CommonShuffleMode = Permute;
878 continue;
879 }
880 CommonShuffleMode = Select;
881 }
882 // If we're not crossing lanes in different vectors, consider it as blending.
883 if (CommonShuffleMode == Select && Vec2)
884 return TargetTransformInfo::SK_Select;
885 // If Vec2 was never used, we have a permutation of a single vector, otherwise
886 // we have permutation of 2 vectors.
887 return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
888 : TargetTransformInfo::SK_PermuteSingleSrc;
889}
890
891/// \returns True if Extract{Value,Element} instruction extracts element Idx.
892static std::optional<unsigned> getExtractIndex(const Instruction *E) {
893 unsigned Opcode = E->getOpcode();
894 assert((Opcode == Instruction::ExtractElement ||
895 Opcode == Instruction::ExtractValue) &&
896 "Expected extractelement or extractvalue instruction.");
897 if (Opcode == Instruction::ExtractElement) {
898 auto *CI = dyn_cast<ConstantInt>(Val: E->getOperand(i: 1));
899 if (!CI)
900 return std::nullopt;
901 // Check if the index is out of bound - we can get the source vector from
902 // operand 0
903 unsigned Idx = CI->getZExtValue();
904 auto *EE = cast<ExtractElementInst>(Val: E);
905 const unsigned VF = ::getNumElements(Ty: EE->getVectorOperandType());
906 if (Idx >= VF)
907 return std::nullopt;
908 return Idx;
909 }
910 auto *EI = cast<ExtractValueInst>(Val: E);
911 if (EI->getNumIndices() != 1)
912 return std::nullopt;
913 return *EI->idx_begin();
914}
915
916/// Checks if the provided value does not require scheduling. It does not
917/// require scheduling if this is not an instruction or it is an instruction
918/// that does not read/write memory and all operands are either not instructions
919/// or phi nodes or instructions from different blocks.
920static bool areAllOperandsNonInsts(Value *V);
921/// Checks if the provided value does not require scheduling. It does not
922/// require scheduling if this is not an instruction or it is an instruction
923/// that does not read/write memory and all users are phi nodes or instructions
924/// from the different blocks.
925static bool isUsedOutsideBlock(Value *V);
926/// Checks if the specified value does not require scheduling. It does not
927/// require scheduling if all operands and all users do not need to be scheduled
928/// in the current basic block.
929static bool doesNotNeedToBeScheduled(Value *V);
930
931/// \returns true if \p Opcode is allowed as part of the main/alternate
932/// instruction for SLP vectorization.
933///
934/// Example of unsupported opcode is SDIV that can potentially cause UB if the
935/// "shuffled out" lane would result in division by zero.
936static bool isValidForAlternation(unsigned Opcode) {
937 return !Instruction::isIntDivRem(Opcode);
938}
939
940namespace {
941
942/// Helper class that determines VL can use the same opcode.
943/// Alternate instruction is supported. In addition, it supports interchangeable
944/// instruction. An interchangeable instruction is an instruction that can be
945/// converted to another instruction with same semantics. For example, x << 1 is
946/// equal to x * 2. x * 1 is equal to x | 0.
947class BinOpSameOpcodeHelper {
948 using MaskType = std::uint_fast16_t;
949 /// Sort SupportedOp because it is used by binary_search.
950 constexpr static std::initializer_list<unsigned> SupportedOp = {
951 Instruction::Add, Instruction::Sub, Instruction::Mul, Instruction::Shl,
952 Instruction::AShr, Instruction::And, Instruction::Or, Instruction::Xor};
953 static_assert(llvm::is_sorted_constexpr(Range: SupportedOp) &&
954 "SupportedOp is not sorted.");
955 enum : MaskType {
956 ShlBIT = 0b1,
957 AShrBIT = 0b10,
958 MulBIT = 0b100,
959 AddBIT = 0b1000,
960 SubBIT = 0b10000,
961 AndBIT = 0b100000,
962 OrBIT = 0b1000000,
963 XorBIT = 0b10000000,
964 MainOpBIT = 0b100000000,
965 LLVM_MARK_AS_BITMASK_ENUM(MainOpBIT)
966 };
967 /// Return a non-nullptr if either operand of I is a ConstantInt.
968 /// The second return value represents the operand position. We check the
969 /// right-hand side first (1). If the right hand side is not a ConstantInt and
970 /// the instruction is neither Sub, Shl, nor AShr, we then check the left hand
971 /// side (0).
972 static std::pair<ConstantInt *, unsigned>
973 isBinOpWithConstantInt(const Instruction *I) {
974 unsigned Opcode = I->getOpcode();
975 assert(binary_search(SupportedOp, Opcode) && "Unsupported opcode.");
976 (void)SupportedOp;
977 auto *BinOp = cast<BinaryOperator>(Val: I);
978 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 1)))
979 return {CI, 1};
980 if (Opcode == Instruction::Sub || Opcode == Instruction::Shl ||
981 Opcode == Instruction::AShr)
982 return {nullptr, 0};
983 if (auto *CI = dyn_cast<ConstantInt>(Val: BinOp->getOperand(i_nocapture: 0)))
984 return {CI, 0};
985 return {nullptr, 0};
986 }
987 struct InterchangeableInfo {
988 const Instruction *I = nullptr;
989 /// The bit it sets represents whether MainOp can be converted to.
990 MaskType Mask = MainOpBIT | XorBIT | OrBIT | AndBIT | SubBIT | AddBIT |
991 MulBIT | AShrBIT | ShlBIT;
992 /// We cannot create an interchangeable instruction that does not exist in
993 /// VL. For example, VL [x + 0, y * 1] can be converted to [x << 0, y << 0],
994 /// but << does not exist in VL. In the end, we convert VL to [x * 1, y *
995 /// 1]. SeenBefore is used to know what operations have been seen before.
996 MaskType SeenBefore = 0;
997 InterchangeableInfo(const Instruction *I) : I(I) {}
998 /// Return false allows BinOpSameOpcodeHelper to find an alternate
999 /// instruction. Directly setting the mask will destroy the mask state,
1000 /// preventing us from determining which instruction it should convert to.
1001 bool trySet(MaskType OpcodeInMaskForm, MaskType InterchangeableMask) {
1002 if (Mask & InterchangeableMask) {
1003 SeenBefore |= OpcodeInMaskForm;
1004 Mask &= InterchangeableMask;
1005 return true;
1006 }
1007 return false;
1008 }
1009 bool equal(unsigned Opcode) {
1010 return Opcode == I->getOpcode() && trySet(OpcodeInMaskForm: MainOpBIT, InterchangeableMask: MainOpBIT);
1011 }
1012 unsigned getOpcode() const {
1013 MaskType Candidate = Mask & SeenBefore;
1014 if (Candidate & MainOpBIT)
1015 return I->getOpcode();
1016 if (Candidate & ShlBIT)
1017 return Instruction::Shl;
1018 if (Candidate & AShrBIT)
1019 return Instruction::AShr;
1020 if (Candidate & MulBIT)
1021 return Instruction::Mul;
1022 if (Candidate & AddBIT)
1023 return Instruction::Add;
1024 if (Candidate & SubBIT)
1025 return Instruction::Sub;
1026 if (Candidate & AndBIT)
1027 return Instruction::And;
1028 if (Candidate & OrBIT)
1029 return Instruction::Or;
1030 if (Candidate & XorBIT)
1031 return Instruction::Xor;
1032 llvm_unreachable("Cannot find interchangeable instruction.");
1033 }
1034
1035 /// Return true if the instruction can be converted to \p Opcode.
1036 bool hasCandidateOpcode(unsigned Opcode) const {
1037 MaskType Candidate = Mask & SeenBefore;
1038 switch (Opcode) {
1039 case Instruction::Shl:
1040 return Candidate & ShlBIT;
1041 case Instruction::AShr:
1042 return Candidate & AShrBIT;
1043 case Instruction::Mul:
1044 return Candidate & MulBIT;
1045 case Instruction::Add:
1046 return Candidate & AddBIT;
1047 case Instruction::Sub:
1048 return Candidate & SubBIT;
1049 case Instruction::And:
1050 return Candidate & AndBIT;
1051 case Instruction::Or:
1052 return Candidate & OrBIT;
1053 case Instruction::Xor:
1054 return Candidate & XorBIT;
1055 case Instruction::LShr:
1056 case Instruction::FAdd:
1057 case Instruction::FSub:
1058 case Instruction::FMul:
1059 case Instruction::SDiv:
1060 case Instruction::UDiv:
1061 case Instruction::FDiv:
1062 case Instruction::SRem:
1063 case Instruction::URem:
1064 case Instruction::FRem:
1065 return false;
1066 default:
1067 break;
1068 }
1069 llvm_unreachable("Cannot find interchangeable instruction.");
1070 }
1071
1072 SmallVector<Value *> getOperand(const Instruction *To) const {
1073 unsigned ToOpcode = To->getOpcode();
1074 unsigned FromOpcode = I->getOpcode();
1075 if (FromOpcode == ToOpcode)
1076 return SmallVector<Value *>(I->operands());
1077 assert(binary_search(SupportedOp, ToOpcode) && "Unsupported opcode.");
1078 auto [CI, Pos] = isBinOpWithConstantInt(I);
1079 const APInt &FromCIValue = CI->getValue();
1080 unsigned FromCIValueBitWidth = FromCIValue.getBitWidth();
1081 Type *RHSType = I->getOperand(i: Pos)->getType();
1082 Constant *RHS;
1083 switch (FromOpcode) {
1084 case Instruction::Shl:
1085 if (ToOpcode == Instruction::Mul) {
1086 RHS = ConstantInt::get(
1087 Ty: RHSType, V: APInt::getOneBitSet(numBits: FromCIValueBitWidth,
1088 BitNo: FromCIValue.getZExtValue()));
1089 } else {
1090 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1091 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1092 /*AllowRHSConstant=*/true);
1093 }
1094 break;
1095 case Instruction::Mul:
1096 assert(FromCIValue.isPowerOf2() && "Cannot convert the instruction.");
1097 if (ToOpcode == Instruction::Shl) {
1098 RHS = ConstantInt::get(
1099 Ty: RHSType, V: APInt(FromCIValueBitWidth, FromCIValue.logBase2()));
1100 } else {
1101 assert(FromCIValue.isOne() && "Cannot convert the instruction.");
1102 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1103 /*AllowRHSConstant=*/true);
1104 }
1105 break;
1106 case Instruction::Add:
1107 case Instruction::Sub:
1108 if (FromCIValue.isZero()) {
1109 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1110 /*AllowRHSConstant=*/true);
1111 } else {
1112 assert(is_contained({Instruction::Add, Instruction::Sub}, ToOpcode) &&
1113 "Cannot convert the instruction.");
1114 APInt NegatedVal = APInt(FromCIValue);
1115 NegatedVal.negate();
1116 RHS = ConstantInt::get(Ty: RHSType, V: NegatedVal);
1117 }
1118 break;
1119 case Instruction::And:
1120 assert(FromCIValue.isAllOnes() && "Cannot convert the instruction.");
1121 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1122 /*AllowRHSConstant=*/true);
1123 break;
1124 default:
1125 assert(FromCIValue.isZero() && "Cannot convert the instruction.");
1126 RHS = ConstantExpr::getBinOpIdentity(Opcode: ToOpcode, Ty: RHSType,
1127 /*AllowRHSConstant=*/true);
1128 break;
1129 }
1130 Value *LHS = I->getOperand(i: 1 - Pos);
1131 // If the target opcode is non-commutative (e.g., shl, sub),
1132 // force the variable to the left and the constant to the right.
1133 if (Pos == 1 || !Instruction::isCommutative(Opcode: ToOpcode))
1134 return SmallVector<Value *>({LHS, RHS});
1135
1136 return SmallVector<Value *>({RHS, LHS});
1137 }
1138 };
1139 InterchangeableInfo MainOp;
1140 InterchangeableInfo AltOp;
1141 bool isValidForAlternation(const Instruction *I) const {
1142 return ::isValidForAlternation(Opcode: MainOp.I->getOpcode()) &&
1143 ::isValidForAlternation(Opcode: I->getOpcode());
1144 }
1145 bool initializeAltOp(const Instruction *I) {
1146 if (AltOp.I)
1147 return true;
1148 if (!isValidForAlternation(I))
1149 return false;
1150 AltOp.I = I;
1151 return true;
1152 }
1153
1154public:
1155 BinOpSameOpcodeHelper(const Instruction *MainOp,
1156 const Instruction *AltOp = nullptr)
1157 : MainOp(MainOp), AltOp(AltOp) {}
1158 bool add(const Instruction *I) {
1159 assert(isa<BinaryOperator>(I) &&
1160 "BinOpSameOpcodeHelper only accepts BinaryOperator.");
1161 unsigned Opcode = I->getOpcode();
1162 MaskType OpcodeInMaskForm;
1163 // Prefer Shl, AShr, Mul, Add, Sub, And, Or and Xor over MainOp.
1164 switch (Opcode) {
1165 case Instruction::Shl:
1166 OpcodeInMaskForm = ShlBIT;
1167 break;
1168 case Instruction::AShr:
1169 OpcodeInMaskForm = AShrBIT;
1170 break;
1171 case Instruction::Mul:
1172 OpcodeInMaskForm = MulBIT;
1173 break;
1174 case Instruction::Add:
1175 OpcodeInMaskForm = AddBIT;
1176 break;
1177 case Instruction::Sub:
1178 OpcodeInMaskForm = SubBIT;
1179 break;
1180 case Instruction::And:
1181 OpcodeInMaskForm = AndBIT;
1182 break;
1183 case Instruction::Or:
1184 OpcodeInMaskForm = OrBIT;
1185 break;
1186 case Instruction::Xor:
1187 OpcodeInMaskForm = XorBIT;
1188 break;
1189 default:
1190 return MainOp.equal(Opcode) ||
1191 (initializeAltOp(I) && AltOp.equal(Opcode));
1192 }
1193 MaskType InterchangeableMask = OpcodeInMaskForm;
1194 ConstantInt *CI = isBinOpWithConstantInt(I).first;
1195 if (CI) {
1196 constexpr MaskType CanBeAll =
1197 XorBIT | OrBIT | AndBIT | SubBIT | AddBIT | MulBIT | AShrBIT | ShlBIT;
1198 const APInt &CIValue = CI->getValue();
1199 switch (Opcode) {
1200 case Instruction::Shl:
1201 if (CIValue.ult(RHS: CIValue.getBitWidth()))
1202 InterchangeableMask = CIValue.isZero() ? CanBeAll : MulBIT | ShlBIT;
1203 break;
1204 case Instruction::Mul:
1205 if (CIValue.isOne()) {
1206 InterchangeableMask = CanBeAll;
1207 break;
1208 }
1209 if (CIValue.isPowerOf2())
1210 InterchangeableMask = MulBIT | ShlBIT;
1211 break;
1212 case Instruction::Add:
1213 case Instruction::Sub:
1214 InterchangeableMask = CIValue.isZero() ? CanBeAll : SubBIT | AddBIT;
1215 break;
1216 case Instruction::And:
1217 if (CIValue.isAllOnes())
1218 InterchangeableMask = CanBeAll;
1219 break;
1220 case Instruction::Xor:
1221 if (CIValue.isZero())
1222 InterchangeableMask = XorBIT | OrBIT | SubBIT | AddBIT;
1223 break;
1224 default:
1225 if (CIValue.isZero())
1226 InterchangeableMask = CanBeAll;
1227 break;
1228 }
1229 }
1230 return MainOp.trySet(OpcodeInMaskForm, InterchangeableMask) ||
1231 (initializeAltOp(I) &&
1232 AltOp.trySet(OpcodeInMaskForm, InterchangeableMask));
1233 }
1234 unsigned getMainOpcode() const { return MainOp.getOpcode(); }
1235 /// Checks if the list of potential opcodes includes \p Opcode.
1236 bool hasCandidateOpcode(unsigned Opcode) const {
1237 return MainOp.hasCandidateOpcode(Opcode);
1238 }
1239 bool hasAltOp() const { return AltOp.I; }
1240 unsigned getAltOpcode() const {
1241 return hasAltOp() ? AltOp.getOpcode() : getMainOpcode();
1242 }
1243 SmallVector<Value *> getOperand(const Instruction *I) const {
1244 return MainOp.getOperand(To: I);
1245 }
1246};
1247
1248/// Main data required for vectorization of instructions.
1249class InstructionsState {
1250 /// MainOp and AltOp are primarily determined by getSameOpcode. Currently,
1251 /// only BinaryOperator, CastInst, and CmpInst support alternate instructions
1252 /// (i.e., AltOp is not equal to MainOp; this can be checked using
1253 /// isAltShuffle).
1254 /// A rare exception is TrySplitNode, where the InstructionsState is derived
1255 /// from getMainAltOpsNoStateVL.
1256 /// For those InstructionsState that use alternate instructions, the resulting
1257 /// vectorized output ultimately comes from a shufflevector. For example,
1258 /// given a vector list (VL):
1259 /// VL[0] = add i32 a, e
1260 /// VL[1] = sub i32 b, f
1261 /// VL[2] = add i32 c, g
1262 /// VL[3] = sub i32 d, h
1263 /// The vectorized result would be:
1264 /// intermediated_0 = add <4 x i32> <a, b, c, d>, <e, f, g, h>
1265 /// intermediated_1 = sub <4 x i32> <a, b, c, d>, <e, f, g, h>
1266 /// result = shufflevector <4 x i32> intermediated_0,
1267 /// <4 x i32> intermediated_1,
1268 /// <4 x i32> <i32 0, i32 5, i32 2, i32 7>
1269 /// Since shufflevector is used in the final result, when calculating the cost
1270 /// (getEntryCost), we must account for the usage of shufflevector in
1271 /// GetVectorCost.
1272 Instruction *MainOp = nullptr;
1273 Instruction *AltOp = nullptr;
1274 /// Wether the instruction state represents copyable instructions.
1275 bool HasCopyables = false;
1276
1277public:
1278 Instruction *getMainOp() const {
1279 assert(valid() && "InstructionsState is invalid.");
1280 return MainOp;
1281 }
1282
1283 Instruction *getAltOp() const {
1284 assert(valid() && "InstructionsState is invalid.");
1285 return AltOp;
1286 }
1287
1288 /// The main/alternate opcodes for the list of instructions.
1289 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
1290
1291 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
1292
1293 /// Some of the instructions in the list have alternate opcodes.
1294 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
1295
1296 /// Checks if the instruction matches either the main or alternate opcode.
1297 /// \returns
1298 /// - MainOp if \param I matches MainOp's opcode directly or can be converted
1299 /// to it
1300 /// - AltOp if \param I matches AltOp's opcode directly or can be converted to
1301 /// it
1302 /// - nullptr if \param I cannot be matched or converted to either opcode
1303 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
1304 assert(MainOp && "MainOp cannot be nullptr.");
1305 if (I->getOpcode() == MainOp->getOpcode())
1306 return MainOp;
1307 if (MainOp->getOpcode() == Instruction::Select &&
1308 I->getOpcode() == Instruction::ZExt && !isAltShuffle())
1309 return MainOp;
1310 // Prefer AltOp instead of interchangeable instruction of MainOp.
1311 assert(AltOp && "AltOp cannot be nullptr.");
1312 if (I->getOpcode() == AltOp->getOpcode())
1313 return AltOp;
1314 if (!I->isBinaryOp())
1315 return nullptr;
1316 BinOpSameOpcodeHelper Converter(MainOp);
1317 if (!Converter.add(I) || !Converter.add(I: MainOp))
1318 return nullptr;
1319 if (isAltShuffle() && !Converter.hasCandidateOpcode(Opcode: MainOp->getOpcode())) {
1320 BinOpSameOpcodeHelper AltConverter(AltOp);
1321 if (AltConverter.add(I) && AltConverter.add(I: AltOp) &&
1322 AltConverter.hasCandidateOpcode(Opcode: AltOp->getOpcode()))
1323 return AltOp;
1324 }
1325 if (Converter.hasAltOp() && !isAltShuffle())
1326 return nullptr;
1327 return Converter.hasAltOp() ? AltOp : MainOp;
1328 }
1329
1330 /// Checks if main/alt instructions are shift operations.
1331 bool isShiftOp() const {
1332 return getMainOp()->isShift() && getAltOp()->isShift();
1333 }
1334
1335 /// Checks if main/alt instructions are bitwise logic operations.
1336 bool isBitwiseLogicOp() const {
1337 return getMainOp()->isBitwiseLogicOp() && getAltOp()->isBitwiseLogicOp();
1338 }
1339
1340 /// Checks if main/alt instructions are mul/div/rem/fmul/fdiv/frem operations.
1341 bool isMulDivLikeOp() const {
1342 constexpr std::array<unsigned, 8> MulDiv = {
1343 Instruction::Mul, Instruction::FMul, Instruction::SDiv,
1344 Instruction::UDiv, Instruction::FDiv, Instruction::SRem,
1345 Instruction::URem, Instruction::FRem};
1346 return is_contained(Range: MulDiv, Element: getOpcode()) &&
1347 is_contained(Range: MulDiv, Element: getAltOpcode());
1348 }
1349
1350 /// Checks if main/alt instructions are add/sub/fadd/fsub operations.
1351 bool isAddSubLikeOp() const {
1352 constexpr std::array<unsigned, 4> AddSub = {
1353 Instruction::Add, Instruction::Sub, Instruction::FAdd,
1354 Instruction::FSub};
1355 return is_contained(Range: AddSub, Element: getOpcode()) &&
1356 is_contained(Range: AddSub, Element: getAltOpcode());
1357 }
1358
1359 /// Checks if main/alt instructions are cmp operations.
1360 bool isCmpOp() const {
1361 return (getOpcode() == Instruction::ICmp ||
1362 getOpcode() == Instruction::FCmp) &&
1363 getAltOpcode() == getOpcode();
1364 }
1365
1366 /// Checks if the current state is valid, i.e. has non-null MainOp
1367 bool valid() const { return MainOp && AltOp; }
1368
1369 explicit operator bool() const { return valid(); }
1370
1371 InstructionsState() = delete;
1372 InstructionsState(Instruction *MainOp, Instruction *AltOp,
1373 bool HasCopyables = false)
1374 : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {}
1375 static InstructionsState invalid() { return {nullptr, nullptr}; }
1376
1377 /// Checks if the value is a copyable element.
1378 bool isCopyableElement(Value *V) const {
1379 assert(valid() && "InstructionsState is invalid.");
1380 if (!HasCopyables)
1381 return false;
1382 if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr)
1383 return false;
1384 auto *I = dyn_cast<Instruction>(Val: V);
1385 if (!I)
1386 return !isa<PoisonValue>(Val: V);
1387 if (I->getParent() != MainOp->getParent() &&
1388 (!isVectorLikeInstWithConstOps(V: I) ||
1389 !isVectorLikeInstWithConstOps(V: MainOp)))
1390 return true;
1391 if (I->getOpcode() == MainOp->getOpcode())
1392 return false;
1393 if (!I->isBinaryOp())
1394 return true;
1395 BinOpSameOpcodeHelper Converter(MainOp);
1396 return !Converter.add(I) || !Converter.add(I: MainOp) ||
1397 Converter.hasAltOp() || !Converter.hasCandidateOpcode(Opcode: getOpcode());
1398 }
1399
1400 /// Checks if the value is non-schedulable.
1401 bool isNonSchedulable(Value *V) const {
1402 assert(valid() && "InstructionsState is invalid.");
1403 auto *I = dyn_cast<Instruction>(Val: V);
1404 if (!HasCopyables)
1405 return !I || isa<PHINode>(Val: I) || isVectorLikeInstWithConstOps(V: I) ||
1406 doesNotNeedToBeScheduled(V);
1407 // MainOp for copyables always schedulable to correctly identify
1408 // non-schedulable copyables.
1409 if (getMainOp() == V)
1410 return false;
1411 if (isCopyableElement(V)) {
1412 auto IsNonSchedulableCopyableElement = [this](Value *V) {
1413 auto *I = dyn_cast<Instruction>(Val: V);
1414 return !I || isa<PHINode>(Val: I) || I->getParent() != MainOp->getParent() ||
1415 (doesNotNeedToBeScheduled(V: I) &&
1416 // If the copyable instructions comes after MainOp
1417 // (non-schedulable, but used in the block) - cannot vectorize
1418 // it, will possibly generate use before def.
1419 !MainOp->comesBefore(Other: I));
1420 };
1421
1422 return IsNonSchedulableCopyableElement(V);
1423 }
1424 return !I || isa<PHINode>(Val: I) || isVectorLikeInstWithConstOps(V: I) ||
1425 doesNotNeedToBeScheduled(V);
1426 }
1427
1428 /// Checks if the state represents copyable instructions.
1429 bool areInstructionsWithCopyableElements() const {
1430 assert(valid() && "InstructionsState is invalid.");
1431 return HasCopyables;
1432 }
1433};
1434
1435std::pair<Instruction *, SmallVector<Value *>>
1436convertTo(Instruction *I, const InstructionsState &S) {
1437 Instruction *SelectedOp = S.getMatchingMainOpOrAltOp(I);
1438 assert(SelectedOp && "Cannot convert the instruction.");
1439 if (I->isBinaryOp()) {
1440 BinOpSameOpcodeHelper Converter(I);
1441 return std::make_pair(x&: SelectedOp, y: Converter.getOperand(I: SelectedOp));
1442 }
1443 return std::make_pair(x&: SelectedOp, y: SmallVector<Value *>(I->operands()));
1444}
1445
1446} // end anonymous namespace
1447
1448static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1449 const TargetLibraryInfo &TLI);
1450
1451/// Find an instruction with a specific opcode in VL.
1452/// \param VL Array of values to search through. Must contain only Instructions
1453/// and PoisonValues.
1454/// \param Opcode The instruction opcode to search for
1455/// \returns
1456/// - The first instruction found with matching opcode
1457/// - nullptr if no matching instruction is found
1458static Instruction *findInstructionWithOpcode(ArrayRef<Value *> VL,
1459 unsigned Opcode) {
1460 for (Value *V : VL) {
1461 if (isa<PoisonValue>(Val: V))
1462 continue;
1463 assert(isa<Instruction>(V) && "Only accepts PoisonValue and Instruction.");
1464 auto *Inst = cast<Instruction>(Val: V);
1465 if (Inst->getOpcode() == Opcode)
1466 return Inst;
1467 }
1468 return nullptr;
1469}
1470
1471/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
1472/// compatible instructions or constants, or just some other regular values.
1473static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
1474 Value *Op1, const TargetLibraryInfo &TLI) {
1475 return (isConstant(V: BaseOp0) && isConstant(V: Op0)) ||
1476 (isConstant(V: BaseOp1) && isConstant(V: Op1)) ||
1477 (!isa<Instruction>(Val: BaseOp0) && !isa<Instruction>(Val: Op0) &&
1478 !isa<Instruction>(Val: BaseOp1) && !isa<Instruction>(Val: Op1)) ||
1479 BaseOp0 == Op0 || BaseOp1 == Op1 ||
1480 getSameOpcode(VL: {BaseOp0, Op0}, TLI) ||
1481 getSameOpcode(VL: {BaseOp1, Op1}, TLI);
1482}
1483
1484/// \returns true if a compare instruction \p CI has similar "look" and
1485/// same predicate as \p BaseCI, "as is" or with its operands and predicate
1486/// swapped, false otherwise.
1487static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
1488 const TargetLibraryInfo &TLI) {
1489 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
1490 "Assessing comparisons of different types?");
1491 CmpInst::Predicate BasePred = BaseCI->getPredicate();
1492 CmpInst::Predicate Pred = CI->getPredicate();
1493 CmpInst::Predicate SwappedPred = CmpInst::getSwappedPredicate(pred: Pred);
1494
1495 Value *BaseOp0 = BaseCI->getOperand(i_nocapture: 0);
1496 Value *BaseOp1 = BaseCI->getOperand(i_nocapture: 1);
1497 Value *Op0 = CI->getOperand(i_nocapture: 0);
1498 Value *Op1 = CI->getOperand(i_nocapture: 1);
1499
1500 return (BasePred == Pred &&
1501 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
1502 (BasePred == SwappedPred &&
1503 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0: Op1, Op1: Op0, TLI));
1504}
1505
1506/// \returns analysis of the Instructions in \p VL described in
1507/// InstructionsState, the Opcode that we suppose the whole list
1508/// could be vectorized even if its structure is diverse.
1509static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
1510 const TargetLibraryInfo &TLI) {
1511 // Make sure these are all Instructions.
1512 if (!all_of(Range&: VL, P: IsaPred<Instruction, PoisonValue>))
1513 return InstructionsState::invalid();
1514
1515 auto *It = find_if(Range&: VL, P: IsaPred<Instruction>);
1516 if (It == VL.end())
1517 return InstructionsState::invalid();
1518
1519 Instruction *MainOp = cast<Instruction>(Val: *It);
1520 unsigned InstCnt = std::count_if(first: It, last: VL.end(), pred: IsaPred<Instruction>);
1521 if ((VL.size() > 2 && !isa<PHINode>(Val: MainOp) && InstCnt < VL.size() / 2) ||
1522 (VL.size() == 2 && InstCnt < 2))
1523 return InstructionsState::invalid();
1524
1525 bool IsCastOp = isa<CastInst>(Val: MainOp);
1526 bool IsBinOp = isa<BinaryOperator>(Val: MainOp);
1527 bool IsCmpOp = isa<CmpInst>(Val: MainOp);
1528 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(Val: MainOp)->getPredicate()
1529 : CmpInst::BAD_ICMP_PREDICATE;
1530 Instruction *AltOp = MainOp;
1531 unsigned Opcode = MainOp->getOpcode();
1532 unsigned AltOpcode = Opcode;
1533
1534 BinOpSameOpcodeHelper BinOpHelper(MainOp);
1535 bool SwappedPredsCompatible = IsCmpOp && [&]() {
1536 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
1537 UniquePreds.insert(X: BasePred);
1538 UniqueNonSwappedPreds.insert(X: BasePred);
1539 for (Value *V : VL) {
1540 auto *I = dyn_cast<CmpInst>(Val: V);
1541 if (!I)
1542 return false;
1543 CmpInst::Predicate CurrentPred = I->getPredicate();
1544 CmpInst::Predicate SwappedCurrentPred =
1545 CmpInst::getSwappedPredicate(pred: CurrentPred);
1546 UniqueNonSwappedPreds.insert(X: CurrentPred);
1547 if (!UniquePreds.contains(key: CurrentPred) &&
1548 !UniquePreds.contains(key: SwappedCurrentPred))
1549 UniquePreds.insert(X: CurrentPred);
1550 }
1551 // Total number of predicates > 2, but if consider swapped predicates
1552 // compatible only 2, consider swappable predicates as compatible opcodes,
1553 // not alternate.
1554 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
1555 }();
1556 // Check for one alternate opcode from another BinaryOperator.
1557 // TODO - generalize to support all operators (types, calls etc.).
1558 Intrinsic::ID BaseID = 0;
1559 SmallVector<VFInfo> BaseMappings;
1560 if (auto *CallBase = dyn_cast<CallInst>(Val: MainOp)) {
1561 BaseID = getVectorIntrinsicIDForCall(CI: CallBase, TLI: &TLI);
1562 BaseMappings = VFDatabase(*CallBase).getMappings(CI: *CallBase);
1563 if (!isTriviallyVectorizable(ID: BaseID) && BaseMappings.empty())
1564 return InstructionsState::invalid();
1565 }
1566 bool AnyPoison = InstCnt != VL.size();
1567 // Check MainOp too to be sure that it matches the requirements for the
1568 // instructions.
1569 for (Value *V : iterator_range(It, VL.end())) {
1570 auto *I = dyn_cast<Instruction>(Val: V);
1571 if (!I)
1572 continue;
1573
1574 // Cannot combine poison and divisions.
1575 // TODO: do some smart analysis of the CallInsts to exclude divide-like
1576 // intrinsics/functions only.
1577 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I)))
1578 return InstructionsState::invalid();
1579 unsigned InstOpcode = I->getOpcode();
1580 if (IsBinOp && isa<BinaryOperator>(Val: I)) {
1581 if (BinOpHelper.add(I))
1582 continue;
1583 } else if (IsCastOp && isa<CastInst>(Val: I)) {
1584 Value *Op0 = MainOp->getOperand(i: 0);
1585 Type *Ty0 = Op0->getType();
1586 Value *Op1 = I->getOperand(i: 0);
1587 Type *Ty1 = Op1->getType();
1588 if (Ty0 == Ty1) {
1589 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
1590 continue;
1591 if (Opcode == AltOpcode) {
1592 assert(isValidForAlternation(Opcode) &&
1593 isValidForAlternation(InstOpcode) &&
1594 "Cast isn't safe for alternation, logic needs to be updated!");
1595 AltOpcode = InstOpcode;
1596 AltOp = I;
1597 continue;
1598 }
1599 }
1600 } else if (auto *Inst = dyn_cast<CmpInst>(Val: I); Inst && IsCmpOp) {
1601 auto *BaseInst = cast<CmpInst>(Val: MainOp);
1602 Type *Ty0 = BaseInst->getOperand(i_nocapture: 0)->getType();
1603 Type *Ty1 = Inst->getOperand(i_nocapture: 0)->getType();
1604 if (Ty0 == Ty1) {
1605 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1606 assert(InstOpcode == AltOpcode &&
1607 "Alternate instructions are only supported by BinaryOperator "
1608 "and CastInst.");
1609 // Check for compatible operands. If the corresponding operands are not
1610 // compatible - need to perform alternate vectorization.
1611 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1612 CmpInst::Predicate SwappedCurrentPred =
1613 CmpInst::getSwappedPredicate(pred: CurrentPred);
1614
1615 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1616 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1617 continue;
1618
1619 if (isCmpSameOrSwapped(BaseCI: BaseInst, CI: Inst, TLI))
1620 continue;
1621 auto *AltInst = cast<CmpInst>(Val: AltOp);
1622 if (MainOp != AltOp) {
1623 if (isCmpSameOrSwapped(BaseCI: AltInst, CI: Inst, TLI))
1624 continue;
1625 } else if (BasePred != CurrentPred) {
1626 assert(
1627 isValidForAlternation(InstOpcode) &&
1628 "CmpInst isn't safe for alternation, logic needs to be updated!");
1629 AltOp = I;
1630 continue;
1631 }
1632 CmpInst::Predicate AltPred = AltInst->getPredicate();
1633 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1634 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1635 continue;
1636 }
1637 } else if (InstOpcode == Opcode) {
1638 assert(InstOpcode == AltOpcode &&
1639 "Alternate instructions are only supported by BinaryOperator and "
1640 "CastInst.");
1641 if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
1642 if (Gep->getNumOperands() != 2 ||
1643 Gep->getOperand(i_nocapture: 0)->getType() != MainOp->getOperand(i: 0)->getType())
1644 return InstructionsState::invalid();
1645 } else if (auto *EI = dyn_cast<ExtractElementInst>(Val: I)) {
1646 if (!isVectorLikeInstWithConstOps(V: EI))
1647 return InstructionsState::invalid();
1648 } else if (auto *LI = dyn_cast<LoadInst>(Val: I)) {
1649 auto *BaseLI = cast<LoadInst>(Val: MainOp);
1650 if (!LI->isSimple() || !BaseLI->isSimple())
1651 return InstructionsState::invalid();
1652 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
1653 auto *CallBase = cast<CallInst>(Val: MainOp);
1654 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1655 return InstructionsState::invalid();
1656 if (Call->hasOperandBundles() &&
1657 (!CallBase->hasOperandBundles() ||
1658 !std::equal(first1: Call->op_begin() + Call->getBundleOperandsStartIndex(),
1659 last1: Call->op_begin() + Call->getBundleOperandsEndIndex(),
1660 first2: CallBase->op_begin() +
1661 CallBase->getBundleOperandsStartIndex())))
1662 return InstructionsState::invalid();
1663 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI: &TLI);
1664 if (ID != BaseID)
1665 return InstructionsState::invalid();
1666 if (!ID) {
1667 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(CI: *Call);
1668 if (Mappings.size() != BaseMappings.size() ||
1669 Mappings.front().ISA != BaseMappings.front().ISA ||
1670 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1671 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1672 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1673 Mappings.front().Shape.Parameters !=
1674 BaseMappings.front().Shape.Parameters)
1675 return InstructionsState::invalid();
1676 }
1677 }
1678 continue;
1679 }
1680 return InstructionsState::invalid();
1681 }
1682
1683 if (IsBinOp) {
1684 MainOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getMainOpcode());
1685 assert(MainOp && "Cannot find MainOp with Opcode from BinOpHelper.");
1686 AltOp = findInstructionWithOpcode(VL, Opcode: BinOpHelper.getAltOpcode());
1687 assert(AltOp && "Cannot find AltOp with Opcode from BinOpHelper.");
1688 }
1689 assert((MainOp == AltOp || !allSameOpcode(VL)) &&
1690 "Incorrect implementation of allSameOpcode.");
1691 InstructionsState S(MainOp, AltOp);
1692 assert(all_of(VL,
1693 [&](Value *V) {
1694 return isa<PoisonValue>(V) ||
1695 S.getMatchingMainOpOrAltOp(cast<Instruction>(V));
1696 }) &&
1697 "Invalid InstructionsState.");
1698 return S;
1699}
1700
1701/// \returns true if all of the values in \p VL have the same type or false
1702/// otherwise.
1703static bool allSameType(ArrayRef<Value *> VL) {
1704 Type *Ty = VL.consume_front()->getType();
1705 return all_of(Range&: VL, P: [&](Value *V) { return V->getType() == Ty; });
1706}
1707
1708/// \returns True if in-tree use also needs extract. This refers to
1709/// possible scalar operand in vectorized instruction.
1710static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1711 TargetLibraryInfo *TLI,
1712 const TargetTransformInfo *TTI) {
1713 if (!UserInst)
1714 return false;
1715 unsigned Opcode = UserInst->getOpcode();
1716 switch (Opcode) {
1717 case Instruction::Load: {
1718 LoadInst *LI = cast<LoadInst>(Val: UserInst);
1719 return (LI->getPointerOperand() == Scalar);
1720 }
1721 case Instruction::Store: {
1722 StoreInst *SI = cast<StoreInst>(Val: UserInst);
1723 return (SI->getPointerOperand() == Scalar);
1724 }
1725 case Instruction::Call: {
1726 CallInst *CI = cast<CallInst>(Val: UserInst);
1727 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
1728 return any_of(Range: enumerate(First: CI->args()), P: [&](auto &&Arg) {
1729 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1730 Arg.value().get() == Scalar;
1731 });
1732 }
1733 default:
1734 return false;
1735 }
1736}
1737
1738/// \returns the AA location that is being access by the instruction.
1739static MemoryLocation getLocation(Instruction *I) {
1740 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1741 return MemoryLocation::get(SI);
1742 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1743 return MemoryLocation::get(LI);
1744 return MemoryLocation();
1745}
1746
1747/// \returns True if the instruction is not a volatile or atomic load/store.
1748static bool isSimple(Instruction *I) {
1749 if (LoadInst *LI = dyn_cast<LoadInst>(Val: I))
1750 return LI->isSimple();
1751 if (StoreInst *SI = dyn_cast<StoreInst>(Val: I))
1752 return SI->isSimple();
1753 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Val: I))
1754 return !MI->isVolatile();
1755 return true;
1756}
1757
1758/// Shuffles \p Mask in accordance with the given \p SubMask.
1759/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1760/// one but two input vectors.
1761static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1762 bool ExtendingManyInputs = false) {
1763 if (SubMask.empty())
1764 return;
1765 assert(
1766 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1767 // Check if input scalars were extended to match the size of other node.
1768 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1769 "SubMask with many inputs support must be larger than the mask.");
1770 if (Mask.empty()) {
1771 Mask.append(in_start: SubMask.begin(), in_end: SubMask.end());
1772 return;
1773 }
1774 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1775 int TermValue = std::min(a: Mask.size(), b: SubMask.size());
1776 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1777 if (SubMask[I] == PoisonMaskElem ||
1778 (!ExtendingManyInputs &&
1779 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1780 continue;
1781 NewMask[I] = Mask[SubMask[I]];
1782 }
1783 Mask.swap(RHS&: NewMask);
1784}
1785
1786/// Order may have elements assigned special value (size) which is out of
1787/// bounds. Such indices only appear on places which correspond to undef values
1788/// (see canReuseExtract for details) and used in order to avoid undef values
1789/// have effect on operands ordering.
1790/// The first loop below simply finds all unused indices and then the next loop
1791/// nest assigns these indices for undef values positions.
1792/// As an example below Order has two undef positions and they have assigned
1793/// values 3 and 7 respectively:
1794/// before: 6 9 5 4 9 2 1 0
1795/// after: 6 3 5 4 7 2 1 0
1796static void fixupOrderingIndices(MutableArrayRef<unsigned> Order) {
1797 const size_t Sz = Order.size();
1798 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1799 SmallBitVector MaskedIndices(Sz);
1800 for (unsigned I = 0; I < Sz; ++I) {
1801 if (Order[I] < Sz)
1802 UnusedIndices.reset(Idx: Order[I]);
1803 else
1804 MaskedIndices.set(I);
1805 }
1806 if (MaskedIndices.none())
1807 return;
1808 assert(UnusedIndices.count() == MaskedIndices.count() &&
1809 "Non-synced masked/available indices.");
1810 int Idx = UnusedIndices.find_first();
1811 int MIdx = MaskedIndices.find_first();
1812 while (MIdx >= 0) {
1813 assert(Idx >= 0 && "Indices must be synced.");
1814 Order[MIdx] = Idx;
1815 Idx = UnusedIndices.find_next(Prev: Idx);
1816 MIdx = MaskedIndices.find_next(Prev: MIdx);
1817 }
1818}
1819
1820/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1821/// Opcode1.
1822static SmallBitVector getAltInstrMask(ArrayRef<Value *> VL, Type *ScalarTy,
1823 unsigned Opcode0, unsigned Opcode1) {
1824 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
1825 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1826 for (unsigned Lane : seq<unsigned>(Size: VL.size())) {
1827 if (isa<PoisonValue>(Val: VL[Lane]))
1828 continue;
1829 if (cast<Instruction>(Val: VL[Lane])->getOpcode() == Opcode1)
1830 OpcodeMask.set(I: Lane * ScalarTyNumElements,
1831 E: Lane * ScalarTyNumElements + ScalarTyNumElements);
1832 }
1833 return OpcodeMask;
1834}
1835
1836/// Replicates the given \p Val \p VF times.
1837static SmallVector<Constant *> replicateMask(ArrayRef<Constant *> Val,
1838 unsigned VF) {
1839 assert(none_of(Val, [](Constant *C) { return C->getType()->isVectorTy(); }) &&
1840 "Expected scalar constants.");
1841 SmallVector<Constant *> NewVal(Val.size() * VF);
1842 for (auto [I, V] : enumerate(First&: Val))
1843 std::fill_n(first: NewVal.begin() + I * VF, n: VF, value: V);
1844 return NewVal;
1845}
1846
1847static void inversePermutation(ArrayRef<unsigned> Indices,
1848 SmallVectorImpl<int> &Mask) {
1849 Mask.clear();
1850 const unsigned E = Indices.size();
1851 Mask.resize(N: E, NV: PoisonMaskElem);
1852 for (unsigned I = 0; I < E; ++I)
1853 Mask[Indices[I]] = I;
1854}
1855
1856/// Reorders the list of scalars in accordance with the given \p Mask.
1857static void reorderScalars(SmallVectorImpl<Value *> &Scalars,
1858 ArrayRef<int> Mask) {
1859 assert(!Mask.empty() && "Expected non-empty mask.");
1860 SmallVector<Value *> Prev(Scalars.size(),
1861 PoisonValue::get(T: Scalars.front()->getType()));
1862 Prev.swap(RHS&: Scalars);
1863 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1864 if (Mask[I] != PoisonMaskElem)
1865 Scalars[Mask[I]] = Prev[I];
1866}
1867
1868/// Checks if the provided value does not require scheduling. It does not
1869/// require scheduling if this is not an instruction or it is an instruction
1870/// that does not read/write memory and all operands are either not instructions
1871/// or phi nodes or instructions from different blocks.
1872static bool areAllOperandsNonInsts(Value *V) {
1873 auto *I = dyn_cast<Instruction>(Val: V);
1874 if (!I)
1875 return true;
1876 return !mayHaveNonDefUseDependency(I: *I) &&
1877 all_of(Range: I->operands(), P: [I](Value *V) {
1878 auto *IO = dyn_cast<Instruction>(Val: V);
1879 if (!IO)
1880 return true;
1881 return isa<PHINode>(Val: IO) || IO->getParent() != I->getParent();
1882 });
1883}
1884
1885/// Checks if the provided value does not require scheduling. It does not
1886/// require scheduling if this is not an instruction or it is an instruction
1887/// that does not read/write memory and all users are phi nodes or instructions
1888/// from the different blocks.
1889static bool isUsedOutsideBlock(Value *V) {
1890 auto *I = dyn_cast<Instruction>(Val: V);
1891 if (!I)
1892 return true;
1893 // Limits the number of uses to save compile time.
1894 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(N: UsesLimit) &&
1895 all_of(Range: I->users(), P: [I](User *U) {
1896 auto *IU = dyn_cast<Instruction>(Val: U);
1897 if (!IU)
1898 return true;
1899 return IU->getParent() != I->getParent() || isa<PHINode>(Val: IU);
1900 });
1901}
1902
1903/// Checks if the specified value does not require scheduling. It does not
1904/// require scheduling if all operands and all users do not need to be scheduled
1905/// in the current basic block.
1906static bool doesNotNeedToBeScheduled(Value *V) {
1907 return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V);
1908}
1909
1910/// Checks if the specified array of instructions does not require scheduling.
1911/// It is so if all either instructions have operands that do not require
1912/// scheduling or their users do not require scheduling since they are phis or
1913/// in other basic blocks.
1914static bool doesNotNeedToSchedule(ArrayRef<Value *> VL) {
1915 return !VL.empty() &&
1916 (all_of(Range&: VL, P: isUsedOutsideBlock) || all_of(Range&: VL, P: areAllOperandsNonInsts));
1917}
1918
1919/// Returns true if widened type of \p Ty elements with size \p Sz represents
1920/// full vector type, i.e. adding extra element results in extra parts upon type
1921/// legalization.
1922static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty,
1923 unsigned Sz) {
1924 if (Sz <= 1)
1925 return false;
1926 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Val: Ty))
1927 return false;
1928 if (has_single_bit(Value: Sz))
1929 return true;
1930 const unsigned NumParts = TTI.getNumberOfParts(Tp: getWidenedType(ScalarTy: Ty, VF: Sz));
1931 return NumParts > 0 && NumParts < Sz && has_single_bit(Value: Sz / NumParts) &&
1932 Sz % NumParts == 0;
1933}
1934
1935/// Returns number of parts, the type \p VecTy will be split at the codegen
1936/// phase. If the type is going to be scalarized or does not uses whole
1937/// registers, returns 1.
1938static unsigned
1939getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy,
1940 const unsigned Limit = std::numeric_limits<unsigned>::max()) {
1941 unsigned NumParts = TTI.getNumberOfParts(Tp: VecTy);
1942 if (NumParts == 0 || NumParts >= Limit)
1943 return 1;
1944 unsigned Sz = getNumElements(Ty: VecTy);
1945 if (NumParts >= Sz || Sz % NumParts != 0 ||
1946 !hasFullVectorsOrPowerOf2(TTI, Ty: VecTy->getElementType(), Sz: Sz / NumParts))
1947 return 1;
1948 return NumParts;
1949}
1950
1951/// Bottom Up SLP Vectorizer.
1952class slpvectorizer::BoUpSLP {
1953 class TreeEntry;
1954 class ScheduleEntity;
1955 class ScheduleData;
1956 class ScheduleCopyableData;
1957 class ScheduleBundle;
1958 class ShuffleCostEstimator;
1959 class ShuffleInstructionBuilder;
1960
1961 /// If we decide to generate strided load / store, this struct contains all
1962 /// the necessary info. It's fields are calculated by analyzeRtStrideCandidate
1963 /// and analyzeConstantStrideCandidate. Note that Stride can be given either
1964 /// as a SCEV or as a Value if it already exists. To get the stride in bytes,
1965 /// StrideVal (or value obtained from StrideSCEV) has to by multiplied by the
1966 /// size of element of FixedVectorType.
1967 struct StridedPtrInfo {
1968 Value *StrideVal = nullptr;
1969 const SCEV *StrideSCEV = nullptr;
1970 FixedVectorType *Ty = nullptr;
1971 };
1972 SmallDenseMap<TreeEntry *, StridedPtrInfo> TreeEntryToStridedPtrInfoMap;
1973
1974public:
1975 /// Tracks the state we can represent the loads in the given sequence.
1976 enum class LoadsState {
1977 Gather,
1978 Vectorize,
1979 ScatterVectorize,
1980 StridedVectorize,
1981 CompressVectorize
1982 };
1983
1984 using ValueList = SmallVector<Value *, 8>;
1985 using InstrList = SmallVector<Instruction *, 16>;
1986 using ValueSet = SmallPtrSet<Value *, 16>;
1987 using StoreList = SmallVector<StoreInst *, 8>;
1988 using ExtraValueToDebugLocsMap = SmallDenseSet<Value *, 4>;
1989 using OrdersType = SmallVector<unsigned, 4>;
1990
1991 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
1992 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1993 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
1994 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
1995 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1996 AC(AC), DB(DB), DL(DL), ORE(ORE),
1997 Builder(Se->getContext(), TargetFolder(*DL)) {
1998 CodeMetrics::collectEphemeralValues(L: F, AC, EphValues);
1999 // Use the vector register size specified by the target unless overridden
2000 // by a command-line option.
2001 // TODO: It would be better to limit the vectorization factor based on
2002 // data type rather than just register size. For example, x86 AVX has
2003 // 256-bit registers, but it does not support integer operations
2004 // at that width (that requires AVX2).
2005 if (MaxVectorRegSizeOption.getNumOccurrences())
2006 MaxVecRegSize = MaxVectorRegSizeOption;
2007 else
2008 MaxVecRegSize =
2009 TTI->getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
2010 .getFixedValue();
2011
2012 if (MinVectorRegSizeOption.getNumOccurrences())
2013 MinVecRegSize = MinVectorRegSizeOption;
2014 else
2015 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
2016 }
2017
2018 /// Vectorize the tree that starts with the elements in \p VL.
2019 /// Returns the vectorized root.
2020 Value *vectorizeTree();
2021
2022 /// Vectorize the tree but with the list of externally used values \p
2023 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
2024 /// generated extractvalue instructions.
2025 Value *
2026 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
2027 Instruction *ReductionRoot = nullptr,
2028 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
2029 VectorValuesAndScales = {});
2030
2031 /// \returns the cost incurred by unwanted spills and fills, caused by
2032 /// holding live values over call sites.
2033 InstructionCost getSpillCost();
2034
2035 /// Calculates the cost of the subtrees, trims non-profitable ones and returns
2036 /// final cost.
2037 InstructionCost
2038 calculateTreeCostAndTrimNonProfitable(ArrayRef<Value *> VectorizedVals = {});
2039
2040 /// \returns the vectorization cost of the subtree that starts at \p VL.
2041 /// A negative number means that this is profitable.
2042 InstructionCost getTreeCost(InstructionCost TreeCost,
2043 ArrayRef<Value *> VectorizedVals = {},
2044 InstructionCost ReductionCost = TTI::TCC_Free);
2045
2046 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
2047 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
2048 void buildTree(ArrayRef<Value *> Roots,
2049 const SmallDenseSet<Value *> &UserIgnoreLst);
2050
2051 /// Construct a vectorizable tree that starts at \p Roots.
2052 void buildTree(ArrayRef<Value *> Roots);
2053
2054 /// Return the scalars of the root node.
2055 ArrayRef<Value *> getRootNodeScalars() const {
2056 assert(!VectorizableTree.empty() && "No graph to get the first node from");
2057 return VectorizableTree.front()->Scalars;
2058 }
2059
2060 /// Returns the type/is-signed info for the root node in the graph without
2061 /// casting.
2062 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
2063 const TreeEntry &Root = *VectorizableTree.front();
2064 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
2065 !Root.Scalars.front()->getType()->isIntegerTy())
2066 return std::nullopt;
2067 auto It = MinBWs.find(Val: &Root);
2068 if (It != MinBWs.end())
2069 return std::make_pair(x: IntegerType::get(C&: Root.Scalars.front()->getContext(),
2070 NumBits: It->second.first),
2071 y: It->second.second);
2072 if (Root.getOpcode() == Instruction::ZExt ||
2073 Root.getOpcode() == Instruction::SExt)
2074 return std::make_pair(x: cast<CastInst>(Val: Root.getMainOp())->getSrcTy(),
2075 y: Root.getOpcode() == Instruction::SExt);
2076 return std::nullopt;
2077 }
2078
2079 /// Checks if the root graph node can be emitted with narrower bitwidth at
2080 /// codegen and returns it signedness, if so.
2081 bool isSignedMinBitwidthRootNode() const {
2082 return MinBWs.at(Val: VectorizableTree.front().get()).second;
2083 }
2084
2085 /// Returns reduction type after minbitdth analysis.
2086 FixedVectorType *getReductionType() const {
2087 if (ReductionBitWidth == 0 ||
2088 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
2089 ReductionBitWidth >=
2090 DL->getTypeSizeInBits(
2091 Ty: VectorizableTree.front()->Scalars.front()->getType()))
2092 return getWidenedType(
2093 ScalarTy: VectorizableTree.front()->Scalars.front()->getType(),
2094 VF: VectorizableTree.front()->getVectorFactor());
2095 return getWidenedType(
2096 ScalarTy: IntegerType::get(
2097 C&: VectorizableTree.front()->Scalars.front()->getContext(),
2098 NumBits: ReductionBitWidth),
2099 VF: VectorizableTree.front()->getVectorFactor());
2100 }
2101
2102 /// Returns true if the tree results in one of the reduced bitcasts variants.
2103 bool isReducedBitcastRoot() const {
2104 return VectorizableTree.front()->hasState() &&
2105 (VectorizableTree.front()->CombinedOp == TreeEntry::ReducedBitcast ||
2106 VectorizableTree.front()->CombinedOp ==
2107 TreeEntry::ReducedBitcastBSwap ||
2108 VectorizableTree.front()->CombinedOp ==
2109 TreeEntry::ReducedBitcastLoads ||
2110 VectorizableTree.front()->CombinedOp ==
2111 TreeEntry::ReducedBitcastBSwapLoads) &&
2112 VectorizableTree.front()->State == TreeEntry::Vectorize;
2113 }
2114
2115 /// Returns true if the tree results in the reduced cmp bitcast root.
2116 bool isReducedCmpBitcastRoot() const {
2117 return VectorizableTree.front()->hasState() &&
2118 VectorizableTree.front()->CombinedOp ==
2119 TreeEntry::ReducedCmpBitcast &&
2120 VectorizableTree.front()->State == TreeEntry::Vectorize;
2121 }
2122
2123 /// Builds external uses of the vectorized scalars, i.e. the list of
2124 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
2125 /// ExternallyUsedValues contains additional list of external uses to handle
2126 /// vectorization of reductions.
2127 void
2128 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
2129
2130 /// Transforms graph nodes to target specific representations, if profitable.
2131 void transformNodes();
2132
2133 /// Clear the internal data structures that are created by 'buildTree'.
2134 void deleteTree() {
2135 VectorizableTree.clear();
2136 ScalarToTreeEntries.clear();
2137 DeletedNodes.clear();
2138 TransformedToGatherNodes.clear();
2139 OperandsToTreeEntry.clear();
2140 ScalarsInSplitNodes.clear();
2141 MustGather.clear();
2142 NonScheduledFirst.clear();
2143 EntryToLastInstruction.clear();
2144 LastInstructionToPos.clear();
2145 LoadEntriesToVectorize.clear();
2146 IsGraphTransformMode = false;
2147 GatheredLoadsEntriesFirst.reset();
2148 CompressEntryToData.clear();
2149 ExternalUses.clear();
2150 ExternalUsesAsOriginalScalar.clear();
2151 ExternalUsesWithNonUsers.clear();
2152 for (auto &Iter : BlocksSchedules) {
2153 BlockScheduling *BS = Iter.second.get();
2154 BS->clear();
2155 }
2156 MinBWs.clear();
2157 ReductionBitWidth = 0;
2158 BaseGraphSize = 1;
2159 CastMaxMinBWSizes.reset();
2160 ExtraBitWidthNodes.clear();
2161 InstrElementSize.clear();
2162 UserIgnoreList = nullptr;
2163 PostponedGathers.clear();
2164 ValueToGatherNodes.clear();
2165 TreeEntryToStridedPtrInfoMap.clear();
2166 }
2167
2168 unsigned getTreeSize() const { return VectorizableTree.size(); }
2169
2170 /// Returns the base graph size, before any transformations.
2171 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
2172
2173 /// Perform LICM and CSE on the newly generated gather sequences.
2174 void optimizeGatherSequence();
2175
2176 /// Does this non-empty order represent an identity order? Identity
2177 /// should be represented as an empty order, so this is used to
2178 /// decide if we can canonicalize a computed order. Undef elements
2179 /// (represented as size) are ignored.
2180 static bool isIdentityOrder(ArrayRef<unsigned> Order) {
2181 assert(!Order.empty() && "expected non-empty order");
2182 const unsigned Sz = Order.size();
2183 return all_of(Range: enumerate(First&: Order), P: [&](const auto &P) {
2184 return P.value() == P.index() || P.value() == Sz;
2185 });
2186 }
2187
2188 /// Checks if the specified gather tree entry \p TE can be represented as a
2189 /// shuffled vector entry + (possibly) permutation with other gathers. It
2190 /// implements the checks only for possibly ordered scalars (Loads,
2191 /// ExtractElement, ExtractValue), which can be part of the graph.
2192 /// \param TopToBottom If true, used for the whole tree rotation, false - for
2193 /// sub-tree rotations. \param IgnoreReorder true, if the order of the root
2194 /// node might be ignored.
2195 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE,
2196 bool TopToBottom,
2197 bool IgnoreReorder);
2198
2199 /// Sort loads into increasing pointers offsets to allow greater clustering.
2200 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
2201
2202 /// Gets reordering data for the given tree entry. If the entry is vectorized
2203 /// - just return ReorderIndices, otherwise check if the scalars can be
2204 /// reordered and return the most optimal order.
2205 /// \return std::nullopt if ordering is not important, empty order, if
2206 /// identity order is important, or the actual order.
2207 /// \param TopToBottom If true, include the order of vectorized stores and
2208 /// insertelement nodes, otherwise skip them.
2209 /// \param IgnoreReorder true, if the root node order can be ignored.
2210 std::optional<OrdersType>
2211 getReorderingData(const TreeEntry &TE, bool TopToBottom, bool IgnoreReorder);
2212
2213 /// Checks if it is profitable to reorder the current tree.
2214 /// If the tree does not contain many profitable reordable nodes, better to
2215 /// skip it to save compile time.
2216 bool isProfitableToReorder() const;
2217
2218 /// Reorders the current graph to the most profitable order starting from the
2219 /// root node to the leaf nodes. The best order is chosen only from the nodes
2220 /// of the same size (vectorization factor). Smaller nodes are considered
2221 /// parts of subgraph with smaller VF and they are reordered independently. We
2222 /// can make it because we still need to extend smaller nodes to the wider VF
2223 /// and we can merge reordering shuffles with the widening shuffles.
2224 void reorderTopToBottom();
2225
2226 /// Reorders the current graph to the most profitable order starting from
2227 /// leaves to the root. It allows to rotate small subgraphs and reduce the
2228 /// number of reshuffles if the leaf nodes use the same order. In this case we
2229 /// can merge the orders and just shuffle user node instead of shuffling its
2230 /// operands. Plus, even the leaf nodes have different orders, it allows to
2231 /// sink reordering in the graph closer to the root node and merge it later
2232 /// during analysis.
2233 void reorderBottomToTop(bool IgnoreReorder = false);
2234
2235 /// \return The vector element size in bits to use when vectorizing the
2236 /// expression tree ending at \p V. If V is a store, the size is the width of
2237 /// the stored value. Otherwise, the size is the width of the largest loaded
2238 /// value reaching V. This method is used by the vectorizer to calculate
2239 /// vectorization factors.
2240 unsigned getVectorElementSize(Value *V);
2241
2242 /// Compute the minimum type sizes required to represent the entries in a
2243 /// vectorizable tree.
2244 void computeMinimumValueSizes();
2245
2246 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
2247 unsigned getMaxVecRegSize() const {
2248 return MaxVecRegSize;
2249 }
2250
2251 // \returns minimum vector register size as set by cl::opt.
2252 unsigned getMinVecRegSize() const {
2253 return MinVecRegSize;
2254 }
2255
2256 unsigned getMinVF(unsigned Sz) const {
2257 return std::max(a: 2U, b: getMinVecRegSize() / Sz);
2258 }
2259
2260 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
2261 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
2262 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
2263 return MaxVF ? MaxVF : UINT_MAX;
2264 }
2265
2266 /// Check if homogeneous aggregate is isomorphic to some VectorType.
2267 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
2268 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
2269 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
2270 ///
2271 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
2272 unsigned canMapToVector(Type *T) const;
2273
2274 /// \returns True if the VectorizableTree is both tiny and not fully
2275 /// vectorizable. We do not vectorize such trees.
2276 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
2277
2278 /// Checks if the graph and all its subgraphs cannot be better vectorized.
2279 /// It may happen, if all gather nodes are loads and they cannot be
2280 /// "clusterized". In this case even subgraphs cannot be vectorized more
2281 /// effectively than the base graph.
2282 bool isTreeNotExtendable() const;
2283
2284 bool isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2285 Align Alignment, const int64_t Diff,
2286 const size_t Sz) const;
2287
2288 /// Return true if an array of scalar loads can be replaced with a strided
2289 /// load (with constant stride).
2290 ///
2291 /// It is possible that the load gets "widened". Suppose that originally each
2292 /// load loads `k` bytes and `PointerOps` can be arranged as follows (`%s` is
2293 /// constant): %b + 0 * %s + 0 %b + 0 * %s + 1 %b + 0 * %s + 2
2294 /// ...
2295 /// %b + 0 * %s + (w - 1)
2296 ///
2297 /// %b + 1 * %s + 0
2298 /// %b + 1 * %s + 1
2299 /// %b + 1 * %s + 2
2300 /// ...
2301 /// %b + 1 * %s + (w - 1)
2302 /// ...
2303 ///
2304 /// %b + (n - 1) * %s + 0
2305 /// %b + (n - 1) * %s + 1
2306 /// %b + (n - 1) * %s + 2
2307 /// ...
2308 /// %b + (n - 1) * %s + (w - 1)
2309 ///
2310 /// In this case we will generate a strided load of type `<n x (k * w)>`.
2311 ///
2312 /// \param PointerOps list of pointer arguments of loads.
2313 /// \param ElemTy original scalar type of loads.
2314 /// \param Alignment alignment of the first load.
2315 /// \param SortedIndices is the order of PointerOps as returned by
2316 /// `sortPtrAccesses`
2317 /// \param Diff Pointer difference between the lowest and the highes pointer
2318 /// in `PointerOps` as returned by `getPointersDiff`.
2319 /// \param Ptr0 first pointer in `PointersOps`.
2320 /// \param PtrN last pointer in `PointersOps`.
2321 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2322 /// of `SPtrInfo` necessary to generate the strided load later.
2323 bool analyzeConstantStrideCandidate(
2324 const ArrayRef<Value *> PointerOps, Type *ElemTy, Align Alignment,
2325 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
2326 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const;
2327
2328 /// Return true if an array of scalar loads can be replaced with a strided
2329 /// load (with run-time stride).
2330 /// \param PointerOps list of pointer arguments of loads.
2331 /// \param ScalarTy type of loads.
2332 /// \param CommonAlignment common alignement of loads as computed by
2333 /// `computeCommonAlignment<LoadInst>`.
2334 /// \param SortedIndicies is a list of indicies computed by this function such
2335 /// that the sequence `PointerOps[SortedIndices[0]],
2336 /// PointerOps[SortedIndicies[1]], ..., PointerOps[SortedIndices[n]]` is
2337 /// ordered by the coefficient of the stride. For example, if PointerOps is
2338 /// `%base + %stride, %base, %base + 2 * stride` the `SortedIndices` will be
2339 /// `[1, 0, 2]`. We follow the convention that if `SortedIndices` has to be
2340 /// `0, 1, 2, 3, ...` we return empty vector for `SortedIndicies`.
2341 /// \param SPtrInfo If the function return `true`, it also sets all the fields
2342 /// of `SPtrInfo` necessary to generate the strided load later.
2343 bool analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps, Type *ScalarTy,
2344 Align CommonAlignment,
2345 SmallVectorImpl<unsigned> &SortedIndices,
2346 StridedPtrInfo &SPtrInfo) const;
2347
2348 /// Checks if the given array of loads can be represented as a vectorized,
2349 /// scatter or just simple gather.
2350 /// \param VL list of loads.
2351 /// \param VL0 main load value.
2352 /// \param Order returned order of load instructions.
2353 /// \param PointerOps returned list of pointer operands.
2354 /// \param BestVF return best vector factor, if recursive check found better
2355 /// vectorization sequences rather than masked gather.
2356 /// \param TryRecursiveCheck used to check if long masked gather can be
2357 /// represented as a serie of loads/insert subvector, if profitable.
2358 LoadsState canVectorizeLoads(ArrayRef<Value *> VL, const Value *VL0,
2359 SmallVectorImpl<unsigned> &Order,
2360 SmallVectorImpl<Value *> &PointerOps,
2361 StridedPtrInfo &SPtrInfo,
2362 unsigned *BestVF = nullptr,
2363 bool TryRecursiveCheck = true) const;
2364
2365 /// Registers non-vectorizable sequence of loads
2366 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
2367 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
2368 }
2369
2370 /// Checks if the given loads sequence is known as not vectorizable
2371 template <typename T>
2372 bool areKnownNonVectorizableLoads(ArrayRef<T *> VL) const {
2373 return ListOfKnonwnNonVectorizableLoads.contains(V: hash_value(VL));
2374 }
2375
2376 OptimizationRemarkEmitter *getORE() { return ORE; }
2377
2378 /// This structure holds any data we need about the edges being traversed
2379 /// during buildTreeRec(). We keep track of:
2380 /// (i) the user TreeEntry index, and
2381 /// (ii) the index of the edge.
2382 struct EdgeInfo {
2383 EdgeInfo() = default;
2384 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
2385 : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
2386 /// The user TreeEntry.
2387 TreeEntry *UserTE = nullptr;
2388 /// The operand index of the use.
2389 unsigned EdgeIdx = UINT_MAX;
2390#ifndef NDEBUG
2391 friend inline raw_ostream &operator<<(raw_ostream &OS,
2392 const BoUpSLP::EdgeInfo &EI) {
2393 EI.dump(OS);
2394 return OS;
2395 }
2396 /// Debug print.
2397 void dump(raw_ostream &OS) const {
2398 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
2399 << " EdgeIdx:" << EdgeIdx << "}";
2400 }
2401 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
2402#endif
2403 bool operator == (const EdgeInfo &Other) const {
2404 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
2405 }
2406
2407 operator bool() const { return UserTE != nullptr; }
2408 };
2409 friend struct DenseMapInfo<EdgeInfo>;
2410
2411 /// A helper class used for scoring candidates for two consecutive lanes.
2412 class LookAheadHeuristics {
2413 const TargetLibraryInfo &TLI;
2414 const DataLayout &DL;
2415 ScalarEvolution &SE;
2416 const BoUpSLP &R;
2417 int NumLanes; // Total number of lanes (aka vectorization factor).
2418 int MaxLevel; // The maximum recursion depth for accumulating score.
2419
2420 public:
2421 LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL,
2422 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
2423 int MaxLevel)
2424 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
2425 MaxLevel(MaxLevel) {}
2426
2427 // The hard-coded scores listed here are not very important, though it shall
2428 // be higher for better matches to improve the resulting cost. When
2429 // computing the scores of matching one sub-tree with another, we are
2430 // basically counting the number of values that are matching. So even if all
2431 // scores are set to 1, we would still get a decent matching result.
2432 // However, sometimes we have to break ties. For example we may have to
2433 // choose between matching loads vs matching opcodes. This is what these
2434 // scores are helping us with: they provide the order of preference. Also,
2435 // this is important if the scalar is externally used or used in another
2436 // tree entry node in the different lane.
2437
2438 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
2439 static const int ScoreConsecutiveLoads = 4;
2440 /// The same load multiple times. This should have a better score than
2441 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
2442 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
2443 /// a vector load and 1.0 for a broadcast.
2444 static const int ScoreSplatLoads = 3;
2445 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
2446 static const int ScoreReversedLoads = 3;
2447 /// A load candidate for masked gather.
2448 static const int ScoreMaskedGatherCandidate = 1;
2449 /// ExtractElementInst from same vector and consecutive indexes.
2450 static const int ScoreConsecutiveExtracts = 4;
2451 /// ExtractElementInst from same vector and reversed indices.
2452 static const int ScoreReversedExtracts = 3;
2453 /// Constants.
2454 static const int ScoreConstants = 2;
2455 /// Instructions with the same opcode.
2456 static const int ScoreSameOpcode = 2;
2457 /// Instructions with alt opcodes (e.g, add + sub).
2458 static const int ScoreAltOpcodes = 1;
2459 /// Identical instructions (a.k.a. splat or broadcast).
2460 static const int ScoreSplat = 1;
2461 /// Matching with an undef is preferable to failing.
2462 static const int ScoreUndef = 1;
2463 /// Score for failing to find a decent match.
2464 static const int ScoreFail = 0;
2465 /// Score if all users are vectorized.
2466 static const int ScoreAllUserVectorized = 1;
2467
2468 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
2469 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
2470 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
2471 /// MainAltOps.
2472 int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2,
2473 ArrayRef<Value *> MainAltOps) const {
2474 if (!isValidElementType(Ty: V1->getType()) ||
2475 !isValidElementType(Ty: V2->getType()))
2476 return LookAheadHeuristics::ScoreFail;
2477
2478 if (V1 == V2) {
2479 if (isa<LoadInst>(Val: V1)) {
2480 // Retruns true if the users of V1 and V2 won't need to be extracted.
2481 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
2482 // Bail out if we have too many uses to save compilation time.
2483 if (V1->hasNUsesOrMore(N: UsesLimit) || V2->hasNUsesOrMore(N: UsesLimit))
2484 return false;
2485
2486 auto AllUsersVectorized = [U1, U2, this](Value *V) {
2487 return llvm::all_of(Range: V->users(), P: [U1, U2, this](Value *U) {
2488 return U == U1 || U == U2 || R.isVectorized(V: U);
2489 });
2490 };
2491 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
2492 };
2493 // A broadcast of a load can be cheaper on some targets.
2494 if (R.TTI->isLegalBroadcastLoad(ElementTy: V1->getType(),
2495 NumElements: ElementCount::getFixed(MinVal: NumLanes)) &&
2496 ((int)V1->getNumUses() == NumLanes ||
2497 AllUsersAreInternal(V1, V2)))
2498 return LookAheadHeuristics::ScoreSplatLoads;
2499 }
2500 return LookAheadHeuristics::ScoreSplat;
2501 }
2502
2503 auto CheckSameEntryOrFail = [&]() {
2504 if (ArrayRef<TreeEntry *> TEs1 = R.getTreeEntries(V: V1); !TEs1.empty()) {
2505 SmallPtrSet<TreeEntry *, 4> Set(llvm::from_range, TEs1);
2506 if (ArrayRef<TreeEntry *> TEs2 = R.getTreeEntries(V: V2);
2507 !TEs2.empty() &&
2508 any_of(Range&: TEs2, P: [&](TreeEntry *E) { return Set.contains(Ptr: E); }))
2509 return LookAheadHeuristics::ScoreSplatLoads;
2510 }
2511 return LookAheadHeuristics::ScoreFail;
2512 };
2513
2514 auto *LI1 = dyn_cast<LoadInst>(Val: V1);
2515 auto *LI2 = dyn_cast<LoadInst>(Val: V2);
2516 if (LI1 && LI2) {
2517 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
2518 !LI2->isSimple())
2519 return CheckSameEntryOrFail();
2520
2521 std::optional<int64_t> Dist = getPointersDiff(
2522 ElemTyA: LI1->getType(), PtrA: LI1->getPointerOperand(), ElemTyB: LI2->getType(),
2523 PtrB: LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
2524 if (!Dist || *Dist == 0) {
2525 if (getUnderlyingObject(V: LI1->getPointerOperand()) ==
2526 getUnderlyingObject(V: LI2->getPointerOperand()) &&
2527 R.TTI->isLegalMaskedGather(
2528 DataType: getWidenedType(ScalarTy: LI1->getType(), VF: NumLanes), Alignment: LI1->getAlign()))
2529 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2530 return CheckSameEntryOrFail();
2531 }
2532 // The distance is too large - still may be profitable to use masked
2533 // loads/gathers.
2534 if (std::abs(i: *Dist) > NumLanes / 2)
2535 return LookAheadHeuristics::ScoreMaskedGatherCandidate;
2536 // This still will detect consecutive loads, but we might have "holes"
2537 // in some cases. It is ok for non-power-2 vectorization and may produce
2538 // better results. It should not affect current vectorization.
2539 return (*Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveLoads
2540 : LookAheadHeuristics::ScoreReversedLoads;
2541 }
2542
2543 auto *C1 = dyn_cast<Constant>(Val: V1);
2544 auto *C2 = dyn_cast<Constant>(Val: V2);
2545 if (C1 && C2)
2546 return LookAheadHeuristics::ScoreConstants;
2547
2548 // Consider constants and buildvector compatible.
2549 if ((C1 && isa<InsertElementInst>(Val: V2)) ||
2550 (C2 && isa<InsertElementInst>(Val: V1)))
2551 return LookAheadHeuristics::ScoreConstants;
2552
2553 // Extracts from consecutive indexes of the same vector better score as
2554 // the extracts could be optimized away.
2555 Value *EV1;
2556 ConstantInt *Ex1Idx;
2557 if (match(V: V1, P: m_ExtractElt(Val: m_Value(V&: EV1), Idx: m_ConstantInt(CI&: Ex1Idx)))) {
2558 // Undefs are always profitable for extractelements.
2559 // Compiler can easily combine poison and extractelement <non-poison> or
2560 // undef and extractelement <poison>. But combining undef +
2561 // extractelement <non-poison-but-may-produce-poison> requires some
2562 // extra operations.
2563 if (isa<UndefValue>(Val: V2))
2564 return (isa<PoisonValue>(Val: V2) || isUndefVector(V: EV1).all())
2565 ? LookAheadHeuristics::ScoreConsecutiveExtracts
2566 : LookAheadHeuristics::ScoreSameOpcode;
2567 Value *EV2 = nullptr;
2568 ConstantInt *Ex2Idx = nullptr;
2569 if (match(V: V2,
2570 P: m_ExtractElt(Val: m_Value(V&: EV2), Idx: m_CombineOr(L: m_ConstantInt(CI&: Ex2Idx),
2571 R: m_Undef())))) {
2572 // Undefs are always profitable for extractelements.
2573 if (!Ex2Idx)
2574 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2575 if (isUndefVector(V: EV2).all() && EV2->getType() == EV1->getType())
2576 return LookAheadHeuristics::ScoreConsecutiveExtracts;
2577 if (EV2 == EV1) {
2578 int Idx1 = Ex1Idx->getZExtValue();
2579 int Idx2 = Ex2Idx->getZExtValue();
2580 int Dist = Idx2 - Idx1;
2581 // The distance is too large - still may be profitable to use
2582 // shuffles.
2583 if (std::abs(x: Dist) == 0)
2584 return LookAheadHeuristics::ScoreSplat;
2585 if (std::abs(x: Dist) > NumLanes / 2)
2586 return LookAheadHeuristics::ScoreSameOpcode;
2587 return (Dist > 0) ? LookAheadHeuristics::ScoreConsecutiveExtracts
2588 : LookAheadHeuristics::ScoreReversedExtracts;
2589 }
2590 return LookAheadHeuristics::ScoreAltOpcodes;
2591 }
2592 return CheckSameEntryOrFail();
2593 }
2594
2595 auto *I1 = dyn_cast<Instruction>(Val: V1);
2596 auto *I2 = dyn_cast<Instruction>(Val: V2);
2597 if (I1 && I2) {
2598 if (I1->getParent() != I2->getParent())
2599 return CheckSameEntryOrFail();
2600 SmallVector<Value *, 4> Ops(MainAltOps);
2601 Ops.push_back(Elt: I1);
2602 Ops.push_back(Elt: I2);
2603 InstructionsState S = getSameOpcode(VL: Ops, TLI);
2604 // Note: Only consider instructions with <= 2 operands to avoid
2605 // complexity explosion.
2606 if (S &&
2607 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
2608 !S.isAltShuffle()) &&
2609 all_of(Range&: Ops, P: [&S](Value *V) {
2610 return isa<PoisonValue>(Val: V) ||
2611 cast<Instruction>(Val: V)->getNumOperands() ==
2612 S.getMainOp()->getNumOperands();
2613 }))
2614 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
2615 : LookAheadHeuristics::ScoreSameOpcode;
2616 }
2617
2618 if (I1 && isa<PoisonValue>(Val: V2))
2619 return LookAheadHeuristics::ScoreSameOpcode;
2620
2621 if (isa<UndefValue>(Val: V2))
2622 return LookAheadHeuristics::ScoreUndef;
2623
2624 return CheckSameEntryOrFail();
2625 }
2626
2627 /// Go through the operands of \p LHS and \p RHS recursively until
2628 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
2629 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
2630 /// of \p U1 and \p U2), except at the beginning of the recursion where
2631 /// these are set to nullptr.
2632 ///
2633 /// For example:
2634 /// \verbatim
2635 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
2636 /// \ / \ / \ / \ /
2637 /// + + + +
2638 /// G1 G2 G3 G4
2639 /// \endverbatim
2640 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
2641 /// each level recursively, accumulating the score. It starts from matching
2642 /// the additions at level 0, then moves on to the loads (level 1). The
2643 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
2644 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
2645 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
2646 /// Please note that the order of the operands does not matter, as we
2647 /// evaluate the score of all profitable combinations of operands. In
2648 /// other words the score of G1 and G4 is the same as G1 and G2. This
2649 /// heuristic is based on ideas described in:
2650 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
2651 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
2652 /// Luís F. W. Góes
2653 int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1,
2654 Instruction *U2, int CurrLevel,
2655 ArrayRef<Value *> MainAltOps) const {
2656
2657 // Get the shallow score of V1 and V2.
2658 int ShallowScoreAtThisLevel =
2659 getShallowScore(V1: LHS, V2: RHS, U1, U2, MainAltOps);
2660
2661 // If reached MaxLevel,
2662 // or if V1 and V2 are not instructions,
2663 // or if they are SPLAT,
2664 // or if they are not consecutive,
2665 // or if profitable to vectorize loads or extractelements, early return
2666 // the current cost.
2667 auto *I1 = dyn_cast<Instruction>(Val: LHS);
2668 auto *I2 = dyn_cast<Instruction>(Val: RHS);
2669 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
2670 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
2671 (((isa<LoadInst>(Val: I1) && isa<LoadInst>(Val: I2)) ||
2672 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
2673 (isa<ExtractElementInst>(Val: I1) && isa<ExtractElementInst>(Val: I2))) &&
2674 ShallowScoreAtThisLevel))
2675 return ShallowScoreAtThisLevel;
2676 assert(I1 && I2 && "Should have early exited.");
2677
2678 // Contains the I2 operand indexes that got matched with I1 operands.
2679 SmallSet<unsigned, 4> Op2Used;
2680
2681 // Recursion towards the operands of I1 and I2. We are trying all possible
2682 // operand pairs, and keeping track of the best score.
2683 if (I1->getNumOperands() != I2->getNumOperands())
2684 return LookAheadHeuristics::ScoreSameOpcode;
2685 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
2686 OpIdx1 != NumOperands1; ++OpIdx1) {
2687 // Try to pair op1I with the best operand of I2.
2688 int MaxTmpScore = 0;
2689 unsigned MaxOpIdx2 = 0;
2690 bool FoundBest = false;
2691 // If I2 is commutative try all combinations.
2692 unsigned FromIdx = isCommutative(I: I2) ? 0 : OpIdx1;
2693 unsigned ToIdx = isCommutative(I: I2)
2694 ? I2->getNumOperands()
2695 : std::min(a: I2->getNumOperands(), b: OpIdx1 + 1);
2696 assert(FromIdx <= ToIdx && "Bad index");
2697 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
2698 // Skip operands already paired with OpIdx1.
2699 if (Op2Used.count(V: OpIdx2))
2700 continue;
2701 // Recursively calculate the cost at each level
2702 int TmpScore =
2703 getScoreAtLevelRec(LHS: I1->getOperand(i: OpIdx1), RHS: I2->getOperand(i: OpIdx2),
2704 U1: I1, U2: I2, CurrLevel: CurrLevel + 1, MainAltOps: {});
2705 // Look for the best score.
2706 if (TmpScore > LookAheadHeuristics::ScoreFail &&
2707 TmpScore > MaxTmpScore) {
2708 MaxTmpScore = TmpScore;
2709 MaxOpIdx2 = OpIdx2;
2710 FoundBest = true;
2711 }
2712 }
2713 if (FoundBest) {
2714 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
2715 Op2Used.insert(V: MaxOpIdx2);
2716 ShallowScoreAtThisLevel += MaxTmpScore;
2717 }
2718 }
2719 return ShallowScoreAtThisLevel;
2720 }
2721 };
2722 /// A helper data structure to hold the operands of a vector of instructions.
2723 /// This supports a fixed vector length for all operand vectors.
2724 class VLOperands {
2725 /// For each operand we need (i) the value, and (ii) the opcode that it
2726 /// would be attached to if the expression was in a left-linearized form.
2727 /// This is required to avoid illegal operand reordering.
2728 /// For example:
2729 /// \verbatim
2730 /// 0 Op1
2731 /// |/
2732 /// Op1 Op2 Linearized + Op2
2733 /// \ / ----------> |/
2734 /// - -
2735 ///
2736 /// Op1 - Op2 (0 + Op1) - Op2
2737 /// \endverbatim
2738 ///
2739 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
2740 ///
2741 /// Another way to think of this is to track all the operations across the
2742 /// path from the operand all the way to the root of the tree and to
2743 /// calculate the operation that corresponds to this path. For example, the
2744 /// path from Op2 to the root crosses the RHS of the '-', therefore the
2745 /// corresponding operation is a '-' (which matches the one in the
2746 /// linearized tree, as shown above).
2747 ///
2748 /// For lack of a better term, we refer to this operation as Accumulated
2749 /// Path Operation (APO).
2750 struct OperandData {
2751 OperandData() = default;
2752 OperandData(Value *V, bool APO, bool IsUsed)
2753 : V(V), APO(APO), IsUsed(IsUsed) {}
2754 /// The operand value.
2755 Value *V = nullptr;
2756 /// TreeEntries only allow a single opcode, or an alternate sequence of
2757 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2758 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2759 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2760 /// (e.g., Add/Mul)
2761 bool APO = false;
2762 /// Helper data for the reordering function.
2763 bool IsUsed = false;
2764 };
2765
2766 /// During operand reordering, we are trying to select the operand at lane
2767 /// that matches best with the operand at the neighboring lane. Our
2768 /// selection is based on the type of value we are looking for. For example,
2769 /// if the neighboring lane has a load, we need to look for a load that is
2770 /// accessing a consecutive address. These strategies are summarized in the
2771 /// 'ReorderingMode' enumerator.
2772 enum class ReorderingMode {
2773 Load, ///< Matching loads to consecutive memory addresses
2774 Opcode, ///< Matching instructions based on opcode (same or alternate)
2775 Constant, ///< Matching constants
2776 Splat, ///< Matching the same instruction multiple times (broadcast)
2777 Failed, ///< We failed to create a vectorizable group
2778 };
2779
2780 using OperandDataVec = SmallVector<OperandData, 2>;
2781
2782 /// A vector of operand vectors.
2783 SmallVector<OperandDataVec, 4> OpsVec;
2784 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2785 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2786 unsigned ArgSize = 0;
2787
2788 const TargetLibraryInfo &TLI;
2789 const DataLayout &DL;
2790 ScalarEvolution &SE;
2791 const BoUpSLP &R;
2792 const Loop *L = nullptr;
2793
2794 /// \returns the operand data at \p OpIdx and \p Lane.
2795 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2796 return OpsVec[OpIdx][Lane];
2797 }
2798
2799 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2800 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2801 return OpsVec[OpIdx][Lane];
2802 }
2803
2804 /// Clears the used flag for all entries.
2805 void clearUsed() {
2806 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2807 OpIdx != NumOperands; ++OpIdx)
2808 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2809 ++Lane)
2810 OpsVec[OpIdx][Lane].IsUsed = false;
2811 }
2812
2813 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2814 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2815 std::swap(a&: OpsVec[OpIdx1][Lane], b&: OpsVec[OpIdx2][Lane]);
2816 }
2817
2818 /// \param Lane lane of the operands under analysis.
2819 /// \param OpIdx operand index in \p Lane lane we're looking the best
2820 /// candidate for.
2821 /// \param Idx operand index of the current candidate value.
2822 /// \returns The additional score due to possible broadcasting of the
2823 /// elements in the lane. It is more profitable to have power-of-2 unique
2824 /// elements in the lane, it will be vectorized with higher probability
2825 /// after removing duplicates. Currently the SLP vectorizer supports only
2826 /// vectorization of the power-of-2 number of unique scalars.
2827 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2828 const SmallBitVector &UsedLanes) const {
2829 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2830 if (!isa<Instruction>(Val: IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2831 isa<ExtractElementInst>(Val: IdxLaneV))
2832 return 0;
2833 SmallDenseMap<Value *, unsigned, 4> Uniques;
2834 for (unsigned Ln : seq<unsigned>(Size: getNumLanes())) {
2835 if (Ln == Lane)
2836 continue;
2837 Value *OpIdxLnV = getData(OpIdx, Lane: Ln).V;
2838 if (!isa<Instruction>(Val: OpIdxLnV))
2839 return 0;
2840 Uniques.try_emplace(Key: OpIdxLnV, Args&: Ln);
2841 }
2842 unsigned UniquesCount = Uniques.size();
2843 auto IdxIt = Uniques.find(Val: IdxLaneV);
2844 unsigned UniquesCntWithIdxLaneV =
2845 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2846 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2847 auto OpIdxIt = Uniques.find(Val: OpIdxLaneV);
2848 unsigned UniquesCntWithOpIdxLaneV =
2849 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2850 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2851 return 0;
2852 return std::min(a: bit_ceil(Value: UniquesCntWithOpIdxLaneV) -
2853 UniquesCntWithOpIdxLaneV,
2854 b: UniquesCntWithOpIdxLaneV -
2855 bit_floor(Value: UniquesCntWithOpIdxLaneV)) -
2856 ((IdxIt != Uniques.end() && UsedLanes.test(Idx: IdxIt->second))
2857 ? UniquesCntWithIdxLaneV - bit_floor(Value: UniquesCntWithIdxLaneV)
2858 : bit_ceil(Value: UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2859 }
2860
2861 /// \param Lane lane of the operands under analysis.
2862 /// \param OpIdx operand index in \p Lane lane we're looking the best
2863 /// candidate for.
2864 /// \param Idx operand index of the current candidate value.
2865 /// \returns The additional score for the scalar which users are all
2866 /// vectorized.
2867 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2868 Value *IdxLaneV = getData(OpIdx: Idx, Lane).V;
2869 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2870 // Do not care about number of uses for vector-like instructions
2871 // (extractelement/extractvalue with constant indices), they are extracts
2872 // themselves and already externally used. Vectorization of such
2873 // instructions does not add extra extractelement instruction, just may
2874 // remove it.
2875 if (isVectorLikeInstWithConstOps(V: IdxLaneV) &&
2876 isVectorLikeInstWithConstOps(V: OpIdxLaneV))
2877 return LookAheadHeuristics::ScoreAllUserVectorized;
2878 auto *IdxLaneI = dyn_cast<Instruction>(Val: IdxLaneV);
2879 if (!IdxLaneI || !isa<Instruction>(Val: OpIdxLaneV))
2880 return 0;
2881 return R.areAllUsersVectorized(I: IdxLaneI)
2882 ? LookAheadHeuristics::ScoreAllUserVectorized
2883 : 0;
2884 }
2885
2886 /// Score scaling factor for fully compatible instructions but with
2887 /// different number of external uses. Allows better selection of the
2888 /// instructions with less external uses.
2889 static const int ScoreScaleFactor = 10;
2890
2891 /// \Returns the look-ahead score, which tells us how much the sub-trees
2892 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2893 /// score. This helps break ties in an informed way when we cannot decide on
2894 /// the order of the operands by just considering the immediate
2895 /// predecessors.
2896 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2897 int Lane, unsigned OpIdx, unsigned Idx,
2898 bool &IsUsed, const SmallBitVector &UsedLanes) {
2899 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2900 LookAheadMaxDepth);
2901 // Keep track of the instruction stack as we recurse into the operands
2902 // during the look-ahead score exploration.
2903 int Score =
2904 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2905 /*CurrLevel=*/1, MainAltOps);
2906 if (Score) {
2907 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2908 if (Score <= -SplatScore) {
2909 // Failed score.
2910 Score = 0;
2911 } else {
2912 Score += SplatScore;
2913 // Scale score to see the difference between different operands
2914 // and similar operands but all vectorized/not all vectorized
2915 // uses. It does not affect actual selection of the best
2916 // compatible operand in general, just allows to select the
2917 // operand with all vectorized uses.
2918 Score *= ScoreScaleFactor;
2919 Score += getExternalUseScore(Lane, OpIdx, Idx);
2920 IsUsed = true;
2921 }
2922 }
2923 return Score;
2924 }
2925
2926 /// Best defined scores per lanes between the passes. Used to choose the
2927 /// best operand (with the highest score) between the passes.
2928 /// The key - {Operand Index, Lane}.
2929 /// The value - the best score between the passes for the lane and the
2930 /// operand.
2931 SmallDenseMap<std::pair<unsigned, unsigned>, unsigned, 8>
2932 BestScoresPerLanes;
2933
2934 // Search all operands in Ops[*][Lane] for the one that matches best
2935 // Ops[OpIdx][LastLane] and return its opreand index.
2936 // If no good match can be found, return std::nullopt.
2937 std::optional<unsigned>
2938 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2939 ArrayRef<ReorderingMode> ReorderingModes,
2940 ArrayRef<Value *> MainAltOps,
2941 const SmallBitVector &UsedLanes) {
2942 unsigned NumOperands = getNumOperands();
2943
2944 // The operand of the previous lane at OpIdx.
2945 Value *OpLastLane = getData(OpIdx, Lane: LastLane).V;
2946
2947 // Our strategy mode for OpIdx.
2948 ReorderingMode RMode = ReorderingModes[OpIdx];
2949 if (RMode == ReorderingMode::Failed)
2950 return std::nullopt;
2951
2952 // The linearized opcode of the operand at OpIdx, Lane.
2953 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2954
2955 // The best operand index and its score.
2956 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2957 // are using the score to differentiate between the two.
2958 struct BestOpData {
2959 std::optional<unsigned> Idx;
2960 unsigned Score = 0;
2961 } BestOp;
2962 BestOp.Score =
2963 BestScoresPerLanes.try_emplace(Key: std::make_pair(x&: OpIdx, y&: Lane), Args: 0)
2964 .first->second;
2965
2966 // Track if the operand must be marked as used. If the operand is set to
2967 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2968 // want to reestimate the operands again on the following iterations).
2969 bool IsUsed = RMode == ReorderingMode::Splat ||
2970 RMode == ReorderingMode::Constant ||
2971 RMode == ReorderingMode::Load;
2972 // Iterate through all unused operands and look for the best.
2973 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2974 // Get the operand at Idx and Lane.
2975 OperandData &OpData = getData(OpIdx: Idx, Lane);
2976 Value *Op = OpData.V;
2977 bool OpAPO = OpData.APO;
2978
2979 // Skip already selected operands.
2980 if (OpData.IsUsed)
2981 continue;
2982
2983 // Skip if we are trying to move the operand to a position with a
2984 // different opcode in the linearized tree form. This would break the
2985 // semantics.
2986 if (OpAPO != OpIdxAPO)
2987 continue;
2988
2989 // Look for an operand that matches the current mode.
2990 switch (RMode) {
2991 case ReorderingMode::Load:
2992 case ReorderingMode::Opcode: {
2993 bool LeftToRight = Lane > LastLane;
2994 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2995 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2996 int Score = getLookAheadScore(LHS: OpLeft, RHS: OpRight, MainAltOps, Lane,
2997 OpIdx, Idx, IsUsed, UsedLanes);
2998 if (Score > static_cast<int>(BestOp.Score) ||
2999 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
3000 Idx == OpIdx)) {
3001 BestOp.Idx = Idx;
3002 BestOp.Score = Score;
3003 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] = Score;
3004 }
3005 break;
3006 }
3007 case ReorderingMode::Constant:
3008 if (isa<Constant>(Val: Op) ||
3009 (!BestOp.Score && L && L->isLoopInvariant(V: Op))) {
3010 BestOp.Idx = Idx;
3011 if (isa<Constant>(Val: Op)) {
3012 BestOp.Score = LookAheadHeuristics::ScoreConstants;
3013 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
3014 LookAheadHeuristics::ScoreConstants;
3015 }
3016 if (isa<UndefValue>(Val: Op) || !isa<Constant>(Val: Op))
3017 IsUsed = false;
3018 }
3019 break;
3020 case ReorderingMode::Splat:
3021 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Val: Op))) {
3022 IsUsed = Op == OpLastLane;
3023 if (Op == OpLastLane) {
3024 BestOp.Score = LookAheadHeuristics::ScoreSplat;
3025 BestScoresPerLanes[std::make_pair(x&: OpIdx, y&: Lane)] =
3026 LookAheadHeuristics::ScoreSplat;
3027 }
3028 BestOp.Idx = Idx;
3029 }
3030 break;
3031 case ReorderingMode::Failed:
3032 llvm_unreachable("Not expected Failed reordering mode.");
3033 }
3034 }
3035
3036 if (BestOp.Idx) {
3037 getData(OpIdx: *BestOp.Idx, Lane).IsUsed = IsUsed;
3038 return BestOp.Idx;
3039 }
3040 // If we could not find a good match return std::nullopt.
3041 return std::nullopt;
3042 }
3043
3044 /// Helper for reorderOperandVecs.
3045 /// \returns the lane that we should start reordering from. This is the one
3046 /// which has the least number of operands that can freely move about or
3047 /// less profitable because it already has the most optimal set of operands.
3048 unsigned getBestLaneToStartReordering() const {
3049 unsigned Min = UINT_MAX;
3050 unsigned SameOpNumber = 0;
3051 // std::pair<unsigned, unsigned> is used to implement a simple voting
3052 // algorithm and choose the lane with the least number of operands that
3053 // can freely move about or less profitable because it already has the
3054 // most optimal set of operands. The first unsigned is a counter for
3055 // voting, the second unsigned is the counter of lanes with instructions
3056 // with same/alternate opcodes and same parent basic block.
3057 MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;
3058 // Try to be closer to the original results, if we have multiple lanes
3059 // with same cost. If 2 lanes have the same cost, use the one with the
3060 // highest index.
3061 for (int I = getNumLanes(); I > 0; --I) {
3062 unsigned Lane = I - 1;
3063 OperandsOrderData NumFreeOpsHash =
3064 getMaxNumOperandsThatCanBeReordered(Lane);
3065 // Compare the number of operands that can move and choose the one with
3066 // the least number.
3067 if (NumFreeOpsHash.NumOfAPOs < Min) {
3068 Min = NumFreeOpsHash.NumOfAPOs;
3069 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3070 HashMap.clear();
3071 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
3072 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3073 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
3074 // Select the most optimal lane in terms of number of operands that
3075 // should be moved around.
3076 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
3077 HashMap[NumFreeOpsHash.Hash] = std::make_pair(x: 1, y&: Lane);
3078 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
3079 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
3080 auto [It, Inserted] =
3081 HashMap.try_emplace(Key: NumFreeOpsHash.Hash, Args: 1, Args&: Lane);
3082 if (!Inserted)
3083 ++It->second.first;
3084 }
3085 }
3086 // Select the lane with the minimum counter.
3087 unsigned BestLane = 0;
3088 unsigned CntMin = UINT_MAX;
3089 for (const auto &Data : reverse(C&: HashMap)) {
3090 if (Data.second.first < CntMin) {
3091 CntMin = Data.second.first;
3092 BestLane = Data.second.second;
3093 }
3094 }
3095 return BestLane;
3096 }
3097
3098 /// Data structure that helps to reorder operands.
3099 struct OperandsOrderData {
3100 /// The best number of operands with the same APOs, which can be
3101 /// reordered.
3102 unsigned NumOfAPOs = UINT_MAX;
3103 /// Number of operands with the same/alternate instruction opcode and
3104 /// parent.
3105 unsigned NumOpsWithSameOpcodeParent = 0;
3106 /// Hash for the actual operands ordering.
3107 /// Used to count operands, actually their position id and opcode
3108 /// value. It is used in the voting mechanism to find the lane with the
3109 /// least number of operands that can freely move about or less profitable
3110 /// because it already has the most optimal set of operands. Can be
3111 /// replaced with SmallVector<unsigned> instead but hash code is faster
3112 /// and requires less memory.
3113 unsigned Hash = 0;
3114 };
3115 /// \returns the maximum number of operands that are allowed to be reordered
3116 /// for \p Lane and the number of compatible instructions(with the same
3117 /// parent/opcode). This is used as a heuristic for selecting the first lane
3118 /// to start operand reordering.
3119 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
3120 unsigned CntTrue = 0;
3121 unsigned NumOperands = getNumOperands();
3122 // Operands with the same APO can be reordered. We therefore need to count
3123 // how many of them we have for each APO, like this: Cnt[APO] = x.
3124 // Since we only have two APOs, namely true and false, we can avoid using
3125 // a map. Instead we can simply count the number of operands that
3126 // correspond to one of them (in this case the 'true' APO), and calculate
3127 // the other by subtracting it from the total number of operands.
3128 // Operands with the same instruction opcode and parent are more
3129 // profitable since we don't need to move them in many cases, with a high
3130 // probability such lane already can be vectorized effectively.
3131 bool AllUndefs = true;
3132 unsigned NumOpsWithSameOpcodeParent = 0;
3133 Instruction *OpcodeI = nullptr;
3134 BasicBlock *Parent = nullptr;
3135 unsigned Hash = 0;
3136 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3137 const OperandData &OpData = getData(OpIdx, Lane);
3138 if (OpData.APO)
3139 ++CntTrue;
3140 // Use Boyer-Moore majority voting for finding the majority opcode and
3141 // the number of times it occurs.
3142 if (auto *I = dyn_cast<Instruction>(Val: OpData.V)) {
3143 if (!OpcodeI || !getSameOpcode(VL: {OpcodeI, I}, TLI) ||
3144 I->getParent() != Parent) {
3145 if (NumOpsWithSameOpcodeParent == 0) {
3146 NumOpsWithSameOpcodeParent = 1;
3147 OpcodeI = I;
3148 Parent = I->getParent();
3149 } else {
3150 --NumOpsWithSameOpcodeParent;
3151 }
3152 } else {
3153 ++NumOpsWithSameOpcodeParent;
3154 }
3155 }
3156 Hash = hash_combine(
3157 args: Hash, args: hash_value(value: (OpIdx + 1) * (OpData.V->getValueID() + 1)));
3158 AllUndefs = AllUndefs && isa<UndefValue>(Val: OpData.V);
3159 }
3160 if (AllUndefs)
3161 return {};
3162 OperandsOrderData Data;
3163 Data.NumOfAPOs = std::max(a: CntTrue, b: NumOperands - CntTrue);
3164 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
3165 Data.Hash = Hash;
3166 return Data;
3167 }
3168
3169 /// Go through the instructions in VL and append their operands.
3170 void appendOperands(ArrayRef<Value *> VL, ArrayRef<ValueList> Operands,
3171 const InstructionsState &S) {
3172 assert(!Operands.empty() && !VL.empty() && "Bad list of operands");
3173 assert((empty() || all_of(Operands,
3174 [this](const ValueList &VL) {
3175 return VL.size() == getNumLanes();
3176 })) &&
3177 "Expected same number of lanes");
3178 assert(S.valid() && "InstructionsState is invalid.");
3179 // IntrinsicInst::isCommutative returns true if swapping the first "two"
3180 // arguments to the intrinsic produces the same result.
3181 Instruction *MainOp = S.getMainOp();
3182 unsigned NumOperands = MainOp->getNumOperands();
3183 ArgSize = ::getNumberOfPotentiallyCommutativeOps(I: MainOp);
3184 OpsVec.resize(N: ArgSize);
3185 unsigned NumLanes = VL.size();
3186 for (OperandDataVec &Ops : OpsVec)
3187 Ops.resize(N: NumLanes);
3188 for (unsigned Lane : seq<unsigned>(Size: NumLanes)) {
3189 // Our tree has just 3 nodes: the root and two operands.
3190 // It is therefore trivial to get the APO. We only need to check the
3191 // opcode of V and whether the operand at OpIdx is the LHS or RHS
3192 // operand. The LHS operand of both add and sub is never attached to an
3193 // inversese operation in the linearized form, therefore its APO is
3194 // false. The RHS is true only if V is an inverse operation.
3195
3196 // Since operand reordering is performed on groups of commutative
3197 // operations or alternating sequences (e.g., +, -), we can safely tell
3198 // the inverse operations by checking commutativity.
3199 auto *I = dyn_cast<Instruction>(Val: VL[Lane]);
3200 if (!I && isa<PoisonValue>(Val: VL[Lane])) {
3201 for (unsigned OpIdx : seq<unsigned>(Size: NumOperands))
3202 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false};
3203 continue;
3204 }
3205 bool IsInverseOperation = false;
3206 if (S.isCopyableElement(V: VL[Lane])) {
3207 // The value is a copyable element.
3208 IsInverseOperation =
3209 !isCommutative(I: MainOp, ValWithUses: VL[Lane], /*IsCopyable=*/true);
3210 } else {
3211 assert(I && "Expected instruction");
3212 auto [SelectedOp, Ops] = convertTo(I, S);
3213 // We cannot check commutativity by the converted instruction
3214 // (SelectedOp) because isCommutative also examines def-use
3215 // relationships.
3216 IsInverseOperation = !isCommutative(I: SelectedOp, ValWithUses: I);
3217 }
3218 for (unsigned OpIdx : seq<unsigned>(Size: ArgSize)) {
3219 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
3220 OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false};
3221 }
3222 }
3223 }
3224
3225 /// \returns the number of operands.
3226 unsigned getNumOperands() const { return ArgSize; }
3227
3228 /// \returns the number of lanes.
3229 unsigned getNumLanes() const { return OpsVec[0].size(); }
3230
3231 /// \returns the operand value at \p OpIdx and \p Lane.
3232 Value *getValue(unsigned OpIdx, unsigned Lane) const {
3233 return getData(OpIdx, Lane).V;
3234 }
3235
3236 /// \returns true if the data structure is empty.
3237 bool empty() const { return OpsVec.empty(); }
3238
3239 /// Clears the data.
3240 void clear() { OpsVec.clear(); }
3241
3242 /// \Returns true if there are enough operands identical to \p Op to fill
3243 /// the whole vector (it is mixed with constants or loop invariant values).
3244 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
3245 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
3246 assert(Op == getValue(OpIdx, Lane) &&
3247 "Op is expected to be getValue(OpIdx, Lane).");
3248 // Small number of loads - try load matching.
3249 if (isa<LoadInst>(Val: Op) && getNumLanes() == 2 && getNumOperands() == 2)
3250 return false;
3251 bool OpAPO = getData(OpIdx, Lane).APO;
3252 bool IsInvariant = L && L->isLoopInvariant(V: Op);
3253 unsigned Cnt = 0;
3254 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3255 if (Ln == Lane)
3256 continue;
3257 // This is set to true if we found a candidate for broadcast at Lane.
3258 bool FoundCandidate = false;
3259 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
3260 OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3261 if (Data.APO != OpAPO || Data.IsUsed)
3262 continue;
3263 Value *OpILane = getValue(OpIdx: OpI, Lane);
3264 bool IsConstantOp = isa<Constant>(Val: OpILane);
3265 // Consider the broadcast candidate if:
3266 // 1. Same value is found in one of the operands.
3267 if (Data.V == Op ||
3268 // 2. The operand in the given lane is not constant but there is a
3269 // constant operand in another lane (which can be moved to the
3270 // given lane). In this case we can represent it as a simple
3271 // permutation of constant and broadcast.
3272 (!IsConstantOp &&
3273 ((Lns > 2 && isa<Constant>(Val: Data.V)) ||
3274 // 2.1. If we have only 2 lanes, need to check that value in the
3275 // next lane does not build same opcode sequence.
3276 (Lns == 2 &&
3277 !getSameOpcode(VL: {Op, getValue(OpIdx: (OpI + 1) % OpE, Lane: Ln)}, TLI) &&
3278 isa<Constant>(Val: Data.V)))) ||
3279 // 3. The operand in the current lane is loop invariant (can be
3280 // hoisted out) and another operand is also a loop invariant
3281 // (though not a constant). In this case the whole vector can be
3282 // hoisted out.
3283 // FIXME: need to teach the cost model about this case for better
3284 // estimation.
3285 (IsInvariant && !isa<Constant>(Val: Data.V) &&
3286 !getSameOpcode(VL: {Op, Data.V}, TLI) &&
3287 L->isLoopInvariant(V: Data.V))) {
3288 FoundCandidate = true;
3289 Data.IsUsed = Data.V == Op;
3290 if (Data.V == Op)
3291 ++Cnt;
3292 break;
3293 }
3294 }
3295 if (!FoundCandidate)
3296 return false;
3297 }
3298 return getNumLanes() == 2 || Cnt > 1;
3299 }
3300
3301 /// Checks if there is at least single compatible operand in lanes other
3302 /// than \p Lane, compatible with the operand \p Op.
3303 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
3304 assert(Op == getValue(OpIdx, Lane) &&
3305 "Op is expected to be getValue(OpIdx, Lane).");
3306 bool OpAPO = getData(OpIdx, Lane).APO;
3307 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
3308 if (Ln == Lane)
3309 continue;
3310 if (any_of(Range: seq<unsigned>(Size: getNumOperands()), P: [&](unsigned OpI) {
3311 const OperandData &Data = getData(OpIdx: OpI, Lane: Ln);
3312 if (Data.APO != OpAPO || Data.IsUsed)
3313 return true;
3314 Value *OpILn = getValue(OpIdx: OpI, Lane: Ln);
3315 return (L && L->isLoopInvariant(V: OpILn)) ||
3316 (getSameOpcode(VL: {Op, OpILn}, TLI) &&
3317 allSameBlock(VL: {Op, OpILn}));
3318 }))
3319 return true;
3320 }
3321 return false;
3322 }
3323
3324 public:
3325 /// Initialize with all the operands of the instruction vector \p RootVL.
3326 VLOperands(ArrayRef<Value *> RootVL, ArrayRef<ValueList> Operands,
3327 const InstructionsState &S, const BoUpSLP &R)
3328 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
3329 L(R.LI->getLoopFor(BB: S.getMainOp()->getParent())) {
3330 // Append all the operands of RootVL.
3331 appendOperands(VL: RootVL, Operands, S);
3332 }
3333
3334 /// \Returns a value vector with the operands across all lanes for the
3335 /// opearnd at \p OpIdx.
3336 ValueList getVL(unsigned OpIdx) const {
3337 ValueList OpVL(OpsVec[OpIdx].size());
3338 assert(OpsVec[OpIdx].size() == getNumLanes() &&
3339 "Expected same num of lanes across all operands");
3340 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
3341 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
3342 return OpVL;
3343 }
3344
3345 // Performs operand reordering for 2 or more operands.
3346 // The original operands are in OrigOps[OpIdx][Lane].
3347 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
3348 void reorder() {
3349 unsigned NumOperands = getNumOperands();
3350 unsigned NumLanes = getNumLanes();
3351 // Each operand has its own mode. We are using this mode to help us select
3352 // the instructions for each lane, so that they match best with the ones
3353 // we have selected so far.
3354 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
3355
3356 // This is a greedy single-pass algorithm. We are going over each lane
3357 // once and deciding on the best order right away with no back-tracking.
3358 // However, in order to increase its effectiveness, we start with the lane
3359 // that has operands that can move the least. For example, given the
3360 // following lanes:
3361 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
3362 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
3363 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
3364 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
3365 // we will start at Lane 1, since the operands of the subtraction cannot
3366 // be reordered. Then we will visit the rest of the lanes in a circular
3367 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
3368
3369 // Find the first lane that we will start our search from.
3370 unsigned FirstLane = getBestLaneToStartReordering();
3371
3372 // Initialize the modes.
3373 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3374 Value *OpLane0 = getValue(OpIdx, Lane: FirstLane);
3375 // Keep track if we have instructions with all the same opcode on one
3376 // side.
3377 if (auto *OpILane0 = dyn_cast<Instruction>(Val: OpLane0)) {
3378 // Check if OpLane0 should be broadcast.
3379 if (shouldBroadcast(Op: OpLane0, OpIdx, Lane: FirstLane) ||
3380 !canBeVectorized(Op: OpILane0, OpIdx, Lane: FirstLane))
3381 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3382 else if (isa<LoadInst>(Val: OpILane0))
3383 ReorderingModes[OpIdx] = ReorderingMode::Load;
3384 else
3385 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
3386 } else if (isa<Constant>(Val: OpLane0)) {
3387 ReorderingModes[OpIdx] = ReorderingMode::Constant;
3388 } else if (isa<Argument>(Val: OpLane0)) {
3389 // Our best hope is a Splat. It may save some cost in some cases.
3390 ReorderingModes[OpIdx] = ReorderingMode::Splat;
3391 } else {
3392 llvm_unreachable("Unexpected value kind.");
3393 }
3394 }
3395
3396 // Check that we don't have same operands. No need to reorder if operands
3397 // are just perfect diamond or shuffled diamond match. Do not do it only
3398 // for possible broadcasts or non-power of 2 number of scalars (just for
3399 // now).
3400 auto &&SkipReordering = [this]() {
3401 SmallPtrSet<Value *, 4> UniqueValues;
3402 ArrayRef<OperandData> Op0 = OpsVec.front();
3403 for (const OperandData &Data : Op0)
3404 UniqueValues.insert(Ptr: Data.V);
3405 for (ArrayRef<OperandData> Op :
3406 ArrayRef(OpsVec).slice(N: 1, M: getNumOperands() - 1)) {
3407 if (any_of(Range&: Op, P: [&UniqueValues](const OperandData &Data) {
3408 return !UniqueValues.contains(Ptr: Data.V);
3409 }))
3410 return false;
3411 }
3412 // TODO: Check if we can remove a check for non-power-2 number of
3413 // scalars after full support of non-power-2 vectorization.
3414 return UniqueValues.size() != 2 &&
3415 hasFullVectorsOrPowerOf2(TTI: *R.TTI, Ty: Op0.front().V->getType(),
3416 Sz: UniqueValues.size());
3417 };
3418
3419 // If the initial strategy fails for any of the operand indexes, then we
3420 // perform reordering again in a second pass. This helps avoid assigning
3421 // high priority to the failed strategy, and should improve reordering for
3422 // the non-failed operand indexes.
3423 for (int Pass = 0; Pass != 2; ++Pass) {
3424 // Check if no need to reorder operands since they're are perfect or
3425 // shuffled diamond match.
3426 // Need to do it to avoid extra external use cost counting for
3427 // shuffled matches, which may cause regressions.
3428 if (SkipReordering())
3429 break;
3430 // Skip the second pass if the first pass did not fail.
3431 bool StrategyFailed = false;
3432 // Mark all operand data as free to use.
3433 clearUsed();
3434 // We keep the original operand order for the FirstLane, so reorder the
3435 // rest of the lanes. We are visiting the nodes in a circular fashion,
3436 // using FirstLane as the center point and increasing the radius
3437 // distance.
3438 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
3439 for (unsigned I = 0; I < NumOperands; ++I)
3440 MainAltOps[I].push_back(Elt: getData(OpIdx: I, Lane: FirstLane).V);
3441
3442 SmallBitVector UsedLanes(NumLanes);
3443 UsedLanes.set(FirstLane);
3444 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
3445 // Visit the lane on the right and then the lane on the left.
3446 for (int Direction : {+1, -1}) {
3447 int Lane = FirstLane + Direction * Distance;
3448 if (Lane < 0 || Lane >= (int)NumLanes)
3449 continue;
3450 UsedLanes.set(Lane);
3451 int LastLane = Lane - Direction;
3452 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
3453 "Out of bounds");
3454 // Look for a good match for each operand.
3455 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
3456 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
3457 std::optional<unsigned> BestIdx =
3458 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
3459 MainAltOps: MainAltOps[OpIdx], UsedLanes);
3460 // By not selecting a value, we allow the operands that follow to
3461 // select a better matching value. We will get a non-null value in
3462 // the next run of getBestOperand().
3463 if (BestIdx) {
3464 // Swap the current operand with the one returned by
3465 // getBestOperand().
3466 swap(OpIdx1: OpIdx, OpIdx2: *BestIdx, Lane);
3467 } else {
3468 // Enable the second pass.
3469 StrategyFailed = true;
3470 }
3471 // Try to get the alternate opcode and follow it during analysis.
3472 if (MainAltOps[OpIdx].size() != 2) {
3473 OperandData &AltOp = getData(OpIdx, Lane);
3474 InstructionsState OpS =
3475 getSameOpcode(VL: {MainAltOps[OpIdx].front(), AltOp.V}, TLI);
3476 if (OpS && OpS.isAltShuffle())
3477 MainAltOps[OpIdx].push_back(Elt: AltOp.V);
3478 }
3479 }
3480 }
3481 }
3482 // Skip second pass if the strategy did not fail.
3483 if (!StrategyFailed)
3484 break;
3485 }
3486 }
3487
3488#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3489 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
3490 switch (RMode) {
3491 case ReorderingMode::Load:
3492 return "Load";
3493 case ReorderingMode::Opcode:
3494 return "Opcode";
3495 case ReorderingMode::Constant:
3496 return "Constant";
3497 case ReorderingMode::Splat:
3498 return "Splat";
3499 case ReorderingMode::Failed:
3500 return "Failed";
3501 }
3502 llvm_unreachable("Unimplemented Reordering Type");
3503 }
3504
3505 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
3506 raw_ostream &OS) {
3507 return OS << getModeStr(RMode);
3508 }
3509
3510 /// Debug print.
3511 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
3512 printMode(RMode, dbgs());
3513 }
3514
3515 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
3516 return printMode(RMode, OS);
3517 }
3518
3519 LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
3520 const unsigned Indent = 2;
3521 unsigned Cnt = 0;
3522 for (const OperandDataVec &OpDataVec : OpsVec) {
3523 OS << "Operand " << Cnt++ << "\n";
3524 for (const OperandData &OpData : OpDataVec) {
3525 OS.indent(Indent) << "{";
3526 if (Value *V = OpData.V)
3527 OS << *V;
3528 else
3529 OS << "null";
3530 OS << ", APO:" << OpData.APO << "}\n";
3531 }
3532 OS << "\n";
3533 }
3534 return OS;
3535 }
3536
3537 /// Debug print.
3538 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
3539#endif
3540 };
3541
3542 /// Evaluate each pair in \p Candidates and return index into \p Candidates
3543 /// for a pair which have highest score deemed to have best chance to form
3544 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
3545 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
3546 /// of the cost, considered to be good enough score.
3547 std::optional<int>
3548 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
3549 int Limit = LookAheadHeuristics::ScoreFail) const {
3550 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
3551 RootLookAheadMaxDepth);
3552 int BestScore = Limit;
3553 std::optional<int> Index;
3554 for (int I : seq<int>(Begin: 0, End: Candidates.size())) {
3555 int Score = LookAhead.getScoreAtLevelRec(LHS: Candidates[I].first,
3556 RHS: Candidates[I].second,
3557 /*U1=*/nullptr, /*U2=*/nullptr,
3558 /*CurrLevel=*/1, MainAltOps: {});
3559 if (Score > BestScore) {
3560 BestScore = Score;
3561 Index = I;
3562 }
3563 }
3564 return Index;
3565 }
3566
3567 /// Checks if the instruction is marked for deletion.
3568 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(V: I); }
3569
3570 /// Removes an instruction from its block and eventually deletes it.
3571 /// It's like Instruction::eraseFromParent() except that the actual deletion
3572 /// is delayed until BoUpSLP is destructed.
3573 void eraseInstruction(Instruction *I) {
3574 DeletedInstructions.insert(V: I);
3575 }
3576
3577 /// Remove instructions from the parent function and clear the operands of \p
3578 /// DeadVals instructions, marking for deletion trivially dead operands.
3579 template <typename T>
3580 void removeInstructionsAndOperands(
3581 ArrayRef<T *> DeadVals,
3582 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
3583 VectorValuesAndScales) {
3584 SmallVector<WeakTrackingVH> DeadInsts;
3585 for (T *V : DeadVals) {
3586 auto *I = cast<Instruction>(V);
3587 eraseInstruction(I);
3588 }
3589 DenseSet<Value *> Processed;
3590 for (T *V : DeadVals) {
3591 if (!V || !Processed.insert(V).second)
3592 continue;
3593 auto *I = cast<Instruction>(V);
3594 salvageDebugInfo(*I);
3595 ArrayRef<TreeEntry *> Entries = getTreeEntries(V: I);
3596 for (Use &U : I->operands()) {
3597 if (auto *OpI = dyn_cast_if_present<Instruction>(Val: U.get());
3598 OpI && !DeletedInstructions.contains(V: OpI) && OpI->hasOneUser() &&
3599 wouldInstructionBeTriviallyDead(I: OpI, TLI) &&
3600 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
3601 return Entry->VectorizedValue == OpI;
3602 })))
3603 DeadInsts.push_back(Elt: OpI);
3604 }
3605 I->dropAllReferences();
3606 }
3607 for (T *V : DeadVals) {
3608 auto *I = cast<Instruction>(V);
3609 if (!I->getParent())
3610 continue;
3611 assert((I->use_empty() || all_of(I->uses(),
3612 [&](Use &U) {
3613 return isDeleted(
3614 cast<Instruction>(U.getUser()));
3615 })) &&
3616 "trying to erase instruction with users.");
3617 I->removeFromParent();
3618 SE->forgetValue(V: I);
3619 }
3620 // Process the dead instruction list until empty.
3621 while (!DeadInsts.empty()) {
3622 Value *V = DeadInsts.pop_back_val();
3623 Instruction *VI = cast_or_null<Instruction>(Val: V);
3624 if (!VI || !VI->getParent())
3625 continue;
3626 assert(isInstructionTriviallyDead(VI, TLI) &&
3627 "Live instruction found in dead worklist!");
3628 assert(VI->use_empty() && "Instructions with uses are not dead.");
3629
3630 // Don't lose the debug info while deleting the instructions.
3631 salvageDebugInfo(I&: *VI);
3632
3633 // Null out all of the instruction's operands to see if any operand
3634 // becomes dead as we go.
3635 for (Use &OpU : VI->operands()) {
3636 Value *OpV = OpU.get();
3637 if (!OpV)
3638 continue;
3639 OpU.set(nullptr);
3640
3641 if (!OpV->use_empty())
3642 continue;
3643
3644 // If the operand is an instruction that became dead as we nulled out
3645 // the operand, and if it is 'trivially' dead, delete it in a future
3646 // loop iteration.
3647 if (auto *OpI = dyn_cast<Instruction>(Val: OpV))
3648 if (!DeletedInstructions.contains(V: OpI) &&
3649 (!OpI->getType()->isVectorTy() ||
3650 none_of(
3651 VectorValuesAndScales,
3652 [&](const std::tuple<WeakTrackingVH, unsigned, bool, bool>
3653 &V) { return std::get<0>(t: V) == OpI; })) &&
3654 isInstructionTriviallyDead(I: OpI, TLI))
3655 DeadInsts.push_back(Elt: OpI);
3656 }
3657
3658 VI->removeFromParent();
3659 eraseInstruction(I: VI);
3660 SE->forgetValue(V: VI);
3661 }
3662 }
3663
3664 /// Checks if the instruction was already analyzed for being possible
3665 /// reduction root.
3666 bool isAnalyzedReductionRoot(Instruction *I) const {
3667 return AnalyzedReductionsRoots.count(Ptr: I);
3668 }
3669 /// Register given instruction as already analyzed for being possible
3670 /// reduction root.
3671 void analyzedReductionRoot(Instruction *I) {
3672 AnalyzedReductionsRoots.insert(Ptr: I);
3673 }
3674 /// Checks if the provided list of reduced values was checked already for
3675 /// vectorization.
3676 bool areAnalyzedReductionVals(ArrayRef<Value *> VL) const {
3677 return AnalyzedReductionVals.contains(V: hash_value(S: VL));
3678 }
3679 /// Adds the list of reduced values to list of already checked values for the
3680 /// vectorization.
3681 void analyzedReductionVals(ArrayRef<Value *> VL) {
3682 AnalyzedReductionVals.insert(V: hash_value(S: VL));
3683 }
3684 /// Clear the list of the analyzed reduction root instructions.
3685 void clearReductionData() {
3686 AnalyzedReductionsRoots.clear();
3687 AnalyzedReductionVals.clear();
3688 AnalyzedMinBWVals.clear();
3689 }
3690 /// Checks if the given value is gathered in one of the nodes.
3691 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
3692 return any_of(Range: MustGather, P: [&](Value *V) { return Vals.contains(V); });
3693 }
3694 /// Checks if the given value is gathered in one of the nodes.
3695 bool isGathered(const Value *V) const {
3696 return MustGather.contains(Ptr: V);
3697 }
3698 /// Checks if the specified value was not schedule.
3699 bool isNotScheduled(const Value *V) const {
3700 return NonScheduledFirst.contains(Ptr: V);
3701 }
3702
3703 /// Check if the value is vectorized in the tree.
3704 bool isVectorized(const Value *V) const {
3705 assert(V && "V cannot be nullptr.");
3706 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
3707 return any_of(Range&: Entries, P: [&](const TreeEntry *E) {
3708 return !DeletedNodes.contains(Ptr: E) && !TransformedToGatherNodes.contains(Val: E);
3709 });
3710 }
3711
3712 ~BoUpSLP();
3713
3714private:
3715 /// Determine if a node \p E in can be demoted to a smaller type with a
3716 /// truncation. We collect the entries that will be demoted in ToDemote.
3717 /// \param E Node for analysis
3718 /// \param ToDemote indices of the nodes to be demoted.
3719 bool collectValuesToDemote(
3720 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
3721 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
3722 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
3723 bool &IsProfitableToDemote, bool IsTruncRoot) const;
3724
3725 /// Builds the list of reorderable operands on the edges \p Edges of the \p
3726 /// UserTE, which allow reordering (i.e. the operands can be reordered because
3727 /// they have only one user and reordarable).
3728 /// \param ReorderableGathers List of all gather nodes that require reordering
3729 /// (e.g., gather of extractlements or partially vectorizable loads).
3730 /// \param GatherOps List of gather operand nodes for \p UserTE that require
3731 /// reordering, subset of \p NonVectorized.
3732 void buildReorderableOperands(
3733 TreeEntry *UserTE,
3734 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
3735 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
3736 SmallVectorImpl<TreeEntry *> &GatherOps);
3737
3738 /// Checks if the given \p TE is a gather node with clustered reused scalars
3739 /// and reorders it per given \p Mask.
3740 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
3741
3742 /// Checks if all users of \p I are the part of the vectorization tree.
3743 bool areAllUsersVectorized(
3744 Instruction *I,
3745 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3746
3747 /// Return information about the vector formed for the specified index
3748 /// of a vector of (the same) instruction.
3749 TargetTransformInfo::OperandValueInfo
3750 getOperandInfo(ArrayRef<Value *> Ops) const;
3751
3752 /// \returns the graph entry for the \p Idx operand of the \p E entry.
3753 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3754 TreeEntry *getOperandEntry(TreeEntry *E, unsigned Idx) {
3755 return const_cast<TreeEntry *>(
3756 getOperandEntry(E: const_cast<const TreeEntry *>(E), Idx));
3757 }
3758
3759 /// Gets the root instruction for the given node. If the node is a strided
3760 /// load/store node with the reverse order, the root instruction is the last
3761 /// one.
3762 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3763
3764 /// \returns Cast context for the given graph node.
3765 TargetTransformInfo::CastContextHint
3766 getCastContextHint(const TreeEntry &TE) const;
3767
3768 /// \returns the cost of the vectorizable entry.
3769 InstructionCost getEntryCost(const TreeEntry *E,
3770 ArrayRef<Value *> VectorizedVals,
3771 SmallPtrSetImpl<Value *> &CheckedExtracts);
3772
3773 /// Checks if it is legal and profitable to build SplitVectorize node for the
3774 /// given \p VL.
3775 /// \param Op1 first homogeneous scalars.
3776 /// \param Op2 second homogeneous scalars.
3777 /// \param ReorderIndices indices to reorder the scalars.
3778 /// \returns true if the node was successfully built.
3779 bool canBuildSplitNode(ArrayRef<Value *> VL,
3780 const InstructionsState &LocalState,
3781 SmallVectorImpl<Value *> &Op1,
3782 SmallVectorImpl<Value *> &Op2,
3783 OrdersType &ReorderIndices) const;
3784
3785 /// This is the recursive part of buildTree.
3786 void buildTreeRec(ArrayRef<Value *> Roots, unsigned Depth, const EdgeInfo &EI,
3787 unsigned InterleaveFactor = 0);
3788
3789 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3790 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3791 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3792 /// returns false, setting \p CurrentOrder to either an empty vector or a
3793 /// non-identity permutation that allows to reuse extract instructions.
3794 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3795 /// extract order.
3796 bool canReuseExtract(ArrayRef<Value *> VL,
3797 SmallVectorImpl<unsigned> &CurrentOrder,
3798 bool ResizeAllowed = false) const;
3799
3800 /// Vectorize a single entry in the tree.
3801 Value *vectorizeTree(TreeEntry *E);
3802
3803 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3804 /// \p E.
3805 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx);
3806
3807 /// Create a new vector from a list of scalar values. Produces a sequence
3808 /// which exploits values reused across lanes, and arranges the inserts
3809 /// for ease of later optimization.
3810 template <typename BVTy, typename ResTy, typename... Args>
3811 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3812
3813 /// Create a new vector from a list of scalar values. Produces a sequence
3814 /// which exploits values reused across lanes, and arranges the inserts
3815 /// for ease of later optimization.
3816 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
3817
3818 /// Returns the instruction in the bundle, which can be used as a base point
3819 /// for scheduling. Usually it is the last instruction in the bundle, except
3820 /// for the case when all operands are external (in this case, it is the first
3821 /// instruction in the list).
3822 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3823
3824 /// Tries to find extractelement instructions with constant indices from fixed
3825 /// vector type and gather such instructions into a bunch, which highly likely
3826 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3827 /// was successful, the matched scalars are replaced by poison values in \p VL
3828 /// for future analysis.
3829 std::optional<TargetTransformInfo::ShuffleKind>
3830 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3831 SmallVectorImpl<int> &Mask) const;
3832
3833 /// Tries to find extractelement instructions with constant indices from fixed
3834 /// vector type and gather such instructions into a bunch, which highly likely
3835 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3836 /// was successful, the matched scalars are replaced by poison values in \p VL
3837 /// for future analysis.
3838 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3839 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3840 SmallVectorImpl<int> &Mask,
3841 unsigned NumParts) const;
3842
3843 /// Checks if the gathered \p VL can be represented as a single register
3844 /// shuffle(s) of previous tree entries.
3845 /// \param TE Tree entry checked for permutation.
3846 /// \param VL List of scalars (a subset of the TE scalar), checked for
3847 /// permutations. Must form single-register vector.
3848 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3849 /// commands to build the mask using the original vector value, without
3850 /// relying on the potential reordering.
3851 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3852 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3853 std::optional<TargetTransformInfo::ShuffleKind>
3854 isGatherShuffledSingleRegisterEntry(
3855 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3856 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3857 bool ForOrder);
3858
3859 /// Checks if the gathered \p VL can be represented as multi-register
3860 /// shuffle(s) of previous tree entries.
3861 /// \param TE Tree entry checked for permutation.
3862 /// \param VL List of scalars (a subset of the TE scalar), checked for
3863 /// permutations.
3864 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3865 /// commands to build the mask using the original vector value, without
3866 /// relying on the potential reordering.
3867 /// \returns per-register series of ShuffleKind, if gathered values can be
3868 /// represented as shuffles of previous tree entries. \p Mask is filled with
3869 /// the shuffle mask (also on per-register base).
3870 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
3871 isGatherShuffledEntry(
3872 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3873 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
3874 unsigned NumParts, bool ForOrder = false);
3875
3876 /// \returns the cost of gathering (inserting) the values in \p VL into a
3877 /// vector.
3878 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3879 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3880 Type *ScalarTy) const;
3881
3882 /// Set the Builder insert point to one after the last instruction in
3883 /// the bundle
3884 void setInsertPointAfterBundle(const TreeEntry *E);
3885
3886 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3887 /// specified, the starting vector value is poison.
3888 Value *
3889 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3890 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3891
3892 /// \returns whether the VectorizableTree is fully vectorizable and will
3893 /// be beneficial even the tree height is tiny.
3894 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3895
3896 /// Run through the list of all gathered loads in the graph and try to find
3897 /// vector loads/masked gathers instead of regular gathers. Later these loads
3898 /// are reshufled to build final gathered nodes.
3899 void tryToVectorizeGatheredLoads(
3900 const SmallMapVector<
3901 std::tuple<BasicBlock *, Value *, Type *>,
3902 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
3903 &GatheredLoads);
3904
3905 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3906 /// users of \p TE and collects the stores. It returns the map from the store
3907 /// pointers to the collected stores.
3908 SmallVector<SmallVector<StoreInst *>>
3909 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3910
3911 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3912 /// stores in \p StoresVec can form a vector instruction. If so it returns
3913 /// true and populates \p ReorderIndices with the shuffle indices of the
3914 /// stores when compared to the sorted vector.
3915 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3916 OrdersType &ReorderIndices) const;
3917
3918 /// Iterates through the users of \p TE, looking for scalar stores that can be
3919 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3920 /// their order and builds an order index vector for each store bundle. It
3921 /// returns all these order vectors found.
3922 /// We run this after the tree has formed, otherwise we may come across user
3923 /// instructions that are not yet in the tree.
3924 SmallVector<OrdersType, 1>
3925 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3926
3927 /// Tries to reorder the gathering node for better vectorization
3928 /// opportunities.
3929 void reorderGatherNode(TreeEntry &TE);
3930
3931 /// Checks if the tree represents disjoint or reduction of shl(zext, (0, 8,
3932 /// .., 56))-like pattern.
3933 /// If the int shifts unique, also strided, but not ordered, sets \p Order.
3934 /// If the node can be represented as a bitcast + bswap, sets \p IsBSwap.
3935 /// If the root nodes are loads, sets \p ForLoads to true.
3936 bool matchesShlZExt(const TreeEntry &TE, OrdersType &Order, bool &IsBSwap,
3937 bool &ForLoads) const;
3938
3939 /// Checks if the \p SelectTE matches zext+selects, which can be inversed for
3940 /// better codegen in case like zext (icmp ne), select (icmp eq), ....
3941 bool matchesInversedZExtSelect(
3942 const TreeEntry &SelectTE,
3943 SmallVectorImpl<unsigned> &InversedCmpsIndices) const;
3944
3945 /// Checks if the tree is reduction or of bit selects, like select %cmp, <1,
3946 /// 2, 4, 8, ..>, zeroinitializer, which can be reduced just to a bitcast %cmp
3947 /// to in.
3948 bool matchesSelectOfBits(const TreeEntry &SelectTE) const;
3949
3950 class TreeEntry {
3951 public:
3952 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3953 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3954
3955 /// \returns Common mask for reorder indices and reused scalars.
3956 SmallVector<int> getCommonMask() const {
3957 if (State == TreeEntry::SplitVectorize)
3958 return {};
3959 SmallVector<int> Mask;
3960 inversePermutation(Indices: ReorderIndices, Mask);
3961 ::addMask(Mask, SubMask: ReuseShuffleIndices);
3962 return Mask;
3963 }
3964
3965 /// \returns The mask for split nodes.
3966 SmallVector<int> getSplitMask() const {
3967 assert(State == TreeEntry::SplitVectorize && !ReorderIndices.empty() &&
3968 "Expected only split vectorize node.");
3969 unsigned CommonVF = std::max<unsigned>(
3970 a: CombinedEntriesWithIndices.back().second,
3971 b: Scalars.size() - CombinedEntriesWithIndices.back().second);
3972 const unsigned Scale = getNumElements(Ty: Scalars.front()->getType());
3973 CommonVF *= Scale;
3974 SmallVector<int> Mask(getVectorFactor() * Scale, PoisonMaskElem);
3975 for (auto [Idx, I] : enumerate(First: ReorderIndices)) {
3976 for (unsigned K : seq<unsigned>(Size: Scale)) {
3977 Mask[Scale * I + K] =
3978 Scale * Idx + K +
3979 (Idx >= CombinedEntriesWithIndices.back().second
3980 ? CommonVF - CombinedEntriesWithIndices.back().second * Scale
3981 : 0);
3982 }
3983 }
3984 return Mask;
3985 }
3986
3987 /// Updates (reorders) SplitVectorize node according to the given mask \p
3988 /// Mask and order \p MaskOrder.
3989 void reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
3990 ArrayRef<int> MaskOrder);
3991
3992 /// \returns true if the scalars in VL are equal to this entry.
3993 bool isSame(ArrayRef<Value *> VL) const {
3994 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3995 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3996 return std::equal(first1: VL.begin(), last1: VL.end(), first2: Scalars.begin());
3997 return VL.size() == Mask.size() &&
3998 std::equal(first1: VL.begin(), last1: VL.end(), first2: Mask.begin(),
3999 binary_pred: [Scalars](Value *V, int Idx) {
4000 return (isa<UndefValue>(Val: V) &&
4001 Idx == PoisonMaskElem) ||
4002 (Idx != PoisonMaskElem && V == Scalars[Idx]);
4003 });
4004 };
4005 if (!ReorderIndices.empty()) {
4006 // TODO: implement matching if the nodes are just reordered, still can
4007 // treat the vector as the same if the list of scalars matches VL
4008 // directly, without reordering.
4009 SmallVector<int> Mask;
4010 inversePermutation(Indices: ReorderIndices, Mask);
4011 if (VL.size() == Scalars.size())
4012 return IsSame(Scalars, Mask);
4013 if (VL.size() == ReuseShuffleIndices.size()) {
4014 ::addMask(Mask, SubMask: ReuseShuffleIndices);
4015 return IsSame(Scalars, Mask);
4016 }
4017 return false;
4018 }
4019 return IsSame(Scalars, ReuseShuffleIndices);
4020 }
4021
4022 /// \returns true if current entry has same operands as \p TE.
4023 bool hasEqualOperands(const TreeEntry &TE) const {
4024 if (TE.getNumOperands() != getNumOperands())
4025 return false;
4026 SmallBitVector Used(getNumOperands());
4027 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
4028 unsigned PrevCount = Used.count();
4029 for (unsigned K = 0; K < E; ++K) {
4030 if (Used.test(Idx: K))
4031 continue;
4032 if (getOperand(OpIdx: K) == TE.getOperand(OpIdx: I)) {
4033 Used.set(K);
4034 break;
4035 }
4036 }
4037 // Check if we actually found the matching operand.
4038 if (PrevCount == Used.count())
4039 return false;
4040 }
4041 return true;
4042 }
4043
4044 /// \return Final vectorization factor for the node. Defined by the total
4045 /// number of vectorized scalars, including those, used several times in the
4046 /// entry and counted in the \a ReuseShuffleIndices, if any.
4047 unsigned getVectorFactor() const {
4048 if (!ReuseShuffleIndices.empty())
4049 return ReuseShuffleIndices.size();
4050 return Scalars.size();
4051 };
4052
4053 /// Checks if the current node is a gather node.
4054 bool isGather() const { return State == NeedToGather; }
4055
4056 /// A vector of scalars.
4057 ValueList Scalars;
4058
4059 /// The Scalars are vectorized into this value. It is initialized to Null.
4060 WeakTrackingVH VectorizedValue = nullptr;
4061
4062 /// Do we need to gather this sequence or vectorize it
4063 /// (either with vector instruction or with scatter/gather
4064 /// intrinsics for store/load)?
4065 enum EntryState {
4066 Vectorize, ///< The node is regularly vectorized.
4067 ScatterVectorize, ///< Masked scatter/gather node.
4068 StridedVectorize, ///< Strided loads (and stores)
4069 CompressVectorize, ///< (Masked) load with compress.
4070 NeedToGather, ///< Gather/buildvector node.
4071 CombinedVectorize, ///< Vectorized node, combined with its user into more
4072 ///< complex node like select/cmp to minmax, mul/add to
4073 ///< fma, etc. Must be used for the following nodes in
4074 ///< the pattern, not the very first one.
4075 SplitVectorize, ///< Splits the node into 2 subnodes, vectorizes them
4076 ///< independently and then combines back.
4077 };
4078 EntryState State;
4079
4080 /// List of combined opcodes supported by the vectorizer.
4081 enum CombinedOpcode {
4082 NotCombinedOp = -1,
4083 MinMax = Instruction::OtherOpsEnd + 1,
4084 FMulAdd,
4085 ReducedBitcast,
4086 ReducedBitcastBSwap,
4087 ReducedBitcastLoads,
4088 ReducedBitcastBSwapLoads,
4089 ReducedCmpBitcast,
4090 };
4091 CombinedOpcode CombinedOp = NotCombinedOp;
4092
4093 /// Does this sequence require some shuffling?
4094 SmallVector<int, 4> ReuseShuffleIndices;
4095
4096 /// Does this entry require reordering?
4097 SmallVector<unsigned, 4> ReorderIndices;
4098
4099 /// Points back to the VectorizableTree.
4100 ///
4101 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
4102 /// to be a pointer and needs to be able to initialize the child iterator.
4103 /// Thus we need a reference back to the container to translate the indices
4104 /// to entries.
4105 VecTreeTy &Container;
4106
4107 /// The TreeEntry index containing the user of this entry.
4108 EdgeInfo UserTreeIndex;
4109
4110 /// The index of this treeEntry in VectorizableTree.
4111 unsigned Idx = 0;
4112
4113 /// For gather/buildvector/alt opcode nodes, which are combined from
4114 /// other nodes as a series of insertvector instructions.
4115 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
4116
4117 private:
4118 /// The operands of each instruction in each lane Operands[op_index][lane].
4119 /// Note: This helps avoid the replication of the code that performs the
4120 /// reordering of operands during buildTreeRec() and vectorizeTree().
4121 SmallVector<ValueList, 2> Operands;
4122
4123 /// Copyable elements of the entry node.
4124 SmallPtrSet<const Value *, 4> CopyableElements;
4125
4126 /// MainOp and AltOp are recorded inside. S should be obtained from
4127 /// newTreeEntry.
4128 InstructionsState S = InstructionsState::invalid();
4129
4130 /// Interleaving factor for interleaved loads Vectorize nodes.
4131 unsigned InterleaveFactor = 0;
4132
4133 /// True if the node does not require scheduling.
4134 bool DoesNotNeedToSchedule = false;
4135
4136 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
4137 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
4138 if (Operands.size() < OpIdx + 1)
4139 Operands.resize(N: OpIdx + 1);
4140 assert(Operands[OpIdx].empty() && "Already resized?");
4141 assert(OpVL.size() <= Scalars.size() &&
4142 "Number of operands is greater than the number of scalars.");
4143 Operands[OpIdx].resize(N: OpVL.size());
4144 copy(Range&: OpVL, Out: Operands[OpIdx].begin());
4145 }
4146
4147 public:
4148 /// Returns interleave factor for interleave nodes.
4149 unsigned getInterleaveFactor() const { return InterleaveFactor; }
4150 /// Sets interleaving factor for the interleaving nodes.
4151 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
4152
4153 /// Marks the node as one that does not require scheduling.
4154 void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; }
4155 /// Returns true if the node is marked as one that does not require
4156 /// scheduling.
4157 bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; }
4158
4159 /// Set this bundle's operands from \p Operands.
4160 void setOperands(ArrayRef<ValueList> Operands) {
4161 for (unsigned I : seq<unsigned>(Size: Operands.size()))
4162 setOperand(OpIdx: I, OpVL: Operands[I]);
4163 }
4164
4165 /// Reorders operands of the node to the given mask \p Mask.
4166 void reorderOperands(ArrayRef<int> Mask) {
4167 for (ValueList &Operand : Operands)
4168 reorderScalars(Scalars&: Operand, Mask);
4169 }
4170
4171 /// \returns the \p OpIdx operand of this TreeEntry.
4172 ValueList &getOperand(unsigned OpIdx) {
4173 assert(OpIdx < Operands.size() && "Off bounds");
4174 return Operands[OpIdx];
4175 }
4176
4177 /// \returns the \p OpIdx operand of this TreeEntry.
4178 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
4179 assert(OpIdx < Operands.size() && "Off bounds");
4180 return Operands[OpIdx];
4181 }
4182
4183 /// \returns the number of operands.
4184 unsigned getNumOperands() const { return Operands.size(); }
4185
4186 /// \return the single \p OpIdx operand.
4187 Value *getSingleOperand(unsigned OpIdx) const {
4188 assert(OpIdx < Operands.size() && "Off bounds");
4189 assert(!Operands[OpIdx].empty() && "No operand available");
4190 return Operands[OpIdx][0];
4191 }
4192
4193 /// Some of the instructions in the list have alternate opcodes.
4194 bool isAltShuffle() const { return S.isAltShuffle(); }
4195
4196 Instruction *getMatchingMainOpOrAltOp(Instruction *I) const {
4197 return S.getMatchingMainOpOrAltOp(I);
4198 }
4199
4200 /// Chooses the correct key for scheduling data. If \p Op has the same (or
4201 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
4202 /// \p OpValue.
4203 Value *isOneOf(Value *Op) const {
4204 auto *I = dyn_cast<Instruction>(Val: Op);
4205 if (I && getMatchingMainOpOrAltOp(I))
4206 return Op;
4207 return S.getMainOp();
4208 }
4209
4210 void setOperations(const InstructionsState &S) {
4211 assert(S && "InstructionsState is invalid.");
4212 this->S = S;
4213 }
4214
4215 Instruction *getMainOp() const { return S.getMainOp(); }
4216
4217 Instruction *getAltOp() const { return S.getAltOp(); }
4218
4219 /// The main/alternate opcodes for the list of instructions.
4220 unsigned getOpcode() const { return S.getOpcode(); }
4221
4222 unsigned getAltOpcode() const { return S.getAltOpcode(); }
4223
4224 bool hasState() const { return S.valid(); }
4225
4226 /// Add \p V to the list of copyable elements.
4227 void addCopyableElement(Value *V) {
4228 assert(S.isCopyableElement(V) && "Not a copyable element.");
4229 CopyableElements.insert(Ptr: V);
4230 }
4231
4232 /// Returns true if \p V is a copyable element.
4233 bool isCopyableElement(Value *V) const {
4234 return CopyableElements.contains(Ptr: V);
4235 }
4236
4237 /// Returns true if any scalar in the list is a copyable element.
4238 bool hasCopyableElements() const { return !CopyableElements.empty(); }
4239
4240 /// Returns the state of the operations.
4241 const InstructionsState &getOperations() const { return S; }
4242
4243 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
4244 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
4245 unsigned findLaneForValue(Value *V) const {
4246 unsigned FoundLane = getVectorFactor();
4247 for (auto *It = find(Range: Scalars, Val: V), *End = Scalars.end(); It != End;
4248 std::advance(i&: It, n: 1)) {
4249 if (*It != V)
4250 continue;
4251 FoundLane = std::distance(first: Scalars.begin(), last: It);
4252 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4253 if (!ReorderIndices.empty())
4254 FoundLane = ReorderIndices[FoundLane];
4255 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
4256 if (ReuseShuffleIndices.empty())
4257 break;
4258 if (auto *RIt = find(Range: ReuseShuffleIndices, Val: FoundLane);
4259 RIt != ReuseShuffleIndices.end()) {
4260 FoundLane = std::distance(first: ReuseShuffleIndices.begin(), last: RIt);
4261 break;
4262 }
4263 }
4264 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
4265 return FoundLane;
4266 }
4267
4268 /// Build a shuffle mask for graph entry which represents a merge of main
4269 /// and alternate operations.
4270 void
4271 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
4272 SmallVectorImpl<int> &Mask,
4273 SmallVectorImpl<Value *> *OpScalars = nullptr,
4274 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
4275
4276 /// Return true if this is a non-power-of-2 node.
4277 bool isNonPowOf2Vec() const {
4278 bool IsNonPowerOf2 = !has_single_bit(Value: Scalars.size());
4279 return IsNonPowerOf2;
4280 }
4281
4282 /// Return true if this is a node, which tries to vectorize number of
4283 /// elements, forming whole vectors.
4284 bool
4285 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
4286 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
4287 TTI, Ty: getValueType(V: Scalars.front()), Sz: Scalars.size());
4288 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
4289 "Reshuffling not supported with non-power-of-2 vectors yet.");
4290 return IsNonPowerOf2;
4291 }
4292
4293 Value *getOrdered(unsigned Idx) const {
4294 if (ReorderIndices.empty())
4295 return Scalars[Idx];
4296 SmallVector<int> Mask;
4297 inversePermutation(Indices: ReorderIndices, Mask);
4298 return Scalars[Mask[Idx]];
4299 }
4300
4301#ifndef NDEBUG
4302 /// Debug printer.
4303 LLVM_DUMP_METHOD void dump() const {
4304 dbgs() << Idx << ".\n";
4305 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
4306 dbgs() << "Operand " << OpI << ":\n";
4307 for (const Value *V : Operands[OpI])
4308 dbgs().indent(2) << *V << "\n";
4309 }
4310 dbgs() << "Scalars: \n";
4311 for (Value *V : Scalars)
4312 dbgs().indent(2) << *V << "\n";
4313 dbgs() << "State: ";
4314 if (S && hasCopyableElements())
4315 dbgs() << "[[Copyable]] ";
4316 switch (State) {
4317 case Vectorize:
4318 if (InterleaveFactor > 0) {
4319 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
4320 << "\n";
4321 } else {
4322 dbgs() << "Vectorize\n";
4323 }
4324 break;
4325 case ScatterVectorize:
4326 dbgs() << "ScatterVectorize\n";
4327 break;
4328 case StridedVectorize:
4329 dbgs() << "StridedVectorize\n";
4330 break;
4331 case CompressVectorize:
4332 dbgs() << "CompressVectorize\n";
4333 break;
4334 case NeedToGather:
4335 dbgs() << "NeedToGather\n";
4336 break;
4337 case CombinedVectorize:
4338 dbgs() << "CombinedVectorize\n";
4339 break;
4340 case SplitVectorize:
4341 dbgs() << "SplitVectorize\n";
4342 break;
4343 }
4344 if (S) {
4345 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
4346 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
4347 } else {
4348 dbgs() << "MainOp: NULL\n";
4349 dbgs() << "AltOp: NULL\n";
4350 }
4351 dbgs() << "VectorizedValue: ";
4352 if (VectorizedValue)
4353 dbgs() << *VectorizedValue << "\n";
4354 else
4355 dbgs() << "NULL\n";
4356 dbgs() << "ReuseShuffleIndices: ";
4357 if (ReuseShuffleIndices.empty())
4358 dbgs() << "Empty";
4359 else
4360 for (int ReuseIdx : ReuseShuffleIndices)
4361 dbgs() << ReuseIdx << ", ";
4362 dbgs() << "\n";
4363 dbgs() << "ReorderIndices: ";
4364 for (unsigned ReorderIdx : ReorderIndices)
4365 dbgs() << ReorderIdx << ", ";
4366 dbgs() << "\n";
4367 dbgs() << "UserTreeIndex: ";
4368 if (UserTreeIndex)
4369 dbgs() << UserTreeIndex;
4370 else
4371 dbgs() << "<invalid>";
4372 dbgs() << "\n";
4373 if (!CombinedEntriesWithIndices.empty()) {
4374 dbgs() << "Combined entries: ";
4375 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
4376 dbgs() << "Entry index " << P.first << " with offset " << P.second;
4377 });
4378 dbgs() << "\n";
4379 }
4380 }
4381#endif
4382 };
4383
4384#ifndef NDEBUG
4385 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
4386 InstructionCost VecCost, InstructionCost ScalarCost,
4387 StringRef Banner) const {
4388 dbgs() << "SLP: " << Banner << ":\n";
4389 E->dump();
4390 dbgs() << "SLP: Costs:\n";
4391 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
4392 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
4393 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
4394 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
4395 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
4396 }
4397#endif
4398
4399 /// Create a new gather TreeEntry
4400 TreeEntry *newGatherTreeEntry(ArrayRef<Value *> VL,
4401 const InstructionsState &S,
4402 const EdgeInfo &UserTreeIdx,
4403 ArrayRef<int> ReuseShuffleIndices = {}) {
4404 auto Invalid = ScheduleBundle::invalid();
4405 return newTreeEntry(VL, Bundle&: Invalid, S, UserTreeIdx, ReuseShuffleIndices);
4406 }
4407
4408 /// Create a new VectorizableTree entry.
4409 TreeEntry *newTreeEntry(ArrayRef<Value *> VL, ScheduleBundle &Bundle,
4410 const InstructionsState &S,
4411 const EdgeInfo &UserTreeIdx,
4412 ArrayRef<int> ReuseShuffleIndices = {},
4413 ArrayRef<unsigned> ReorderIndices = {},
4414 unsigned InterleaveFactor = 0) {
4415 TreeEntry::EntryState EntryState =
4416 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
4417 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
4418 ReuseShuffleIndices, ReorderIndices);
4419 if (E && InterleaveFactor > 0)
4420 E->setInterleave(InterleaveFactor);
4421 return E;
4422 }
4423
4424 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
4425 TreeEntry::EntryState EntryState,
4426 ScheduleBundle &Bundle, const InstructionsState &S,
4427 const EdgeInfo &UserTreeIdx,
4428 ArrayRef<int> ReuseShuffleIndices = {},
4429 ArrayRef<unsigned> ReorderIndices = {}) {
4430 assert(((!Bundle && (EntryState == TreeEntry::NeedToGather ||
4431 EntryState == TreeEntry::SplitVectorize)) ||
4432 (Bundle && EntryState != TreeEntry::NeedToGather &&
4433 EntryState != TreeEntry::SplitVectorize)) &&
4434 "Need to vectorize gather entry?");
4435 // Gathered loads still gathered? Do not create entry, use the original one.
4436 if (GatheredLoadsEntriesFirst.has_value() &&
4437 EntryState == TreeEntry::NeedToGather && S &&
4438 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
4439 !UserTreeIdx.UserTE)
4440 return nullptr;
4441 VectorizableTree.push_back(Elt: std::make_unique<TreeEntry>(args&: VectorizableTree));
4442 TreeEntry *Last = VectorizableTree.back().get();
4443 Last->Idx = VectorizableTree.size() - 1;
4444 Last->State = EntryState;
4445 if (UserTreeIdx.UserTE)
4446 OperandsToTreeEntry.try_emplace(
4447 Key: std::make_pair(x: UserTreeIdx.UserTE, y: UserTreeIdx.EdgeIdx), Args&: Last);
4448 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
4449 // for non-power-of-two vectors.
4450 assert(
4451 (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) ||
4452 ReuseShuffleIndices.empty()) &&
4453 "Reshuffling scalars not yet supported for nodes with padding");
4454 Last->ReuseShuffleIndices.append(in_start: ReuseShuffleIndices.begin(),
4455 in_end: ReuseShuffleIndices.end());
4456 if (ReorderIndices.empty()) {
4457 Last->Scalars.assign(in_start: VL.begin(), in_end: VL.end());
4458 if (S)
4459 Last->setOperations(S);
4460 } else {
4461 // Reorder scalars and build final mask.
4462 Last->Scalars.assign(NumElts: VL.size(), Elt: nullptr);
4463 transform(Range&: ReorderIndices, d_first: Last->Scalars.begin(),
4464 F: [VL](unsigned Idx) -> Value * {
4465 if (Idx >= VL.size())
4466 return UndefValue::get(T: VL.front()->getType());
4467 return VL[Idx];
4468 });
4469 InstructionsState S = getSameOpcode(VL: Last->Scalars, TLI: *TLI);
4470 if (S)
4471 Last->setOperations(S);
4472 Last->ReorderIndices.append(in_start: ReorderIndices.begin(), in_end: ReorderIndices.end());
4473 }
4474 if (EntryState == TreeEntry::SplitVectorize) {
4475 assert(S && "Split nodes must have operations.");
4476 Last->setOperations(S);
4477 SmallPtrSet<Value *, 4> Processed;
4478 for (Value *V : VL) {
4479 auto *I = dyn_cast<Instruction>(Val: V);
4480 if (!I)
4481 continue;
4482 auto It = ScalarsInSplitNodes.find(Val: V);
4483 if (It == ScalarsInSplitNodes.end()) {
4484 ScalarsInSplitNodes.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4485 (void)Processed.insert(Ptr: V);
4486 } else if (Processed.insert(Ptr: V).second) {
4487 assert(!is_contained(It->getSecond(), Last) &&
4488 "Value already associated with the node.");
4489 It->getSecond().push_back(Elt: Last);
4490 }
4491 }
4492 } else if (!Last->isGather()) {
4493 if (isa<PHINode>(Val: S.getMainOp()) ||
4494 isVectorLikeInstWithConstOps(V: S.getMainOp()) ||
4495 (!S.areInstructionsWithCopyableElements() &&
4496 doesNotNeedToSchedule(VL)) ||
4497 all_of(Range&: VL, P: [&](Value *V) { return S.isNonSchedulable(V); }))
4498 Last->setDoesNotNeedToSchedule();
4499 SmallPtrSet<Value *, 4> Processed;
4500 for (Value *V : VL) {
4501 if (isa<PoisonValue>(Val: V))
4502 continue;
4503 if (S.isCopyableElement(V)) {
4504 Last->addCopyableElement(V);
4505 continue;
4506 }
4507 auto It = ScalarToTreeEntries.find(Val: V);
4508 if (It == ScalarToTreeEntries.end()) {
4509 ScalarToTreeEntries.try_emplace(Key: V).first->getSecond().push_back(Elt: Last);
4510 (void)Processed.insert(Ptr: V);
4511 } else if (Processed.insert(Ptr: V).second) {
4512 assert(!is_contained(It->getSecond(), Last) &&
4513 "Value already associated with the node.");
4514 It->getSecond().push_back(Elt: Last);
4515 }
4516 }
4517 // Update the scheduler bundle to point to this TreeEntry.
4518 assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) &&
4519 "Bundle and VL out of sync");
4520 if (!Bundle.getBundle().empty()) {
4521#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
4522 auto *BundleMember = Bundle.getBundle().begin();
4523 SmallPtrSet<Value *, 4> Processed;
4524 for (Value *V : VL) {
4525 if (S.isNonSchedulable(V) || !Processed.insert(V).second)
4526 continue;
4527 ++BundleMember;
4528 }
4529 assert(BundleMember == Bundle.getBundle().end() &&
4530 "Bundle and VL out of sync");
4531#endif
4532 Bundle.setTreeEntry(Last);
4533 }
4534 } else {
4535 // Build a map for gathered scalars to the nodes where they are used.
4536 bool AllConstsOrCasts = true;
4537 for (Value *V : VL) {
4538 if (S && S.areInstructionsWithCopyableElements() &&
4539 S.isCopyableElement(V))
4540 Last->addCopyableElement(V);
4541 if (!isConstant(V)) {
4542 auto *I = dyn_cast<CastInst>(Val: V);
4543 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
4544 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
4545 !UserTreeIdx.UserTE->isGather())
4546 ValueToGatherNodes.try_emplace(Key: V).first->getSecond().insert(X: Last);
4547 }
4548 }
4549 if (AllConstsOrCasts)
4550 CastMaxMinBWSizes =
4551 std::make_pair(x: std::numeric_limits<unsigned>::max(), y: 1);
4552 MustGather.insert_range(R&: VL);
4553 }
4554
4555 if (UserTreeIdx.UserTE)
4556 Last->UserTreeIndex = UserTreeIdx;
4557 return Last;
4558 }
4559
4560 /// -- Vectorization State --
4561 /// Holds all of the tree entries.
4562 TreeEntry::VecTreeTy VectorizableTree;
4563
4564#ifndef NDEBUG
4565 /// Debug printer.
4566 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
4567 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
4568 VectorizableTree[Id]->dump();
4569 if (TransformedToGatherNodes.contains(VectorizableTree[Id].get()))
4570 dbgs() << "[[TRANSFORMED TO GATHER]]";
4571 else if (DeletedNodes.contains(VectorizableTree[Id].get()))
4572 dbgs() << "[[DELETED NODE]]";
4573 dbgs() << "\n";
4574 }
4575 }
4576#endif
4577
4578 /// Get list of vector entries, associated with the value \p V.
4579 ArrayRef<TreeEntry *> getTreeEntries(const Value *V) const {
4580 assert(V && "V cannot be nullptr.");
4581 auto It = ScalarToTreeEntries.find(Val: V);
4582 if (It == ScalarToTreeEntries.end())
4583 return {};
4584 return It->getSecond();
4585 }
4586
4587 /// Get list of split vector entries, associated with the value \p V.
4588 ArrayRef<TreeEntry *> getSplitTreeEntries(Value *V) const {
4589 assert(V && "V cannot be nullptr.");
4590 auto It = ScalarsInSplitNodes.find(Val: V);
4591 if (It == ScalarsInSplitNodes.end())
4592 return {};
4593 return It->getSecond();
4594 }
4595
4596 /// Returns first vector node for value \p V, matching values \p VL.
4597 TreeEntry *getSameValuesTreeEntry(Value *V, ArrayRef<Value *> VL,
4598 bool SameVF = false) const {
4599 assert(V && "V cannot be nullptr.");
4600 for (TreeEntry *TE : ScalarToTreeEntries.lookup(Val: V))
4601 if ((!SameVF || TE->getVectorFactor() == VL.size()) && TE->isSame(VL))
4602 return TE;
4603 return nullptr;
4604 }
4605
4606 /// Contains all the outputs of legality analysis for a list of values to
4607 /// vectorize.
4608 class ScalarsVectorizationLegality {
4609 InstructionsState S;
4610 bool IsLegal;
4611 bool TryToFindDuplicates;
4612 bool TrySplitVectorize;
4613
4614 public:
4615 ScalarsVectorizationLegality(InstructionsState S, bool IsLegal,
4616 bool TryToFindDuplicates = true,
4617 bool TrySplitVectorize = false)
4618 : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates),
4619 TrySplitVectorize(TrySplitVectorize) {
4620 assert((!IsLegal || (S.valid() && TryToFindDuplicates)) &&
4621 "Inconsistent state");
4622 }
4623 const InstructionsState &getInstructionsState() const { return S; };
4624 bool isLegal() const { return IsLegal; }
4625 bool tryToFindDuplicates() const { return TryToFindDuplicates; }
4626 bool trySplitVectorize() const { return TrySplitVectorize; }
4627 };
4628
4629 /// Checks if the specified list of the instructions/values can be vectorized
4630 /// in general.
4631 ScalarsVectorizationLegality
4632 getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
4633 const EdgeInfo &UserTreeIdx,
4634 bool TryCopyableElementsVectorization) const;
4635
4636 /// Checks if the specified list of the instructions/values can be vectorized
4637 /// and fills required data before actual scheduling of the instructions.
4638 TreeEntry::EntryState getScalarsVectorizationState(
4639 const InstructionsState &S, ArrayRef<Value *> VL,
4640 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
4641 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo);
4642
4643 /// Maps a specific scalar to its tree entry(ies).
4644 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarToTreeEntries;
4645
4646 /// List of deleted non-profitable nodes.
4647 SmallPtrSet<const TreeEntry *, 8> DeletedNodes;
4648
4649 /// List of nodes, transformed to gathered, with their conservative
4650 /// gather/buildvector cost estimation.
4651 SmallDenseMap<const TreeEntry *, InstructionCost> TransformedToGatherNodes;
4652
4653 /// Maps the operand index and entry to the corresponding tree entry.
4654 SmallDenseMap<std::pair<const TreeEntry *, unsigned>, TreeEntry *>
4655 OperandsToTreeEntry;
4656
4657 /// Scalars, used in split vectorize nodes.
4658 SmallDenseMap<Value *, SmallVector<TreeEntry *>> ScalarsInSplitNodes;
4659
4660 /// Maps a value to the proposed vectorizable size.
4661 SmallDenseMap<Value *, unsigned> InstrElementSize;
4662
4663 /// A list of scalars that we found that we need to keep as scalars.
4664 ValueSet MustGather;
4665
4666 /// A set of first non-schedulable values.
4667 ValueSet NonScheduledFirst;
4668
4669 /// A map between the vectorized entries and the last instructions in the
4670 /// bundles. The bundles are built in use order, not in the def order of the
4671 /// instructions. So, we cannot rely directly on the last instruction in the
4672 /// bundle being the last instruction in the program order during
4673 /// vectorization process since the basic blocks are affected, need to
4674 /// pre-gather them before.
4675 SmallDenseMap<const TreeEntry *, WeakTrackingVH> EntryToLastInstruction;
4676
4677 /// Keeps the mapping between the last instructions and their insertion
4678 /// points, which is an instruction-after-the-last-instruction.
4679 SmallDenseMap<const Instruction *, Instruction *> LastInstructionToPos;
4680
4681 /// List of gather nodes, depending on other gather/vector nodes, which should
4682 /// be emitted after the vector instruction emission process to correctly
4683 /// handle order of the vector instructions and shuffles.
4684 SetVector<const TreeEntry *> PostponedGathers;
4685
4686 using ValueToGatherNodesMap =
4687 DenseMap<Value *, SmallSetVector<const TreeEntry *, 4>>;
4688 ValueToGatherNodesMap ValueToGatherNodes;
4689
4690 /// A list of the load entries (node indices), which can be vectorized using
4691 /// strided or masked gather approach, but attempted to be represented as
4692 /// contiguous loads.
4693 SetVector<unsigned> LoadEntriesToVectorize;
4694
4695 /// true if graph nodes transforming mode is on.
4696 bool IsGraphTransformMode = false;
4697
4698 /// The index of the first gathered load entry in the VectorizeTree.
4699 std::optional<unsigned> GatheredLoadsEntriesFirst;
4700
4701 /// Maps compress entries to their mask data for the final codegen.
4702 SmallDenseMap<const TreeEntry *,
4703 std::tuple<SmallVector<int>, VectorType *, unsigned, bool>>
4704 CompressEntryToData;
4705
4706 /// This POD struct describes one external user in the vectorized tree.
4707 struct ExternalUser {
4708 ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
4709 : Scalar(S), User(U), E(E), Lane(L) {}
4710
4711 /// Which scalar in our function.
4712 Value *Scalar = nullptr;
4713
4714 /// Which user that uses the scalar.
4715 llvm::User *User = nullptr;
4716
4717 /// Vector node, the value is part of.
4718 const TreeEntry &E;
4719
4720 /// Which lane does the scalar belong to.
4721 unsigned Lane;
4722 };
4723 using UserList = SmallVector<ExternalUser, 16>;
4724
4725 /// Checks if two instructions may access the same memory.
4726 ///
4727 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
4728 /// is invariant in the calling loop.
4729 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
4730 Instruction *Inst2) {
4731 assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction.");
4732 // First check if the result is already in the cache.
4733 AliasCacheKey Key = std::make_pair(x&: Inst1, y&: Inst2);
4734 auto Res = AliasCache.try_emplace(Key);
4735 if (!Res.second)
4736 return Res.first->second;
4737 bool Aliased = isModOrRefSet(MRI: BatchAA.getModRefInfo(I: Inst2, OptLoc: Loc1));
4738 // Store the result in the cache.
4739 Res.first->getSecond() = Aliased;
4740 return Aliased;
4741 }
4742
4743 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
4744
4745 /// Cache for alias results.
4746 /// TODO: consider moving this to the AliasAnalysis itself.
4747 SmallDenseMap<AliasCacheKey, bool> AliasCache;
4748
4749 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
4750 // globally through SLP because we don't perform any action which
4751 // invalidates capture results.
4752 BatchAAResults BatchAA;
4753
4754 /// Temporary store for deleted instructions. Instructions will be deleted
4755 /// eventually when the BoUpSLP is destructed. The deferral is required to
4756 /// ensure that there are no incorrect collisions in the AliasCache, which
4757 /// can happen if a new instruction is allocated at the same address as a
4758 /// previously deleted instruction.
4759 DenseSet<Instruction *> DeletedInstructions;
4760
4761 /// Set of the instruction, being analyzed already for reductions.
4762 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
4763
4764 /// Set of hashes for the list of reduction values already being analyzed.
4765 DenseSet<size_t> AnalyzedReductionVals;
4766
4767 /// Values, already been analyzed for mininmal bitwidth and found to be
4768 /// non-profitable.
4769 DenseSet<Value *> AnalyzedMinBWVals;
4770
4771 /// A list of values that need to extracted out of the tree.
4772 /// This list holds pairs of (Internal Scalar : External User). External User
4773 /// can be nullptr, it means that this Internal Scalar will be used later,
4774 /// after vectorization.
4775 UserList ExternalUses;
4776
4777 /// A list of GEPs which can be reaplced by scalar GEPs instead of
4778 /// extractelement instructions.
4779 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
4780
4781 /// A list of scalar to be extracted without specific user necause of too many
4782 /// uses.
4783 SmallPtrSet<Value *, 4> ExternalUsesWithNonUsers;
4784
4785 /// Values used only by @llvm.assume calls.
4786 SmallPtrSet<const Value *, 32> EphValues;
4787
4788 /// Holds all of the instructions that we gathered, shuffle instructions and
4789 /// extractelements.
4790 SetVector<Instruction *> GatherShuffleExtractSeq;
4791
4792 /// A list of blocks that we are going to CSE.
4793 DenseSet<BasicBlock *> CSEBlocks;
4794
4795 /// List of hashes of vector of loads, which are known to be non vectorizable.
4796 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
4797
4798 /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData
4799 /// or ScheduleBundle. ScheduleData used to gather dependecies for a single
4800 /// instructions, while ScheduleBundle represents a batch of instructions,
4801 /// going to be groupped together. ScheduleCopyableData models extra user for
4802 /// "copyable" instructions.
4803 class ScheduleEntity {
4804 friend class ScheduleBundle;
4805 friend class ScheduleData;
4806 friend class ScheduleCopyableData;
4807
4808 protected:
4809 enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData };
4810 Kind getKind() const { return K; }
4811 ScheduleEntity(Kind K) : K(K) {}
4812
4813 private:
4814 /// Used for getting a "good" final ordering of instructions.
4815 int SchedulingPriority = 0;
4816 /// True if this instruction (or bundle) is scheduled (or considered as
4817 /// scheduled in the dry-run).
4818 bool IsScheduled = false;
4819 /// The kind of the ScheduleEntity.
4820 const Kind K = Kind::ScheduleData;
4821
4822 public:
4823 ScheduleEntity() = delete;
4824 /// Gets/sets the scheduling priority.
4825 void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; }
4826 int getSchedulingPriority() const { return SchedulingPriority; }
4827 bool isReady() const {
4828 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4829 return SD->isReady();
4830 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4831 return CD->isReady();
4832 return cast<ScheduleBundle>(Val: this)->isReady();
4833 }
4834 /// Returns true if the dependency information has been calculated.
4835 /// Note that depenendency validity can vary between instructions within
4836 /// a single bundle.
4837 bool hasValidDependencies() const {
4838 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4839 return SD->hasValidDependencies();
4840 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4841 return CD->hasValidDependencies();
4842 return cast<ScheduleBundle>(Val: this)->hasValidDependencies();
4843 }
4844 /// Gets the number of unscheduled dependencies.
4845 int getUnscheduledDeps() const {
4846 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4847 return SD->getUnscheduledDeps();
4848 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: this))
4849 return CD->getUnscheduledDeps();
4850 return cast<ScheduleBundle>(Val: this)->unscheduledDepsInBundle();
4851 }
4852 /// Increments the number of unscheduled dependencies.
4853 int incrementUnscheduledDeps(int Incr) {
4854 if (auto *SD = dyn_cast<ScheduleData>(Val: this))
4855 return SD->incrementUnscheduledDeps(Incr);
4856 return cast<ScheduleCopyableData>(Val: this)->incrementUnscheduledDeps(Incr);
4857 }
4858 /// Gets the number of dependencies.
4859 int getDependencies() const {
4860 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4861 return SD->getDependencies();
4862 return cast<ScheduleCopyableData>(Val: this)->getDependencies();
4863 }
4864 /// Gets the instruction.
4865 Instruction *getInst() const {
4866 if (const auto *SD = dyn_cast<ScheduleData>(Val: this))
4867 return SD->getInst();
4868 return cast<ScheduleCopyableData>(Val: this)->getInst();
4869 }
4870
4871 /// Gets/sets if the bundle is scheduled.
4872 bool isScheduled() const { return IsScheduled; }
4873 void setScheduled(bool Scheduled) { IsScheduled = Scheduled; }
4874
4875 static bool classof(const ScheduleEntity *) { return true; }
4876
4877#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4878 void dump(raw_ostream &OS) const {
4879 if (const auto *SD = dyn_cast<ScheduleData>(this))
4880 return SD->dump(OS);
4881 if (const auto *CD = dyn_cast<ScheduleCopyableData>(this))
4882 return CD->dump(OS);
4883 return cast<ScheduleBundle>(this)->dump(OS);
4884 }
4885
4886 LLVM_DUMP_METHOD void dump() const {
4887 dump(dbgs());
4888 dbgs() << '\n';
4889 }
4890#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4891 };
4892
4893#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
4894 friend inline raw_ostream &operator<<(raw_ostream &OS,
4895 const BoUpSLP::ScheduleEntity &SE) {
4896 SE.dump(OS);
4897 return OS;
4898 }
4899#endif
4900
4901 /// Contains all scheduling relevant data for an instruction.
4902 /// A ScheduleData either represents a single instruction or a member of an
4903 /// instruction bundle (= a group of instructions which is combined into a
4904 /// vector instruction).
4905 class ScheduleData final : public ScheduleEntity {
4906 public:
4907 // The initial value for the dependency counters. It means that the
4908 // dependencies are not calculated yet.
4909 enum { InvalidDeps = -1 };
4910
4911 ScheduleData() : ScheduleEntity(Kind::ScheduleData) {}
4912 static bool classof(const ScheduleEntity *Entity) {
4913 return Entity->getKind() == Kind::ScheduleData;
4914 }
4915
4916 void init(int BlockSchedulingRegionID, Instruction *I) {
4917 NextLoadStore = nullptr;
4918 IsScheduled = false;
4919 SchedulingRegionID = BlockSchedulingRegionID;
4920 clearDependencies();
4921 Inst = I;
4922 }
4923
4924 /// Verify basic self consistency properties
4925 void verify() {
4926 if (hasValidDependencies()) {
4927 assert(UnscheduledDeps <= Dependencies && "invariant");
4928 } else {
4929 assert(UnscheduledDeps == Dependencies && "invariant");
4930 }
4931
4932 if (IsScheduled) {
4933 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
4934 "unexpected scheduled state");
4935 }
4936 }
4937
4938 /// Returns true if the dependency information has been calculated.
4939 /// Note that depenendency validity can vary between instructions within
4940 /// a single bundle.
4941 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
4942
4943 /// Returns true if it is ready for scheduling, i.e. it has no more
4944 /// unscheduled depending instructions/bundles.
4945 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
4946
4947 /// Modifies the number of unscheduled dependencies for this instruction,
4948 /// and returns the number of remaining dependencies for the containing
4949 /// bundle.
4950 int incrementUnscheduledDeps(int Incr) {
4951 assert(hasValidDependencies() &&
4952 "increment of unscheduled deps would be meaningless");
4953 UnscheduledDeps += Incr;
4954 assert(UnscheduledDeps >= 0 &&
4955 "Expected valid number of unscheduled deps");
4956 return UnscheduledDeps;
4957 }
4958
4959 /// Sets the number of unscheduled dependencies to the number of
4960 /// dependencies.
4961 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
4962
4963 /// Clears all dependency information.
4964 void clearDependencies() {
4965 clearDirectDependencies();
4966 MemoryDependencies.clear();
4967 ControlDependencies.clear();
4968 }
4969
4970 /// Clears all direct dependencies only, except for control and memory
4971 /// dependencies.
4972 /// Required for copyable elements to correctly handle control/memory deps
4973 /// and avoid extra reclaculation of such deps.
4974 void clearDirectDependencies() {
4975 Dependencies = InvalidDeps;
4976 resetUnscheduledDeps();
4977 IsScheduled = false;
4978 }
4979
4980 /// Gets the number of unscheduled dependencies.
4981 int getUnscheduledDeps() const { return UnscheduledDeps; }
4982 /// Gets the number of dependencies.
4983 int getDependencies() const { return Dependencies; }
4984 /// Initializes the number of dependencies.
4985 void initDependencies() { Dependencies = 0; }
4986 /// Increments the number of dependencies.
4987 void incDependencies() { Dependencies++; }
4988
4989 /// Gets scheduling region ID.
4990 int getSchedulingRegionID() const { return SchedulingRegionID; }
4991
4992 /// Gets the instruction.
4993 Instruction *getInst() const { return Inst; }
4994
4995 /// Gets the list of memory dependencies.
4996 ArrayRef<ScheduleData *> getMemoryDependencies() const {
4997 return MemoryDependencies;
4998 }
4999 /// Adds a memory dependency.
5000 void addMemoryDependency(ScheduleData *Dep) {
5001 MemoryDependencies.push_back(Elt: Dep);
5002 }
5003 /// Gets the list of control dependencies.
5004 ArrayRef<ScheduleData *> getControlDependencies() const {
5005 return ControlDependencies;
5006 }
5007 /// Adds a control dependency.
5008 void addControlDependency(ScheduleData *Dep) {
5009 ControlDependencies.push_back(Elt: Dep);
5010 }
5011 /// Gets/sets the next load/store instruction in the block.
5012 ScheduleData *getNextLoadStore() const { return NextLoadStore; }
5013 void setNextLoadStore(ScheduleData *Next) { NextLoadStore = Next; }
5014
5015 void dump(raw_ostream &OS) const { OS << *Inst; }
5016
5017 LLVM_DUMP_METHOD void dump() const {
5018 dump(OS&: dbgs());
5019 dbgs() << '\n';
5020 }
5021
5022 private:
5023 Instruction *Inst = nullptr;
5024
5025 /// Single linked list of all memory instructions (e.g. load, store, call)
5026 /// in the block - until the end of the scheduling region.
5027 ScheduleData *NextLoadStore = nullptr;
5028
5029 /// The dependent memory instructions.
5030 /// This list is derived on demand in calculateDependencies().
5031 SmallVector<ScheduleData *> MemoryDependencies;
5032
5033 /// List of instructions which this instruction could be control dependent
5034 /// on. Allowing such nodes to be scheduled below this one could introduce
5035 /// a runtime fault which didn't exist in the original program.
5036 /// ex: this is a load or udiv following a readonly call which inf loops
5037 SmallVector<ScheduleData *> ControlDependencies;
5038
5039 /// This ScheduleData is in the current scheduling region if this matches
5040 /// the current SchedulingRegionID of BlockScheduling.
5041 int SchedulingRegionID = 0;
5042
5043 /// The number of dependencies. Constitutes of the number of users of the
5044 /// instruction plus the number of dependent memory instructions (if any).
5045 /// This value is calculated on demand.
5046 /// If InvalidDeps, the number of dependencies is not calculated yet.
5047 int Dependencies = InvalidDeps;
5048
5049 /// The number of dependencies minus the number of dependencies of scheduled
5050 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5051 /// for scheduling.
5052 /// Note that this is negative as long as Dependencies is not calculated.
5053 int UnscheduledDeps = InvalidDeps;
5054 };
5055
5056#ifndef NDEBUG
5057 friend inline raw_ostream &operator<<(raw_ostream &OS,
5058 const BoUpSLP::ScheduleData &SD) {
5059 SD.dump(OS);
5060 return OS;
5061 }
5062#endif
5063
5064 class ScheduleBundle final : public ScheduleEntity {
5065 /// The schedule data for the instructions in the bundle.
5066 SmallVector<ScheduleEntity *> Bundle;
5067 /// True if this bundle is valid.
5068 bool IsValid = true;
5069 /// The TreeEntry that this instruction corresponds to.
5070 TreeEntry *TE = nullptr;
5071 ScheduleBundle(bool IsValid)
5072 : ScheduleEntity(Kind::ScheduleBundle), IsValid(IsValid) {}
5073
5074 public:
5075 ScheduleBundle() : ScheduleEntity(Kind::ScheduleBundle) {}
5076 static bool classof(const ScheduleEntity *Entity) {
5077 return Entity->getKind() == Kind::ScheduleBundle;
5078 }
5079
5080 /// Verify basic self consistency properties
5081 void verify() const {
5082 for (const ScheduleEntity *SD : Bundle) {
5083 if (SD->hasValidDependencies()) {
5084 assert(SD->getUnscheduledDeps() <= SD->getDependencies() &&
5085 "invariant");
5086 } else {
5087 assert(SD->getUnscheduledDeps() == SD->getDependencies() &&
5088 "invariant");
5089 }
5090
5091 if (isScheduled()) {
5092 assert(SD->hasValidDependencies() && SD->getUnscheduledDeps() == 0 &&
5093 "unexpected scheduled state");
5094 }
5095 }
5096 }
5097
5098 /// Returns the number of unscheduled dependencies in the bundle.
5099 int unscheduledDepsInBundle() const {
5100 assert(*this && "bundle must not be empty");
5101 int Sum = 0;
5102 for (const ScheduleEntity *BundleMember : Bundle) {
5103 if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps)
5104 return ScheduleData::InvalidDeps;
5105 Sum += BundleMember->getUnscheduledDeps();
5106 }
5107 return Sum;
5108 }
5109
5110 /// Returns true if the dependency information has been calculated.
5111 /// Note that depenendency validity can vary between instructions within
5112 /// a single bundle.
5113 bool hasValidDependencies() const {
5114 return all_of(Range: Bundle, P: [](const ScheduleEntity *SD) {
5115 return SD->hasValidDependencies();
5116 });
5117 }
5118
5119 /// Returns true if it is ready for scheduling, i.e. it has no more
5120 /// unscheduled depending instructions/bundles.
5121 bool isReady() const {
5122 assert(*this && "bundle must not be empty");
5123 return unscheduledDepsInBundle() == 0 && !isScheduled();
5124 }
5125
5126 /// Returns the bundle of scheduling data, associated with the current
5127 /// instruction.
5128 ArrayRef<ScheduleEntity *> getBundle() { return Bundle; }
5129 ArrayRef<const ScheduleEntity *> getBundle() const { return Bundle; }
5130 /// Adds an instruction to the bundle.
5131 void add(ScheduleEntity *SD) { Bundle.push_back(Elt: SD); }
5132
5133 /// Gets/sets the associated tree entry.
5134 void setTreeEntry(TreeEntry *TE) { this->TE = TE; }
5135 TreeEntry *getTreeEntry() const { return TE; }
5136
5137 static ScheduleBundle invalid() { return {false}; }
5138
5139 operator bool() const { return IsValid; }
5140
5141#ifndef NDEBUG
5142 void dump(raw_ostream &OS) const {
5143 if (!*this) {
5144 OS << "[]";
5145 return;
5146 }
5147 OS << '[';
5148 interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) {
5149 if (isa<ScheduleCopyableData>(SD))
5150 OS << "<Copyable>";
5151 OS << *SD->getInst();
5152 });
5153 OS << ']';
5154 }
5155
5156 LLVM_DUMP_METHOD void dump() const {
5157 dump(dbgs());
5158 dbgs() << '\n';
5159 }
5160#endif // NDEBUG
5161 };
5162
5163#ifndef NDEBUG
5164 friend inline raw_ostream &operator<<(raw_ostream &OS,
5165 const BoUpSLP::ScheduleBundle &Bundle) {
5166 Bundle.dump(OS);
5167 return OS;
5168 }
5169#endif
5170
5171 /// Contains all scheduling relevant data for the copyable instruction.
5172 /// It models the virtual instructions, supposed to replace the original
5173 /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0,
5174 /// %1], where %1 = add, then the ScheduleCopyableData models virtual
5175 /// instruction %virt = add %0, 0.
5176 class ScheduleCopyableData final : public ScheduleEntity {
5177 /// The source schedule data for the instruction.
5178 Instruction *Inst = nullptr;
5179 /// The edge information for the instruction.
5180 const EdgeInfo EI;
5181 /// This ScheduleData is in the current scheduling region if this matches
5182 /// the current SchedulingRegionID of BlockScheduling.
5183 int SchedulingRegionID = 0;
5184 /// Bundle, this data is part of.
5185 ScheduleBundle &Bundle;
5186
5187 public:
5188 ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I,
5189 const EdgeInfo &EI, ScheduleBundle &Bundle)
5190 : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI),
5191 SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {}
5192 static bool classof(const ScheduleEntity *Entity) {
5193 return Entity->getKind() == Kind::ScheduleCopyableData;
5194 }
5195
5196 /// Verify basic self consistency properties
5197 void verify() {
5198 if (hasValidDependencies()) {
5199 assert(UnscheduledDeps <= Dependencies && "invariant");
5200 } else {
5201 assert(UnscheduledDeps == Dependencies && "invariant");
5202 }
5203
5204 if (IsScheduled) {
5205 assert(hasValidDependencies() && UnscheduledDeps == 0 &&
5206 "unexpected scheduled state");
5207 }
5208 }
5209
5210 /// Returns true if the dependency information has been calculated.
5211 /// Note that depenendency validity can vary between instructions within
5212 /// a single bundle.
5213 bool hasValidDependencies() const {
5214 return Dependencies != ScheduleData::InvalidDeps;
5215 }
5216
5217 /// Returns true if it is ready for scheduling, i.e. it has no more
5218 /// unscheduled depending instructions/bundles.
5219 bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; }
5220
5221 /// Modifies the number of unscheduled dependencies for this instruction,
5222 /// and returns the number of remaining dependencies for the containing
5223 /// bundle.
5224 int incrementUnscheduledDeps(int Incr) {
5225 assert(hasValidDependencies() &&
5226 "increment of unscheduled deps would be meaningless");
5227 UnscheduledDeps += Incr;
5228 assert(UnscheduledDeps >= 0 && "invariant");
5229 return UnscheduledDeps;
5230 }
5231
5232 /// Sets the number of unscheduled dependencies to the number of
5233 /// dependencies.
5234 void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; }
5235
5236 /// Gets the number of unscheduled dependencies.
5237 int getUnscheduledDeps() const { return UnscheduledDeps; }
5238 /// Gets the number of dependencies.
5239 int getDependencies() const { return Dependencies; }
5240 /// Initializes the number of dependencies.
5241 void initDependencies() { Dependencies = 0; }
5242 /// Increments the number of dependencies.
5243 void incDependencies() { Dependencies++; }
5244
5245 /// Gets scheduling region ID.
5246 int getSchedulingRegionID() const { return SchedulingRegionID; }
5247
5248 /// Gets the instruction.
5249 Instruction *getInst() const { return Inst; }
5250
5251 /// Clears all dependency information.
5252 void clearDependencies() {
5253 Dependencies = ScheduleData::InvalidDeps;
5254 UnscheduledDeps = ScheduleData::InvalidDeps;
5255 IsScheduled = false;
5256 }
5257
5258 /// Gets the edge information.
5259 const EdgeInfo &getEdgeInfo() const { return EI; }
5260
5261 /// Gets the bundle.
5262 ScheduleBundle &getBundle() { return Bundle; }
5263 const ScheduleBundle &getBundle() const { return Bundle; }
5264
5265#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5266 void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); }
5267
5268 LLVM_DUMP_METHOD void dump() const {
5269 dump(dbgs());
5270 dbgs() << '\n';
5271 }
5272#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
5273
5274 private:
5275 /// true, if it has valid dependency information. These nodes always have
5276 /// only single dependency.
5277 int Dependencies = ScheduleData::InvalidDeps;
5278
5279 /// The number of dependencies minus the number of dependencies of scheduled
5280 /// instructions. As soon as this is zero, the instruction/bundle gets ready
5281 /// for scheduling.
5282 /// Note that this is negative as long as Dependencies is not calculated.
5283 int UnscheduledDeps = ScheduleData::InvalidDeps;
5284 };
5285
5286#ifndef NDEBUG
5287 friend inline raw_ostream &
5288 operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) {
5289 SD.dump(OS);
5290 return OS;
5291 }
5292#endif
5293
5294 friend struct GraphTraits<BoUpSLP *>;
5295 friend struct DOTGraphTraits<BoUpSLP *>;
5296
5297 /// Contains all scheduling data for a basic block.
5298 /// It does not schedules instructions, which are not memory read/write
5299 /// instructions and their operands are either constants, or arguments, or
5300 /// phis, or instructions from others blocks, or their users are phis or from
5301 /// the other blocks. The resulting vector instructions can be placed at the
5302 /// beginning of the basic block without scheduling (if operands does not need
5303 /// to be scheduled) or at the end of the block (if users are outside of the
5304 /// block). It allows to save some compile time and memory used by the
5305 /// compiler.
5306 /// ScheduleData is assigned for each instruction in between the boundaries of
5307 /// the tree entry, even for those, which are not part of the graph. It is
5308 /// required to correctly follow the dependencies between the instructions and
5309 /// their correct scheduling. The ScheduleData is not allocated for the
5310 /// instructions, which do not require scheduling, like phis, nodes with
5311 /// extractelements/insertelements only or nodes with instructions, with
5312 /// uses/operands outside of the block.
5313 struct BlockScheduling {
5314 BlockScheduling(BasicBlock *BB)
5315 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
5316
5317 void clear() {
5318 ScheduledBundles.clear();
5319 ScheduledBundlesList.clear();
5320 ScheduleCopyableDataMap.clear();
5321 ScheduleCopyableDataMapByInst.clear();
5322 ScheduleCopyableDataMapByInstUser.clear();
5323 ScheduleCopyableDataMapByUsers.clear();
5324 ReadyInsts.clear();
5325 ScheduleStart = nullptr;
5326 ScheduleEnd = nullptr;
5327 FirstLoadStoreInRegion = nullptr;
5328 LastLoadStoreInRegion = nullptr;
5329 RegionHasStackSave = false;
5330
5331 // Reduce the maximum schedule region size by the size of the
5332 // previous scheduling run.
5333 ScheduleRegionSizeLimit -= ScheduleRegionSize;
5334 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
5335 ScheduleRegionSizeLimit = MinScheduleRegionSize;
5336 ScheduleRegionSize = 0;
5337
5338 // Make a new scheduling region, i.e. all existing ScheduleData is not
5339 // in the new region yet.
5340 ++SchedulingRegionID;
5341 }
5342
5343 ScheduleData *getScheduleData(Instruction *I) {
5344 if (!I)
5345 return nullptr;
5346 if (BB != I->getParent())
5347 // Avoid lookup if can't possibly be in map.
5348 return nullptr;
5349 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
5350 if (SD && isInSchedulingRegion(SD: *SD))
5351 return SD;
5352 return nullptr;
5353 }
5354
5355 ScheduleData *getScheduleData(Value *V) {
5356 return getScheduleData(I: dyn_cast<Instruction>(Val: V));
5357 }
5358
5359 /// Returns the ScheduleCopyableData for the given edge (user tree entry and
5360 /// operand number) and value.
5361 ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI,
5362 const Value *V) const {
5363 if (ScheduleCopyableDataMap.empty())
5364 return nullptr;
5365 auto It = ScheduleCopyableDataMap.find(Val: std::make_pair(x: EI, y&: V));
5366 if (It == ScheduleCopyableDataMap.end())
5367 return nullptr;
5368 ScheduleCopyableData *SD = It->getSecond().get();
5369 if (!isInSchedulingRegion(SD: *SD))
5370 return nullptr;
5371 return SD;
5372 }
5373
5374 /// Returns the ScheduleCopyableData for the given user \p User, operand
5375 /// number and operand \p V.
5376 SmallVector<ScheduleCopyableData *>
5377 getScheduleCopyableData(const Value *User, unsigned OperandIdx,
5378 const Value *V) {
5379 if (ScheduleCopyableDataMapByInstUser.empty())
5380 return {};
5381 const auto It = ScheduleCopyableDataMapByInstUser.find(
5382 Val: std::make_pair(x: std::make_pair(x&: User, y&: OperandIdx), y&: V));
5383 if (It == ScheduleCopyableDataMapByInstUser.end())
5384 return {};
5385 SmallVector<ScheduleCopyableData *> Res;
5386 for (ScheduleCopyableData *SD : It->getSecond()) {
5387 if (isInSchedulingRegion(SD: *SD))
5388 Res.push_back(Elt: SD);
5389 }
5390 return Res;
5391 }
5392
5393 /// Returns true if all operands of the given instruction \p User are
5394 /// replaced by copyable data.
5395 /// \param User The user instruction.
5396 /// \param Op The operand, which might be replaced by the copyable data.
5397 /// \param SLP The SLP tree.
5398 /// \param NumOps The number of operands used. If the instruction uses the
5399 /// same operand several times, check for the first use, then the second,
5400 /// etc.
5401 bool areAllOperandsReplacedByCopyableData(Instruction *User,
5402 Instruction *Op, BoUpSLP &SLP,
5403 unsigned NumOps) const {
5404 assert(NumOps > 0 && "No operands");
5405 if (ScheduleCopyableDataMap.empty())
5406 return false;
5407 SmallDenseMap<TreeEntry *, unsigned> PotentiallyReorderedEntriesCount;
5408 ArrayRef<TreeEntry *> Entries = SLP.getTreeEntries(V: User);
5409 if (Entries.empty())
5410 return false;
5411 unsigned CurNumOps = 0;
5412 for (const Use &U : User->operands()) {
5413 if (U.get() != Op)
5414 continue;
5415 ++CurNumOps;
5416 // Check all tree entries, if they have operands replaced by copyable
5417 // data.
5418 for (TreeEntry *TE : Entries) {
5419 unsigned Inc = 0;
5420 bool IsNonSchedulableWithParentPhiNode =
5421 TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
5422 TE->UserTreeIndex.UserTE->hasState() &&
5423 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5424 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5425 // Count the number of unique phi nodes, which are the parent for
5426 // parent entry, and exit, if all the unique phis are processed.
5427 if (IsNonSchedulableWithParentPhiNode) {
5428 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5429 const TreeEntry *ParentTE = TE->UserTreeIndex.UserTE;
5430 for (Value *V : ParentTE->Scalars) {
5431 auto *PHI = dyn_cast<PHINode>(Val: V);
5432 if (!PHI)
5433 continue;
5434 if (ParentsUniqueUsers.insert(Ptr: PHI).second &&
5435 is_contained(Range: PHI->incoming_values(), Element: User))
5436 ++Inc;
5437 }
5438 } else {
5439 Inc = count(Range&: TE->Scalars, Element: User);
5440 }
5441
5442 // Check if the user is commutative.
5443 // The commutatives are handled later, as their operands can be
5444 // reordered.
5445 // Same applies even for non-commutative cmps, because we can invert
5446 // their predicate potentially and, thus, reorder the operands.
5447 bool IsCommutativeUser =
5448 ::isCommutative(I: User) &&
5449 ::isCommutableOperand(I: User, ValWithUses: User, Op: U.getOperandNo());
5450 if (!IsCommutativeUser) {
5451 Instruction *MainOp = TE->getMatchingMainOpOrAltOp(I: User);
5452 IsCommutativeUser =
5453 ::isCommutative(I: MainOp, ValWithUses: User) &&
5454 ::isCommutableOperand(I: MainOp, ValWithUses: User, Op: U.getOperandNo());
5455 }
5456 // The commutative user with the same operands can be safely
5457 // considered as non-commutative, operands reordering does not change
5458 // the semantics.
5459 assert(
5460 (!IsCommutativeUser ||
5461 (((::isCommutative(User) &&
5462 ::isCommutableOperand(User, User, 0) &&
5463 ::isCommutableOperand(User, User, 1)) ||
5464 (::isCommutative(TE->getMatchingMainOpOrAltOp(User), User) &&
5465 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5466 User, 0) &&
5467 ::isCommutableOperand(TE->getMatchingMainOpOrAltOp(User),
5468 User, 1))))) &&
5469 "Expected commutative user with 2 first commutable operands");
5470 bool IsCommutativeWithSameOps =
5471 IsCommutativeUser && User->getOperand(i: 0) == User->getOperand(i: 1);
5472 if ((!IsCommutativeUser || IsCommutativeWithSameOps) &&
5473 !isa<CmpInst>(Val: User)) {
5474 EdgeInfo EI(TE, U.getOperandNo());
5475 if (CurNumOps != NumOps || getScheduleCopyableData(EI, V: Op))
5476 continue;
5477 return false;
5478 }
5479 PotentiallyReorderedEntriesCount.try_emplace(Key: TE, Args: 0)
5480 .first->getSecond() += Inc;
5481 }
5482 }
5483 if (PotentiallyReorderedEntriesCount.empty())
5484 return true;
5485 // Check the commutative/cmp entries.
5486 for (auto &P : PotentiallyReorderedEntriesCount) {
5487 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5488 bool IsNonSchedulableWithParentPhiNode =
5489 P.first->doesNotNeedToSchedule() && P.first->UserTreeIndex &&
5490 P.first->UserTreeIndex.UserTE->hasState() &&
5491 P.first->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
5492 P.first->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
5493 auto *It = find(Range&: P.first->Scalars, Val: User);
5494 do {
5495 assert(It != P.first->Scalars.end() &&
5496 "User is not in the tree entry");
5497 int Lane = std::distance(first: P.first->Scalars.begin(), last: It);
5498 assert(Lane >= 0 && "Lane is not found");
5499 if (isa<StoreInst>(Val: User) && !P.first->ReorderIndices.empty())
5500 Lane = P.first->ReorderIndices[Lane];
5501 assert(Lane < static_cast<int>(P.first->Scalars.size()) &&
5502 "Couldn't find extract lane");
5503 // Count the number of unique phi nodes, which are the parent for
5504 // parent entry, and exit, if all the unique phis are processed.
5505 if (IsNonSchedulableWithParentPhiNode) {
5506 const TreeEntry *ParentTE = P.first->UserTreeIndex.UserTE;
5507 Value *User = ParentTE->Scalars[Lane];
5508 if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5509 It =
5510 find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5511 continue;
5512 }
5513 }
5514 for (unsigned OpIdx :
5515 seq<unsigned>(Size: ::getNumberOfPotentiallyCommutativeOps(
5516 I: P.first->getMainOp()))) {
5517 if (P.first->getOperand(OpIdx)[Lane] == Op &&
5518 getScheduleCopyableData(EI: EdgeInfo(P.first, OpIdx), V: Op))
5519 --P.getSecond();
5520 }
5521 // If parent node is schedulable, it will be handled correctly.
5522 It = find(Range: make_range(x: std::next(x: It), y: P.first->Scalars.end()), Val: User);
5523 } while (It != P.first->Scalars.end());
5524 }
5525 return all_of(Range&: PotentiallyReorderedEntriesCount,
5526 P: [&](const std::pair<const TreeEntry *, unsigned> &P) {
5527 return P.second == NumOps - 1;
5528 });
5529 }
5530
5531 SmallVector<ScheduleCopyableData *>
5532 getScheduleCopyableData(const Instruction *I) const {
5533 if (ScheduleCopyableDataMapByInst.empty())
5534 return {};
5535 const auto It = ScheduleCopyableDataMapByInst.find(Val: I);
5536 if (It == ScheduleCopyableDataMapByInst.end())
5537 return {};
5538 SmallVector<ScheduleCopyableData *> Res;
5539 for (ScheduleCopyableData *SD : It->getSecond()) {
5540 if (isInSchedulingRegion(SD: *SD))
5541 Res.push_back(Elt: SD);
5542 }
5543 return Res;
5544 }
5545
5546 SmallVector<ScheduleCopyableData *>
5547 getScheduleCopyableDataUsers(const Instruction *User) const {
5548 if (ScheduleCopyableDataMapByUsers.empty())
5549 return {};
5550 const auto It = ScheduleCopyableDataMapByUsers.find(Val: User);
5551 if (It == ScheduleCopyableDataMapByUsers.end())
5552 return {};
5553 SmallVector<ScheduleCopyableData *> Res;
5554 for (ScheduleCopyableData *SD : It->getSecond()) {
5555 if (isInSchedulingRegion(SD: *SD))
5556 Res.push_back(Elt: SD);
5557 }
5558 return Res;
5559 }
5560
5561 ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI,
5562 Instruction *I,
5563 int SchedulingRegionID,
5564 ScheduleBundle &Bundle) {
5565 assert(!getScheduleCopyableData(EI, I) && "already in the map");
5566 ScheduleCopyableData *CD =
5567 ScheduleCopyableDataMap
5568 .try_emplace(Key: std::make_pair(x: EI, y&: I),
5569 Args: std::make_unique<ScheduleCopyableData>(
5570 args&: SchedulingRegionID, args&: I, args: EI, args&: Bundle))
5571 .first->getSecond()
5572 .get();
5573 ScheduleCopyableDataMapByInst[I].push_back(Elt: CD);
5574 if (EI.UserTE) {
5575 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
5576 const auto *It = find(Range&: Op, Val: I);
5577 assert(It != Op.end() && "Lane not set");
5578 SmallPtrSet<Instruction *, 4> Visited;
5579 do {
5580 int Lane = std::distance(first: Op.begin(), last: It);
5581 assert(Lane >= 0 && "Lane not set");
5582 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
5583 !EI.UserTE->ReorderIndices.empty())
5584 Lane = EI.UserTE->ReorderIndices[Lane];
5585 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
5586 "Couldn't find extract lane");
5587 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
5588 if (!Visited.insert(Ptr: In).second) {
5589 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5590 continue;
5591 }
5592 ScheduleCopyableDataMapByInstUser
5593 .try_emplace(Key: std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I))
5594 .first->getSecond()
5595 .push_back(Elt: CD);
5596 ScheduleCopyableDataMapByUsers.try_emplace(Key: I)
5597 .first->getSecond()
5598 .insert(X: CD);
5599 // Remove extra deps for users, becoming non-immediate users of the
5600 // instruction. It may happen, if the chain of same copyable elements
5601 // appears in the tree.
5602 if (In == I) {
5603 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
5604 if (ScheduleCopyableData *UserCD =
5605 getScheduleCopyableData(EI: UserEI, V: In))
5606 ScheduleCopyableDataMapByUsers[I].remove(X: UserCD);
5607 }
5608 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
5609 } while (It != Op.end());
5610 } else {
5611 ScheduleCopyableDataMapByUsers.try_emplace(Key: I).first->getSecond().insert(
5612 X: CD);
5613 }
5614 return *CD;
5615 }
5616
5617 ArrayRef<ScheduleBundle *> getScheduleBundles(Value *V) const {
5618 auto *I = dyn_cast<Instruction>(Val: V);
5619 if (!I)
5620 return {};
5621 auto It = ScheduledBundles.find(Val: I);
5622 if (It == ScheduledBundles.end())
5623 return {};
5624 return It->getSecond();
5625 }
5626
5627 /// Returns true if the entity is in the scheduling region.
5628 bool isInSchedulingRegion(const ScheduleEntity &SD) const {
5629 if (const auto *Data = dyn_cast<ScheduleData>(Val: &SD))
5630 return Data->getSchedulingRegionID() == SchedulingRegionID;
5631 if (const auto *CD = dyn_cast<ScheduleCopyableData>(Val: &SD))
5632 return CD->getSchedulingRegionID() == SchedulingRegionID;
5633 return all_of(Range: cast<ScheduleBundle>(Val: SD).getBundle(),
5634 P: [&](const ScheduleEntity *BundleMember) {
5635 return isInSchedulingRegion(SD: *BundleMember);
5636 });
5637 }
5638
5639 /// Marks an instruction as scheduled and puts all dependent ready
5640 /// instructions into the ready-list.
5641 template <typename ReadyListType>
5642 void schedule(const BoUpSLP &R, const InstructionsState &S,
5643 const EdgeInfo &EI, ScheduleEntity *Data,
5644 ReadyListType &ReadyList) {
5645 auto ProcessBundleMember = [&](ScheduleEntity *BundleMember,
5646 ArrayRef<ScheduleBundle *> Bundles) {
5647 // Handle the def-use chain dependencies.
5648
5649 // Decrement the unscheduled counter and insert to ready list if ready.
5650 auto DecrUnsched = [&](auto *Data, bool IsControl = false) {
5651 if ((IsControl || Data->hasValidDependencies()) &&
5652 Data->incrementUnscheduledDeps(-1) == 0) {
5653 // There are no more unscheduled dependencies after
5654 // decrementing, so we can put the dependent instruction
5655 // into the ready list.
5656 SmallVector<ScheduleBundle *, 1> CopyableBundle;
5657 ArrayRef<ScheduleBundle *> Bundles;
5658 if (auto *CD = dyn_cast<ScheduleCopyableData>(Data)) {
5659 CopyableBundle.push_back(Elt: &CD->getBundle());
5660 Bundles = CopyableBundle;
5661 } else {
5662 Bundles = getScheduleBundles(V: Data->getInst());
5663 }
5664 if (!Bundles.empty()) {
5665 for (ScheduleBundle *Bundle : Bundles) {
5666 if (Bundle->unscheduledDepsInBundle() == 0) {
5667 assert(!Bundle->isScheduled() &&
5668 "already scheduled bundle gets ready");
5669 ReadyList.insert(Bundle);
5670 LLVM_DEBUG(dbgs()
5671 << "SLP: gets ready: " << *Bundle << "\n");
5672 }
5673 }
5674 return;
5675 }
5676 assert(!Data->isScheduled() &&
5677 "already scheduled bundle gets ready");
5678 assert(!isa<ScheduleCopyableData>(Data) &&
5679 "Expected non-copyable data");
5680 ReadyList.insert(Data);
5681 LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n");
5682 }
5683 };
5684
5685 auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx,
5686 Instruction *I) {
5687 if (!ScheduleCopyableDataMap.empty()) {
5688 SmallVector<ScheduleCopyableData *> CopyableData =
5689 getScheduleCopyableData(User, OperandIdx: OpIdx, V: I);
5690 for (ScheduleCopyableData *CD : CopyableData)
5691 DecrUnsched(CD, /*IsControl=*/false);
5692 if (!CopyableData.empty())
5693 return;
5694 }
5695 if (ScheduleData *OpSD = getScheduleData(I))
5696 DecrUnsched(OpSD, /*IsControl=*/false);
5697 };
5698
5699 // If BundleMember is a vector bundle, its operands may have been
5700 // reordered during buildTree(). We therefore need to get its operands
5701 // through the TreeEntry.
5702 if (!Bundles.empty()) {
5703 auto *In = BundleMember->getInst();
5704 // Count uses of each instruction operand.
5705 SmallDenseMap<const Instruction *, unsigned> OperandsUses;
5706 unsigned TotalOpCount = 0;
5707 if (isa<ScheduleCopyableData>(Val: BundleMember)) {
5708 // Copyable data is used only once (uses itself).
5709 TotalOpCount = OperandsUses[In] = 1;
5710 } else {
5711 for (const Use &U : In->operands()) {
5712 if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5713 auto Res = OperandsUses.try_emplace(Key: I, Args: 0);
5714 ++Res.first->getSecond();
5715 ++TotalOpCount;
5716 }
5717 }
5718 }
5719 // Decrement the unscheduled counter and insert to ready list if
5720 // ready.
5721 auto DecrUnschedForInst =
5722 [&](Instruction *I, TreeEntry *UserTE, unsigned OpIdx,
5723 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>>
5724 &Checked) {
5725 if (!ScheduleCopyableDataMap.empty()) {
5726 const EdgeInfo EI = {UserTE, OpIdx};
5727 if (ScheduleCopyableData *CD =
5728 getScheduleCopyableData(EI, V: I)) {
5729 if (!Checked.insert(V: std::make_pair(x&: CD, y&: OpIdx)).second)
5730 return;
5731 DecrUnsched(CD, /*IsControl=*/false);
5732 return;
5733 }
5734 }
5735 auto It = OperandsUses.find(Val: I);
5736 assert(It != OperandsUses.end() && "Operand not found");
5737 if (It->second > 0) {
5738 if (ScheduleData *OpSD = getScheduleData(I)) {
5739 if (!Checked.insert(V: std::make_pair(x&: OpSD, y&: OpIdx)).second)
5740 return;
5741 --It->getSecond();
5742 assert(TotalOpCount > 0 && "No more operands to decrement");
5743 --TotalOpCount;
5744 DecrUnsched(OpSD, /*IsControl=*/false);
5745 } else {
5746 --It->getSecond();
5747 assert(TotalOpCount > 0 && "No more operands to decrement");
5748 --TotalOpCount;
5749 }
5750 }
5751 };
5752
5753 SmallDenseSet<std::pair<const ScheduleEntity *, unsigned>> Checked;
5754 for (ScheduleBundle *Bundle : Bundles) {
5755 if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0)
5756 break;
5757 SmallPtrSet<Value *, 4> ParentsUniqueUsers;
5758 // Need to search for the lane since the tree entry can be
5759 // reordered.
5760 auto *It = find(Range&: Bundle->getTreeEntry()->Scalars, Val: In);
5761 bool IsNonSchedulableWithParentPhiNode =
5762 Bundle->getTreeEntry()->doesNotNeedToSchedule() &&
5763 Bundle->getTreeEntry()->UserTreeIndex &&
5764 Bundle->getTreeEntry()->UserTreeIndex.UserTE->hasState() &&
5765 Bundle->getTreeEntry()->UserTreeIndex.UserTE->State !=
5766 TreeEntry::SplitVectorize &&
5767 Bundle->getTreeEntry()->UserTreeIndex.UserTE->getOpcode() ==
5768 Instruction::PHI;
5769 do {
5770 int Lane =
5771 std::distance(first: Bundle->getTreeEntry()->Scalars.begin(), last: It);
5772 assert(Lane >= 0 && "Lane not set");
5773 if (isa<StoreInst>(Val: In) &&
5774 !Bundle->getTreeEntry()->ReorderIndices.empty())
5775 Lane = Bundle->getTreeEntry()->ReorderIndices[Lane];
5776 assert(Lane < static_cast<int>(
5777 Bundle->getTreeEntry()->Scalars.size()) &&
5778 "Couldn't find extract lane");
5779
5780 // Since vectorization tree is being built recursively this
5781 // assertion ensures that the tree entry has all operands set
5782 // before reaching this code. Couple of exceptions known at the
5783 // moment are extracts where their second (immediate) operand is
5784 // not added. Since immediates do not affect scheduler behavior
5785 // this is considered okay.
5786 assert(
5787 In &&
5788 (isa<ExtractValueInst, ExtractElementInst, CallBase>(In) ||
5789 In->getNumOperands() ==
5790 Bundle->getTreeEntry()->getNumOperands() ||
5791 (isa<ZExtInst>(In) && Bundle->getTreeEntry()->getOpcode() ==
5792 Instruction::Select) ||
5793 Bundle->getTreeEntry()->isCopyableElement(In)) &&
5794 "Missed TreeEntry operands?");
5795
5796 // Count the number of unique phi nodes, which are the parent for
5797 // parent entry, and exit, if all the unique phis are processed.
5798 if (IsNonSchedulableWithParentPhiNode) {
5799 const TreeEntry *ParentTE =
5800 Bundle->getTreeEntry()->UserTreeIndex.UserTE;
5801 Value *User = ParentTE->Scalars[Lane];
5802 if (!ParentsUniqueUsers.insert(Ptr: User).second) {
5803 It = std::find(first: std::next(x: It),
5804 last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5805 continue;
5806 }
5807 }
5808
5809 for (unsigned OpIdx :
5810 seq<unsigned>(Size: Bundle->getTreeEntry()->getNumOperands()))
5811 if (auto *I = dyn_cast<Instruction>(
5812 Val: Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) {
5813 LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): "
5814 << *I << "\n");
5815 DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx, Checked);
5816 }
5817 // If parent node is schedulable, it will be handled correctly.
5818 if (Bundle->getTreeEntry()->isCopyableElement(V: In))
5819 break;
5820 It = std::find(first: std::next(x: It),
5821 last: Bundle->getTreeEntry()->Scalars.end(), val: In);
5822 } while (It != Bundle->getTreeEntry()->Scalars.end());
5823 }
5824 } else {
5825 // If BundleMember is a stand-alone instruction, no operand reordering
5826 // has taken place, so we directly access its operands.
5827 for (Use &U : BundleMember->getInst()->operands()) {
5828 if (auto *I = dyn_cast<Instruction>(Val: U.get())) {
5829 LLVM_DEBUG(dbgs()
5830 << "SLP: check for readiness (def): " << *I << "\n");
5831 DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I);
5832 }
5833 }
5834 }
5835 // Handle the memory dependencies.
5836 auto *SD = dyn_cast<ScheduleData>(Val: BundleMember);
5837 if (!SD)
5838 return;
5839 SmallPtrSet<const ScheduleData *, 4> VisitedMemory;
5840 for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) {
5841 if (!VisitedMemory.insert(Ptr: MemoryDep).second)
5842 continue;
5843 // There are no more unscheduled dependencies after decrementing,
5844 // so we can put the dependent instruction into the ready list.
5845 LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): "
5846 << *MemoryDep << "\n");
5847 DecrUnsched(MemoryDep);
5848 }
5849 // Handle the control dependencies.
5850 SmallPtrSet<const ScheduleData *, 4> VisitedControl;
5851 for (ScheduleData *Dep : SD->getControlDependencies()) {
5852 if (!VisitedControl.insert(Ptr: Dep).second)
5853 continue;
5854 // There are no more unscheduled dependencies after decrementing,
5855 // so we can put the dependent instruction into the ready list.
5856 LLVM_DEBUG(dbgs()
5857 << "SLP: check for readiness (ctrl): " << *Dep << "\n");
5858 DecrUnsched(Dep, /*IsControl=*/true);
5859 }
5860 };
5861 if (auto *SD = dyn_cast<ScheduleData>(Val: Data)) {
5862 SD->setScheduled(/*Scheduled=*/true);
5863 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
5864 SmallVector<std::unique_ptr<ScheduleBundle>> PseudoBundles;
5865 SmallVector<ScheduleBundle *> Bundles;
5866 Instruction *In = SD->getInst();
5867 ArrayRef<TreeEntry *> Entries = R.getTreeEntries(V: In);
5868 if (!Entries.empty()) {
5869 for (TreeEntry *TE : Entries) {
5870 if (!isa<ExtractValueInst, ExtractElementInst, CallBase>(Val: In) &&
5871 In->getNumOperands() != TE->getNumOperands())
5872 continue;
5873 auto &BundlePtr =
5874 PseudoBundles.emplace_back(Args: std::make_unique<ScheduleBundle>());
5875 BundlePtr->setTreeEntry(TE);
5876 BundlePtr->add(SD);
5877 Bundles.push_back(Elt: BundlePtr.get());
5878 }
5879 }
5880 ProcessBundleMember(SD, Bundles);
5881 } else {
5882 ScheduleBundle &Bundle = *cast<ScheduleBundle>(Val: Data);
5883 Bundle.setScheduled(/*Scheduled=*/true);
5884 LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n");
5885 auto AreAllBundlesScheduled =
5886 [&](const ScheduleEntity *SD,
5887 ArrayRef<ScheduleBundle *> SDBundles) {
5888 if (isa<ScheduleCopyableData>(Val: SD))
5889 return true;
5890 return !SDBundles.empty() &&
5891 all_of(SDBundles, [&](const ScheduleBundle *SDBundle) {
5892 return SDBundle->isScheduled();
5893 });
5894 };
5895 for (ScheduleEntity *SD : Bundle.getBundle()) {
5896 ArrayRef<ScheduleBundle *> SDBundles;
5897 if (!isa<ScheduleCopyableData>(Val: SD))
5898 SDBundles = getScheduleBundles(V: SD->getInst());
5899 if (AreAllBundlesScheduled(SD, SDBundles)) {
5900 SD->setScheduled(/*Scheduled=*/true);
5901 ProcessBundleMember(SD, isa<ScheduleCopyableData>(Val: SD) ? &Bundle
5902 : SDBundles);
5903 }
5904 }
5905 }
5906 }
5907
5908 /// Verify basic self consistency properties of the data structure.
5909 void verify() {
5910 if (!ScheduleStart)
5911 return;
5912
5913 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
5914 ScheduleStart->comesBefore(ScheduleEnd) &&
5915 "Not a valid scheduling region?");
5916
5917 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5918 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
5919 if (!Bundles.empty()) {
5920 for (ScheduleBundle *Bundle : Bundles) {
5921 assert(isInSchedulingRegion(*Bundle) &&
5922 "primary schedule data not in window?");
5923 Bundle->verify();
5924 }
5925 continue;
5926 }
5927 auto *SD = getScheduleData(I);
5928 if (!SD)
5929 continue;
5930 assert(isInSchedulingRegion(*SD) &&
5931 "primary schedule data not in window?");
5932 SD->verify();
5933 }
5934
5935 assert(all_of(ReadyInsts,
5936 [](const ScheduleEntity *Bundle) {
5937 return Bundle->isReady();
5938 }) &&
5939 "item in ready list not ready?");
5940 }
5941
5942 /// Put all instructions into the ReadyList which are ready for scheduling.
5943 template <typename ReadyListType>
5944 void initialFillReadyList(ReadyListType &ReadyList) {
5945 SmallPtrSet<ScheduleBundle *, 16> Visited;
5946 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
5947 ScheduleData *SD = getScheduleData(I);
5948 if (SD && SD->hasValidDependencies() && SD->isReady()) {
5949 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: I);
5950 !Bundles.empty()) {
5951 for (ScheduleBundle *Bundle : Bundles) {
5952 if (!Visited.insert(Ptr: Bundle).second)
5953 continue;
5954 if (Bundle->hasValidDependencies() && Bundle->isReady()) {
5955 ReadyList.insert(Bundle);
5956 LLVM_DEBUG(dbgs() << "SLP: initially in ready list: "
5957 << *Bundle << "\n");
5958 }
5959 }
5960 continue;
5961 }
5962 ReadyList.insert(SD);
5963 LLVM_DEBUG(dbgs()
5964 << "SLP: initially in ready list: " << *SD << "\n");
5965 }
5966 }
5967 }
5968
5969 /// Build a bundle from the ScheduleData nodes corresponding to the
5970 /// scalar instruction for each lane.
5971 /// \param VL The list of scalar instructions.
5972 /// \param S The state of the instructions.
5973 /// \param EI The edge in the SLP graph or the user node/operand number.
5974 ScheduleBundle &buildBundle(ArrayRef<Value *> VL,
5975 const InstructionsState &S, const EdgeInfo &EI);
5976
5977 /// Checks if a bundle of instructions can be scheduled, i.e. has no
5978 /// cyclic dependencies. This is only a dry-run, no instructions are
5979 /// actually moved at this stage.
5980 /// \returns the scheduling bundle. The returned Optional value is not
5981 /// std::nullopt if \p VL is allowed to be scheduled.
5982 std::optional<ScheduleBundle *>
5983 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
5984 const InstructionsState &S, const EdgeInfo &EI);
5985
5986 /// Allocates schedule data chunk.
5987 ScheduleData *allocateScheduleDataChunks();
5988
5989 /// Extends the scheduling region so that V is inside the region.
5990 /// \returns true if the region size is within the limit.
5991 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
5992
5993 /// Initialize the ScheduleData structures for new instructions in the
5994 /// scheduling region.
5995 void initScheduleData(Instruction *FromI, Instruction *ToI,
5996 ScheduleData *PrevLoadStore,
5997 ScheduleData *NextLoadStore);
5998
5999 /// Updates the dependency information of a bundle and of all instructions/
6000 /// bundles which depend on the original bundle.
6001 void calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList,
6002 BoUpSLP *SLP,
6003 ArrayRef<ScheduleData *> ControlDeps = {});
6004
6005 /// Sets all instruction in the scheduling region to un-scheduled.
6006 void resetSchedule();
6007
6008 BasicBlock *BB;
6009
6010 /// Simple memory allocation for ScheduleData.
6011 SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
6012
6013 /// The size of a ScheduleData array in ScheduleDataChunks.
6014 int ChunkSize;
6015
6016 /// The allocator position in the current chunk, which is the last entry
6017 /// of ScheduleDataChunks.
6018 int ChunkPos;
6019
6020 /// Attaches ScheduleData to Instruction.
6021 /// Note that the mapping survives during all vectorization iterations, i.e.
6022 /// ScheduleData structures are recycled.
6023 SmallDenseMap<Instruction *, ScheduleData *> ScheduleDataMap;
6024
6025 /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand
6026 /// number) and the operand instruction, represented as copyable element.
6027 SmallDenseMap<std::pair<EdgeInfo, const Value *>,
6028 std::unique_ptr<ScheduleCopyableData>>
6029 ScheduleCopyableDataMap;
6030
6031 /// Represents mapping between instruction and all related
6032 /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable
6033 /// element). The SLP tree may contain several representations of the same
6034 /// instruction.
6035 SmallDenseMap<const Instruction *, SmallVector<ScheduleCopyableData *>>
6036 ScheduleCopyableDataMapByInst;
6037
6038 /// Represents mapping between user value and operand number, the operand
6039 /// value and all related ScheduleCopyableData. The relation is 1:n, because
6040 /// the same user may refernce the same operand in different tree entries
6041 /// and the operand may be modelled by the different copyable data element.
6042 SmallDenseMap<std::pair<std::pair<const Value *, unsigned>, const Value *>,
6043 SmallVector<ScheduleCopyableData *>>
6044 ScheduleCopyableDataMapByInstUser;
6045
6046 /// Represents mapping between instruction and all related
6047 /// ScheduleCopyableData. It represents the mapping between the actual
6048 /// instruction and the last copyable data element in the chain. E.g., if
6049 /// the graph models the following instructions:
6050 /// %0 = non-add instruction ...
6051 /// ...
6052 /// %4 = add %3, 1
6053 /// %5 = add %4, 1
6054 /// %6 = insertelement poison, %0, 0
6055 /// %7 = insertelement %6, %5, 1
6056 /// And the graph is modeled as:
6057 /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ]
6058 /// -> [1, 0] -> [%1, 0]
6059 ///
6060 /// this map will map %0 only to the copyable element <1>, which is the last
6061 /// user (direct user of the actual instruction). <0> uses <1>, so <1> will
6062 /// keep the map to <0>, not the %0.
6063 SmallDenseMap<const Instruction *,
6064 SmallSetVector<ScheduleCopyableData *, 4>>
6065 ScheduleCopyableDataMapByUsers;
6066
6067 /// Attaches ScheduleBundle to Instruction.
6068 SmallDenseMap<Instruction *, SmallVector<ScheduleBundle *>>
6069 ScheduledBundles;
6070 /// The list of ScheduleBundles.
6071 SmallVector<std::unique_ptr<ScheduleBundle>> ScheduledBundlesList;
6072
6073 /// The ready-list for scheduling (only used for the dry-run).
6074 SetVector<ScheduleEntity *> ReadyInsts;
6075
6076 /// The first instruction of the scheduling region.
6077 Instruction *ScheduleStart = nullptr;
6078
6079 /// The first instruction _after_ the scheduling region.
6080 Instruction *ScheduleEnd = nullptr;
6081
6082 /// The first memory accessing instruction in the scheduling region
6083 /// (can be null).
6084 ScheduleData *FirstLoadStoreInRegion = nullptr;
6085
6086 /// The last memory accessing instruction in the scheduling region
6087 /// (can be null).
6088 ScheduleData *LastLoadStoreInRegion = nullptr;
6089
6090 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
6091 /// region? Used to optimize the dependence calculation for the
6092 /// common case where there isn't.
6093 bool RegionHasStackSave = false;
6094
6095 /// The current size of the scheduling region.
6096 int ScheduleRegionSize = 0;
6097
6098 /// The maximum size allowed for the scheduling region.
6099 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
6100
6101 /// The ID of the scheduling region. For a new vectorization iteration this
6102 /// is incremented which "removes" all ScheduleData from the region.
6103 /// Make sure that the initial SchedulingRegionID is greater than the
6104 /// initial SchedulingRegionID in ScheduleData (which is 0).
6105 int SchedulingRegionID = 1;
6106 };
6107
6108 /// Attaches the BlockScheduling structures to basic blocks.
6109 MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
6110
6111 /// Performs the "real" scheduling. Done before vectorization is actually
6112 /// performed in a basic block.
6113 void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS);
6114
6115 /// List of users to ignore during scheduling and that don't need extracting.
6116 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
6117
6118 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
6119 /// sorted SmallVectors of unsigned.
6120 struct OrdersTypeDenseMapInfo {
6121 static OrdersType getEmptyKey() {
6122 OrdersType V;
6123 V.push_back(Elt: ~1U);
6124 return V;
6125 }
6126
6127 static OrdersType getTombstoneKey() {
6128 OrdersType V;
6129 V.push_back(Elt: ~2U);
6130 return V;
6131 }
6132
6133 static unsigned getHashValue(const OrdersType &V) {
6134 return static_cast<unsigned>(hash_combine_range(R: V));
6135 }
6136
6137 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
6138 return LHS == RHS;
6139 }
6140 };
6141
6142 // Analysis and block reference.
6143 Function *F;
6144 ScalarEvolution *SE;
6145 TargetTransformInfo *TTI;
6146 TargetLibraryInfo *TLI;
6147 LoopInfo *LI;
6148 DominatorTree *DT;
6149 AssumptionCache *AC;
6150 DemandedBits *DB;
6151 const DataLayout *DL;
6152 OptimizationRemarkEmitter *ORE;
6153
6154 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
6155 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
6156
6157 /// Instruction builder to construct the vectorized tree.
6158 IRBuilder<TargetFolder> Builder;
6159
6160 /// A map of scalar integer values to the smallest bit width with which they
6161 /// can legally be represented. The values map to (width, signed) pairs,
6162 /// where "width" indicates the minimum bit width and "signed" is True if the
6163 /// value must be signed-extended, rather than zero-extended, back to its
6164 /// original width.
6165 DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;
6166
6167 /// Final size of the reduced vector, if the current graph represents the
6168 /// input for the reduction and it was possible to narrow the size of the
6169 /// reduction.
6170 unsigned ReductionBitWidth = 0;
6171
6172 /// Canonical graph size before the transformations.
6173 unsigned BaseGraphSize = 1;
6174
6175 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
6176 /// type sizes, used in the tree.
6177 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
6178
6179 /// Indices of the vectorized nodes, which supposed to be the roots of the new
6180 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
6181 DenseSet<unsigned> ExtraBitWidthNodes;
6182};
6183
6184template <> struct llvm::DenseMapInfo<BoUpSLP::EdgeInfo> {
6185 using FirstInfo = DenseMapInfo<BoUpSLP::TreeEntry *>;
6186 using SecondInfo = DenseMapInfo<unsigned>;
6187 static BoUpSLP::EdgeInfo getEmptyKey() {
6188 return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(),
6189 SecondInfo::getEmptyKey());
6190 }
6191
6192 static BoUpSLP::EdgeInfo getTombstoneKey() {
6193 return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(),
6194 SecondInfo::getTombstoneKey());
6195 }
6196
6197 static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) {
6198 return detail::combineHashValue(a: FirstInfo::getHashValue(PtrVal: Val.UserTE),
6199 b: SecondInfo::getHashValue(Val: Val.EdgeIdx));
6200 }
6201
6202 static bool isEqual(const BoUpSLP::EdgeInfo &LHS,
6203 const BoUpSLP::EdgeInfo &RHS) {
6204 return LHS == RHS;
6205 }
6206};
6207
6208template <> struct llvm::GraphTraits<BoUpSLP *> {
6209 using TreeEntry = BoUpSLP::TreeEntry;
6210
6211 /// NodeRef has to be a pointer per the GraphWriter.
6212 using NodeRef = TreeEntry *;
6213
6214 using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
6215
6216 /// Add the VectorizableTree to the index iterator to be able to return
6217 /// TreeEntry pointers.
6218 struct ChildIteratorType
6219 : public iterator_adaptor_base<
6220 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
6221 ContainerTy &VectorizableTree;
6222
6223 ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
6224 ContainerTy &VT)
6225 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
6226
6227 NodeRef operator*() { return I->UserTE; }
6228 };
6229
6230 static NodeRef getEntryNode(BoUpSLP &R) {
6231 return R.VectorizableTree[0].get();
6232 }
6233
6234 static ChildIteratorType child_begin(NodeRef N) {
6235 return {&N->UserTreeIndex, N->Container};
6236 }
6237
6238 static ChildIteratorType child_end(NodeRef N) {
6239 return {&N->UserTreeIndex + 1, N->Container};
6240 }
6241
6242 /// For the node iterator we just need to turn the TreeEntry iterator into a
6243 /// TreeEntry* iterator so that it dereferences to NodeRef.
6244 class nodes_iterator {
6245 using ItTy = ContainerTy::iterator;
6246 ItTy It;
6247
6248 public:
6249 nodes_iterator(const ItTy &It2) : It(It2) {}
6250 NodeRef operator*() { return It->get(); }
6251 nodes_iterator operator++() {
6252 ++It;
6253 return *this;
6254 }
6255 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
6256 };
6257
6258 static nodes_iterator nodes_begin(BoUpSLP *R) {
6259 return nodes_iterator(R->VectorizableTree.begin());
6260 }
6261
6262 static nodes_iterator nodes_end(BoUpSLP *R) {
6263 return nodes_iterator(R->VectorizableTree.end());
6264 }
6265
6266 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
6267};
6268
6269template <>
6270struct llvm::DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
6271 using TreeEntry = BoUpSLP::TreeEntry;
6272
6273 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
6274
6275 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
6276 std::string Str;
6277 raw_string_ostream OS(Str);
6278 OS << Entry->Idx << ".\n";
6279 if (isSplat(VL: Entry->Scalars))
6280 OS << "<splat> ";
6281 for (auto *V : Entry->Scalars) {
6282 OS << *V;
6283 if (llvm::any_of(Range: R->ExternalUses, P: [&](const BoUpSLP::ExternalUser &EU) {
6284 return EU.Scalar == V;
6285 }))
6286 OS << " <extract>";
6287 OS << "\n";
6288 }
6289 return Str;
6290 }
6291
6292 static std::string getNodeAttributes(const TreeEntry *Entry,
6293 const BoUpSLP *) {
6294 if (Entry->isGather())
6295 return "color=red";
6296 if (Entry->State == TreeEntry::ScatterVectorize ||
6297 Entry->State == TreeEntry::StridedVectorize ||
6298 Entry->State == TreeEntry::CompressVectorize)
6299 return "color=blue";
6300 return "";
6301 }
6302};
6303
6304BoUpSLP::~BoUpSLP() {
6305 SmallVector<WeakTrackingVH> DeadInsts;
6306 for (auto *I : DeletedInstructions) {
6307 if (!I->getParent()) {
6308 // Temporarily insert instruction back to erase them from parent and
6309 // memory later.
6310 if (isa<PHINode>(Val: I))
6311 // Phi nodes must be the very first instructions in the block.
6312 I->insertBefore(BB&: F->getEntryBlock(),
6313 InsertPos: F->getEntryBlock().getFirstNonPHIIt());
6314 else
6315 I->insertBefore(InsertPos: F->getEntryBlock().getTerminator()->getIterator());
6316 continue;
6317 }
6318 for (Use &U : I->operands()) {
6319 auto *Op = dyn_cast<Instruction>(Val: U.get());
6320 if (Op && !DeletedInstructions.count(V: Op) && Op->hasOneUser() &&
6321 wouldInstructionBeTriviallyDead(I: Op, TLI))
6322 DeadInsts.emplace_back(Args&: Op);
6323 }
6324 I->dropAllReferences();
6325 }
6326 for (auto *I : DeletedInstructions) {
6327 assert(I->use_empty() &&
6328 "trying to erase instruction with users.");
6329 I->eraseFromParent();
6330 }
6331
6332 // Cleanup any dead scalar code feeding the vectorized instructions
6333 RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);
6334
6335#ifdef EXPENSIVE_CHECKS
6336 // If we could guarantee that this call is not extremely slow, we could
6337 // remove the ifdef limitation (see PR47712).
6338 assert(!verifyFunction(*F, &dbgs()));
6339#endif
6340}
6341
6342/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
6343/// contains original mask for the scalars reused in the node. Procedure
6344/// transform this mask in accordance with the given \p Mask.
6345static void reorderReuses(SmallVectorImpl<int> &Reuses, ArrayRef<int> Mask) {
6346 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
6347 "Expected non-empty mask.");
6348 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
6349 Prev.swap(RHS&: Reuses);
6350 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
6351 if (Mask[I] != PoisonMaskElem)
6352 Reuses[Mask[I]] = Prev[I];
6353}
6354
6355/// Reorders the given \p Order according to the given \p Mask. \p Order - is
6356/// the original order of the scalars. Procedure transforms the provided order
6357/// in accordance with the given \p Mask. If the resulting \p Order is just an
6358/// identity order, \p Order is cleared.
6359static void reorderOrder(SmallVectorImpl<unsigned> &Order, ArrayRef<int> Mask,
6360 bool BottomOrder = false) {
6361 assert(!Mask.empty() && "Expected non-empty mask.");
6362 unsigned Sz = Mask.size();
6363 if (BottomOrder) {
6364 SmallVector<unsigned> PrevOrder;
6365 if (Order.empty()) {
6366 PrevOrder.resize(N: Sz);
6367 std::iota(first: PrevOrder.begin(), last: PrevOrder.end(), value: 0);
6368 } else {
6369 PrevOrder.swap(RHS&: Order);
6370 }
6371 Order.assign(NumElts: Sz, Elt: Sz);
6372 for (unsigned I = 0; I < Sz; ++I)
6373 if (Mask[I] != PoisonMaskElem)
6374 Order[I] = PrevOrder[Mask[I]];
6375 if (all_of(Range: enumerate(First&: Order), P: [&](const auto &Data) {
6376 return Data.value() == Sz || Data.index() == Data.value();
6377 })) {
6378 Order.clear();
6379 return;
6380 }
6381 fixupOrderingIndices(Order);
6382 return;
6383 }
6384 SmallVector<int> MaskOrder;
6385 if (Order.empty()) {
6386 MaskOrder.resize(N: Sz);
6387 std::iota(first: MaskOrder.begin(), last: MaskOrder.end(), value: 0);
6388 } else {
6389 inversePermutation(Indices: Order, Mask&: MaskOrder);
6390 }
6391 reorderReuses(Reuses&: MaskOrder, Mask);
6392 if (ShuffleVectorInst::isIdentityMask(Mask: MaskOrder, NumSrcElts: Sz)) {
6393 Order.clear();
6394 return;
6395 }
6396 Order.assign(NumElts: Sz, Elt: Sz);
6397 for (unsigned I = 0; I < Sz; ++I)
6398 if (MaskOrder[I] != PoisonMaskElem)
6399 Order[MaskOrder[I]] = I;
6400 fixupOrderingIndices(Order);
6401}
6402
6403std::optional<BoUpSLP::OrdersType>
6404BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE,
6405 bool TopToBottom, bool IgnoreReorder) {
6406 assert(TE.isGather() && "Expected gather node only.");
6407 // Try to find subvector extract/insert patterns and reorder only such
6408 // patterns.
6409 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
6410 Type *ScalarTy = GatheredScalars.front()->getType();
6411 size_t NumScalars = GatheredScalars.size();
6412 if (!isValidElementType(Ty: ScalarTy))
6413 return std::nullopt;
6414 auto *VecTy = getWidenedType(ScalarTy, VF: NumScalars);
6415 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: NumScalars);
6416 SmallVector<int> ExtractMask;
6417 SmallVector<int> Mask;
6418 SmallVector<SmallVector<const TreeEntry *>> Entries;
6419 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =
6420 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
6421 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =
6422 isGatherShuffledEntry(TE: &TE, VL: GatheredScalars, Mask, Entries, NumParts,
6423 /*ForOrder=*/true);
6424 // No shuffled operands - ignore.
6425 if (GatherShuffles.empty() && ExtractShuffles.empty())
6426 return std::nullopt;
6427 OrdersType CurrentOrder(NumScalars, NumScalars);
6428 if (GatherShuffles.size() == 1 &&
6429 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
6430 Entries.front().front()->isSame(VL: TE.Scalars)) {
6431 // If the full matched node in whole tree rotation - no need to consider the
6432 // matching order, rotating the whole tree.
6433 if (TopToBottom)
6434 return std::nullopt;
6435 // No need to keep the order for the same user node.
6436 if (Entries.front().front()->UserTreeIndex.UserTE ==
6437 TE.UserTreeIndex.UserTE)
6438 return std::nullopt;
6439 // No need to keep the order for the matched root node, if it can be freely
6440 // reordered.
6441 if (!IgnoreReorder && Entries.front().front()->Idx == 0)
6442 return std::nullopt;
6443 // If shuffling 2 elements only and the matching node has reverse reuses -
6444 // no need to count order, both work fine.
6445 if (!Entries.front().front()->ReuseShuffleIndices.empty() &&
6446 TE.getVectorFactor() == 2 && Mask.size() == 2 &&
6447 any_of(Range: enumerate(First: Entries.front().front()->ReuseShuffleIndices),
6448 P: [](const auto &P) {
6449 return P.value() % 2 != static_cast<int>(P.index()) % 2;
6450 }))
6451 return std::nullopt;
6452
6453 // Perfect match in the graph, will reuse the previously vectorized
6454 // node. Cost is 0.
6455 std::iota(first: CurrentOrder.begin(), last: CurrentOrder.end(), value: 0);
6456 return CurrentOrder;
6457 }
6458 auto IsSplatMask = [](ArrayRef<int> Mask) {
6459 int SingleElt = PoisonMaskElem;
6460 return all_of(Range&: Mask, P: [&](int I) {
6461 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
6462 SingleElt = I;
6463 return I == PoisonMaskElem || I == SingleElt;
6464 });
6465 };
6466 // Exclusive broadcast mask - ignore.
6467 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
6468 (Entries.size() != 1 ||
6469 Entries.front().front()->ReorderIndices.empty())) ||
6470 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
6471 return std::nullopt;
6472 SmallBitVector ShuffledSubMasks(NumParts);
6473 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
6474 ArrayRef<int> Mask, int PartSz, int NumParts,
6475 function_ref<unsigned(unsigned)> GetVF) {
6476 for (int I : seq<int>(Begin: 0, End: NumParts)) {
6477 if (ShuffledSubMasks.test(Idx: I))
6478 continue;
6479 const int VF = GetVF(I);
6480 if (VF == 0)
6481 continue;
6482 unsigned Limit = getNumElems(Size: CurrentOrder.size(), PartNumElems: PartSz, Part: I);
6483 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(N: I * PartSz, M: Limit);
6484 // Shuffle of at least 2 vectors - ignore.
6485 if (any_of(Range&: Slice, P: not_equal_to(Arg&: NumScalars))) {
6486 llvm::fill(Range&: Slice, Value&: NumScalars);
6487 ShuffledSubMasks.set(I);
6488 continue;
6489 }
6490 // Try to include as much elements from the mask as possible.
6491 int FirstMin = INT_MAX;
6492 int SecondVecFound = false;
6493 for (int K : seq<int>(Size: Limit)) {
6494 int Idx = Mask[I * PartSz + K];
6495 if (Idx == PoisonMaskElem) {
6496 Value *V = GatheredScalars[I * PartSz + K];
6497 if (isConstant(V) && !isa<PoisonValue>(Val: V)) {
6498 SecondVecFound = true;
6499 break;
6500 }
6501 continue;
6502 }
6503 if (Idx < VF) {
6504 if (FirstMin > Idx)
6505 FirstMin = Idx;
6506 } else {
6507 SecondVecFound = true;
6508 break;
6509 }
6510 }
6511 FirstMin = (FirstMin / PartSz) * PartSz;
6512 // Shuffle of at least 2 vectors - ignore.
6513 if (SecondVecFound) {
6514 llvm::fill(Range&: Slice, Value&: NumScalars);
6515 ShuffledSubMasks.set(I);
6516 continue;
6517 }
6518 for (int K : seq<int>(Size: Limit)) {
6519 int Idx = Mask[I * PartSz + K];
6520 if (Idx == PoisonMaskElem)
6521 continue;
6522 Idx -= FirstMin;
6523 if (Idx >= PartSz) {
6524 SecondVecFound = true;
6525 break;
6526 }
6527 if (CurrentOrder[I * PartSz + Idx] >
6528 static_cast<unsigned>(I * PartSz + K) &&
6529 CurrentOrder[I * PartSz + Idx] !=
6530 static_cast<unsigned>(I * PartSz + Idx))
6531 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
6532 }
6533 // Shuffle of at least 2 vectors - ignore.
6534 if (SecondVecFound) {
6535 llvm::fill(Range&: Slice, Value&: NumScalars);
6536 ShuffledSubMasks.set(I);
6537 continue;
6538 }
6539 }
6540 };
6541 int PartSz = getPartNumElems(Size: NumScalars, NumParts);
6542 if (!ExtractShuffles.empty())
6543 TransformMaskToOrder(
6544 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
6545 if (!ExtractShuffles[I])
6546 return 0U;
6547 unsigned VF = 0;
6548 unsigned Sz = getNumElems(Size: TE.getVectorFactor(), PartNumElems: PartSz, Part: I);
6549 for (unsigned Idx : seq<unsigned>(Size: Sz)) {
6550 int K = I * PartSz + Idx;
6551 if (ExtractMask[K] == PoisonMaskElem)
6552 continue;
6553 if (!TE.ReuseShuffleIndices.empty())
6554 K = TE.ReuseShuffleIndices[K];
6555 if (K == PoisonMaskElem)
6556 continue;
6557 if (!TE.ReorderIndices.empty())
6558 K = std::distance(first: TE.ReorderIndices.begin(),
6559 last: find(Range: TE.ReorderIndices, Val: K));
6560 auto *EI = dyn_cast<ExtractElementInst>(Val: TE.Scalars[K]);
6561 if (!EI)
6562 continue;
6563 VF = std::max(a: VF, b: cast<VectorType>(Val: EI->getVectorOperandType())
6564 ->getElementCount()
6565 .getKnownMinValue());
6566 }
6567 return VF;
6568 });
6569 // Check special corner case - single shuffle of the same entry.
6570 if (GatherShuffles.size() == 1 && NumParts != 1) {
6571 if (ShuffledSubMasks.any())
6572 return std::nullopt;
6573 PartSz = NumScalars;
6574 NumParts = 1;
6575 }
6576 if (!Entries.empty())
6577 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
6578 if (!GatherShuffles[I])
6579 return 0U;
6580 return std::max(a: Entries[I].front()->getVectorFactor(),
6581 b: Entries[I].back()->getVectorFactor());
6582 });
6583 unsigned NumUndefs = count(Range&: CurrentOrder, Element: NumScalars);
6584 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
6585 return std::nullopt;
6586 return std::move(CurrentOrder);
6587}
6588
6589static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
6590 const TargetLibraryInfo &TLI,
6591 bool CompareOpcodes = true) {
6592 if (getUnderlyingObject(V: Ptr1, MaxLookup: RecursionMaxDepth) !=
6593 getUnderlyingObject(V: Ptr2, MaxLookup: RecursionMaxDepth))
6594 return false;
6595 auto *GEP1 = dyn_cast<GetElementPtrInst>(Val: Ptr1);
6596 auto *GEP2 = dyn_cast<GetElementPtrInst>(Val: Ptr2);
6597 return (!GEP1 || GEP1->getNumOperands() == 2) &&
6598 (!GEP2 || GEP2->getNumOperands() == 2) &&
6599 (((!GEP1 || isConstant(V: GEP1->getOperand(i_nocapture: 1))) &&
6600 (!GEP2 || isConstant(V: GEP2->getOperand(i_nocapture: 1)))) ||
6601 !CompareOpcodes ||
6602 (GEP1 && GEP2 &&
6603 getSameOpcode(VL: {GEP1->getOperand(i_nocapture: 1), GEP2->getOperand(i_nocapture: 1)}, TLI)));
6604}
6605
6606/// Calculates minimal alignment as a common alignment.
6607template <typename T>
6608static Align computeCommonAlignment(ArrayRef<Value *> VL) {
6609 Align CommonAlignment = cast<T>(VL.consume_front())->getAlign();
6610 for (Value *V : VL)
6611 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
6612 return CommonAlignment;
6613}
6614
6615/// Check if \p Order represents reverse order.
6616static bool isReverseOrder(ArrayRef<unsigned> Order) {
6617 assert(!Order.empty() &&
6618 "Order is empty. Please check it before using isReverseOrder.");
6619 unsigned Sz = Order.size();
6620 return all_of(Range: enumerate(First&: Order), P: [&](const auto &Pair) {
6621 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
6622 });
6623}
6624
6625/// Checks if the provided list of pointers \p Pointers represents the strided
6626/// pointers for type ElemTy. If they are not, nullptr is returned.
6627/// Otherwise, SCEV* of the stride value is returned.
6628/// If `PointerOps` can be rearanged into the following sequence:
6629/// ```
6630/// %x + c_0 * stride,
6631/// %x + c_1 * stride,
6632/// %x + c_2 * stride
6633/// ...
6634/// ```
6635/// where each `c_i` is constant. The `Coeffs` will contain `c_0, c_1, c_2, ..`
6636/// and the SCEV of the `stride` will be returned.
6637static const SCEV *calculateRtStride(ArrayRef<Value *> PointerOps, Type *ElemTy,
6638 const DataLayout &DL, ScalarEvolution &SE,
6639 SmallVectorImpl<unsigned> &SortedIndices,
6640 SmallVectorImpl<int64_t> &Coeffs) {
6641 assert(Coeffs.size() == PointerOps.size() &&
6642 "Coeffs vector needs to be of correct size");
6643 SmallVector<const SCEV *> SCEVs;
6644 const SCEV *PtrSCEVLowest = nullptr;
6645 const SCEV *PtrSCEVHighest = nullptr;
6646 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
6647 // addresses).
6648 for (Value *Ptr : PointerOps) {
6649 const SCEV *PtrSCEV = SE.getSCEV(V: Ptr);
6650 if (!PtrSCEV)
6651 return nullptr;
6652 SCEVs.push_back(Elt: PtrSCEV);
6653 if (!PtrSCEVLowest && !PtrSCEVHighest) {
6654 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
6655 continue;
6656 }
6657 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6658 if (isa<SCEVCouldNotCompute>(Val: Diff))
6659 return nullptr;
6660 if (Diff->isNonConstantNegative()) {
6661 PtrSCEVLowest = PtrSCEV;
6662 continue;
6663 }
6664 const SCEV *Diff1 = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEV);
6665 if (isa<SCEVCouldNotCompute>(Val: Diff1))
6666 return nullptr;
6667 if (Diff1->isNonConstantNegative()) {
6668 PtrSCEVHighest = PtrSCEV;
6669 continue;
6670 }
6671 }
6672 // Dist = PtrSCEVHighest - PtrSCEVLowest;
6673 const SCEV *Dist = SE.getMinusSCEV(LHS: PtrSCEVHighest, RHS: PtrSCEVLowest);
6674 if (isa<SCEVCouldNotCompute>(Val: Dist))
6675 return nullptr;
6676 int Size = DL.getTypeStoreSize(Ty: ElemTy);
6677 auto TryGetStride = [&](const SCEV *Dist,
6678 const SCEV *Multiplier) -> const SCEV * {
6679 if (const auto *M = dyn_cast<SCEVMulExpr>(Val: Dist)) {
6680 if (M->getOperand(i: 0) == Multiplier)
6681 return M->getOperand(i: 1);
6682 if (M->getOperand(i: 1) == Multiplier)
6683 return M->getOperand(i: 0);
6684 return nullptr;
6685 }
6686 if (Multiplier == Dist)
6687 return SE.getConstant(Ty: Dist->getType(), V: 1);
6688 return SE.getUDivExactExpr(LHS: Dist, RHS: Multiplier);
6689 };
6690 // Stride_in_elements = Dist / element_size * (num_elems - 1).
6691 const SCEV *Stride = nullptr;
6692 if (Size != 1 || SCEVs.size() > 2) {
6693 const SCEV *Sz = SE.getConstant(Ty: Dist->getType(), V: Size * (SCEVs.size() - 1));
6694 Stride = TryGetStride(Dist, Sz);
6695 if (!Stride)
6696 return nullptr;
6697 }
6698 if (!Stride || isa<SCEVConstant>(Val: Stride))
6699 return nullptr;
6700 // Iterate through all pointers and check if all distances are
6701 // unique multiple of Stride.
6702 using DistOrdPair = std::pair<int64_t, int>;
6703 auto Compare = llvm::less_first();
6704 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
6705 int Cnt = 0;
6706 bool IsConsecutive = true;
6707 for (const auto [Idx, PtrSCEV] : enumerate(First&: SCEVs)) {
6708 unsigned Dist = 0;
6709 if (PtrSCEV != PtrSCEVLowest) {
6710 const SCEV *Diff = SE.getMinusSCEV(LHS: PtrSCEV, RHS: PtrSCEVLowest);
6711 const SCEV *Coeff = TryGetStride(Diff, Stride);
6712 if (!Coeff)
6713 return nullptr;
6714 const auto *SC = dyn_cast<SCEVConstant>(Val: Coeff);
6715 if (!SC || isa<SCEVCouldNotCompute>(Val: SC))
6716 return nullptr;
6717 Coeffs[Idx] = (int64_t)SC->getAPInt().getLimitedValue();
6718 if (!SE.getMinusSCEV(LHS: PtrSCEV, RHS: SE.getAddExpr(LHS: PtrSCEVLowest,
6719 RHS: SE.getMulExpr(LHS: Stride, RHS: SC)))
6720 ->isZero())
6721 return nullptr;
6722 Dist = SC->getAPInt().getZExtValue();
6723 } else {
6724 Coeffs[Idx] = 0;
6725 }
6726 // If the strides are not the same or repeated, we can't vectorize.
6727 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
6728 return nullptr;
6729 auto Res = Offsets.emplace(args&: Dist, args&: Cnt);
6730 if (!Res.second)
6731 return nullptr;
6732 // Consecutive order if the inserted element is the last one.
6733 IsConsecutive = IsConsecutive && std::next(x: Res.first) == Offsets.end();
6734 ++Cnt;
6735 }
6736 if (Offsets.size() != SCEVs.size())
6737 return nullptr;
6738 SortedIndices.clear();
6739 if (!IsConsecutive) {
6740 // Fill SortedIndices array only if it is non-consecutive.
6741 SortedIndices.resize(N: PointerOps.size());
6742 Cnt = 0;
6743 for (const std::pair<int64_t, int> &Pair : Offsets) {
6744 SortedIndices[Cnt] = Pair.second;
6745 ++Cnt;
6746 }
6747 }
6748 return Stride;
6749}
6750
6751static std::pair<InstructionCost, InstructionCost>
6752getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
6753 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
6754 Type *ScalarTy, VectorType *VecTy);
6755
6756/// Returns the cost of the shuffle instructions with the given \p Kind, vector
6757/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
6758/// subvector pattern.
6759static InstructionCost
6760getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind,
6761 VectorType *Tp, ArrayRef<int> Mask = {},
6762 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
6763 int Index = 0, VectorType *SubTp = nullptr,
6764 ArrayRef<const Value *> Args = {}) {
6765 VectorType *DstTy = Tp;
6766 if (!Mask.empty())
6767 DstTy = FixedVectorType::get(ElementType: Tp->getScalarType(), NumElts: Mask.size());
6768
6769 if (Kind != TTI::SK_PermuteTwoSrc)
6770 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6771 Args);
6772 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
6773 int NumSubElts;
6774 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
6775 Mask, NumSrcElts, NumSubElts, Index)) {
6776 if (Index + NumSubElts > NumSrcElts &&
6777 Index + NumSrcElts <= static_cast<int>(Mask.size()))
6778 return TTI.getShuffleCost(Kind: TTI::SK_InsertSubvector, DstTy, SrcTy: Tp, Mask,
6779 CostKind: TTI::TCK_RecipThroughput, Index, SubTp: Tp);
6780 }
6781 return TTI.getShuffleCost(Kind, DstTy, SrcTy: Tp, Mask, CostKind, Index, SubTp,
6782 Args);
6783}
6784
6785/// This is similar to TargetTransformInfo::getScalarizationOverhead, but if
6786/// ScalarTy is a FixedVectorType, a vector will be inserted or extracted
6787/// instead of a scalar.
6788static InstructionCost
6789getScalarizationOverhead(const TargetTransformInfo &TTI, Type *ScalarTy,
6790 VectorType *Ty, const APInt &DemandedElts, bool Insert,
6791 bool Extract, TTI::TargetCostKind CostKind,
6792 bool ForPoisonSrc = true, ArrayRef<Value *> VL = {}) {
6793 assert(!isa<ScalableVectorType>(Ty) &&
6794 "ScalableVectorType is not supported.");
6795 assert(getNumElements(ScalarTy) * DemandedElts.getBitWidth() ==
6796 getNumElements(Ty) &&
6797 "Incorrect usage.");
6798 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6799 assert(SLPReVec && "Only supported by REVEC.");
6800 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
6801 // of CreateInsertElement.
6802 unsigned ScalarTyNumElements = VecTy->getNumElements();
6803 InstructionCost Cost = 0;
6804 for (unsigned I : seq(Size: DemandedElts.getBitWidth())) {
6805 if (!DemandedElts[I])
6806 continue;
6807 if (Insert)
6808 Cost += getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: Ty, Mask: {}, CostKind,
6809 Index: I * ScalarTyNumElements, SubTp: VecTy);
6810 if (Extract)
6811 Cost += getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: Ty, Mask: {}, CostKind,
6812 Index: I * ScalarTyNumElements, SubTp: VecTy);
6813 }
6814 return Cost;
6815 }
6816 return TTI.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
6817 CostKind, ForPoisonSrc, VL);
6818}
6819
6820/// This is similar to TargetTransformInfo::getVectorInstrCost, but if ScalarTy
6821/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6822static InstructionCost getVectorInstrCost(
6823 const TargetTransformInfo &TTI, Type *ScalarTy, unsigned Opcode, Type *Val,
6824 TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar,
6825 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
6826 if (Opcode == Instruction::ExtractElement) {
6827 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
6828 assert(SLPReVec && "Only supported by REVEC.");
6829 assert(isa<VectorType>(Val) && "Val must be a vector type.");
6830 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
6831 Tp: cast<VectorType>(Val), Mask: {}, CostKind,
6832 Index: Index * VecTy->getNumElements(), SubTp: VecTy);
6833 }
6834 }
6835 return TTI.getVectorInstrCost(Opcode, Val, CostKind, Index, Scalar,
6836 ScalarUserAndIdx);
6837}
6838
6839/// This is similar to TargetTransformInfo::getExtractWithExtendCost, but if Dst
6840/// is a FixedVectorType, a vector will be extracted instead of a scalar.
6841static InstructionCost getExtractWithExtendCost(
6842 const TargetTransformInfo &TTI, unsigned Opcode, Type *Dst,
6843 VectorType *VecTy, unsigned Index,
6844 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
6845 if (auto *ScalarTy = dyn_cast<FixedVectorType>(Val: Dst)) {
6846 assert(SLPReVec && "Only supported by REVEC.");
6847 auto *SubTp =
6848 getWidenedType(ScalarTy: VecTy->getElementType(), VF: ScalarTy->getNumElements());
6849 return getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector, Tp: VecTy, Mask: {}, CostKind,
6850 Index: Index * ScalarTy->getNumElements(), SubTp) +
6851 TTI.getCastInstrCost(Opcode, Dst, Src: SubTp, CCH: TTI::CastContextHint::None,
6852 CostKind);
6853 }
6854 return TTI.getExtractWithExtendCost(Opcode, Dst, VecTy, Index, CostKind);
6855}
6856
6857/// Creates subvector insert. Generates shuffle using \p Generator or
6858/// using default shuffle.
6859static Value *createInsertVector(
6860 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
6861 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
6862 if (isa<PoisonValue>(Val: Vec) && isa<PoisonValue>(Val: V))
6863 return Vec;
6864 const unsigned SubVecVF = getNumElements(Ty: V->getType());
6865 // Create shuffle, insertvector requires that index is multiple of
6866 // the subvector length.
6867 const unsigned VecVF = getNumElements(Ty: Vec->getType());
6868 SmallVector<int> Mask(VecVF, PoisonMaskElem);
6869 if (isa<PoisonValue>(Val: Vec)) {
6870 auto *Begin = std::next(x: Mask.begin(), n: Index);
6871 std::iota(first: Begin, last: std::next(x: Begin, n: SubVecVF), value: 0);
6872 Vec = Builder.CreateShuffleVector(V, Mask);
6873 return Vec;
6874 }
6875 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
6876 std::iota(first: std::next(x: Mask.begin(), n: Index),
6877 last: std::next(x: Mask.begin(), n: Index + SubVecVF), value: VecVF);
6878 if (Generator)
6879 return Generator(Vec, V, Mask);
6880 // 1. Resize V to the size of Vec.
6881 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
6882 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: SubVecVF), value: 0);
6883 V = Builder.CreateShuffleVector(V, Mask: ResizeMask);
6884 // 2. Insert V into Vec.
6885 return Builder.CreateShuffleVector(V1: Vec, V2: V, Mask);
6886}
6887
6888/// Generates subvector extract using \p Generator or using default shuffle.
6889static Value *createExtractVector(IRBuilderBase &Builder, Value *Vec,
6890 unsigned SubVecVF, unsigned Index) {
6891 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
6892 std::iota(first: Mask.begin(), last: Mask.end(), value: Index);
6893 return Builder.CreateShuffleVector(V: Vec, Mask);
6894}
6895
6896/// Builds compress-like mask for shuffles for the given \p PointerOps, ordered
6897/// with \p Order.
6898/// \return true if the mask represents strided access, false - otherwise.
6899static bool buildCompressMask(ArrayRef<Value *> PointerOps,
6900 ArrayRef<unsigned> Order, Type *ScalarTy,
6901 const DataLayout &DL, ScalarEvolution &SE,
6902 SmallVectorImpl<int> &CompressMask) {
6903 const unsigned Sz = PointerOps.size();
6904 CompressMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
6905 // The first element always set.
6906 CompressMask[0] = 0;
6907 // Check if the mask represents strided access.
6908 std::optional<unsigned> Stride = 0;
6909 Value *Ptr0 = Order.empty() ? PointerOps.front() : PointerOps[Order.front()];
6910 for (unsigned I : seq<unsigned>(Begin: 1, End: Sz)) {
6911 Value *Ptr = Order.empty() ? PointerOps[I] : PointerOps[Order[I]];
6912 std::optional<int64_t> OptPos =
6913 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: Ptr, DL, SE);
6914 if (!OptPos || OptPos > std::numeric_limits<unsigned>::max())
6915 return false;
6916 unsigned Pos = static_cast<unsigned>(*OptPos);
6917 CompressMask[I] = Pos;
6918 if (!Stride)
6919 continue;
6920 if (*Stride == 0) {
6921 *Stride = Pos;
6922 continue;
6923 }
6924 if (Pos != *Stride * I)
6925 Stride.reset();
6926 }
6927 return Stride.has_value();
6928}
6929
6930/// Checks if the \p VL can be transformed to a (masked)load + compress or
6931/// (masked) interleaved load.
6932static bool isMaskedLoadCompress(
6933 ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
6934 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
6935 const DataLayout &DL, ScalarEvolution &SE, AssumptionCache &AC,
6936 const DominatorTree &DT, const TargetLibraryInfo &TLI,
6937 const function_ref<bool(Value *)> AreAllUsersVectorized, bool &IsMasked,
6938 unsigned &InterleaveFactor, SmallVectorImpl<int> &CompressMask,
6939 VectorType *&LoadVecTy) {
6940 InterleaveFactor = 0;
6941 Type *ScalarTy = VL.front()->getType();
6942 const size_t Sz = VL.size();
6943 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
6944 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6945 SmallVector<int> Mask;
6946 if (!Order.empty())
6947 inversePermutation(Indices: Order, Mask);
6948 // Check external uses.
6949 for (const auto [I, V] : enumerate(First&: VL)) {
6950 if (AreAllUsersVectorized(V))
6951 continue;
6952 InstructionCost ExtractCost =
6953 TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VecTy, CostKind,
6954 Index: Mask.empty() ? I : Mask[I]);
6955 InstructionCost ScalarCost =
6956 TTI.getInstructionCost(U: cast<Instruction>(Val: V), CostKind);
6957 if (ExtractCost <= ScalarCost)
6958 return false;
6959 }
6960 Value *Ptr0;
6961 Value *PtrN;
6962 if (Order.empty()) {
6963 Ptr0 = PointerOps.front();
6964 PtrN = PointerOps.back();
6965 } else {
6966 Ptr0 = PointerOps[Order.front()];
6967 PtrN = PointerOps[Order.back()];
6968 }
6969 std::optional<int64_t> Diff =
6970 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL, SE);
6971 if (!Diff)
6972 return false;
6973 const size_t MaxRegSize =
6974 TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector)
6975 .getFixedValue();
6976 // Check for very large distances between elements.
6977 if (*Diff / Sz >= MaxRegSize / 8)
6978 return false;
6979 LoadVecTy = getWidenedType(ScalarTy, VF: *Diff + 1);
6980 auto *LI = cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()]);
6981 Align CommonAlignment = LI->getAlign();
6982 IsMasked = !isSafeToLoadUnconditionally(
6983 V: Ptr0, Ty: LoadVecTy, Alignment: CommonAlignment, DL,
6984 ScanFrom: cast<LoadInst>(Val: Order.empty() ? VL.back() : VL[Order.back()]), AC: &AC, DT: &DT,
6985 TLI: &TLI);
6986 if (IsMasked && !TTI.isLegalMaskedLoad(DataType: LoadVecTy, Alignment: CommonAlignment,
6987 AddressSpace: LI->getPointerAddressSpace()))
6988 return false;
6989 // TODO: perform the analysis of each scalar load for better
6990 // safe-load-unconditionally analysis.
6991 bool IsStrided =
6992 buildCompressMask(PointerOps, Order, ScalarTy, DL, SE, CompressMask);
6993 assert(CompressMask.size() >= 2 && "At least two elements are required");
6994 SmallVector<Value *> OrderedPointerOps(PointerOps);
6995 if (!Order.empty())
6996 reorderScalars(Scalars&: OrderedPointerOps, Mask);
6997 auto [ScalarGEPCost, VectorGEPCost] =
6998 getGEPCosts(TTI, Ptrs: OrderedPointerOps, BasePtr: OrderedPointerOps.front(),
6999 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy: LoadVecTy);
7000 // The cost of scalar loads.
7001 InstructionCost ScalarLoadsCost =
7002 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
7003 binary_op: [&](InstructionCost C, Value *V) {
7004 return C + TTI.getInstructionCost(U: cast<Instruction>(Val: V),
7005 CostKind);
7006 }) +
7007 ScalarGEPCost;
7008 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
7009 InstructionCost GatherCost =
7010 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7011 /*Insert=*/true,
7012 /*Extract=*/false, CostKind) +
7013 ScalarLoadsCost;
7014 InstructionCost LoadCost = 0;
7015 if (IsMasked) {
7016 LoadCost = TTI.getMemIntrinsicInstrCost(
7017 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
7018 CommonAlignment,
7019 LI->getPointerAddressSpace()),
7020 CostKind);
7021 } else {
7022 LoadCost =
7023 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
7024 AddressSpace: LI->getPointerAddressSpace(), CostKind);
7025 }
7026 if (IsStrided && !IsMasked && Order.empty()) {
7027 // Check for potential segmented(interleaved) loads.
7028 VectorType *AlignedLoadVecTy = getWidenedType(
7029 ScalarTy, VF: getFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: *Diff + 1));
7030 if (!isSafeToLoadUnconditionally(V: Ptr0, Ty: AlignedLoadVecTy, Alignment: CommonAlignment,
7031 DL, ScanFrom: cast<LoadInst>(Val: VL.back()), AC: &AC, DT: &DT,
7032 TLI: &TLI))
7033 AlignedLoadVecTy = LoadVecTy;
7034 if (TTI.isLegalInterleavedAccessType(VTy: AlignedLoadVecTy, Factor: CompressMask[1],
7035 Alignment: CommonAlignment,
7036 AddrSpace: LI->getPointerAddressSpace())) {
7037 InstructionCost InterleavedCost =
7038 VectorGEPCost + TTI.getInterleavedMemoryOpCost(
7039 Opcode: Instruction::Load, VecTy: AlignedLoadVecTy,
7040 Factor: CompressMask[1], Indices: {}, Alignment: CommonAlignment,
7041 AddressSpace: LI->getPointerAddressSpace(), CostKind, UseMaskForCond: IsMasked);
7042 if (InterleavedCost < GatherCost) {
7043 InterleaveFactor = CompressMask[1];
7044 LoadVecTy = AlignedLoadVecTy;
7045 return true;
7046 }
7047 }
7048 }
7049 InstructionCost CompressCost = ::getShuffleCost(
7050 TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: LoadVecTy, Mask: CompressMask, CostKind);
7051 if (!Order.empty()) {
7052 SmallVector<int> NewMask(Sz, PoisonMaskElem);
7053 for (unsigned I : seq<unsigned>(Size: Sz)) {
7054 NewMask[I] = CompressMask[Mask[I]];
7055 }
7056 CompressMask.swap(RHS&: NewMask);
7057 }
7058 InstructionCost TotalVecCost = VectorGEPCost + LoadCost + CompressCost;
7059 return TotalVecCost < GatherCost;
7060}
7061
7062/// Checks if the \p VL can be transformed to a (masked)load + compress or
7063/// (masked) interleaved load.
7064static bool
7065isMaskedLoadCompress(ArrayRef<Value *> VL, ArrayRef<Value *> PointerOps,
7066 ArrayRef<unsigned> Order, const TargetTransformInfo &TTI,
7067 const DataLayout &DL, ScalarEvolution &SE,
7068 AssumptionCache &AC, const DominatorTree &DT,
7069 const TargetLibraryInfo &TLI,
7070 const function_ref<bool(Value *)> AreAllUsersVectorized) {
7071 bool IsMasked;
7072 unsigned InterleaveFactor;
7073 SmallVector<int> CompressMask;
7074 VectorType *LoadVecTy;
7075 return isMaskedLoadCompress(VL, PointerOps, Order, TTI, DL, SE, AC, DT, TLI,
7076 AreAllUsersVectorized, IsMasked, InterleaveFactor,
7077 CompressMask, LoadVecTy);
7078}
7079
7080/// Checks if strided loads can be generated out of \p VL loads with pointers \p
7081/// PointerOps:
7082/// 1. Target with strided load support is detected.
7083/// 2. The number of loads is greater than MinProfitableStridedLoads, or the
7084/// potential stride <= MaxProfitableLoadStride and the potential stride is
7085/// power-of-2 (to avoid perf regressions for the very small number of loads)
7086/// and max distance > number of loads, or potential stride is -1.
7087/// 3. The loads are ordered, or number of unordered loads <=
7088/// MaxProfitableUnorderedLoads, or loads are in reversed order. (this check is
7089/// to avoid extra costs for very expensive shuffles).
7090/// 4. Any pointer operand is an instruction with the users outside of the
7091/// current graph (for masked gathers extra extractelement instructions
7092/// might be required).
7093bool BoUpSLP::isStridedLoad(ArrayRef<Value *> PointerOps, Type *ScalarTy,
7094 Align Alignment, const int64_t Diff,
7095 const size_t Sz) const {
7096 if (Diff % (Sz - 1) != 0)
7097 return false;
7098
7099 // Try to generate strided load node.
7100 auto IsAnyPointerUsedOutGraph = any_of(Range&: PointerOps, P: [&](Value *V) {
7101 return isa<Instruction>(Val: V) && any_of(Range: V->users(), P: [&](User *U) {
7102 return !isVectorized(V: U) && !MustGather.contains(Ptr: U);
7103 });
7104 });
7105
7106 const uint64_t AbsoluteDiff = std::abs(i: Diff);
7107 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7108 if (IsAnyPointerUsedOutGraph ||
7109 (AbsoluteDiff > Sz &&
7110 (Sz > MinProfitableStridedLoads ||
7111 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
7112 AbsoluteDiff % Sz == 0 && has_single_bit(Value: AbsoluteDiff / Sz)))) ||
7113 Diff == -(static_cast<int64_t>(Sz) - 1)) {
7114 int64_t Stride = Diff / static_cast<int64_t>(Sz - 1);
7115 if (Diff != Stride * static_cast<int64_t>(Sz - 1))
7116 return false;
7117 if (!TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment))
7118 return false;
7119 return true;
7120 }
7121 return false;
7122}
7123
7124bool BoUpSLP::analyzeConstantStrideCandidate(
7125 const ArrayRef<Value *> PointerOps, Type *ScalarTy, Align Alignment,
7126 const SmallVectorImpl<unsigned> &SortedIndices, const int64_t Diff,
7127 Value *Ptr0, Value *PtrN, StridedPtrInfo &SPtrInfo) const {
7128 const size_t Sz = PointerOps.size();
7129 SmallVector<int64_t> SortedOffsetsFromBase(Sz);
7130 // Go through `PointerOps` in sorted order and record offsets from
7131 // PointerOps[0]. We use PointerOps[0] rather than Ptr0 because
7132 // sortPtrAccesses only validates getPointersDiff for pairs relative to
7133 // PointerOps[0]. This is safe since only offset differences are used below.
7134 for (unsigned I : seq<unsigned>(Size: Sz)) {
7135 Value *Ptr =
7136 SortedIndices.empty() ? PointerOps[I] : PointerOps[SortedIndices[I]];
7137 std::optional<int64_t> Offset =
7138 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: Ptr, DL: *DL, SE&: *SE);
7139 assert(Offset && "sortPtrAccesses should have validated this pointer");
7140 SortedOffsetsFromBase[I] = *Offset;
7141 }
7142
7143 // The code below checks that `SortedOffsetsFromBase` looks as follows:
7144 // ```
7145 // [
7146 // (e_{0, 0}, e_{0, 1}, ..., e_{0, GroupSize - 1}), // first group
7147 // (e_{1, 0}, e_{1, 1}, ..., e_{1, GroupSize - 1}), // secon group
7148 // ...
7149 // (e_{NumGroups - 1, 0}, e_{NumGroups - 1, 1}, ..., e_{NumGroups - 1,
7150 // GroupSize - 1}), // last group
7151 // ]
7152 // ```
7153 // The distance between consecutive elements within each group should all be
7154 // the same `StrideWithinGroup`. The distance between the first elements of
7155 // consecutive groups should all be the same `StrideBetweenGroups`.
7156
7157 int64_t StrideWithinGroup =
7158 SortedOffsetsFromBase[1] - SortedOffsetsFromBase[0];
7159 // Determine size of the first group. Later we will check that all other
7160 // groups have the same size.
7161 auto IsEndOfGroupIndex = [=, &SortedOffsetsFromBase](unsigned Idx) {
7162 return SortedOffsetsFromBase[Idx] - SortedOffsetsFromBase[Idx - 1] !=
7163 StrideWithinGroup;
7164 };
7165 auto Indices = seq<unsigned>(Begin: 1, End: Sz);
7166 auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7167 unsigned GroupSize = FoundIt != Indices.end() ? *FoundIt : Sz;
7168
7169 unsigned VecSz = Sz;
7170 Type *NewScalarTy = ScalarTy;
7171
7172 // Quick detour: at this point we can say what the type of strided load would
7173 // be if all the checks pass. Check if this type is legal for the target.
7174 bool NeedsWidening = Sz != GroupSize;
7175 if (NeedsWidening) {
7176 if (Sz % GroupSize != 0)
7177 return false;
7178
7179 if (StrideWithinGroup != 1)
7180 return false;
7181 VecSz = Sz / GroupSize;
7182 NewScalarTy = Type::getIntNTy(
7183 C&: SE->getContext(),
7184 N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * GroupSize);
7185 }
7186
7187 if (!isStridedLoad(PointerOps, ScalarTy: NewScalarTy, Alignment, Diff, Sz: VecSz))
7188 return false;
7189
7190 int64_t StrideIntVal = StrideWithinGroup;
7191 if (NeedsWidening) {
7192 // Continue with checking the "shape" of `SortedOffsetsFromBase`.
7193 // Check that the strides between groups are all the same.
7194 unsigned CurrentGroupStartIdx = GroupSize;
7195 int64_t StrideBetweenGroups =
7196 SortedOffsetsFromBase[GroupSize] - SortedOffsetsFromBase[0];
7197 StrideIntVal = StrideBetweenGroups;
7198 for (; CurrentGroupStartIdx < Sz; CurrentGroupStartIdx += GroupSize) {
7199 if (SortedOffsetsFromBase[CurrentGroupStartIdx] -
7200 SortedOffsetsFromBase[CurrentGroupStartIdx - GroupSize] !=
7201 StrideBetweenGroups)
7202 return false;
7203 }
7204
7205 auto CheckGroup = [=](const unsigned StartIdx) -> bool {
7206 auto Indices = seq<unsigned>(Begin: StartIdx + 1, End: Sz);
7207 auto FoundIt = llvm::find_if(Range&: Indices, P: IsEndOfGroupIndex);
7208 unsigned GroupEndIdx = FoundIt != Indices.end() ? *FoundIt : Sz;
7209 return GroupEndIdx - StartIdx == GroupSize;
7210 };
7211 for (unsigned I = 0; I < Sz; I += GroupSize) {
7212 if (!CheckGroup(I))
7213 return false;
7214 }
7215 }
7216
7217 Type *StrideTy = DL->getIndexType(PtrTy: Ptr0->getType());
7218 SPtrInfo.StrideVal = ConstantInt::getSigned(Ty: StrideTy, V: StrideIntVal);
7219 SPtrInfo.Ty = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7220 return true;
7221}
7222
7223bool BoUpSLP::analyzeRtStrideCandidate(ArrayRef<Value *> PointerOps,
7224 Type *ScalarTy, Align CommonAlignment,
7225 SmallVectorImpl<unsigned> &SortedIndices,
7226 StridedPtrInfo &SPtrInfo) const {
7227 // If each value in `PointerOps` is of the form `%x + Offset` where `Offset`
7228 // is constant, we partition `PointerOps` sequence into subsequences of
7229 // pointers with the same offset. For each offset we record values from
7230 // `PointerOps` and their indicies in `PointerOps`.
7231 SmallDenseMap<int64_t, std::pair<SmallVector<Value *>, SmallVector<unsigned>>>
7232 OffsetToPointerOpIdxMap;
7233 for (auto [Idx, Ptr] : enumerate(First&: PointerOps)) {
7234 const SCEV *PtrSCEV = SE->getSCEV(V: Ptr);
7235 if (!PtrSCEV)
7236 return false;
7237
7238 const auto *Add = dyn_cast<SCEVAddExpr>(Val: PtrSCEV);
7239 int64_t Offset = 0;
7240 if (Add) {
7241 // `Offset` is non-zero.
7242 for (int I : seq<int>(Size: Add->getNumOperands())) {
7243 const auto *SC = dyn_cast<SCEVConstant>(Val: Add->getOperand(i: I));
7244 if (!SC)
7245 continue;
7246 Offset = SC->getAPInt().getSExtValue();
7247 if (Offset >= std::numeric_limits<int64_t>::max() - 1) {
7248 Offset = 0;
7249 continue;
7250 }
7251 break;
7252 }
7253 }
7254 OffsetToPointerOpIdxMap[Offset].first.push_back(Elt: Ptr);
7255 OffsetToPointerOpIdxMap[Offset].second.push_back(Elt: Idx);
7256 }
7257 unsigned NumOffsets = OffsetToPointerOpIdxMap.size();
7258
7259 // Quick detour: at this point we can say what the type of strided load would
7260 // be if all the checks pass. Check if this type is legal for the target.
7261 const unsigned Sz = PointerOps.size();
7262 unsigned VecSz = Sz;
7263 Type *NewScalarTy = ScalarTy;
7264 if (NumOffsets > 1) {
7265 if (Sz % NumOffsets != 0)
7266 return false;
7267 VecSz = Sz / NumOffsets;
7268 NewScalarTy = Type::getIntNTy(
7269 C&: SE->getContext(),
7270 N: DL->getTypeSizeInBits(Ty: ScalarTy).getFixedValue() * NumOffsets);
7271 }
7272 FixedVectorType *StridedLoadTy = getWidenedType(ScalarTy: NewScalarTy, VF: VecSz);
7273 if (Sz <= MinProfitableStridedLoads || !TTI->isTypeLegal(Ty: StridedLoadTy) ||
7274 !TTI->isLegalStridedLoadStore(DataType: StridedLoadTy, Alignment: CommonAlignment))
7275 return false;
7276
7277 // Check if the offsets are contiguous and that each group has the required
7278 // size.
7279 SmallVector<int64_t> SortedOffsetsV(NumOffsets);
7280 for (auto [Idx, MapPair] : enumerate(First&: OffsetToPointerOpIdxMap)) {
7281 if (MapPair.second.first.size() != VecSz)
7282 return false;
7283 SortedOffsetsV[Idx] = MapPair.first;
7284 }
7285 sort(C&: SortedOffsetsV);
7286
7287 if (NumOffsets > 1) {
7288 for (int I : seq<int>(Begin: 1, End: SortedOffsetsV.size())) {
7289 if (SortedOffsetsV[I] - SortedOffsetsV[I - 1] != 1)
7290 return false;
7291 }
7292 }
7293
7294 // Introduce some notation for the explanations below. Let `PointerOps_j`
7295 // denote the subsequence of `PointerOps` with offsets equal to
7296 // `SortedOffsetsV[j]`. Let `SortedIndices_j` be a such that the sequence
7297 // ```
7298 // PointerOps_j[SortedIndices_j[0]],
7299 // PointerOps_j[SortedIndices_j[1]],
7300 // PointerOps_j[SortedIndices_j[2]],
7301 // ...
7302 // ```
7303 // is sorted. Also, let `IndicesInAllPointerOps_j` be the vector
7304 // of indices of the subsequence `PointerOps_j` in all of `PointerOps`,
7305 // i.e `PointerOps_j[i] = PointerOps[IndicesInAllPointerOps_j[i]]`.
7306 // The entire sorted `PointerOps` looks like this:
7307 // ```
7308 // PointerOps_0[SortedIndices_0[0]] = PointerOps[IndicesInAllPointerOps_0[0]],
7309 // PointerOps_1[SortedIndices_1[0]] = PointerOps[IndicesInAllPointerOps_1[0]],
7310 // PointerOps_2[SortedIndices_2[0]] = PointerOps[IndicesInAllPointerOps_2[0]],
7311 // ...
7312 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[0]] =
7313 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[0]],
7314 //
7315 // PointerOps_0[SortedIndices_0[1]] = PointerOps[IndicesInAllPointerOps_0[1]],
7316 // PointerOps_1[SortedIndices_1[1]] = PointerOps[IndicesInAllPointerOps_1[1]],
7317 // PointerOps_2[SortedIndices_2[1]] = PointerOps[IndicesInAllPointerOps_2[1]],
7318 // ...
7319 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[1]] =
7320 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[1]],
7321 //
7322 // PointerOps_0[SortedIndices_0[2]] = PointerOps[IndicesInAllPointerOps_0[2]],
7323 // PointerOps_1[SortedIndices_1[2]] = PointerOps[IndicesInAllPointerOps_1[2]],
7324 // PointerOps_2[SortedIndices_2[2]] = PointerOps[IndicesInAllPointerOps_2[2]],
7325 // ...
7326 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[2]] =
7327 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[2]],
7328 // ...
7329 // ...
7330 // ...
7331 // PointerOps_0[SortedIndices_0[VecSz - 1]] =
7332 // PointerOps[IndicesInAllPointerOps_0[VecSz - 1]],
7333 // PointerOps_1[SortedIndices_1[VecSz - 1]] =
7334 // PointerOps[IndicesInAllPointerOps_1[VecSz - 1]],
7335 // PointerOps_2[SortedIndices_2[VecSz - 1]] =
7336 // PointerOps[IndicesInAllPointerOps_2[VecSz - 1]],
7337 // ...
7338 // PointerOps_(NumOffsets - 1)[SortedIndices_(NumOffsets - 1)[VecSz - 1]] =
7339 // PointerOps[IndicesInAllPointerOps_(NumOffsets - 1)[VecSz - 1]],
7340 // ```
7341 // In order to be able to generate a strided load, we need the following
7342 // checks to pass:
7343 //
7344 // (1) for each `PointerOps_j` check that the distance
7345 // between adjacent pointers are all equal to the same value (stride).
7346 // (2) for each `PointerOps_j` check that coefficients calculated by
7347 // `calculateRtStride` are all the same.
7348 //
7349 // As we do that, also calculate SortedIndices. Since we should not modify
7350 // `SortedIndices` unless we know that all the checks succeed, record the
7351 // indicies into `SortedIndicesDraft`.
7352 SmallVector<unsigned> SortedIndicesDraft(Sz);
7353
7354 // Given sorted indices for a particular offset (as calculated by
7355 // calculateRtStride), update the `SortedIndicesDraft` for all of PointerOps.
7356 // Let `Offset` be `SortedOffsetsV[OffsetNum]`.
7357 // \param `OffsetNum` the index of `Offset` in `SortedOffsetsV`.
7358 // \param `IndicesInAllPointerOps` vector of indices of the
7359 // subsequence `PointerOps_OffsetNum` in `PointerOps`, i.e. using the above
7360 // notation `IndicesInAllPointerOps = IndicesInAllPointerOps_OffsetNum`.
7361 // \param `SortedIndicesForOffset = SortedIndices_OffsetNum`
7362 auto UpdateSortedIndices =
7363 [&](SmallVectorImpl<unsigned> &SortedIndicesForOffset,
7364 ArrayRef<unsigned> IndicesInAllPointerOps, const int64_t OffsetNum) {
7365 if (SortedIndicesForOffset.empty()) {
7366 SortedIndicesForOffset.resize(N: IndicesInAllPointerOps.size());
7367 std::iota(first: SortedIndicesForOffset.begin(),
7368 last: SortedIndicesForOffset.end(), value: 0);
7369 }
7370 for (const auto [Num, Idx] : enumerate(First&: SortedIndicesForOffset)) {
7371 SortedIndicesDraft[Num * NumOffsets + OffsetNum] =
7372 IndicesInAllPointerOps[Idx];
7373 }
7374 };
7375
7376 int64_t LowestOffset = SortedOffsetsV[0];
7377 ArrayRef<Value *> PointerOps0 = OffsetToPointerOpIdxMap[LowestOffset].first;
7378
7379 SmallVector<int64_t> Coeffs0(VecSz);
7380 SmallVector<unsigned> SortedIndicesForOffset0;
7381 const SCEV *Stride0 = calculateRtStride(PointerOps: PointerOps0, ElemTy: ScalarTy, DL: *DL, SE&: *SE,
7382 SortedIndices&: SortedIndicesForOffset0, Coeffs&: Coeffs0);
7383 if (!Stride0)
7384 return false;
7385 unsigned NumCoeffs0 = Coeffs0.size();
7386 if (NumCoeffs0 * NumOffsets != Sz)
7387 return false;
7388 sort(C&: Coeffs0);
7389
7390 ArrayRef<unsigned> IndicesInAllPointerOps0 =
7391 OffsetToPointerOpIdxMap[LowestOffset].second;
7392 UpdateSortedIndices(SortedIndicesForOffset0, IndicesInAllPointerOps0, 0);
7393
7394 // Now that we know what the common stride and coefficients has to be check
7395 // the remaining `PointerOps_j`.
7396 SmallVector<int64_t> Coeffs;
7397 SmallVector<unsigned> SortedIndicesForOffset;
7398 for (int J : seq<int>(Begin: 1, End: NumOffsets)) {
7399 Coeffs.clear();
7400 Coeffs.resize(N: VecSz);
7401 SortedIndicesForOffset.clear();
7402
7403 int64_t Offset = SortedOffsetsV[J];
7404 ArrayRef<Value *> PointerOpsForOffset =
7405 OffsetToPointerOpIdxMap[Offset].first;
7406 ArrayRef<unsigned> IndicesInAllPointerOps =
7407 OffsetToPointerOpIdxMap[Offset].second;
7408 const SCEV *StrideWithinGroup =
7409 calculateRtStride(PointerOps: PointerOpsForOffset, ElemTy: ScalarTy, DL: *DL, SE&: *SE,
7410 SortedIndices&: SortedIndicesForOffset, Coeffs);
7411
7412 if (!StrideWithinGroup || StrideWithinGroup != Stride0)
7413 return false;
7414 if (Coeffs.size() != NumCoeffs0)
7415 return false;
7416 sort(C&: Coeffs);
7417 if (Coeffs != Coeffs0)
7418 return false;
7419
7420 UpdateSortedIndices(SortedIndicesForOffset, IndicesInAllPointerOps, J);
7421 }
7422
7423 SortedIndices.clear();
7424 SortedIndices = std::move(SortedIndicesDraft);
7425 SPtrInfo.StrideSCEV = Stride0;
7426 SPtrInfo.Ty = StridedLoadTy;
7427 return true;
7428}
7429
7430BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
7431 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
7432 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo,
7433 unsigned *BestVF, bool TryRecursiveCheck) const {
7434 // Check that a vectorized load would load the same memory as a scalar
7435 // load. For example, we don't want to vectorize loads that are smaller
7436 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7437 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7438 // from such a struct, we read/write packed bits disagreeing with the
7439 // unvectorized version.
7440 if (BestVF)
7441 *BestVF = 0;
7442 if (areKnownNonVectorizableLoads(VL))
7443 return LoadsState::Gather;
7444 Type *ScalarTy = VL0->getType();
7445
7446 if (DL->getTypeSizeInBits(Ty: ScalarTy) != DL->getTypeAllocSizeInBits(Ty: ScalarTy))
7447 return LoadsState::Gather;
7448
7449 // Make sure all loads in the bundle are simple - we can't vectorize
7450 // atomic or volatile loads.
7451 PointerOps.clear();
7452 const size_t Sz = VL.size();
7453 PointerOps.resize(N: Sz);
7454 auto *POIter = PointerOps.begin();
7455 for (Value *V : VL) {
7456 auto *L = dyn_cast<LoadInst>(Val: V);
7457 if (!L || !L->isSimple())
7458 return LoadsState::Gather;
7459 *POIter = L->getPointerOperand();
7460 ++POIter;
7461 }
7462
7463 Order.clear();
7464 // Check the order of pointer operands or that all pointers are the same.
7465 bool IsSorted = sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order);
7466
7467 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
7468 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
7469 if (!IsSorted) {
7470 if (analyzeRtStrideCandidate(PointerOps, ScalarTy, CommonAlignment, SortedIndices&: Order,
7471 SPtrInfo))
7472 return LoadsState::StridedVectorize;
7473
7474 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
7475 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7476 return LoadsState::Gather;
7477
7478 if (!all_of(Range&: PointerOps, P: [&](Value *P) {
7479 return arePointersCompatible(Ptr1: P, Ptr2: PointerOps.front(), TLI: *TLI);
7480 }))
7481 return LoadsState::Gather;
7482
7483 } else {
7484 Value *Ptr0;
7485 Value *PtrN;
7486 if (Order.empty()) {
7487 Ptr0 = PointerOps.front();
7488 PtrN = PointerOps.back();
7489 } else {
7490 Ptr0 = PointerOps[Order.front()];
7491 PtrN = PointerOps[Order.back()];
7492 }
7493 // sortPtrAccesses validates getPointersDiff for all pointers relative to
7494 // PointerOps[0], so compute the span using PointerOps[0] as intermediate:
7495 // Diff = offset(PtrN) - offset(Ptr0) relative to PointerOps[0]
7496 std::optional<int64_t> Diff0 =
7497 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: Ptr0, DL: *DL, SE&: *SE);
7498 std::optional<int64_t> DiffN =
7499 getPointersDiff(ElemTyA: ScalarTy, PtrA: PointerOps[0], ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
7500 assert(Diff0 && DiffN &&
7501 "sortPtrAccesses should have validated these pointers");
7502 int64_t Diff = *DiffN - *Diff0;
7503 // Check that the sorted loads are consecutive.
7504 if (static_cast<uint64_t>(Diff) == Sz - 1)
7505 return LoadsState::Vectorize;
7506 if (isMaskedLoadCompress(VL, PointerOps, Order, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
7507 TLI: *TLI, AreAllUsersVectorized: [&](Value *V) {
7508 return areAllUsersVectorized(
7509 I: cast<Instruction>(Val: V), VectorizedVals: UserIgnoreList);
7510 }))
7511 return LoadsState::CompressVectorize;
7512 Align Alignment =
7513 cast<LoadInst>(Val: Order.empty() ? VL.front() : VL[Order.front()])
7514 ->getAlign();
7515 if (analyzeConstantStrideCandidate(PointerOps, ScalarTy, Alignment, SortedIndices: Order,
7516 Diff, Ptr0, PtrN, SPtrInfo))
7517 return LoadsState::StridedVectorize;
7518 }
7519 if (!TTI->isLegalMaskedGather(DataType: VecTy, Alignment: CommonAlignment) ||
7520 TTI->forceScalarizeMaskedGather(Type: VecTy, Alignment: CommonAlignment))
7521 return LoadsState::Gather;
7522 // Correctly identify compare the cost of loads + shuffles rather than
7523 // strided/masked gather loads. Returns true if vectorized + shuffles
7524 // representation is better than just gather.
7525 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
7526 unsigned *BestVF,
7527 bool ProfitableGatherPointers) {
7528 if (BestVF)
7529 *BestVF = 0;
7530 // Compare masked gather cost and loads + insert subvector costs.
7531 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7532 auto [ScalarGEPCost, VectorGEPCost] =
7533 getGEPCosts(TTI, Ptrs: PointerOps, BasePtr: PointerOps.front(),
7534 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
7535 // Estimate the cost of masked gather GEP. If not a splat, roughly
7536 // estimate as a buildvector, otherwise estimate as splat.
7537 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
7538 Type *PtrScalarTy = PointerOps.front()->getType()->getScalarType();
7539 VectorType *PtrVecTy = getWidenedType(ScalarTy: PtrScalarTy, VF: Sz);
7540 if (static_cast<unsigned>(count_if(
7541 Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
7542 any_of(Range&: PointerOps, P: [&](Value *V) {
7543 return getUnderlyingObject(V) !=
7544 getUnderlyingObject(V: PointerOps.front());
7545 }))
7546 VectorGEPCost += getScalarizationOverhead(TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy,
7547 DemandedElts, /*Insert=*/true,
7548 /*Extract=*/false, CostKind);
7549 else
7550 VectorGEPCost +=
7551 getScalarizationOverhead(
7552 TTI, ScalarTy: PtrScalarTy, Ty: PtrVecTy, DemandedElts: APInt::getOneBitSet(numBits: Sz, BitNo: 0),
7553 /*Insert=*/true, /*Extract=*/false, CostKind) +
7554 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: PtrVecTy, Mask: {}, CostKind);
7555 // The cost of scalar loads.
7556 InstructionCost ScalarLoadsCost =
7557 std::accumulate(first: VL.begin(), last: VL.end(), init: InstructionCost(),
7558 binary_op: [&](InstructionCost C, Value *V) {
7559 return C + TTI.getInstructionCost(
7560 U: cast<Instruction>(Val: V), CostKind);
7561 }) +
7562 ScalarGEPCost;
7563 // The cost of masked gather.
7564 InstructionCost MaskedGatherCost =
7565 TTI.getMemIntrinsicInstrCost(
7566 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
7567 cast<LoadInst>(Val: VL0)->getPointerOperand(),
7568 /*VariableMask=*/false, CommonAlignment),
7569 CostKind) +
7570 (ProfitableGatherPointers ? 0 : VectorGEPCost);
7571 InstructionCost GatherCost =
7572 getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7573 /*Insert=*/true,
7574 /*Extract=*/false, CostKind) +
7575 ScalarLoadsCost;
7576 // The list of loads is small or perform partial check already - directly
7577 // compare masked gather cost and gather cost.
7578 constexpr unsigned ListLimit = 4;
7579 if (!TryRecursiveCheck || VL.size() < ListLimit)
7580 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7581
7582 // FIXME: The following code has not been updated for non-power-of-2
7583 // vectors (and not whole registers). The splitting logic here does not
7584 // cover the original vector if the vector factor is not a power of two.
7585 if (!hasFullVectorsOrPowerOf2(TTI, Ty: ScalarTy, Sz: VL.size()))
7586 return false;
7587
7588 unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
7589 unsigned MinVF = getMinVF(Sz: 2 * Sz);
7590 DemandedElts.clearAllBits();
7591 // Iterate through possible vectorization factors and check if vectorized +
7592 // shuffles is better than just gather.
7593 for (unsigned VF =
7594 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VL.size() - 1);
7595 VF >= MinVF;
7596 VF = getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: VF - 1)) {
7597 SmallVector<LoadsState> States;
7598 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
7599 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
7600 SmallVector<unsigned> Order;
7601 SmallVector<Value *> PointerOps;
7602 LoadsState LS = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
7603 PointerOps, SPtrInfo, BestVF,
7604 /*TryRecursiveCheck=*/false);
7605 // Check that the sorted loads are consecutive.
7606 if (LS == LoadsState::Gather) {
7607 if (BestVF) {
7608 DemandedElts.setAllBits();
7609 break;
7610 }
7611 DemandedElts.setBits(loBit: Cnt, hiBit: Cnt + VF);
7612 continue;
7613 }
7614 // If need the reorder - consider as high-cost masked gather for now.
7615 if ((LS == LoadsState::Vectorize ||
7616 LS == LoadsState::StridedVectorize ||
7617 LS == LoadsState::CompressVectorize) &&
7618 !Order.empty() && !isReverseOrder(Order))
7619 LS = LoadsState::ScatterVectorize;
7620 States.push_back(Elt: LS);
7621 }
7622 if (DemandedElts.isAllOnes())
7623 // All loads gathered - try smaller VF.
7624 continue;
7625 // Can be vectorized later as a serie of loads/insertelements.
7626 InstructionCost VecLdCost = 0;
7627 if (!DemandedElts.isZero()) {
7628 VecLdCost = getScalarizationOverhead(TTI, ScalarTy, Ty: VecTy, DemandedElts,
7629 /*Insert=*/true,
7630 /*Extract=*/false, CostKind) +
7631 ScalarGEPCost;
7632 for (unsigned Idx : seq<unsigned>(Size: VL.size()))
7633 if (DemandedElts[Idx])
7634 VecLdCost +=
7635 TTI.getInstructionCost(U: cast<Instruction>(Val: VL[Idx]), CostKind);
7636 }
7637 auto *SubVecTy = getWidenedType(ScalarTy, VF);
7638 for (auto [I, LS] : enumerate(First&: States)) {
7639 auto *LI0 = cast<LoadInst>(Val: VL[I * VF]);
7640 InstructionCost VectorGEPCost =
7641 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
7642 ? 0
7643 : getGEPCosts(TTI, Ptrs: ArrayRef(PointerOps).slice(N: I * VF, M: VF),
7644 BasePtr: LI0->getPointerOperand(),
7645 Opcode: Instruction::GetElementPtr, CostKind, ScalarTy,
7646 VecTy: SubVecTy)
7647 .second;
7648 if (LS == LoadsState::ScatterVectorize) {
7649 if (static_cast<unsigned>(
7650 count_if(Range&: PointerOps, P: IsaPred<GetElementPtrInst>)) <
7651 PointerOps.size() - 1 ||
7652 any_of(Range&: PointerOps, P: [&](Value *V) {
7653 return getUnderlyingObject(V) !=
7654 getUnderlyingObject(V: PointerOps.front());
7655 }))
7656 VectorGEPCost += getScalarizationOverhead(
7657 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getAllOnes(numBits: VF),
7658 /*Insert=*/true, /*Extract=*/false, CostKind);
7659 else
7660 VectorGEPCost +=
7661 getScalarizationOverhead(
7662 TTI, ScalarTy, Ty: SubVecTy, DemandedElts: APInt::getOneBitSet(numBits: VF, BitNo: 0),
7663 /*Insert=*/true, /*Extract=*/false, CostKind) +
7664 ::getShuffleCost(TTI, Kind: TTI::SK_Broadcast, Tp: SubVecTy, Mask: {},
7665 CostKind);
7666 }
7667 switch (LS) {
7668 case LoadsState::Vectorize:
7669 VecLdCost +=
7670 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SubVecTy, Alignment: LI0->getAlign(),
7671 AddressSpace: LI0->getPointerAddressSpace(), CostKind,
7672 OpdInfo: TTI::OperandValueInfo()) +
7673 VectorGEPCost;
7674 break;
7675 case LoadsState::StridedVectorize:
7676 VecLdCost += TTI.getMemIntrinsicInstrCost(
7677 MICA: MemIntrinsicCostAttributes(
7678 Intrinsic::experimental_vp_strided_load,
7679 SubVecTy, LI0->getPointerOperand(),
7680 /*VariableMask=*/false, CommonAlignment),
7681 CostKind) +
7682 VectorGEPCost;
7683 break;
7684 case LoadsState::CompressVectorize:
7685 VecLdCost += TTI.getMemIntrinsicInstrCost(
7686 MICA: MemIntrinsicCostAttributes(
7687 Intrinsic::masked_load, SubVecTy,
7688 CommonAlignment, LI0->getPointerAddressSpace()),
7689 CostKind) +
7690 ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SubVecTy,
7691 Mask: {}, CostKind);
7692 break;
7693 case LoadsState::ScatterVectorize:
7694 VecLdCost += TTI.getMemIntrinsicInstrCost(
7695 MICA: MemIntrinsicCostAttributes(
7696 Intrinsic::masked_gather, SubVecTy,
7697 LI0->getPointerOperand(),
7698 /*VariableMask=*/false, CommonAlignment),
7699 CostKind) +
7700 VectorGEPCost;
7701 break;
7702 case LoadsState::Gather:
7703 // Gathers are already calculated - ignore.
7704 continue;
7705 }
7706 SmallVector<int> ShuffleMask(VL.size());
7707 for (int Idx : seq<int>(Begin: 0, End: VL.size()))
7708 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
7709 if (I > 0)
7710 VecLdCost +=
7711 ::getShuffleCost(TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: ShuffleMask,
7712 CostKind, Index: I * VF, SubTp: SubVecTy);
7713 }
7714 // If masked gather cost is higher - better to vectorize, so
7715 // consider it as a gather node. It will be better estimated
7716 // later.
7717 if (MaskedGatherCost >= VecLdCost &&
7718 VecLdCost - GatherCost < -SLPCostThreshold) {
7719 if (BestVF)
7720 *BestVF = VF;
7721 return true;
7722 }
7723 }
7724 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
7725 };
7726 // TODO: need to improve analysis of the pointers, if not all of them are
7727 // GEPs or have > 2 operands, we end up with a gather node, which just
7728 // increases the cost.
7729 Loop *L = LI->getLoopFor(BB: cast<LoadInst>(Val: VL0)->getParent());
7730 bool ProfitableGatherPointers =
7731 L && Sz > 2 && static_cast<unsigned>(count_if(Range&: PointerOps, P: [L](Value *V) {
7732 return L->isLoopInvariant(V);
7733 })) <= Sz / 2;
7734 if (ProfitableGatherPointers || all_of(Range&: PointerOps, P: [](Value *P) {
7735 auto *GEP = dyn_cast<GetElementPtrInst>(Val: P);
7736 return (!GEP && doesNotNeedToBeScheduled(V: P)) ||
7737 (GEP && GEP->getNumOperands() == 2 &&
7738 isa<Constant, Instruction>(Val: GEP->getOperand(i_nocapture: 1)));
7739 })) {
7740 // Check if potential masked gather can be represented as series
7741 // of loads + insertsubvectors.
7742 // If masked gather cost is higher - better to vectorize, so
7743 // consider it as a gather node. It will be better estimated
7744 // later.
7745 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
7746 ProfitableGatherPointers))
7747 return LoadsState::ScatterVectorize;
7748 }
7749
7750 return LoadsState::Gather;
7751}
7752
7753static bool clusterSortPtrAccesses(ArrayRef<Value *> VL,
7754 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
7755 const DataLayout &DL, ScalarEvolution &SE,
7756 SmallVectorImpl<unsigned> &SortedIndices) {
7757 assert(
7758 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
7759 "Expected list of pointer operands.");
7760 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
7761 // Ptr into, sort and return the sorted indices with values next to one
7762 // another.
7763 SmallMapVector<
7764 std::pair<BasicBlock *, Value *>,
7765 SmallVector<SmallVector<std::tuple<Value *, int64_t, unsigned>>>, 8>
7766 Bases;
7767 Bases
7768 .try_emplace(Key: std::make_pair(
7769 x: BBs.front(), y: getUnderlyingObject(V: VL.front(), MaxLookup: RecursionMaxDepth)))
7770 .first->second.emplace_back().emplace_back(Args: VL.front(), Args: 0U, Args: 0U);
7771
7772 SortedIndices.clear();
7773 for (auto [Cnt, Ptr] : enumerate(First: VL.drop_front())) {
7774 auto Key = std::make_pair(x: BBs[Cnt + 1],
7775 y: getUnderlyingObject(V: Ptr, MaxLookup: RecursionMaxDepth));
7776 bool Found = any_of(Range&: Bases.try_emplace(Key).first->second,
7777 P: [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
7778 std::optional<int64_t> Diff =
7779 getPointersDiff(ElemTy, std::get<0>(Base.front()),
7780 ElemTy, Ptr, DL, SE,
7781 /*StrictCheck=*/true);
7782 if (!Diff)
7783 return false;
7784
7785 Base.emplace_back(Ptr, *Diff, Cnt + 1);
7786 return true;
7787 });
7788
7789 if (!Found) {
7790 // If we haven't found enough to usefully cluster, return early.
7791 if (Bases.size() > VL.size() / 2 - 1)
7792 return false;
7793
7794 // Not found already - add a new Base
7795 Bases.find(Key)->second.emplace_back().emplace_back(Args: Ptr, Args: 0, Args: Cnt + 1);
7796 }
7797 }
7798
7799 if (Bases.size() == VL.size())
7800 return false;
7801
7802 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
7803 Bases.front().second.size() == VL.size()))
7804 return false;
7805
7806 // For each of the bases sort the pointers by Offset and check if any of the
7807 // base become consecutively allocated.
7808 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
7809 SmallPtrSet<Value *, 13> FirstPointers;
7810 SmallPtrSet<Value *, 13> SecondPointers;
7811 Value *P1 = Ptr1;
7812 Value *P2 = Ptr2;
7813 unsigned Depth = 0;
7814 while (!FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1)) {
7815 if (P1 == P2 || Depth > RecursionMaxDepth)
7816 return false;
7817 FirstPointers.insert(Ptr: P1);
7818 SecondPointers.insert(Ptr: P2);
7819 P1 = getUnderlyingObject(V: P1, /*MaxLookup=*/1);
7820 P2 = getUnderlyingObject(V: P2, /*MaxLookup=*/1);
7821 ++Depth;
7822 }
7823 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
7824 "Unable to find matching root.");
7825 return FirstPointers.contains(Ptr: P2) && !SecondPointers.contains(Ptr: P1);
7826 };
7827 for (auto &Base : Bases) {
7828 for (auto &Vec : Base.second) {
7829 if (Vec.size() > 1) {
7830 stable_sort(Range&: Vec, C: llvm::less_second());
7831 int64_t InitialOffset = std::get<1>(t&: Vec[0]);
7832 bool AnyConsecutive =
7833 all_of(Range: enumerate(First&: Vec), P: [InitialOffset](const auto &P) {
7834 return std::get<1>(P.value()) ==
7835 int64_t(P.index()) + InitialOffset;
7836 });
7837 // Fill SortedIndices array only if it looks worth-while to sort the
7838 // ptrs.
7839 if (!AnyConsecutive)
7840 return false;
7841 }
7842 }
7843 stable_sort(Range&: Base.second, C: [&](const auto &V1, const auto &V2) {
7844 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
7845 });
7846 }
7847
7848 for (auto &T : Bases)
7849 for (const auto &Vec : T.second)
7850 for (const auto &P : Vec)
7851 SortedIndices.push_back(Elt: std::get<2>(t: P));
7852
7853 assert(SortedIndices.size() == VL.size() &&
7854 "Expected SortedIndices to be the size of VL");
7855 return true;
7856}
7857
7858std::optional<BoUpSLP::OrdersType>
7859BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
7860 assert(TE.isGather() && "Expected gather node only.");
7861 Type *ScalarTy = TE.Scalars[0]->getType();
7862
7863 SmallVector<Value *> Ptrs;
7864 Ptrs.reserve(N: TE.Scalars.size());
7865 SmallVector<BasicBlock *> BBs;
7866 BBs.reserve(N: TE.Scalars.size());
7867 for (Value *V : TE.Scalars) {
7868 auto *L = dyn_cast<LoadInst>(Val: V);
7869 if (!L || !L->isSimple())
7870 return std::nullopt;
7871 Ptrs.push_back(Elt: L->getPointerOperand());
7872 BBs.push_back(Elt: L->getParent());
7873 }
7874
7875 BoUpSLP::OrdersType Order;
7876 if (!LoadEntriesToVectorize.contains(key: TE.Idx) &&
7877 clusterSortPtrAccesses(VL: Ptrs, BBs, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: Order))
7878 return std::move(Order);
7879 return std::nullopt;
7880}
7881
7882/// Check if two insertelement instructions are from the same buildvector.
7883static bool areTwoInsertFromSameBuildVector(
7884 InsertElementInst *VU, InsertElementInst *V,
7885 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
7886 // Instructions must be from the same basic blocks.
7887 if (VU->getParent() != V->getParent())
7888 return false;
7889 // Checks if 2 insertelements are from the same buildvector.
7890 if (VU->getType() != V->getType())
7891 return false;
7892 // Multiple used inserts are separate nodes.
7893 if (!VU->hasOneUse() && !V->hasOneUse())
7894 return false;
7895 auto *IE1 = VU;
7896 auto *IE2 = V;
7897 std::optional<unsigned> Idx1 = getElementIndex(Inst: IE1);
7898 std::optional<unsigned> Idx2 = getElementIndex(Inst: IE2);
7899 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
7900 return false;
7901 // Go through the vector operand of insertelement instructions trying to find
7902 // either VU as the original vector for IE2 or V as the original vector for
7903 // IE1.
7904 SmallBitVector ReusedIdx(
7905 cast<VectorType>(Val: VU->getType())->getElementCount().getKnownMinValue());
7906 bool IsReusedIdx = false;
7907 do {
7908 if (IE2 == VU && !IE1)
7909 return VU->hasOneUse();
7910 if (IE1 == V && !IE2)
7911 return V->hasOneUse();
7912 if (IE1 && IE1 != V) {
7913 unsigned Idx1 = getElementIndex(Inst: IE1).value_or(u&: *Idx2);
7914 IsReusedIdx |= ReusedIdx.test(Idx: Idx1);
7915 ReusedIdx.set(Idx1);
7916 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
7917 IE1 = nullptr;
7918 else
7919 IE1 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE1));
7920 }
7921 if (IE2 && IE2 != VU) {
7922 unsigned Idx2 = getElementIndex(Inst: IE2).value_or(u&: *Idx1);
7923 IsReusedIdx |= ReusedIdx.test(Idx: Idx2);
7924 ReusedIdx.set(Idx2);
7925 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
7926 IE2 = nullptr;
7927 else
7928 IE2 = dyn_cast_or_null<InsertElementInst>(Val: GetBaseOperand(IE2));
7929 }
7930 } while (!IsReusedIdx && (IE1 || IE2));
7931 return false;
7932}
7933
7934/// Checks if the specified instruction \p I is an alternate operation for
7935/// the given \p MainOp and \p AltOp instructions.
7936static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
7937 Instruction *AltOp,
7938 const TargetLibraryInfo &TLI);
7939
7940std::optional<BoUpSLP::OrdersType>
7941BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom,
7942 bool IgnoreReorder) {
7943 // No need to reorder if need to shuffle reuses, still need to shuffle the
7944 // node.
7945 if (!TE.ReuseShuffleIndices.empty()) {
7946 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
7947 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
7948 "Reshuffling scalars not yet supported for nodes with padding");
7949
7950 if (isSplat(VL: TE.Scalars))
7951 return std::nullopt;
7952 // Check if reuse shuffle indices can be improved by reordering.
7953 // For this, check that reuse mask is "clustered", i.e. each scalar values
7954 // is used once in each submask of size <number_of_scalars>.
7955 // Example: 4 scalar values.
7956 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
7957 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
7958 // element 3 is used twice in the second submask.
7959 unsigned Sz = TE.Scalars.size();
7960 if (TE.isGather()) {
7961 if (std::optional<OrdersType> CurrentOrder =
7962 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) {
7963 SmallVector<int> Mask;
7964 fixupOrderingIndices(Order: *CurrentOrder);
7965 inversePermutation(Indices: *CurrentOrder, Mask);
7966 ::addMask(Mask, SubMask: TE.ReuseShuffleIndices);
7967 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
7968 unsigned Sz = TE.Scalars.size();
7969 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
7970 for (auto [I, Idx] : enumerate(First: ArrayRef(Mask).slice(N: K * Sz, M: Sz)))
7971 if (Idx != PoisonMaskElem)
7972 Res[Idx + K * Sz] = I + K * Sz;
7973 }
7974 return std::move(Res);
7975 }
7976 }
7977 if (Sz == 2 && TE.getVectorFactor() == 4 &&
7978 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: TE.Scalars.front()->getType(),
7979 VF: 2 * TE.getVectorFactor())) == 1)
7980 return std::nullopt;
7981 if (TE.ReuseShuffleIndices.size() % Sz != 0)
7982 return std::nullopt;
7983 if (!ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
7984 VF: Sz)) {
7985 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
7986 if (TE.ReorderIndices.empty())
7987 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
7988 else
7989 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
7990 ::addMask(Mask&: ReorderMask, SubMask: TE.ReuseShuffleIndices);
7991 unsigned VF = ReorderMask.size();
7992 OrdersType ResOrder(VF, VF);
7993 unsigned NumParts = divideCeil(Numerator: VF, Denominator: Sz);
7994 SmallBitVector UsedVals(NumParts);
7995 for (unsigned I = 0; I < VF; I += Sz) {
7996 int Val = PoisonMaskElem;
7997 unsigned UndefCnt = 0;
7998 unsigned Limit = std::min(a: Sz, b: VF - I);
7999 if (any_of(Range: ArrayRef(ReorderMask).slice(N: I, M: Limit),
8000 P: [&](int Idx) {
8001 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
8002 Val = Idx;
8003 if (Idx == PoisonMaskElem)
8004 ++UndefCnt;
8005 return Idx != PoisonMaskElem && Idx != Val;
8006 }) ||
8007 Val >= static_cast<int>(NumParts) || UsedVals.test(Idx: Val) ||
8008 UndefCnt > Sz / 2)
8009 return std::nullopt;
8010 UsedVals.set(Val);
8011 for (unsigned K = 0; K < NumParts; ++K) {
8012 unsigned Idx = Val + Sz * K;
8013 if (Idx < VF && I + K < VF)
8014 ResOrder[Idx] = I + K;
8015 }
8016 }
8017 return std::move(ResOrder);
8018 }
8019 unsigned VF = TE.getVectorFactor();
8020 // Try build correct order for extractelement instructions.
8021 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
8022 TE.ReuseShuffleIndices.end());
8023 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
8024 all_of(Range: TE.Scalars, P: [Sz](Value *V) {
8025 if (isa<PoisonValue>(Val: V))
8026 return true;
8027 std::optional<unsigned> Idx = getExtractIndex(E: cast<Instruction>(Val: V));
8028 return Idx && *Idx < Sz;
8029 })) {
8030 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
8031 "by BinaryOperator and CastInst.");
8032 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
8033 if (TE.ReorderIndices.empty())
8034 std::iota(first: ReorderMask.begin(), last: ReorderMask.end(), value: 0);
8035 else
8036 inversePermutation(Indices: TE.ReorderIndices, Mask&: ReorderMask);
8037 for (unsigned I = 0; I < VF; ++I) {
8038 int &Idx = ReusedMask[I];
8039 if (Idx == PoisonMaskElem)
8040 continue;
8041 Value *V = TE.Scalars[ReorderMask[Idx]];
8042 std::optional<unsigned> EI = getExtractIndex(E: cast<Instruction>(Val: V));
8043 Idx = std::distance(first: ReorderMask.begin(), last: find(Range&: ReorderMask, Val: *EI));
8044 }
8045 }
8046 // Build the order of the VF size, need to reorder reuses shuffles, they are
8047 // always of VF size.
8048 OrdersType ResOrder(VF);
8049 std::iota(first: ResOrder.begin(), last: ResOrder.end(), value: 0);
8050 auto *It = ResOrder.begin();
8051 for (unsigned K = 0; K < VF; K += Sz) {
8052 OrdersType CurrentOrder(TE.ReorderIndices);
8053 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(N: K, M: Sz)};
8054 if (SubMask.front() == PoisonMaskElem)
8055 std::iota(first: SubMask.begin(), last: SubMask.end(), value: 0);
8056 reorderOrder(Order&: CurrentOrder, Mask: SubMask);
8057 transform(Range&: CurrentOrder, d_first: It, F: [K](unsigned Pos) { return Pos + K; });
8058 std::advance(i&: It, n: Sz);
8059 }
8060 if (TE.isGather() && all_of(Range: enumerate(First&: ResOrder), P: [](const auto &Data) {
8061 return Data.index() == Data.value();
8062 }))
8063 return std::nullopt; // No need to reorder.
8064 return std::move(ResOrder);
8065 }
8066 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
8067 (!TE.UserTreeIndex || !TE.UserTreeIndex.UserTE->hasState() ||
8068 !Instruction::isBinaryOp(Opcode: TE.UserTreeIndex.UserTE->getOpcode())) &&
8069 (TE.ReorderIndices.empty() || isReverseOrder(Order: TE.ReorderIndices)))
8070 return std::nullopt;
8071 if (TE.State == TreeEntry::SplitVectorize ||
8072 ((TE.State == TreeEntry::Vectorize ||
8073 TE.State == TreeEntry::StridedVectorize ||
8074 TE.State == TreeEntry::CompressVectorize) &&
8075 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: TE.getMainOp()) ||
8076 (TopToBottom && isa<StoreInst, InsertElementInst>(Val: TE.getMainOp()))))) {
8077 assert((TE.State == TreeEntry::SplitVectorize || !TE.isAltShuffle()) &&
8078 "Alternate instructions are only supported by "
8079 "BinaryOperator and CastInst.");
8080 return TE.ReorderIndices;
8081 }
8082 if (!TopToBottom && IgnoreReorder && TE.State == TreeEntry::Vectorize &&
8083 TE.isAltShuffle()) {
8084 assert(TE.ReuseShuffleIndices.empty() &&
8085 "ReuseShuffleIndices should be "
8086 "empty for alternate instructions.");
8087 SmallVector<int> Mask;
8088 TE.buildAltOpShuffleMask(
8089 IsAltOp: [&](Instruction *I) {
8090 assert(TE.getMatchingMainOpOrAltOp(I) &&
8091 "Unexpected main/alternate opcode");
8092 return isAlternateInstruction(I, MainOp: TE.getMainOp(), AltOp: TE.getAltOp(), TLI: *TLI);
8093 },
8094 Mask);
8095 const int VF = TE.getVectorFactor();
8096 OrdersType ResOrder(VF, VF);
8097 for (unsigned I : seq<unsigned>(Size: VF)) {
8098 if (Mask[I] == PoisonMaskElem)
8099 continue;
8100 ResOrder[Mask[I] % VF] = I;
8101 }
8102 return std::move(ResOrder);
8103 }
8104 if (!TE.ReorderIndices.empty())
8105 return TE.ReorderIndices;
8106 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
8107 if (!TE.ReorderIndices.empty())
8108 return TE.ReorderIndices;
8109
8110 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
8111 for (auto [I, V] : zip(t&: UserBVHead, u: TE.Scalars)) {
8112 if (isa<Constant>(Val: V) || !V->hasNUsesOrMore(N: 1))
8113 continue;
8114 auto *II = dyn_cast<InsertElementInst>(Val: *V->user_begin());
8115 if (!II)
8116 continue;
8117 Instruction *BVHead = nullptr;
8118 BasicBlock *BB = II->getParent();
8119 while (II && II->hasOneUse() && II->getParent() == BB) {
8120 BVHead = II;
8121 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
8122 }
8123 I = BVHead;
8124 }
8125
8126 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
8127 assert(BB1 != BB2 && "Expected different basic blocks.");
8128 if (!DT->isReachableFromEntry(A: BB1))
8129 return false;
8130 if (!DT->isReachableFromEntry(A: BB2))
8131 return true;
8132 auto *NodeA = DT->getNode(BB: BB1);
8133 auto *NodeB = DT->getNode(BB: BB2);
8134 assert(NodeA && "Should only process reachable instructions");
8135 assert(NodeB && "Should only process reachable instructions");
8136 assert((NodeA == NodeB) ==
8137 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
8138 "Different nodes should have different DFS numbers");
8139 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
8140 };
8141 auto PHICompare = [&](unsigned I1, unsigned I2) {
8142 Value *V1 = TE.Scalars[I1];
8143 Value *V2 = TE.Scalars[I2];
8144 if (V1 == V2 || (V1->use_empty() && V2->use_empty()))
8145 return false;
8146 if (isa<PoisonValue>(Val: V1))
8147 return true;
8148 if (isa<PoisonValue>(Val: V2))
8149 return false;
8150 if (V1->getNumUses() < V2->getNumUses())
8151 return true;
8152 if (V1->getNumUses() > V2->getNumUses())
8153 return false;
8154 auto *FirstUserOfPhi1 = cast<Instruction>(Val: *V1->user_begin());
8155 auto *FirstUserOfPhi2 = cast<Instruction>(Val: *V2->user_begin());
8156 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
8157 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
8158 FirstUserOfPhi2->getParent());
8159 auto *IE1 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi1);
8160 auto *IE2 = dyn_cast<InsertElementInst>(Val: FirstUserOfPhi2);
8161 auto *EE1 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi1);
8162 auto *EE2 = dyn_cast<ExtractElementInst>(Val: FirstUserOfPhi2);
8163 if (IE1 && !IE2)
8164 return true;
8165 if (!IE1 && IE2)
8166 return false;
8167 if (IE1 && IE2) {
8168 if (UserBVHead[I1] && !UserBVHead[I2])
8169 return true;
8170 if (!UserBVHead[I1])
8171 return false;
8172 if (UserBVHead[I1] == UserBVHead[I2])
8173 return getElementIndex(Inst: IE1) < getElementIndex(Inst: IE2);
8174 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
8175 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
8176 UserBVHead[I2]->getParent());
8177 return UserBVHead[I1]->comesBefore(Other: UserBVHead[I2]);
8178 }
8179 if (EE1 && !EE2)
8180 return true;
8181 if (!EE1 && EE2)
8182 return false;
8183 if (EE1 && EE2) {
8184 auto *Inst1 = dyn_cast<Instruction>(Val: EE1->getOperand(i_nocapture: 0));
8185 auto *Inst2 = dyn_cast<Instruction>(Val: EE2->getOperand(i_nocapture: 0));
8186 auto *P1 = dyn_cast<Argument>(Val: EE1->getOperand(i_nocapture: 0));
8187 auto *P2 = dyn_cast<Argument>(Val: EE2->getOperand(i_nocapture: 0));
8188 if (!Inst2 && !P2)
8189 return Inst1 || P1;
8190 if (EE1->getOperand(i_nocapture: 0) == EE2->getOperand(i_nocapture: 0))
8191 return getElementIndex(Inst: EE1) < getElementIndex(Inst: EE2);
8192 if (!Inst1 && Inst2)
8193 return false;
8194 if (Inst1 && Inst2) {
8195 if (Inst1->getParent() != Inst2->getParent())
8196 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
8197 return Inst1->comesBefore(Other: Inst2);
8198 }
8199 if (!P1 && P2)
8200 return false;
8201 assert(P1 && P2 &&
8202 "Expected either instructions or arguments vector operands.");
8203 return P1->getArgNo() < P2->getArgNo();
8204 }
8205 return false;
8206 };
8207 OrdersType Phis(TE.Scalars.size());
8208 std::iota(first: Phis.begin(), last: Phis.end(), value: 0);
8209 stable_sort(Range&: Phis, C: PHICompare);
8210 if (isIdentityOrder(Order: Phis))
8211 return std::nullopt; // No need to reorder.
8212 return std::move(Phis);
8213 }
8214 if (TE.isGather() &&
8215 (!TE.hasState() || !TE.isAltShuffle() ||
8216 ScalarsInSplitNodes.contains(Val: TE.getMainOp())) &&
8217 allSameType(VL: TE.Scalars)) {
8218 // TODO: add analysis of other gather nodes with extractelement
8219 // instructions and other values/instructions, not only undefs.
8220 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
8221 (all_of(Range: TE.Scalars, P: IsaPred<UndefValue, ExtractElementInst>) &&
8222 any_of(Range: TE.Scalars, P: IsaPred<ExtractElementInst>))) &&
8223 all_of(Range: TE.Scalars, P: [](Value *V) {
8224 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
8225 return !EE || isa<FixedVectorType>(Val: EE->getVectorOperandType());
8226 })) {
8227 // Check that gather of extractelements can be represented as
8228 // just a shuffle of a single vector.
8229 OrdersType CurrentOrder;
8230 bool Reuse =
8231 canReuseExtract(VL: TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
8232 if (Reuse || !CurrentOrder.empty())
8233 return std::move(CurrentOrder);
8234 }
8235 // If the gather node is <undef, v, .., poison> and
8236 // insertelement poison, v, 0 [+ permute]
8237 // is cheaper than
8238 // insertelement poison, v, n - try to reorder.
8239 // If rotating the whole graph, exclude the permute cost, the whole graph
8240 // might be transformed.
8241 int Sz = TE.Scalars.size();
8242 if (isSplat(VL: TE.Scalars) && !allConstant(VL: TE.Scalars) &&
8243 count_if(Range: TE.Scalars, P: IsaPred<UndefValue>) == Sz - 1) {
8244 const auto *It = find_if_not(Range: TE.Scalars, P: isConstant);
8245 if (It == TE.Scalars.begin())
8246 return OrdersType();
8247 auto *Ty = getWidenedType(ScalarTy: TE.Scalars.front()->getType(), VF: Sz);
8248 if (It != TE.Scalars.end()) {
8249 OrdersType Order(Sz, Sz);
8250 unsigned Idx = std::distance(first: TE.Scalars.begin(), last: It);
8251 Order[Idx] = 0;
8252 fixupOrderingIndices(Order);
8253 SmallVector<int> Mask;
8254 inversePermutation(Indices: Order, Mask);
8255 InstructionCost PermuteCost =
8256 TopToBottom
8257 ? 0
8258 : ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: Ty, Mask);
8259 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
8260 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: 0,
8261 Op0: PoisonValue::get(T: Ty), Op1: *It);
8262 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
8263 Opcode: Instruction::InsertElement, Val: Ty, CostKind: TTI::TCK_RecipThroughput, Index: Idx,
8264 Op0: PoisonValue::get(T: Ty), Op1: *It);
8265 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
8266 OrdersType Order(Sz, Sz);
8267 Order[Idx] = 0;
8268 return std::move(Order);
8269 }
8270 }
8271 }
8272 if (isSplat(VL: TE.Scalars))
8273 return std::nullopt;
8274 if (TE.Scalars.size() >= 3)
8275 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
8276 return Order;
8277 // Check if can include the order of vectorized loads. For masked gathers do
8278 // extra analysis later, so include such nodes into a special list.
8279 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
8280 SmallVector<Value *> PointerOps;
8281 StridedPtrInfo SPtrInfo;
8282 OrdersType CurrentOrder;
8283 LoadsState Res = canVectorizeLoads(VL: TE.Scalars, VL0: TE.Scalars.front(),
8284 Order&: CurrentOrder, PointerOps, SPtrInfo);
8285 if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize ||
8286 Res == LoadsState::CompressVectorize)
8287 return std::move(CurrentOrder);
8288 }
8289 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
8290 // has been auditted for correctness with non-power-of-two vectors.
8291 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
8292 if (std::optional<OrdersType> CurrentOrder =
8293 findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder))
8294 return CurrentOrder;
8295 }
8296 return std::nullopt;
8297}
8298
8299/// Checks if the given mask is a "clustered" mask with the same clusters of
8300/// size \p Sz, which are not identity submasks.
8301static bool isRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,
8302 unsigned Sz) {
8303 ArrayRef<int> FirstCluster = Mask.slice(N: 0, M: Sz);
8304 if (ShuffleVectorInst::isIdentityMask(Mask: FirstCluster, NumSrcElts: Sz))
8305 return false;
8306 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
8307 ArrayRef<int> Cluster = Mask.slice(N: I, M: Sz);
8308 if (Cluster != FirstCluster)
8309 return false;
8310 }
8311 return true;
8312}
8313
8314void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
8315 // Reorder reuses mask.
8316 reorderReuses(Reuses&: TE.ReuseShuffleIndices, Mask);
8317 const unsigned Sz = TE.Scalars.size();
8318 // For vectorized and non-clustered reused no need to do anything else.
8319 if (!TE.isGather() ||
8320 !ShuffleVectorInst::isOneUseSingleSourceMask(Mask: TE.ReuseShuffleIndices,
8321 VF: Sz) ||
8322 !isRepeatedNonIdentityClusteredMask(Mask: TE.ReuseShuffleIndices, Sz))
8323 return;
8324 SmallVector<int> NewMask;
8325 inversePermutation(Indices: TE.ReorderIndices, Mask&: NewMask);
8326 addMask(Mask&: NewMask, SubMask: TE.ReuseShuffleIndices);
8327 // Clear reorder since it is going to be applied to the new mask.
8328 TE.ReorderIndices.clear();
8329 // Try to improve gathered nodes with clustered reuses, if possible.
8330 ArrayRef<int> Slice = ArrayRef(NewMask).slice(N: 0, M: Sz);
8331 SmallVector<unsigned> NewOrder(Slice);
8332 inversePermutation(Indices: NewOrder, Mask&: NewMask);
8333 reorderScalars(Scalars&: TE.Scalars, Mask: NewMask);
8334 // Fill the reuses mask with the identity submasks.
8335 for (auto *It = TE.ReuseShuffleIndices.begin(),
8336 *End = TE.ReuseShuffleIndices.end();
8337 It != End; std::advance(i&: It, n: Sz))
8338 std::iota(first: It, last: std::next(x: It, n: Sz), value: 0);
8339}
8340
8341static void combineOrders(MutableArrayRef<unsigned> Order,
8342 ArrayRef<unsigned> SecondaryOrder) {
8343 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
8344 "Expected same size of orders");
8345 size_t Sz = Order.size();
8346 SmallBitVector UsedIndices(Sz);
8347 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz)) {
8348 if (Order[Idx] != Sz)
8349 UsedIndices.set(Order[Idx]);
8350 }
8351 if (SecondaryOrder.empty()) {
8352 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
8353 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
8354 Order[Idx] = Idx;
8355 } else {
8356 for (unsigned Idx : seq<unsigned>(Begin: 0, End: Sz))
8357 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
8358 !UsedIndices.test(Idx: SecondaryOrder[Idx]))
8359 Order[Idx] = SecondaryOrder[Idx];
8360 }
8361}
8362
8363bool BoUpSLP::isProfitableToReorder() const {
8364 if (DisableTreeReorder)
8365 return false;
8366
8367 constexpr unsigned TinyVF = 2;
8368 constexpr unsigned TinyTree = 10;
8369 constexpr unsigned PhiOpsLimit = 12;
8370 constexpr unsigned GatherLoadsLimit = 2;
8371 if (VectorizableTree.size() <= TinyTree)
8372 return true;
8373 if (VectorizableTree.front()->hasState() &&
8374 !VectorizableTree.front()->isGather() &&
8375 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
8376 VectorizableTree.front()->getOpcode() == Instruction::PHI ||
8377 (VectorizableTree.front()->getVectorFactor() <= TinyVF &&
8378 (VectorizableTree.front()->getOpcode() == Instruction::PtrToInt ||
8379 VectorizableTree.front()->getOpcode() == Instruction::ICmp))) &&
8380 VectorizableTree.front()->ReorderIndices.empty()) {
8381 // Check if the tree has only single store and single (unordered) load node,
8382 // other nodes are phis or geps/binops, combined with phis, and/or single
8383 // gather load node
8384 if (VectorizableTree.front()->hasState() &&
8385 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
8386 VectorizableTree.front()->Scalars.size() == TinyVF &&
8387 VectorizableTree.front()->getNumOperands() > PhiOpsLimit)
8388 return false;
8389 // Single node, which require reorder - skip.
8390 if (VectorizableTree.front()->hasState() &&
8391 VectorizableTree.front()->getOpcode() == Instruction::Store &&
8392 VectorizableTree.front()->ReorderIndices.empty()) {
8393 const unsigned ReorderedSplitsCnt =
8394 count_if(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8395 return TE->State == TreeEntry::SplitVectorize &&
8396 !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE &&
8397 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8398 ::isCommutative(I: TE->UserTreeIndex.UserTE->getMainOp());
8399 });
8400 if (ReorderedSplitsCnt <= 1 &&
8401 static_cast<unsigned>(count_if(
8402 Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
8403 return ((!TE->isGather() &&
8404 (TE->ReorderIndices.empty() ||
8405 (TE->UserTreeIndex.UserTE &&
8406 TE->UserTreeIndex.UserTE->State ==
8407 TreeEntry::Vectorize &&
8408 !TE->UserTreeIndex.UserTE->ReuseShuffleIndices
8409 .empty()))) ||
8410 (TE->isGather() && TE->ReorderIndices.empty() &&
8411 (!TE->hasState() || TE->isAltShuffle() ||
8412 TE->getOpcode() == Instruction::Load ||
8413 TE->getOpcode() == Instruction::ZExt ||
8414 TE->getOpcode() == Instruction::SExt))) &&
8415 (VectorizableTree.front()->getVectorFactor() > TinyVF ||
8416 !TE->isGather() || none_of(Range&: TE->Scalars, P: [&](Value *V) {
8417 return !isConstant(V) && isVectorized(V);
8418 }));
8419 })) >= VectorizableTree.size() - ReorderedSplitsCnt)
8420 return false;
8421 }
8422 bool HasPhis = false;
8423 bool HasLoad = true;
8424 unsigned GatherLoads = 0;
8425 for (const std::unique_ptr<TreeEntry> &TE :
8426 ArrayRef(VectorizableTree).drop_front()) {
8427 if (TE->State == TreeEntry::SplitVectorize)
8428 continue;
8429 if (!TE->hasState()) {
8430 if (all_of(Range&: TE->Scalars, P: IsaPred<Constant, PHINode>) ||
8431 all_of(Range&: TE->Scalars, P: IsaPred<BinaryOperator, PHINode>))
8432 continue;
8433 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8434 any_of(Range&: TE->Scalars, P: IsaPred<PHINode, GEPOperator>))
8435 continue;
8436 return true;
8437 }
8438 if (TE->getOpcode() == Instruction::Load && TE->ReorderIndices.empty()) {
8439 if (!TE->isGather()) {
8440 HasLoad = false;
8441 continue;
8442 }
8443 if (HasLoad)
8444 return true;
8445 ++GatherLoads;
8446 if (GatherLoads >= GatherLoadsLimit)
8447 return true;
8448 }
8449 if (TE->getOpcode() == Instruction::GetElementPtr ||
8450 Instruction::isBinaryOp(Opcode: TE->getOpcode()))
8451 continue;
8452 if (TE->getOpcode() != Instruction::PHI &&
8453 (!TE->hasCopyableElements() ||
8454 static_cast<unsigned>(count_if(Range&: TE->Scalars, P: IsaPred<PHINode>)) <
8455 TE->Scalars.size() / 2))
8456 return true;
8457 if (VectorizableTree.front()->Scalars.size() == TinyVF &&
8458 TE->getNumOperands() > PhiOpsLimit)
8459 return false;
8460 HasPhis = true;
8461 }
8462 return !HasPhis;
8463 }
8464 return true;
8465}
8466
8467void BoUpSLP::TreeEntry::reorderSplitNode(unsigned Idx, ArrayRef<int> Mask,
8468 ArrayRef<int> MaskOrder) {
8469 assert(State == TreeEntry::SplitVectorize && "Expected split user node.");
8470 SmallVector<int> NewMask(getVectorFactor());
8471 SmallVector<int> NewMaskOrder(getVectorFactor());
8472 std::iota(first: NewMask.begin(), last: NewMask.end(), value: 0);
8473 std::iota(first: NewMaskOrder.begin(), last: NewMaskOrder.end(), value: 0);
8474 if (Idx == 0) {
8475 copy(Range&: Mask, Out: NewMask.begin());
8476 copy(Range&: MaskOrder, Out: NewMaskOrder.begin());
8477 } else {
8478 assert(Idx == 1 && "Expected either 0 or 1 index.");
8479 unsigned Offset = CombinedEntriesWithIndices.back().second;
8480 for (unsigned I : seq<unsigned>(Size: Mask.size())) {
8481 NewMask[I + Offset] = Mask[I] + Offset;
8482 NewMaskOrder[I + Offset] = MaskOrder[I] + Offset;
8483 }
8484 }
8485 reorderScalars(Scalars, Mask: NewMask);
8486 reorderOrder(Order&: ReorderIndices, Mask: NewMaskOrder, /*BottomOrder=*/true);
8487 if (!ReorderIndices.empty() && BoUpSLP::isIdentityOrder(Order: ReorderIndices))
8488 ReorderIndices.clear();
8489}
8490
8491void BoUpSLP::reorderTopToBottom() {
8492 // Maps VF to the graph nodes.
8493 DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
8494 // ExtractElement gather nodes which can be vectorized and need to handle
8495 // their ordering.
8496 DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
8497
8498 // Phi nodes can have preferred ordering based on their result users
8499 DenseMap<const TreeEntry *, OrdersType> PhisToOrders;
8500
8501 // AltShuffles can also have a preferred ordering that leads to fewer
8502 // instructions, e.g., the addsub instruction in x86.
8503 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
8504
8505 // Maps a TreeEntry to the reorder indices of external users.
8506 DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
8507 ExternalUserReorderMap;
8508 // Find all reorderable nodes with the given VF.
8509 // Currently the are vectorized stores,loads,extracts + some gathering of
8510 // extracts.
8511 for_each(Range&: VectorizableTree, F: [&, &TTIRef = *TTI](
8512 const std::unique_ptr<TreeEntry> &TE) {
8513 // Look for external users that will probably be vectorized.
8514 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
8515 findExternalStoreUsersReorderIndices(TE: TE.get());
8516 if (!ExternalUserReorderIndices.empty()) {
8517 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8518 ExternalUserReorderMap.try_emplace(Key: TE.get(),
8519 Args: std::move(ExternalUserReorderIndices));
8520 }
8521
8522 // Patterns like [fadd,fsub] can be combined into a single instruction in
8523 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
8524 // to take into account their order when looking for the most used order.
8525 if (TE->hasState() && TE->isAltShuffle() &&
8526 TE->State != TreeEntry::SplitVectorize) {
8527 Type *ScalarTy = TE->Scalars[0]->getType();
8528 VectorType *VecTy = getWidenedType(ScalarTy, VF: TE->Scalars.size());
8529 unsigned Opcode0 = TE->getOpcode();
8530 unsigned Opcode1 = TE->getAltOpcode();
8531 SmallBitVector OpcodeMask(
8532 getAltInstrMask(VL: TE->Scalars, ScalarTy, Opcode0, Opcode1));
8533 // If this pattern is supported by the target then we consider the order.
8534 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
8535 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8536 AltShufflesToOrders.try_emplace(Key: TE.get(), Args: OrdersType());
8537 }
8538 // TODO: Check the reverse order too.
8539 }
8540
8541 bool IgnoreReorder =
8542 !UserIgnoreList && VectorizableTree.front()->hasState() &&
8543 (VectorizableTree.front()->getOpcode() == Instruction::InsertElement ||
8544 VectorizableTree.front()->getOpcode() == Instruction::Store);
8545 if (std::optional<OrdersType> CurrentOrder =
8546 getReorderingData(TE: *TE, /*TopToBottom=*/true, IgnoreReorder)) {
8547 // Do not include ordering for nodes used in the alt opcode vectorization,
8548 // better to reorder them during bottom-to-top stage. If follow the order
8549 // here, it causes reordering of the whole graph though actually it is
8550 // profitable just to reorder the subgraph that starts from the alternate
8551 // opcode vectorization node. Such nodes already end-up with the shuffle
8552 // instruction and it is just enough to change this shuffle rather than
8553 // rotate the scalars for the whole graph.
8554 unsigned Cnt = 0;
8555 const TreeEntry *UserTE = TE.get();
8556 while (UserTE && Cnt < RecursionMaxDepth) {
8557 if (!UserTE->UserTreeIndex)
8558 break;
8559 if (UserTE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
8560 UserTE->UserTreeIndex.UserTE->isAltShuffle() &&
8561 UserTE->UserTreeIndex.UserTE->Idx != 0)
8562 return;
8563 UserTE = UserTE->UserTreeIndex.UserTE;
8564 ++Cnt;
8565 }
8566 VFToOrderedEntries[TE->getVectorFactor()].insert(X: TE.get());
8567 if (!(TE->State == TreeEntry::Vectorize ||
8568 TE->State == TreeEntry::StridedVectorize ||
8569 TE->State == TreeEntry::SplitVectorize ||
8570 TE->State == TreeEntry::CompressVectorize) ||
8571 !TE->ReuseShuffleIndices.empty())
8572 GathersToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8573 if (TE->State == TreeEntry::Vectorize &&
8574 TE->getOpcode() == Instruction::PHI)
8575 PhisToOrders.try_emplace(Key: TE.get(), Args&: *CurrentOrder);
8576 }
8577 });
8578
8579 // Reorder the graph nodes according to their vectorization factor.
8580 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
8581 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
8582 auto It = VFToOrderedEntries.find(Val: VF);
8583 if (It == VFToOrderedEntries.end())
8584 continue;
8585 // Try to find the most profitable order. We just are looking for the most
8586 // used order and reorder scalar elements in the nodes according to this
8587 // mostly used order.
8588 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
8589 // Delete VF entry upon exit.
8590 llvm::scope_exit Cleanup([&]() { VFToOrderedEntries.erase(I: It); });
8591
8592 // All operands are reordered and used only in this node - propagate the
8593 // most used order to the user node.
8594 MapVector<OrdersType, unsigned,
8595 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
8596 OrdersUses;
8597 for (const TreeEntry *OpTE : OrderedEntries) {
8598 // No need to reorder this nodes, still need to extend and to use shuffle,
8599 // just need to merge reordering shuffle and the reuse shuffle.
8600 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(Val: OpTE) &&
8601 OpTE->State != TreeEntry::SplitVectorize)
8602 continue;
8603 // Count number of orders uses.
8604 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
8605 &PhisToOrders]() -> const OrdersType & {
8606 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
8607 auto It = GathersToOrders.find(Val: OpTE);
8608 if (It != GathersToOrders.end())
8609 return It->second;
8610 }
8611 if (OpTE->hasState() && OpTE->isAltShuffle()) {
8612 auto It = AltShufflesToOrders.find(Val: OpTE);
8613 if (It != AltShufflesToOrders.end())
8614 return It->second;
8615 }
8616 if (OpTE->State == TreeEntry::Vectorize &&
8617 OpTE->getOpcode() == Instruction::PHI) {
8618 auto It = PhisToOrders.find(Val: OpTE);
8619 if (It != PhisToOrders.end())
8620 return It->second;
8621 }
8622 return OpTE->ReorderIndices;
8623 }();
8624 // First consider the order of the external scalar users.
8625 auto It = ExternalUserReorderMap.find(Val: OpTE);
8626 if (It != ExternalUserReorderMap.end()) {
8627 const auto &ExternalUserReorderIndices = It->second;
8628 // If the OpTE vector factor != number of scalars - use natural order,
8629 // it is an attempt to reorder node with reused scalars but with
8630 // external uses.
8631 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
8632 OrdersUses.try_emplace(Key: OrdersType(), Args: 0).first->second +=
8633 ExternalUserReorderIndices.size();
8634 } else {
8635 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
8636 ++OrdersUses.try_emplace(Key: ExtOrder, Args: 0).first->second;
8637 }
8638 // No other useful reorder data in this entry.
8639 if (Order.empty())
8640 continue;
8641 }
8642 // Stores actually store the mask, not the order, need to invert.
8643 if (OpTE->State == TreeEntry::Vectorize &&
8644 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
8645 assert(!OpTE->isAltShuffle() &&
8646 "Alternate instructions are only supported by BinaryOperator "
8647 "and CastInst.");
8648 SmallVector<int> Mask;
8649 inversePermutation(Indices: Order, Mask);
8650 unsigned E = Order.size();
8651 OrdersType CurrentOrder(E, E);
8652 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
8653 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
8654 });
8655 fixupOrderingIndices(Order: CurrentOrder);
8656 ++OrdersUses.try_emplace(Key: CurrentOrder, Args: 0).first->second;
8657 } else {
8658 ++OrdersUses.try_emplace(Key: Order, Args: 0).first->second;
8659 }
8660 }
8661 if (OrdersUses.empty())
8662 continue;
8663 // Choose the most used order.
8664 unsigned IdentityCnt = 0;
8665 unsigned FilledIdentityCnt = 0;
8666 OrdersType IdentityOrder(VF, VF);
8667 for (auto &Pair : OrdersUses) {
8668 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
8669 if (!Pair.first.empty())
8670 FilledIdentityCnt += Pair.second;
8671 IdentityCnt += Pair.second;
8672 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
8673 }
8674 }
8675 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
8676 unsigned Cnt = IdentityCnt;
8677 for (auto &Pair : OrdersUses) {
8678 // Prefer identity order. But, if filled identity found (non-empty order)
8679 // with same number of uses, as the new candidate order, we can choose
8680 // this candidate order.
8681 if (Cnt < Pair.second ||
8682 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
8683 Cnt == Pair.second && !BestOrder.empty() &&
8684 isIdentityOrder(Order: BestOrder))) {
8685 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
8686 BestOrder = Pair.first;
8687 Cnt = Pair.second;
8688 } else {
8689 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
8690 }
8691 }
8692 // Set order of the user node.
8693 if (isIdentityOrder(Order: BestOrder))
8694 continue;
8695 fixupOrderingIndices(Order: BestOrder);
8696 SmallVector<int> Mask;
8697 inversePermutation(Indices: BestOrder, Mask);
8698 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
8699 unsigned E = BestOrder.size();
8700 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8701 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8702 });
8703 // Do an actual reordering, if profitable.
8704 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8705 // Just do the reordering for the nodes with the given VF.
8706 if (TE->Scalars.size() != VF) {
8707 if (TE->ReuseShuffleIndices.size() == VF) {
8708 assert(TE->State != TreeEntry::SplitVectorize &&
8709 "Split vectorized not expected.");
8710 // Need to reorder the reuses masks of the operands with smaller VF to
8711 // be able to find the match between the graph nodes and scalar
8712 // operands of the given node during vectorization/cost estimation.
8713 assert(
8714 (!TE->UserTreeIndex ||
8715 TE->UserTreeIndex.UserTE->Scalars.size() == VF ||
8716 TE->UserTreeIndex.UserTE->Scalars.size() == TE->Scalars.size() ||
8717 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize) &&
8718 "All users must be of VF size.");
8719 if (SLPReVec) {
8720 assert(SLPReVec && "Only supported by REVEC.");
8721 // ShuffleVectorInst does not do reorderOperands (and it should not
8722 // because ShuffleVectorInst supports only a limited set of
8723 // patterns). Only do reorderNodeWithReuses if the user is not
8724 // ShuffleVectorInst.
8725 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->hasState() &&
8726 isa<ShuffleVectorInst>(Val: TE->UserTreeIndex.UserTE->getMainOp()))
8727 continue;
8728 }
8729 // Update ordering of the operands with the smaller VF than the given
8730 // one.
8731 reorderNodeWithReuses(TE&: *TE, Mask);
8732 // Update orders in user split vectorize nodes.
8733 if (TE->UserTreeIndex &&
8734 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8735 TE->UserTreeIndex.UserTE->reorderSplitNode(
8736 Idx: TE->UserTreeIndex.EdgeIdx, Mask, MaskOrder);
8737 }
8738 continue;
8739 }
8740 if ((TE->State == TreeEntry::SplitVectorize &&
8741 TE->ReuseShuffleIndices.empty()) ||
8742 ((TE->State == TreeEntry::Vectorize ||
8743 TE->State == TreeEntry::StridedVectorize ||
8744 TE->State == TreeEntry::CompressVectorize) &&
8745 (isa<ExtractElementInst, ExtractValueInst, LoadInst, StoreInst,
8746 InsertElementInst>(Val: TE->getMainOp()) ||
8747 (SLPReVec && isa<ShuffleVectorInst>(Val: TE->getMainOp()))))) {
8748 assert(
8749 (!TE->isAltShuffle() || (TE->State == TreeEntry::SplitVectorize &&
8750 TE->ReuseShuffleIndices.empty())) &&
8751 "Alternate instructions are only supported by BinaryOperator "
8752 "and CastInst.");
8753 // Build correct orders for extract{element,value}, loads,
8754 // stores and alternate (split) nodes.
8755 reorderOrder(Order&: TE->ReorderIndices, Mask);
8756 if (isa<InsertElementInst, StoreInst>(Val: TE->getMainOp()))
8757 TE->reorderOperands(Mask);
8758 } else {
8759 // Reorder the node and its operands.
8760 TE->reorderOperands(Mask);
8761 assert(TE->ReorderIndices.empty() &&
8762 "Expected empty reorder sequence.");
8763 reorderScalars(Scalars&: TE->Scalars, Mask);
8764 }
8765 if (!TE->ReuseShuffleIndices.empty()) {
8766 // Apply reversed order to keep the original ordering of the reused
8767 // elements to avoid extra reorder indices shuffling.
8768 OrdersType CurrentOrder;
8769 reorderOrder(Order&: CurrentOrder, Mask: MaskOrder);
8770 SmallVector<int> NewReuses;
8771 inversePermutation(Indices: CurrentOrder, Mask&: NewReuses);
8772 addMask(Mask&: NewReuses, SubMask: TE->ReuseShuffleIndices);
8773 TE->ReuseShuffleIndices.swap(RHS&: NewReuses);
8774 } else if (TE->UserTreeIndex &&
8775 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize)
8776 // Update orders in user split vectorize nodes.
8777 TE->UserTreeIndex.UserTE->reorderSplitNode(Idx: TE->UserTreeIndex.EdgeIdx,
8778 Mask, MaskOrder);
8779 }
8780 }
8781}
8782
8783void BoUpSLP::buildReorderableOperands(
8784 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
8785 const SmallPtrSetImpl<const TreeEntry *> &ReorderableGathers,
8786 SmallVectorImpl<TreeEntry *> &GatherOps) {
8787 for (unsigned I : seq<unsigned>(Size: UserTE->getNumOperands())) {
8788 if (any_of(Range&: Edges, P: [I](const std::pair<unsigned, TreeEntry *> &OpData) {
8789 return OpData.first == I &&
8790 (OpData.second->State == TreeEntry::Vectorize ||
8791 OpData.second->State == TreeEntry::StridedVectorize ||
8792 OpData.second->State == TreeEntry::CompressVectorize ||
8793 OpData.second->State == TreeEntry::SplitVectorize);
8794 }))
8795 continue;
8796 // Do not request operands, if they do not exist.
8797 if (UserTE->hasState()) {
8798 if (UserTE->getOpcode() == Instruction::ExtractElement ||
8799 UserTE->getOpcode() == Instruction::ExtractValue)
8800 continue;
8801 if (UserTE->getOpcode() == Instruction::InsertElement && I == 0)
8802 continue;
8803 if (UserTE->getOpcode() == Instruction::Store &&
8804 UserTE->State == TreeEntry::Vectorize && I == 1)
8805 continue;
8806 if (UserTE->getOpcode() == Instruction::Load &&
8807 (UserTE->State == TreeEntry::Vectorize ||
8808 UserTE->State == TreeEntry::StridedVectorize ||
8809 UserTE->State == TreeEntry::CompressVectorize))
8810 continue;
8811 }
8812 TreeEntry *TE = getOperandEntry(E: UserTE, Idx: I);
8813 assert(TE && "Expected operand entry.");
8814 if (!TE->isGather()) {
8815 // Add the node to the list of the ordered nodes with the identity
8816 // order.
8817 Edges.emplace_back(Args&: I, Args&: TE);
8818 // Add ScatterVectorize nodes to the list of operands, where just
8819 // reordering of the scalars is required. Similar to the gathers, so
8820 // simply add to the list of gathered ops.
8821 // If there are reused scalars, process this node as a regular vectorize
8822 // node, just reorder reuses mask.
8823 if (TE->State == TreeEntry::ScatterVectorize &&
8824 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
8825 GatherOps.push_back(Elt: TE);
8826 continue;
8827 }
8828 if (ReorderableGathers.contains(Ptr: TE))
8829 GatherOps.push_back(Elt: TE);
8830 }
8831}
8832
8833void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
8834 struct TreeEntryCompare {
8835 bool operator()(const TreeEntry *LHS, const TreeEntry *RHS) const {
8836 if (LHS->UserTreeIndex && RHS->UserTreeIndex)
8837 return LHS->UserTreeIndex.UserTE->Idx < RHS->UserTreeIndex.UserTE->Idx;
8838 return LHS->Idx < RHS->Idx;
8839 }
8840 };
8841 PriorityQueue<TreeEntry *, SmallVector<TreeEntry *>, TreeEntryCompare> Queue;
8842 DenseSet<const TreeEntry *> GathersToOrders;
8843 // Find all reorderable leaf nodes with the given VF.
8844 // Currently the are vectorized loads,extracts without alternate operands +
8845 // some gathering of extracts.
8846 SmallPtrSet<const TreeEntry *, 4> NonVectorized;
8847 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8848 if (TE->State != TreeEntry::Vectorize &&
8849 TE->State != TreeEntry::StridedVectorize &&
8850 TE->State != TreeEntry::CompressVectorize &&
8851 TE->State != TreeEntry::SplitVectorize)
8852 NonVectorized.insert(Ptr: TE.get());
8853 if (std::optional<OrdersType> CurrentOrder =
8854 getReorderingData(TE: *TE, /*TopToBottom=*/false, IgnoreReorder)) {
8855 Queue.push(x: TE.get());
8856 if (!(TE->State == TreeEntry::Vectorize ||
8857 TE->State == TreeEntry::StridedVectorize ||
8858 TE->State == TreeEntry::CompressVectorize ||
8859 TE->State == TreeEntry::SplitVectorize) ||
8860 !TE->ReuseShuffleIndices.empty())
8861 GathersToOrders.insert(V: TE.get());
8862 }
8863 }
8864
8865 // 1. Propagate order to the graph nodes, which use only reordered nodes.
8866 // I.e., if the node has operands, that are reordered, try to make at least
8867 // one operand order in the natural order and reorder others + reorder the
8868 // user node itself.
8869 SmallPtrSet<const TreeEntry *, 4> Visited, RevisitedOps;
8870 while (!Queue.empty()) {
8871 // 1. Filter out only reordered nodes.
8872 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>> Users;
8873 TreeEntry *TE = Queue.top();
8874 const TreeEntry *UserTE = TE->UserTreeIndex.UserTE;
8875 Queue.pop();
8876 SmallVector<TreeEntry *> OrderedOps(1, TE);
8877 while (!Queue.empty()) {
8878 TE = Queue.top();
8879 if (!UserTE || UserTE != TE->UserTreeIndex.UserTE)
8880 break;
8881 Queue.pop();
8882 OrderedOps.push_back(Elt: TE);
8883 }
8884 for (TreeEntry *TE : OrderedOps) {
8885 if (!(TE->State == TreeEntry::Vectorize ||
8886 TE->State == TreeEntry::StridedVectorize ||
8887 TE->State == TreeEntry::CompressVectorize ||
8888 TE->State == TreeEntry::SplitVectorize ||
8889 (TE->isGather() && GathersToOrders.contains(V: TE))) ||
8890 !TE->UserTreeIndex || !TE->ReuseShuffleIndices.empty() ||
8891 !Visited.insert(Ptr: TE).second)
8892 continue;
8893 // Build a map between user nodes and their operands order to speedup
8894 // search. The graph currently does not provide this dependency directly.
8895 Users.first = TE->UserTreeIndex.UserTE;
8896 Users.second.emplace_back(Args&: TE->UserTreeIndex.EdgeIdx, Args&: TE);
8897 }
8898 if (Users.first) {
8899 auto &Data = Users;
8900 if (Data.first->State == TreeEntry::SplitVectorize) {
8901 assert(
8902 Data.second.size() <= 2 &&
8903 "Expected not greater than 2 operands for split vectorize node.");
8904 if (any_of(Range&: Data.second,
8905 P: [](const auto &Op) { return !Op.second->UserTreeIndex; }))
8906 continue;
8907 // Update orders in user split vectorize nodes.
8908 assert(Data.first->CombinedEntriesWithIndices.size() == 2 &&
8909 "Expected exactly 2 entries.");
8910 for (const auto &P : Data.first->CombinedEntriesWithIndices) {
8911 TreeEntry &OpTE = *VectorizableTree[P.first];
8912 OrdersType Order = OpTE.ReorderIndices;
8913 if (Order.empty() || !OpTE.ReuseShuffleIndices.empty()) {
8914 if (!OpTE.isGather() && OpTE.ReuseShuffleIndices.empty())
8915 continue;
8916 const auto BestOrder =
8917 getReorderingData(TE: OpTE, /*TopToBottom=*/false, IgnoreReorder);
8918 if (!BestOrder || BestOrder->empty() || isIdentityOrder(Order: *BestOrder))
8919 continue;
8920 Order = *BestOrder;
8921 }
8922 fixupOrderingIndices(Order);
8923 SmallVector<int> Mask;
8924 inversePermutation(Indices: Order, Mask);
8925 const unsigned E = Order.size();
8926 SmallVector<int> MaskOrder(E, PoisonMaskElem);
8927 transform(Range&: Order, d_first: MaskOrder.begin(), F: [E](unsigned I) {
8928 return I < E ? static_cast<int>(I) : PoisonMaskElem;
8929 });
8930 Data.first->reorderSplitNode(Idx: P.second ? 1 : 0, Mask, MaskOrder);
8931 // Clear ordering of the operand.
8932 if (!OpTE.ReorderIndices.empty()) {
8933 OpTE.ReorderIndices.clear();
8934 } else if (!OpTE.ReuseShuffleIndices.empty()) {
8935 reorderReuses(Reuses&: OpTE.ReuseShuffleIndices, Mask);
8936 } else {
8937 assert(OpTE.isGather() && "Expected only gather/buildvector node.");
8938 reorderScalars(Scalars&: OpTE.Scalars, Mask);
8939 }
8940 }
8941 if (Data.first->ReuseShuffleIndices.empty() &&
8942 !Data.first->ReorderIndices.empty()) {
8943 // Insert user node to the list to try to sink reordering deeper in
8944 // the graph.
8945 Queue.push(x: Data.first);
8946 }
8947 continue;
8948 }
8949 // Check that operands are used only in the User node.
8950 SmallVector<TreeEntry *> GatherOps;
8951 buildReorderableOperands(UserTE: Data.first, Edges&: Data.second, ReorderableGathers: NonVectorized,
8952 GatherOps);
8953 // All operands are reordered and used only in this node - propagate the
8954 // most used order to the user node.
8955 MapVector<OrdersType, unsigned,
8956 DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>
8957 OrdersUses;
8958 // Do the analysis for each tree entry only once, otherwise the order of
8959 // the same node my be considered several times, though might be not
8960 // profitable.
8961 SmallPtrSet<const TreeEntry *, 4> VisitedOps;
8962 SmallPtrSet<const TreeEntry *, 4> VisitedUsers;
8963 for (const auto &Op : Data.second) {
8964 TreeEntry *OpTE = Op.second;
8965 if (!VisitedOps.insert(Ptr: OpTE).second)
8966 continue;
8967 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(V: OpTE))
8968 continue;
8969 const auto Order = [&]() -> const OrdersType {
8970 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
8971 return getReorderingData(TE: *OpTE, /*TopToBottom=*/false,
8972 IgnoreReorder)
8973 .value_or(u: OrdersType(1));
8974 return OpTE->ReorderIndices;
8975 }();
8976 // The order is partially ordered, skip it in favor of fully non-ordered
8977 // orders.
8978 if (Order.size() == 1)
8979 continue;
8980
8981 // Check that the reordering does not increase number of shuffles, i.e.
8982 // same-values-nodes has same parents or their parents has same parents.
8983 if (!Order.empty() && !isIdentityOrder(Order)) {
8984 Value *Root = OpTE->hasState()
8985 ? OpTE->getMainOp()
8986 : *find_if_not(Range&: OpTE->Scalars, P: isConstant);
8987 auto GetSameNodesUsers = [&](Value *Root) {
8988 SmallSetVector<TreeEntry *, 4> Res;
8989 for (const TreeEntry *TE : ValueToGatherNodes.lookup(Val: Root)) {
8990 if (TE != OpTE && TE->UserTreeIndex &&
8991 TE->getVectorFactor() == OpTE->getVectorFactor() &&
8992 TE->Scalars.size() == OpTE->Scalars.size() &&
8993 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
8994 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
8995 Res.insert(X: TE->UserTreeIndex.UserTE);
8996 }
8997 for (const TreeEntry *TE : getTreeEntries(V: Root)) {
8998 if (TE != OpTE && TE->UserTreeIndex &&
8999 TE->getVectorFactor() == OpTE->getVectorFactor() &&
9000 TE->Scalars.size() == OpTE->Scalars.size() &&
9001 ((TE->ReorderIndices.empty() && OpTE->isSame(VL: TE->Scalars)) ||
9002 (OpTE->ReorderIndices.empty() && TE->isSame(VL: OpTE->Scalars))))
9003 Res.insert(X: TE->UserTreeIndex.UserTE);
9004 }
9005 return Res.takeVector();
9006 };
9007 auto GetNumOperands = [](const TreeEntry *TE) {
9008 if (TE->State == TreeEntry::SplitVectorize)
9009 return TE->getNumOperands();
9010 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
9011 return CI->arg_size();
9012 return TE->getNumOperands();
9013 };
9014 auto NodeShouldBeReorderedWithOperands = [&, TTI = TTI](
9015 const TreeEntry *TE) {
9016 Intrinsic::ID ID = Intrinsic::not_intrinsic;
9017 if (auto *CI = dyn_cast<CallInst>(Val: TE->getMainOp()); CI)
9018 ID = getVectorIntrinsicIDForCall(CI, TLI);
9019 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(TE))) {
9020 if (ID != Intrinsic::not_intrinsic &&
9021 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9022 continue;
9023 const TreeEntry *Op = getOperandEntry(E: TE, Idx);
9024 if (Op->isGather() && Op->hasState()) {
9025 const TreeEntry *VecOp =
9026 getSameValuesTreeEntry(V: Op->getMainOp(), VL: Op->Scalars);
9027 if (VecOp)
9028 Op = VecOp;
9029 }
9030 if (Op->ReorderIndices.empty() && Op->ReuseShuffleIndices.empty())
9031 return false;
9032 }
9033 return true;
9034 };
9035 SmallVector<TreeEntry *> Users = GetSameNodesUsers(Root);
9036 if (!Users.empty() && !all_of(Range&: Users, P: [&](TreeEntry *UTE) {
9037 if (!RevisitedOps.insert(Ptr: UTE).second)
9038 return false;
9039 return UTE == Data.first || !UTE->ReorderIndices.empty() ||
9040 !UTE->ReuseShuffleIndices.empty() ||
9041 (UTE->UserTreeIndex &&
9042 UTE->UserTreeIndex.UserTE == Data.first) ||
9043 (Data.first->UserTreeIndex &&
9044 Data.first->UserTreeIndex.UserTE == UTE) ||
9045 (IgnoreReorder && UTE->UserTreeIndex &&
9046 UTE->UserTreeIndex.UserTE->Idx == 0) ||
9047 NodeShouldBeReorderedWithOperands(UTE);
9048 }))
9049 continue;
9050 for (TreeEntry *UTE : Users) {
9051 Intrinsic::ID ID = Intrinsic::not_intrinsic;
9052 if (auto *CI = dyn_cast<CallInst>(Val: UTE->getMainOp()); CI)
9053 ID = getVectorIntrinsicIDForCall(CI, TLI);
9054 for (unsigned Idx : seq<unsigned>(Size: GetNumOperands(UTE))) {
9055 if (ID != Intrinsic::not_intrinsic &&
9056 isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI))
9057 continue;
9058 const TreeEntry *Op = getOperandEntry(E: UTE, Idx);
9059 Visited.erase(Ptr: Op);
9060 Queue.push(x: const_cast<TreeEntry *>(Op));
9061 }
9062 }
9063 }
9064 unsigned NumOps = count_if(
9065 Range&: Data.second, P: [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
9066 return P.second == OpTE;
9067 });
9068 // Stores actually store the mask, not the order, need to invert.
9069 if (OpTE->State == TreeEntry::Vectorize &&
9070 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
9071 assert(!OpTE->isAltShuffle() &&
9072 "Alternate instructions are only supported by BinaryOperator "
9073 "and CastInst.");
9074 SmallVector<int> Mask;
9075 inversePermutation(Indices: Order, Mask);
9076 unsigned E = Order.size();
9077 OrdersType CurrentOrder(E, E);
9078 transform(Range&: Mask, d_first: CurrentOrder.begin(), F: [E](int Idx) {
9079 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
9080 });
9081 fixupOrderingIndices(Order: CurrentOrder);
9082 OrdersUses.try_emplace(Key: CurrentOrder, Args: 0).first->second += NumOps;
9083 } else {
9084 OrdersUses.try_emplace(Key: Order, Args: 0).first->second += NumOps;
9085 }
9086 auto Res = OrdersUses.try_emplace(Key: OrdersType(), Args: 0);
9087 const auto AllowsReordering = [&](const TreeEntry *TE) {
9088 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
9089 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
9090 (IgnoreReorder && TE->Idx == 0))
9091 return true;
9092 if (TE->isGather()) {
9093 if (GathersToOrders.contains(V: TE))
9094 return !getReorderingData(TE: *TE, /*TopToBottom=*/false,
9095 IgnoreReorder)
9096 .value_or(u: OrdersType(1))
9097 .empty();
9098 return true;
9099 }
9100 return false;
9101 };
9102 if (OpTE->UserTreeIndex) {
9103 TreeEntry *UserTE = OpTE->UserTreeIndex.UserTE;
9104 if (!VisitedUsers.insert(Ptr: UserTE).second)
9105 continue;
9106 // May reorder user node if it requires reordering, has reused
9107 // scalars, is an alternate op vectorize node or its op nodes require
9108 // reordering.
9109 if (AllowsReordering(UserTE))
9110 continue;
9111 // Check if users allow reordering.
9112 // Currently look up just 1 level of operands to avoid increase of
9113 // the compile time.
9114 // Profitable to reorder if definitely more operands allow
9115 // reordering rather than those with natural order.
9116 ArrayRef<std::pair<unsigned, TreeEntry *>> Ops = Users.second;
9117 if (static_cast<unsigned>(count_if(
9118 Range&: Ops, P: [UserTE, &AllowsReordering](
9119 const std::pair<unsigned, TreeEntry *> &Op) {
9120 return AllowsReordering(Op.second) &&
9121 Op.second->UserTreeIndex.UserTE == UserTE;
9122 })) <= Ops.size() / 2)
9123 ++Res.first->second;
9124 }
9125 }
9126 if (OrdersUses.empty()) {
9127 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9128 continue;
9129 }
9130 // Choose the most used order.
9131 unsigned IdentityCnt = 0;
9132 unsigned VF = Data.second.front().second->getVectorFactor();
9133 OrdersType IdentityOrder(VF, VF);
9134 for (auto &Pair : OrdersUses) {
9135 if (Pair.first.empty() || isIdentityOrder(Order: Pair.first)) {
9136 IdentityCnt += Pair.second;
9137 combineOrders(Order: IdentityOrder, SecondaryOrder: Pair.first);
9138 }
9139 }
9140 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
9141 unsigned Cnt = IdentityCnt;
9142 for (auto &Pair : OrdersUses) {
9143 // Prefer identity order. But, if filled identity found (non-empty
9144 // order) with same number of uses, as the new candidate order, we can
9145 // choose this candidate order.
9146 if (Cnt < Pair.second) {
9147 combineOrders(Order: Pair.first, SecondaryOrder: BestOrder);
9148 BestOrder = Pair.first;
9149 Cnt = Pair.second;
9150 } else {
9151 combineOrders(Order: BestOrder, SecondaryOrder: Pair.first);
9152 }
9153 }
9154 // Set order of the user node.
9155 if (isIdentityOrder(Order: BestOrder)) {
9156 Visited.insert_range(R: llvm::make_second_range(c&: Data.second));
9157 continue;
9158 }
9159 fixupOrderingIndices(Order: BestOrder);
9160 // Erase operands from OrderedEntries list and adjust their orders.
9161 VisitedOps.clear();
9162 SmallVector<int> Mask;
9163 inversePermutation(Indices: BestOrder, Mask);
9164 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
9165 unsigned E = BestOrder.size();
9166 transform(Range&: BestOrder, d_first: MaskOrder.begin(), F: [E](unsigned I) {
9167 return I < E ? static_cast<int>(I) : PoisonMaskElem;
9168 });
9169 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
9170 TreeEntry *TE = Op.second;
9171 if (!VisitedOps.insert(Ptr: TE).second)
9172 continue;
9173 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
9174 reorderNodeWithReuses(TE&: *TE, Mask);
9175 continue;
9176 }
9177 // Gathers are processed separately.
9178 if (TE->State != TreeEntry::Vectorize &&
9179 TE->State != TreeEntry::StridedVectorize &&
9180 TE->State != TreeEntry::CompressVectorize &&
9181 TE->State != TreeEntry::SplitVectorize &&
9182 (TE->State != TreeEntry::ScatterVectorize ||
9183 TE->ReorderIndices.empty()))
9184 continue;
9185 assert((BestOrder.size() == TE->ReorderIndices.size() ||
9186 TE->ReorderIndices.empty()) &&
9187 "Non-matching sizes of user/operand entries.");
9188 reorderOrder(Order&: TE->ReorderIndices, Mask);
9189 if (IgnoreReorder && TE == VectorizableTree.front().get())
9190 IgnoreReorder = false;
9191 }
9192 // For gathers just need to reorder its scalars.
9193 for (TreeEntry *Gather : GatherOps) {
9194 assert(Gather->ReorderIndices.empty() &&
9195 "Unexpected reordering of gathers.");
9196 if (!Gather->ReuseShuffleIndices.empty()) {
9197 // Just reorder reuses indices.
9198 reorderReuses(Reuses&: Gather->ReuseShuffleIndices, Mask);
9199 continue;
9200 }
9201 reorderScalars(Scalars&: Gather->Scalars, Mask);
9202 Visited.insert(Ptr: Gather);
9203 }
9204 // Reorder operands of the user node and set the ordering for the user
9205 // node itself.
9206 auto IsNotProfitableAltCodeNode = [](const TreeEntry &TE) {
9207 return TE.isAltShuffle() &&
9208 (!TE.ReuseShuffleIndices.empty() || TE.getVectorFactor() == 2 ||
9209 TE.ReorderIndices.empty());
9210 };
9211 if (Data.first->State != TreeEntry::Vectorize ||
9212 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
9213 Val: Data.first->getMainOp()) ||
9214 IsNotProfitableAltCodeNode(*Data.first))
9215 Data.first->reorderOperands(Mask);
9216 if (!isa<InsertElementInst, StoreInst>(Val: Data.first->getMainOp()) ||
9217 IsNotProfitableAltCodeNode(*Data.first) ||
9218 Data.first->State == TreeEntry::StridedVectorize ||
9219 Data.first->State == TreeEntry::CompressVectorize) {
9220 reorderScalars(Scalars&: Data.first->Scalars, Mask);
9221 reorderOrder(Order&: Data.first->ReorderIndices, Mask: MaskOrder,
9222 /*BottomOrder=*/true);
9223 if (Data.first->ReuseShuffleIndices.empty() &&
9224 !Data.first->ReorderIndices.empty() &&
9225 !IsNotProfitableAltCodeNode(*Data.first)) {
9226 // Insert user node to the list to try to sink reordering deeper in
9227 // the graph.
9228 Queue.push(x: Data.first);
9229 }
9230 } else {
9231 reorderOrder(Order&: Data.first->ReorderIndices, Mask);
9232 }
9233 }
9234 }
9235 // If the reordering is unnecessary, just remove the reorder.
9236 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
9237 VectorizableTree.front()->ReuseShuffleIndices.empty())
9238 VectorizableTree.front()->ReorderIndices.clear();
9239}
9240
9241Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
9242 if (Entry.hasState() &&
9243 (Entry.getOpcode() == Instruction::Store ||
9244 Entry.getOpcode() == Instruction::Load) &&
9245 Entry.State == TreeEntry::StridedVectorize &&
9246 !Entry.ReorderIndices.empty() && isReverseOrder(Order: Entry.ReorderIndices))
9247 return dyn_cast<Instruction>(Val: Entry.Scalars[Entry.ReorderIndices.front()]);
9248 return dyn_cast<Instruction>(Val: Entry.Scalars.front());
9249}
9250
9251void BoUpSLP::buildExternalUses(
9252 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
9253 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
9254 DenseMap<Value *, unsigned> ScalarToExtUses;
9255 // Collect the values that we need to extract from the tree.
9256 for (auto &TEPtr : VectorizableTree) {
9257 TreeEntry *Entry = TEPtr.get();
9258
9259 // No need to handle users of gathered values.
9260 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
9261 DeletedNodes.contains(Ptr: Entry) ||
9262 TransformedToGatherNodes.contains(Val: Entry))
9263 continue;
9264
9265 // For each lane:
9266 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
9267 Value *Scalar = Entry->Scalars[Lane];
9268 if (!isa<Instruction>(Val: Scalar) || Entry->isCopyableElement(V: Scalar))
9269 continue;
9270
9271 // All uses must be replaced already? No need to do it again.
9272 auto It = ScalarToExtUses.find(Val: Scalar);
9273 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
9274 continue;
9275
9276 if (Scalar->hasNUsesOrMore(N: NumVectScalars)) {
9277 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9278 LLVM_DEBUG(dbgs() << "SLP: Need to extract from lane " << FoundLane
9279 << " from " << *Scalar << "for many users.\n");
9280 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9281 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9282 ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9283 continue;
9284 }
9285
9286 // Check if the scalar is externally used as an extra arg.
9287 const auto ExtI = ExternallyUsedValues.find(V: Scalar);
9288 if (ExtI != ExternallyUsedValues.end()) {
9289 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9290 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
9291 << FoundLane << " from " << *Scalar << ".\n");
9292 ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size());
9293 ExternalUses.emplace_back(Args&: Scalar, Args: nullptr, Args&: *Entry, Args&: FoundLane);
9294 continue;
9295 }
9296 for (User *U : Scalar->users()) {
9297 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
9298
9299 Instruction *UserInst = dyn_cast<Instruction>(Val: U);
9300 if (!UserInst || isDeleted(I: UserInst))
9301 continue;
9302
9303 // Ignore users in the user ignore list.
9304 if (UserIgnoreList && UserIgnoreList->contains(V: UserInst))
9305 continue;
9306
9307 // Skip in-tree scalars that become vectors
9308 if (ArrayRef<TreeEntry *> UseEntries = getTreeEntries(V: U);
9309 any_of(Range&: UseEntries, P: [this](const TreeEntry *UseEntry) {
9310 return !DeletedNodes.contains(Ptr: UseEntry) &&
9311 !TransformedToGatherNodes.contains(Val: UseEntry);
9312 })) {
9313 // Some in-tree scalars will remain as scalar in vectorized
9314 // instructions. If that is the case, the one in FoundLane will
9315 // be used.
9316 if (!((Scalar->getType()->getScalarType()->isPointerTy() &&
9317 isa<LoadInst, StoreInst>(Val: UserInst)) ||
9318 isa<CallInst>(Val: UserInst)) ||
9319 all_of(Range&: UseEntries, P: [&](TreeEntry *UseEntry) {
9320 if (DeletedNodes.contains(Ptr: UseEntry) ||
9321 TransformedToGatherNodes.contains(Val: UseEntry))
9322 return true;
9323 return UseEntry->State == TreeEntry::ScatterVectorize ||
9324 !doesInTreeUserNeedToExtract(
9325 Scalar, UserInst: getRootEntryInstruction(Entry: *UseEntry), TLI,
9326 TTI);
9327 })) {
9328 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
9329 << ".\n");
9330 assert(none_of(UseEntries,
9331 [](TreeEntry *UseEntry) {
9332 return UseEntry->isGather();
9333 }) &&
9334 "Bad state");
9335 continue;
9336 }
9337 U = nullptr;
9338 if (It != ScalarToExtUses.end()) {
9339 ExternalUses[It->second].User = nullptr;
9340 break;
9341 }
9342 }
9343
9344 if (U && Scalar->hasNUsesOrMore(N: UsesLimit))
9345 U = nullptr;
9346 unsigned FoundLane = Entry->findLaneForValue(V: Scalar);
9347 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
9348 << " from lane " << FoundLane << " from " << *Scalar
9349 << ".\n");
9350 It = ScalarToExtUses.try_emplace(Key: Scalar, Args: ExternalUses.size()).first;
9351 ExternalUses.emplace_back(Args&: Scalar, Args&: U, Args&: *Entry, Args&: FoundLane);
9352 ExternalUsesWithNonUsers.insert(Ptr: Scalar);
9353 if (!U)
9354 break;
9355 }
9356 }
9357 }
9358}
9359
9360SmallVector<SmallVector<StoreInst *>>
9361BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
9362 SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,
9363 SmallVector<StoreInst *>, 8>
9364 PtrToStoresMap;
9365 for (unsigned Lane : seq<unsigned>(Begin: 0, End: TE->Scalars.size())) {
9366 Value *V = TE->Scalars[Lane];
9367 // Don't iterate over the users of constant data.
9368 if (!isa<Instruction>(Val: V))
9369 continue;
9370 // To save compilation time we don't visit if we have too many users.
9371 if (V->hasNUsesOrMore(N: UsesLimit))
9372 break;
9373
9374 // Collect stores per pointer object.
9375 for (User *U : V->users()) {
9376 auto *SI = dyn_cast<StoreInst>(Val: U);
9377 // Test whether we can handle the store. V might be a global, which could
9378 // be used in a different function.
9379 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
9380 !isValidElementType(Ty: SI->getValueOperand()->getType()))
9381 continue;
9382 // Skip entry if already
9383 if (isVectorized(V: U))
9384 continue;
9385
9386 Value *Ptr =
9387 getUnderlyingObject(V: SI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
9388 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
9389 SI->getValueOperand()->getType(), Ptr}];
9390 // For now just keep one store per pointer object per lane.
9391 // TODO: Extend this to support multiple stores per pointer per lane
9392 if (StoresVec.size() > Lane)
9393 continue;
9394 if (!StoresVec.empty()) {
9395 std::optional<int64_t> Diff = getPointersDiff(
9396 ElemTyA: SI->getValueOperand()->getType(), PtrA: SI->getPointerOperand(),
9397 ElemTyB: SI->getValueOperand()->getType(),
9398 PtrB: StoresVec.front()->getPointerOperand(), DL: *DL, SE&: *SE,
9399 /*StrictCheck=*/true);
9400 // We failed to compare the pointers so just abandon this store.
9401 if (!Diff)
9402 continue;
9403 }
9404 StoresVec.push_back(Elt: SI);
9405 }
9406 }
9407 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
9408 unsigned I = 0;
9409 for (auto &P : PtrToStoresMap) {
9410 Res[I].swap(RHS&: P.second);
9411 ++I;
9412 }
9413 return Res;
9414}
9415
9416bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
9417 OrdersType &ReorderIndices) const {
9418 // We check whether the stores in StoreVec can form a vector by sorting them
9419 // and checking whether they are consecutive.
9420
9421 // To avoid calling getPointersDiff() while sorting we create a vector of
9422 // pairs {store, offset from first} and sort this instead.
9423 SmallVector<std::pair<int64_t, unsigned>> StoreOffsetVec;
9424 StoreInst *S0 = StoresVec[0];
9425 StoreOffsetVec.emplace_back(Args: 0, Args: 0);
9426 Type *S0Ty = S0->getValueOperand()->getType();
9427 Value *S0Ptr = S0->getPointerOperand();
9428 for (unsigned Idx : seq<unsigned>(Begin: 1, End: StoresVec.size())) {
9429 StoreInst *SI = StoresVec[Idx];
9430 std::optional<int64_t> Diff =
9431 getPointersDiff(ElemTyA: S0Ty, PtrA: S0Ptr, ElemTyB: SI->getValueOperand()->getType(),
9432 PtrB: SI->getPointerOperand(), DL: *DL, SE&: *SE,
9433 /*StrictCheck=*/true);
9434 StoreOffsetVec.emplace_back(Args&: *Diff, Args&: Idx);
9435 }
9436
9437 // Check if the stores are consecutive by checking if their difference is 1.
9438 if (StoreOffsetVec.size() != StoresVec.size())
9439 return false;
9440 sort(C&: StoreOffsetVec, Comp: llvm::less_first());
9441 unsigned Idx = 0;
9442 int64_t PrevDist = 0;
9443 for (const auto &P : StoreOffsetVec) {
9444 if (Idx > 0 && P.first != PrevDist + 1)
9445 return false;
9446 PrevDist = P.first;
9447 ++Idx;
9448 }
9449
9450 // Calculate the shuffle indices according to their offset against the sorted
9451 // StoreOffsetVec.
9452 ReorderIndices.assign(NumElts: StoresVec.size(), Elt: 0);
9453 bool IsIdentity = true;
9454 for (auto [I, P] : enumerate(First&: StoreOffsetVec)) {
9455 ReorderIndices[P.second] = I;
9456 IsIdentity &= P.second == I;
9457 }
9458 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
9459 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
9460 // same convention here.
9461 if (IsIdentity)
9462 ReorderIndices.clear();
9463
9464 return true;
9465}
9466
9467#ifndef NDEBUG
9468LLVM_DUMP_METHOD static void dumpOrder(const BoUpSLP::OrdersType &Order) {
9469 for (unsigned Idx : Order)
9470 dbgs() << Idx << ", ";
9471 dbgs() << "\n";
9472}
9473#endif
9474
9475SmallVector<BoUpSLP::OrdersType, 1>
9476BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
9477 unsigned NumLanes = TE->Scalars.size();
9478
9479 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
9480
9481 // Holds the reorder indices for each candidate store vector that is a user of
9482 // the current TreeEntry.
9483 SmallVector<OrdersType, 1> ExternalReorderIndices;
9484
9485 // Now inspect the stores collected per pointer and look for vectorization
9486 // candidates. For each candidate calculate the reorder index vector and push
9487 // it into `ExternalReorderIndices`
9488 for (ArrayRef<StoreInst *> StoresVec : Stores) {
9489 // If we have fewer than NumLanes stores, then we can't form a vector.
9490 if (StoresVec.size() != NumLanes)
9491 continue;
9492
9493 // If the stores are not consecutive then abandon this StoresVec.
9494 OrdersType ReorderIndices;
9495 if (!canFormVector(StoresVec, ReorderIndices))
9496 continue;
9497
9498 // We now know that the scalars in StoresVec can form a vector instruction,
9499 // so set the reorder indices.
9500 ExternalReorderIndices.push_back(Elt: ReorderIndices);
9501 }
9502 return ExternalReorderIndices;
9503}
9504
9505void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
9506 const SmallDenseSet<Value *> &UserIgnoreLst) {
9507 deleteTree();
9508 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9509 "TreeEntryToStridedPtrInfoMap is not cleared");
9510 UserIgnoreList = &UserIgnoreLst;
9511 if (!allSameType(VL: Roots))
9512 return;
9513 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
9514}
9515
9516void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
9517 deleteTree();
9518 assert(TreeEntryToStridedPtrInfoMap.empty() &&
9519 "TreeEntryToStridedPtrInfoMap is not cleared");
9520 if (!allSameType(VL: Roots))
9521 return;
9522 buildTreeRec(Roots, Depth: 0, EI: EdgeInfo());
9523}
9524
9525/// Tries to find subvector of loads and builds new vector of only loads if can
9526/// be profitable.
9527static void gatherPossiblyVectorizableLoads(
9528 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
9529 ScalarEvolution &SE, const TargetTransformInfo &TTI,
9530 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>> &GatheredLoads,
9531 bool AddNew = true) {
9532 if (VL.empty())
9533 return;
9534 Type *ScalarTy = getValueType(V: VL.front());
9535 if (!isValidElementType(Ty: ScalarTy))
9536 return;
9537 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>> ClusteredLoads;
9538 SmallVector<DenseMap<int64_t, LoadInst *>> ClusteredDistToLoad;
9539 for (Value *V : VL) {
9540 auto *LI = dyn_cast<LoadInst>(Val: V);
9541 if (!LI)
9542 continue;
9543 if (R.isDeleted(I: LI) || R.isVectorized(V: LI) || !LI->isSimple())
9544 continue;
9545 bool IsFound = false;
9546 for (auto [Map, Data] : zip(t&: ClusteredDistToLoad, u&: ClusteredLoads)) {
9547 assert(LI->getParent() == Data.front().first->getParent() &&
9548 LI->getType() == Data.front().first->getType() &&
9549 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
9550 getUnderlyingObject(Data.front().first->getPointerOperand(),
9551 RecursionMaxDepth) &&
9552 "Expected loads with the same type, same parent and same "
9553 "underlying pointer.");
9554 std::optional<int64_t> Dist = getPointersDiff(
9555 ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(), ElemTyB: Data.front().first->getType(),
9556 PtrB: Data.front().first->getPointerOperand(), DL, SE,
9557 /*StrictCheck=*/true);
9558 if (!Dist)
9559 continue;
9560 auto It = Map.find(Val: *Dist);
9561 if (It != Map.end() && It->second != LI)
9562 continue;
9563 if (It == Map.end()) {
9564 Data.emplace_back(Args&: LI, Args&: *Dist);
9565 Map.try_emplace(Key: *Dist, Args&: LI);
9566 }
9567 IsFound = true;
9568 break;
9569 }
9570 if (!IsFound) {
9571 ClusteredLoads.emplace_back().emplace_back(Args&: LI, Args: 0);
9572 ClusteredDistToLoad.emplace_back().try_emplace(Key: 0, Args&: LI);
9573 }
9574 }
9575 auto FindMatchingLoads =
9576 [&](ArrayRef<std::pair<LoadInst *, int64_t>> Loads,
9577 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int64_t>>>
9578 &GatheredLoads,
9579 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
9580 int64_t &Offset, unsigned &Start) {
9581 if (Loads.empty())
9582 return GatheredLoads.end();
9583 LoadInst *LI = Loads.front().first;
9584 for (auto [Idx, Data] : enumerate(First&: GatheredLoads)) {
9585 if (Idx < Start)
9586 continue;
9587 ToAdd.clear();
9588 if (LI->getParent() != Data.front().first->getParent() ||
9589 LI->getType() != Data.front().first->getType())
9590 continue;
9591 std::optional<int64_t> Dist =
9592 getPointersDiff(ElemTyA: LI->getType(), PtrA: LI->getPointerOperand(),
9593 ElemTyB: Data.front().first->getType(),
9594 PtrB: Data.front().first->getPointerOperand(), DL, SE,
9595 /*StrictCheck=*/true);
9596 if (!Dist)
9597 continue;
9598 SmallSet<int64_t, 4> DataDists;
9599 SmallPtrSet<LoadInst *, 4> DataLoads;
9600 for (std::pair<LoadInst *, int64_t> P : Data) {
9601 DataDists.insert(V: P.second);
9602 DataLoads.insert(Ptr: P.first);
9603 }
9604 // Found matching gathered loads - check if all loads are unique or
9605 // can be effectively vectorized.
9606 unsigned NumUniques = 0;
9607 for (auto [Cnt, Pair] : enumerate(First&: Loads)) {
9608 bool Used = DataLoads.contains(Ptr: Pair.first);
9609 if (!Used && !DataDists.contains(V: *Dist + Pair.second)) {
9610 ++NumUniques;
9611 ToAdd.insert(X: Cnt);
9612 } else if (Used) {
9613 Repeated.insert(X: Cnt);
9614 }
9615 }
9616 if (NumUniques > 0 &&
9617 (Loads.size() == NumUniques ||
9618 (Loads.size() - NumUniques >= 2 &&
9619 Loads.size() - NumUniques >= Loads.size() / 2 &&
9620 (has_single_bit(Value: Data.size() + NumUniques) ||
9621 bit_ceil(Value: Data.size()) <
9622 bit_ceil(Value: Data.size() + NumUniques))))) {
9623 Offset = *Dist;
9624 Start = Idx + 1;
9625 return std::next(x: GatheredLoads.begin(), n: Idx);
9626 }
9627 }
9628 ToAdd.clear();
9629 return GatheredLoads.end();
9630 };
9631 for (ArrayRef<std::pair<LoadInst *, int64_t>> Data : ClusteredLoads) {
9632 unsigned Start = 0;
9633 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
9634 int64_t Offset = 0;
9635 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
9636 Offset, Start);
9637 while (It != GatheredLoads.end()) {
9638 assert(!LocalToAdd.empty() && "Expected some elements to add.");
9639 for (unsigned Idx : LocalToAdd)
9640 It->emplace_back(Args: Data[Idx].first, Args: Data[Idx].second + Offset);
9641 ToAdd.insert_range(R&: LocalToAdd);
9642 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
9643 Start);
9644 }
9645 if (any_of(Range: seq<unsigned>(Size: Data.size()), P: [&](unsigned Idx) {
9646 return !ToAdd.contains(key: Idx) && !Repeated.contains(key: Idx);
9647 })) {
9648 auto AddNewLoads =
9649 [&](SmallVectorImpl<std::pair<LoadInst *, int64_t>> &Loads) {
9650 for (unsigned Idx : seq<unsigned>(Size: Data.size())) {
9651 if (ToAdd.contains(key: Idx) || Repeated.contains(key: Idx))
9652 continue;
9653 Loads.push_back(Elt: Data[Idx]);
9654 }
9655 };
9656 if (!AddNew) {
9657 LoadInst *LI = Data.front().first;
9658 It = find_if(
9659 Range&: GatheredLoads, P: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9660 return PD.front().first->getParent() == LI->getParent() &&
9661 PD.front().first->getType() == LI->getType();
9662 });
9663 while (It != GatheredLoads.end()) {
9664 AddNewLoads(*It);
9665 It = std::find_if(
9666 first: std::next(x: It), last: GatheredLoads.end(),
9667 pred: [&](ArrayRef<std::pair<LoadInst *, int64_t>> PD) {
9668 return PD.front().first->getParent() == LI->getParent() &&
9669 PD.front().first->getType() == LI->getType();
9670 });
9671 }
9672 }
9673 GatheredLoads.emplace_back().append(in_start: Data.begin(), in_end: Data.end());
9674 AddNewLoads(GatheredLoads.emplace_back());
9675 }
9676 }
9677}
9678
9679void BoUpSLP::tryToVectorizeGatheredLoads(
9680 const SmallMapVector<
9681 std::tuple<BasicBlock *, Value *, Type *>,
9682 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
9683 &GatheredLoads) {
9684 GatheredLoadsEntriesFirst = VectorizableTree.size();
9685
9686 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
9687 LoadEntriesToVectorize.size());
9688 for (auto [Idx, Set] : zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize))
9689 Set.insert_range(R&: VectorizableTree[Idx]->Scalars);
9690
9691 // Sort loads by distance.
9692 auto LoadSorter = [](const std::pair<LoadInst *, int64_t> &L1,
9693 const std::pair<LoadInst *, int64_t> &L2) {
9694 return L1.second > L2.second;
9695 };
9696
9697 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
9698 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
9699 Loads.size());
9700 Align Alignment = computeCommonAlignment<LoadInst>(VL: Values);
9701 auto *Ty = getWidenedType(ScalarTy: Loads.front()->getType(), VF: Loads.size());
9702 return TTI->isLegalMaskedGather(DataType: Ty, Alignment) &&
9703 !TTI->forceScalarizeMaskedGather(Type: Ty, Alignment);
9704 };
9705
9706 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
9707 BoUpSLP::ValueSet &VectorizedLoads,
9708 SmallVectorImpl<LoadInst *> &NonVectorized,
9709 bool Final, unsigned MaxVF) {
9710 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results;
9711 unsigned StartIdx = 0;
9712 SmallVector<int> CandidateVFs;
9713 if (VectorizeNonPowerOf2 && has_single_bit(Value: MaxVF + 1))
9714 CandidateVFs.push_back(Elt: MaxVF);
9715 for (int NumElts = getFloorFullVectorNumberOfElements(
9716 TTI: *TTI, Ty: Loads.front()->getType(), Sz: MaxVF);
9717 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
9718 TTI: *TTI, Ty: Loads.front()->getType(), Sz: NumElts - 1)) {
9719 CandidateVFs.push_back(Elt: NumElts);
9720 if (VectorizeNonPowerOf2 && NumElts > 2)
9721 CandidateVFs.push_back(Elt: NumElts - 1);
9722 }
9723
9724 if (Final && CandidateVFs.empty())
9725 return Results;
9726
9727 unsigned BestVF = Final ? CandidateVFs.back() : 0;
9728 for (unsigned NumElts : CandidateVFs) {
9729 if (Final && NumElts > BestVF)
9730 continue;
9731 SmallVector<unsigned> MaskedGatherVectorized;
9732 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
9733 ++Cnt) {
9734 ArrayRef<LoadInst *> Slice =
9735 ArrayRef(Loads).slice(N: Cnt, M: std::min(a: NumElts, b: E - Cnt));
9736 if (VectorizedLoads.count(Ptr: Slice.front()) ||
9737 VectorizedLoads.count(Ptr: Slice.back()) ||
9738 areKnownNonVectorizableLoads(VL: Slice))
9739 continue;
9740 // Check if it is profitable to try vectorizing gathered loads. It is
9741 // profitable if we have more than 3 consecutive loads or if we have
9742 // less but all users are vectorized or deleted.
9743 bool AllowToVectorize = false;
9744 // Check if it is profitable to vectorize 2-elements loads.
9745 if (NumElts == 2) {
9746 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
9747 ElementTy: Slice.front()->getType(), NumElements: ElementCount::getFixed(MinVal: NumElts));
9748 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
9749 for (LoadInst *LI : Slice) {
9750 // If single use/user - allow to vectorize.
9751 if (LI->hasOneUse())
9752 continue;
9753 // 1. Check if number of uses equals number of users.
9754 // 2. All users are deleted.
9755 // 3. The load broadcasts are not allowed or the load is not
9756 // broadcasted.
9757 if (static_cast<unsigned int>(std::distance(
9758 first: LI->user_begin(), last: LI->user_end())) != LI->getNumUses())
9759 return false;
9760 if (!IsLegalBroadcastLoad)
9761 continue;
9762 if (LI->hasNUsesOrMore(N: UsesLimit))
9763 return false;
9764 for (User *U : LI->users()) {
9765 if (auto *UI = dyn_cast<Instruction>(Val: U); UI && isDeleted(I: UI))
9766 continue;
9767 for (const TreeEntry *UTE : getTreeEntries(V: U)) {
9768 for (int I : seq<int>(Size: UTE->getNumOperands())) {
9769 if (all_of(Range: UTE->getOperand(OpIdx: I), P: [LI](Value *V) {
9770 return V == LI || isa<PoisonValue>(Val: V);
9771 }))
9772 // Found legal broadcast - do not vectorize.
9773 return false;
9774 }
9775 }
9776 }
9777 }
9778 return true;
9779 };
9780 AllowToVectorize = CheckIfAllowed(Slice);
9781 } else {
9782 AllowToVectorize =
9783 (NumElts >= 3 ||
9784 any_of(Range&: ValueToGatherNodes.at(Val: Slice.front()),
9785 P: [=](const TreeEntry *TE) {
9786 return TE->Scalars.size() == 2 &&
9787 ((TE->Scalars.front() == Slice.front() &&
9788 TE->Scalars.back() == Slice.back()) ||
9789 (TE->Scalars.front() == Slice.back() &&
9790 TE->Scalars.back() == Slice.front()));
9791 })) &&
9792 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(),
9793 Sz: Slice.size());
9794 }
9795 if (AllowToVectorize) {
9796 SmallVector<Value *> PointerOps;
9797 OrdersType CurrentOrder;
9798 // Try to build vector load.
9799 ArrayRef<Value *> Values(
9800 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9801 StridedPtrInfo SPtrInfo;
9802 LoadsState LS = canVectorizeLoads(VL: Values, VL0: Slice.front(), Order&: CurrentOrder,
9803 PointerOps, SPtrInfo, BestVF: &BestVF);
9804 if (LS != LoadsState::Gather ||
9805 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
9806 if (LS == LoadsState::ScatterVectorize) {
9807 if (MaskedGatherVectorized.empty() ||
9808 Cnt >= MaskedGatherVectorized.back() + NumElts)
9809 MaskedGatherVectorized.push_back(Elt: Cnt);
9810 continue;
9811 }
9812 if (LS != LoadsState::Gather) {
9813 Results.emplace_back(Args&: Values, Args&: LS);
9814 VectorizedLoads.insert_range(R&: Slice);
9815 // If we vectorized initial block, no need to try to vectorize it
9816 // again.
9817 if (Cnt == StartIdx)
9818 StartIdx += NumElts;
9819 }
9820 // Check if the whole array was vectorized already - exit.
9821 if (StartIdx >= Loads.size())
9822 break;
9823 // Erase last masked gather candidate, if another candidate within
9824 // the range is found to be better.
9825 if (!MaskedGatherVectorized.empty() &&
9826 Cnt < MaskedGatherVectorized.back() + NumElts)
9827 MaskedGatherVectorized.pop_back();
9828 Cnt += NumElts - 1;
9829 continue;
9830 }
9831 }
9832 if (!AllowToVectorize || BestVF == 0)
9833 registerNonVectorizableLoads(VL: Slice);
9834 }
9835 // Mark masked gathers candidates as vectorized, if any.
9836 for (unsigned Cnt : MaskedGatherVectorized) {
9837 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
9838 N: Cnt, M: std::min<unsigned>(a: NumElts, b: Loads.size() - Cnt));
9839 ArrayRef<Value *> Values(
9840 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
9841 Results.emplace_back(Args&: Values, Args: LoadsState::ScatterVectorize);
9842 VectorizedLoads.insert_range(R&: Slice);
9843 // If we vectorized initial block, no need to try to vectorize it again.
9844 if (Cnt == StartIdx)
9845 StartIdx += NumElts;
9846 }
9847 }
9848 for (LoadInst *LI : Loads) {
9849 if (!VectorizedLoads.contains(Ptr: LI))
9850 NonVectorized.push_back(Elt: LI);
9851 }
9852 return Results;
9853 };
9854 auto ProcessGatheredLoads =
9855 [&, &TTI = *TTI](
9856 ArrayRef<SmallVector<std::pair<LoadInst *, int64_t>>> GatheredLoads,
9857 bool Final = false) {
9858 SmallVector<LoadInst *> NonVectorized;
9859 for (ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists :
9860 GatheredLoads) {
9861 if (LoadsDists.size() <= 1) {
9862 NonVectorized.push_back(Elt: LoadsDists.back().first);
9863 continue;
9864 }
9865 SmallVector<std::pair<LoadInst *, int64_t>> LocalLoadsDists(
9866 LoadsDists);
9867 SmallVector<LoadInst *> OriginalLoads(make_first_range(c&: LoadsDists));
9868 stable_sort(Range&: LocalLoadsDists, C: LoadSorter);
9869 SmallVector<LoadInst *> Loads;
9870 unsigned MaxConsecutiveDistance = 0;
9871 unsigned CurrentConsecutiveDist = 1;
9872 int64_t LastDist = LocalLoadsDists.front().second;
9873 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
9874 for (const std::pair<LoadInst *, int64_t> &L : LocalLoadsDists) {
9875 if (isVectorized(V: L.first))
9876 continue;
9877 assert(LastDist >= L.second &&
9878 "Expected first distance always not less than second");
9879 if (static_cast<uint64_t>(LastDist - L.second) ==
9880 CurrentConsecutiveDist) {
9881 ++CurrentConsecutiveDist;
9882 MaxConsecutiveDistance =
9883 std::max(a: MaxConsecutiveDistance, b: CurrentConsecutiveDist);
9884 Loads.push_back(Elt: L.first);
9885 continue;
9886 }
9887 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
9888 !Loads.empty())
9889 Loads.pop_back();
9890 CurrentConsecutiveDist = 1;
9891 LastDist = L.second;
9892 Loads.push_back(Elt: L.first);
9893 }
9894 if (Loads.size() <= 1)
9895 continue;
9896 if (AllowMaskedGather)
9897 MaxConsecutiveDistance = Loads.size();
9898 else if (MaxConsecutiveDistance < 2)
9899 continue;
9900 BoUpSLP::ValueSet VectorizedLoads;
9901 SmallVector<LoadInst *> SortedNonVectorized;
9902 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>> Results =
9903 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
9904 Final, MaxConsecutiveDistance);
9905 if (!Results.empty() && !SortedNonVectorized.empty() &&
9906 OriginalLoads.size() == Loads.size() &&
9907 MaxConsecutiveDistance == Loads.size() &&
9908 all_of(Range&: Results,
9909 P: [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
9910 return P.second == LoadsState::ScatterVectorize;
9911 })) {
9912 VectorizedLoads.clear();
9913 SmallVector<LoadInst *> UnsortedNonVectorized;
9914 SmallVector<std::pair<ArrayRef<Value *>, LoadsState>>
9915 UnsortedResults =
9916 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
9917 UnsortedNonVectorized, Final,
9918 OriginalLoads.size());
9919 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
9920 SortedNonVectorized.swap(RHS&: UnsortedNonVectorized);
9921 Results.swap(RHS&: UnsortedResults);
9922 }
9923 }
9924 for (auto [Slice, _] : Results) {
9925 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
9926 << Slice.size() << ")\n");
9927 if (any_of(Range&: Slice, P: [&](Value *V) { return isVectorized(V); })) {
9928 for (Value *L : Slice)
9929 if (!isVectorized(V: L))
9930 SortedNonVectorized.push_back(Elt: cast<LoadInst>(Val: L));
9931 continue;
9932 }
9933
9934 // Select maximum VF as a maximum of user gathered nodes and
9935 // distance between scalar loads in these nodes.
9936 unsigned MaxVF = Slice.size();
9937 unsigned UserMaxVF = 0;
9938 unsigned InterleaveFactor = 0;
9939 if (MaxVF == 2) {
9940 UserMaxVF = MaxVF;
9941 } else {
9942 // Found distance between segments of the interleaved loads.
9943 std::optional<unsigned> InterleavedLoadsDistance = 0;
9944 unsigned Order = 0;
9945 std::optional<unsigned> CommonVF = 0;
9946 DenseMap<const TreeEntry *, unsigned> EntryToPosition;
9947 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
9948 for (auto [Idx, V] : enumerate(First&: Slice)) {
9949 for (const TreeEntry *E : ValueToGatherNodes.at(Val: V)) {
9950 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: E->Scalars.size());
9951 unsigned Pos =
9952 EntryToPosition.try_emplace(Key: E, Args&: Idx).first->second;
9953 UserMaxVF = std::max<unsigned>(a: UserMaxVF, b: Idx - Pos + 1);
9954 if (CommonVF) {
9955 if (*CommonVF == 0) {
9956 CommonVF = E->Scalars.size();
9957 continue;
9958 }
9959 if (*CommonVF != E->Scalars.size())
9960 CommonVF.reset();
9961 }
9962 // Check if the load is the part of the interleaved load.
9963 if (Pos != Idx && InterleavedLoadsDistance) {
9964 if (!DeinterleavedNodes.contains(Ptr: E) &&
9965 any_of(Range: E->Scalars, P: [&, Slice = Slice](Value *V) {
9966 if (isa<Constant>(Val: V))
9967 return false;
9968 if (isVectorized(V))
9969 return true;
9970 const auto &Nodes = ValueToGatherNodes.at(Val: V);
9971 return (Nodes.size() != 1 || !Nodes.contains(key: E)) &&
9972 !is_contained(Range: Slice, Element: V);
9973 })) {
9974 InterleavedLoadsDistance.reset();
9975 continue;
9976 }
9977 DeinterleavedNodes.insert(Ptr: E);
9978 if (*InterleavedLoadsDistance == 0) {
9979 InterleavedLoadsDistance = Idx - Pos;
9980 continue;
9981 }
9982 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
9983 (Idx - Pos) / *InterleavedLoadsDistance < Order)
9984 InterleavedLoadsDistance.reset();
9985 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(u: 1);
9986 }
9987 }
9988 }
9989 DeinterleavedNodes.clear();
9990 // Check if the large load represents interleaved load operation.
9991 if (InterleavedLoadsDistance.value_or(u: 0) > 1 &&
9992 CommonVF.value_or(u: 0) != 0) {
9993 InterleaveFactor = bit_ceil(Value: *InterleavedLoadsDistance);
9994 unsigned VF = *CommonVF;
9995 OrdersType Order;
9996 SmallVector<Value *> PointerOps;
9997 StridedPtrInfo SPtrInfo;
9998 // Segmented load detected - vectorize at maximum vector factor.
9999 if (InterleaveFactor <= Slice.size() &&
10000 TTI.isLegalInterleavedAccessType(
10001 VTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF),
10002 Factor: InterleaveFactor,
10003 Alignment: cast<LoadInst>(Val: Slice.front())->getAlign(),
10004 AddrSpace: cast<LoadInst>(Val: Slice.front())
10005 ->getPointerAddressSpace()) &&
10006 canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order, PointerOps,
10007 SPtrInfo) == LoadsState::Vectorize) {
10008 UserMaxVF = InterleaveFactor * VF;
10009 } else {
10010 InterleaveFactor = 0;
10011 }
10012 }
10013 // Cannot represent the loads as consecutive vectorizable nodes -
10014 // just exit.
10015 unsigned ConsecutiveNodesSize = 0;
10016 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
10017 any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
10018 P: [&, Slice = Slice](const auto &P) {
10019 const auto *It = find_if(Slice, [&](Value *V) {
10020 return std::get<1>(P).contains(V);
10021 });
10022 if (It == Slice.end())
10023 return false;
10024 const TreeEntry &TE =
10025 *VectorizableTree[std::get<0>(P)];
10026 ArrayRef<Value *> VL = TE.Scalars;
10027 OrdersType Order;
10028 SmallVector<Value *> PointerOps;
10029 StridedPtrInfo SPtrInfo;
10030 LoadsState State = canVectorizeLoads(
10031 VL, VL0: VL.front(), Order, PointerOps, SPtrInfo);
10032 if (State == LoadsState::ScatterVectorize ||
10033 State == LoadsState::CompressVectorize)
10034 return false;
10035 ConsecutiveNodesSize += VL.size();
10036 size_t Start = std::distance(Slice.begin(), It);
10037 size_t Sz = Slice.size() - Start;
10038 return Sz < VL.size() ||
10039 Slice.slice(N: Start, M: VL.size()) != VL;
10040 }))
10041 continue;
10042 // Try to build long masked gather loads.
10043 UserMaxVF = bit_ceil(Value: UserMaxVF);
10044 if (InterleaveFactor == 0 &&
10045 any_of(Range: seq<unsigned>(Size: Slice.size() / UserMaxVF),
10046 P: [&, Slice = Slice](unsigned Idx) {
10047 OrdersType Order;
10048 SmallVector<Value *> PointerOps;
10049 StridedPtrInfo SPtrInfo;
10050 return canVectorizeLoads(
10051 VL: Slice.slice(N: Idx * UserMaxVF, M: UserMaxVF),
10052 VL0: Slice[Idx * UserMaxVF], Order, PointerOps,
10053 SPtrInfo) == LoadsState::ScatterVectorize;
10054 }))
10055 UserMaxVF = MaxVF;
10056 if (Slice.size() != ConsecutiveNodesSize)
10057 MaxVF = std::min<unsigned>(a: MaxVF, b: UserMaxVF);
10058 }
10059 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
10060 bool IsVectorized = true;
10061 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
10062 ArrayRef<Value *> SubSlice =
10063 Slice.slice(N: I, M: std::min(a: VF, b: E - I));
10064 if (isVectorized(V: SubSlice.front()))
10065 continue;
10066 // Check if the subslice is to be-vectorized entry, which is not
10067 // equal to entry.
10068 if (any_of(Range: zip(t&: LoadEntriesToVectorize, u&: LoadSetsToVectorize),
10069 P: [&](const auto &P) {
10070 return !SubSlice.equals(
10071 RHS: VectorizableTree[std::get<0>(P)]
10072 ->Scalars) &&
10073 set_is_subset(SubSlice, std::get<1>(P));
10074 }))
10075 continue;
10076 unsigned Sz = VectorizableTree.size();
10077 buildTreeRec(Roots: SubSlice, Depth: 0, EI: EdgeInfo(), InterleaveFactor);
10078 if (Sz == VectorizableTree.size()) {
10079 IsVectorized = false;
10080 // Try non-interleaved vectorization with smaller vector
10081 // factor.
10082 if (InterleaveFactor > 0) {
10083 VF = 2 * (MaxVF / InterleaveFactor);
10084 InterleaveFactor = 0;
10085 }
10086 continue;
10087 }
10088 }
10089 if (IsVectorized)
10090 break;
10091 }
10092 }
10093 NonVectorized.append(RHS: SortedNonVectorized);
10094 }
10095 return NonVectorized;
10096 };
10097 for (const auto &GLs : GatheredLoads) {
10098 const auto &Ref = GLs.second;
10099 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
10100 if (!Ref.empty() && !NonVectorized.empty() &&
10101 std::accumulate(
10102 first: Ref.begin(), last: Ref.end(), init: 0u,
10103 binary_op: [](unsigned S, ArrayRef<std::pair<LoadInst *, int64_t>> LoadsDists)
10104 -> unsigned { return S + LoadsDists.size(); }) !=
10105 NonVectorized.size() &&
10106 IsMaskedGatherSupported(NonVectorized)) {
10107 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>
10108 FinalGatheredLoads;
10109 for (LoadInst *LI : NonVectorized) {
10110 // Reinsert non-vectorized loads to other list of loads with the same
10111 // base pointers.
10112 gatherPossiblyVectorizableLoads(R: *this, VL: LI, DL: *DL, SE&: *SE, TTI: *TTI,
10113 GatheredLoads&: FinalGatheredLoads,
10114 /*AddNew=*/false);
10115 }
10116 // Final attempt to vectorize non-vectorized loads.
10117 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
10118 }
10119 }
10120 // Try to vectorize postponed load entries, previously marked as gathered.
10121 for (unsigned Idx : LoadEntriesToVectorize) {
10122 const TreeEntry &E = *VectorizableTree[Idx];
10123 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
10124 // Avoid reordering, if possible.
10125 if (!E.ReorderIndices.empty()) {
10126 // Build a mask out of the reorder indices and reorder scalars per this
10127 // mask.
10128 SmallVector<int> ReorderMask;
10129 inversePermutation(Indices: E.ReorderIndices, Mask&: ReorderMask);
10130 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
10131 }
10132 buildTreeRec(Roots: GatheredScalars, Depth: 0, EI: EdgeInfo());
10133 }
10134 // If no new entries created, consider it as no gathered loads entries must be
10135 // handled.
10136 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
10137 VectorizableTree.size())
10138 GatheredLoadsEntriesFirst.reset();
10139}
10140
10141/// Generates key/subkey pair for the given value to provide effective sorting
10142/// of the values and better detection of the vectorizable values sequences. The
10143/// keys/subkeys can be used for better sorting of the values themselves (keys)
10144/// and in values subgroups (subkeys).
10145static std::pair<size_t, size_t> generateKeySubkey(
10146 Value *V, const TargetLibraryInfo *TLI,
10147 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
10148 bool AllowAlternate) {
10149 hash_code Key = hash_value(value: V->getValueID() + 2);
10150 hash_code SubKey = hash_value(value: 0);
10151 // Sort the loads by the distance between the pointers.
10152 if (auto *LI = dyn_cast<LoadInst>(Val: V)) {
10153 Key = hash_combine(args: LI->getType(), args: hash_value(value: Instruction::Load), args: Key);
10154 if (LI->isSimple())
10155 SubKey = hash_value(code: LoadsSubkeyGenerator(Key, LI));
10156 else
10157 Key = SubKey = hash_value(ptr: LI);
10158 } else if (isVectorLikeInstWithConstOps(V)) {
10159 // Sort extracts by the vector operands.
10160 if (isa<ExtractElementInst, UndefValue>(Val: V))
10161 Key = hash_value(value: Value::UndefValueVal + 1);
10162 if (auto *EI = dyn_cast<ExtractElementInst>(Val: V)) {
10163 if (!isUndefVector(V: EI->getVectorOperand()).all() &&
10164 !isa<UndefValue>(Val: EI->getIndexOperand()))
10165 SubKey = hash_value(ptr: EI->getVectorOperand());
10166 }
10167 } else if (auto *I = dyn_cast<Instruction>(Val: V)) {
10168 // Sort other instructions just by the opcodes except for CMPInst.
10169 // For CMP also sort by the predicate kind.
10170 if ((isa<BinaryOperator, CastInst>(Val: I)) &&
10171 isValidForAlternation(Opcode: I->getOpcode())) {
10172 if (AllowAlternate)
10173 Key = hash_value(value: isa<BinaryOperator>(Val: I) ? 1 : 0);
10174 else
10175 Key = hash_combine(args: hash_value(value: I->getOpcode()), args: Key);
10176 SubKey = hash_combine(
10177 args: hash_value(value: I->getOpcode()), args: hash_value(ptr: I->getType()),
10178 args: hash_value(ptr: isa<BinaryOperator>(Val: I)
10179 ? I->getType()
10180 : cast<CastInst>(Val: I)->getOperand(i_nocapture: 0)->getType()));
10181 // For casts, look through the only operand to improve compile time.
10182 if (isa<CastInst>(Val: I)) {
10183 std::pair<size_t, size_t> OpVals =
10184 generateKeySubkey(V: I->getOperand(i: 0), TLI, LoadsSubkeyGenerator,
10185 /*AllowAlternate=*/true);
10186 Key = hash_combine(args: OpVals.first, args: Key);
10187 SubKey = hash_combine(args: OpVals.first, args: SubKey);
10188 }
10189 } else if (auto *CI = dyn_cast<CmpInst>(Val: I)) {
10190 CmpInst::Predicate Pred = CI->getPredicate();
10191 if (CI->isCommutative())
10192 Pred = std::min(a: Pred, b: CmpInst::getInversePredicate(pred: Pred));
10193 CmpInst::Predicate SwapPred = CmpInst::getSwappedPredicate(pred: Pred);
10194 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: Pred),
10195 args: hash_value(value: SwapPred),
10196 args: hash_value(ptr: CI->getOperand(i_nocapture: 0)->getType()));
10197 } else if (auto *Call = dyn_cast<CallInst>(Val: I)) {
10198 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: Call, TLI);
10199 if (isTriviallyVectorizable(ID)) {
10200 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(value: ID));
10201 } else if (!VFDatabase(*Call).getMappings(CI: *Call).empty()) {
10202 SubKey = hash_combine(args: hash_value(value: I->getOpcode()),
10203 args: hash_value(ptr: Call->getCalledFunction()));
10204 } else {
10205 Key = hash_combine(args: hash_value(ptr: Call), args: Key);
10206 SubKey = hash_combine(args: hash_value(value: I->getOpcode()), args: hash_value(ptr: Call));
10207 }
10208 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
10209 SubKey = hash_combine(args: hash_value(value: Op.Begin), args: hash_value(value: Op.End),
10210 args: hash_value(ptr: Op.Tag), args: SubKey);
10211 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(Val: I)) {
10212 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Val: Gep->getOperand(i_nocapture: 1)))
10213 SubKey = hash_value(ptr: Gep->getPointerOperand());
10214 else
10215 SubKey = hash_value(ptr: Gep);
10216 } else if (BinaryOperator::isIntDivRem(Opcode: I->getOpcode()) &&
10217 !isa<ConstantInt>(Val: I->getOperand(i: 1))) {
10218 // Do not try to vectorize instructions with potentially high cost.
10219 SubKey = hash_value(ptr: I);
10220 } else {
10221 SubKey = hash_value(value: I->getOpcode());
10222 }
10223 Key = hash_combine(args: hash_value(ptr: I->getParent()), args: Key);
10224 }
10225 return std::make_pair(x&: Key, y&: SubKey);
10226}
10227
10228/// Checks if the specified instruction \p I is an main operation for the given
10229/// \p MainOp and \p AltOp instructions.
10230static bool isMainInstruction(Instruction *I, Instruction *MainOp,
10231 Instruction *AltOp, const TargetLibraryInfo &TLI);
10232
10233/// Builds the arguments types vector for the given call instruction with the
10234/// given \p ID for the specified vector factor.
10235static SmallVector<Type *>
10236buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
10237 const unsigned VF, unsigned MinBW,
10238 const TargetTransformInfo *TTI) {
10239 SmallVector<Type *> ArgTys;
10240 for (auto [Idx, Arg] : enumerate(First: CI->args())) {
10241 if (ID != Intrinsic::not_intrinsic) {
10242 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI)) {
10243 ArgTys.push_back(Elt: Arg->getType());
10244 continue;
10245 }
10246 if (MinBW > 0) {
10247 ArgTys.push_back(
10248 Elt: getWidenedType(ScalarTy: IntegerType::get(C&: CI->getContext(), NumBits: MinBW), VF));
10249 continue;
10250 }
10251 }
10252 ArgTys.push_back(Elt: getWidenedType(ScalarTy: Arg->getType(), VF));
10253 }
10254 return ArgTys;
10255}
10256
10257/// Calculates the costs of vectorized intrinsic (if possible) and vectorized
10258/// function (if possible) calls. Returns invalid cost for the corresponding
10259/// calls, if they cannot be vectorized/will be scalarized.
10260static std::pair<InstructionCost, InstructionCost>
10261getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
10262 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10263 ArrayRef<Type *> ArgTys) {
10264 auto Shape = VFShape::get(FTy: CI->getFunctionType(),
10265 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
10266 HasGlobalPred: false /*HasGlobalPred*/);
10267 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10268 auto LibCost = InstructionCost::getInvalid();
10269 if (!CI->isNoBuiltin() && VecFunc) {
10270 // Calculate the cost of the vector library call.
10271 // If the corresponding vector call is cheaper, return its cost.
10272 LibCost =
10273 TTI->getCallInstrCost(F: nullptr, RetTy: VecTy, Tys: ArgTys, CostKind: TTI::TCK_RecipThroughput);
10274 }
10275 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10276
10277 // Calculate the cost of the vector intrinsic call.
10278 FastMathFlags FMF;
10279 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: CI))
10280 FMF = FPCI->getFastMathFlags();
10281 const InstructionCost ScalarLimit = 10000;
10282 IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF, nullptr,
10283 LibCost.isValid() ? LibCost : ScalarLimit);
10284 auto IntrinsicCost =
10285 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind: TTI::TCK_RecipThroughput);
10286 if ((LibCost.isValid() && IntrinsicCost > LibCost) ||
10287 (!LibCost.isValid() && IntrinsicCost > ScalarLimit))
10288 IntrinsicCost = InstructionCost::getInvalid();
10289
10290 return {IntrinsicCost, LibCost};
10291}
10292
10293BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
10294 const InstructionsState &S, ArrayRef<Value *> VL,
10295 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
10296 SmallVectorImpl<Value *> &PointerOps, StridedPtrInfo &SPtrInfo) {
10297 assert(S.getMainOp() &&
10298 "Expected instructions with same/alternate opcodes only.");
10299
10300 unsigned ShuffleOrOp =
10301 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
10302 Instruction *VL0 = S.getMainOp();
10303 switch (ShuffleOrOp) {
10304 case Instruction::PHI: {
10305 // Too many operands - gather, most probably won't be vectorized.
10306 if (VL0->getNumOperands() > MaxPHINumOperands)
10307 return TreeEntry::NeedToGather;
10308 // Check for terminator values (e.g. invoke).
10309 for (Value *V : VL) {
10310 auto *PHI = dyn_cast<PHINode>(Val: V);
10311 if (!PHI)
10312 continue;
10313 for (Value *Incoming : PHI->incoming_values()) {
10314 Instruction *Term = dyn_cast<Instruction>(Val: Incoming);
10315 if (Term && Term->isTerminator()) {
10316 LLVM_DEBUG(dbgs()
10317 << "SLP: Need to swizzle PHINodes (terminator use).\n");
10318 return TreeEntry::NeedToGather;
10319 }
10320 }
10321 }
10322
10323 return TreeEntry::Vectorize;
10324 }
10325 case Instruction::ExtractElement:
10326 if (any_of(Range&: VL, P: [&](Value *V) {
10327 auto *EI = dyn_cast<ExtractElementInst>(Val: V);
10328 if (!EI)
10329 return true;
10330 return isVectorized(V: EI->getOperand(i_nocapture: 0));
10331 }))
10332 return TreeEntry::NeedToGather;
10333 [[fallthrough]];
10334 case Instruction::ExtractValue: {
10335 bool Reuse = canReuseExtract(VL, CurrentOrder);
10336 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
10337 // non-full registers).
10338 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: VL0->getType(), Sz: VL.size()))
10339 return TreeEntry::NeedToGather;
10340 if (Reuse || !CurrentOrder.empty())
10341 return TreeEntry::Vectorize;
10342 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
10343 return TreeEntry::NeedToGather;
10344 }
10345 case Instruction::InsertElement: {
10346 // Check that we have a buildvector and not a shuffle of 2 or more
10347 // different vectors.
10348 ValueSet SourceVectors;
10349 for (Value *V : VL) {
10350 if (isa<PoisonValue>(Val: V)) {
10351 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement/poison vector.\n");
10352 return TreeEntry::NeedToGather;
10353 }
10354 SourceVectors.insert(Ptr: cast<Instruction>(Val: V)->getOperand(i: 0));
10355 assert(getElementIndex(V) != std::nullopt &&
10356 "Non-constant or undef index?");
10357 }
10358
10359 if (count_if(Range&: VL, P: [&SourceVectors](Value *V) {
10360 return !SourceVectors.contains(Ptr: V);
10361 }) >= 2) {
10362 // Found 2nd source vector - cancel.
10363 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10364 "different source vectors.\n");
10365 return TreeEntry::NeedToGather;
10366 }
10367
10368 if (any_of(Range&: VL, P: [&SourceVectors](Value *V) {
10369 // The last InsertElement can have multiple uses.
10370 return SourceVectors.contains(Ptr: V) && !V->hasOneUse();
10371 })) {
10372 assert(SLPReVec && "Only supported by REVEC.");
10373 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
10374 "multiple uses.\n");
10375 return TreeEntry::NeedToGather;
10376 }
10377
10378 return TreeEntry::Vectorize;
10379 }
10380 case Instruction::Load: {
10381 // Check that a vectorized load would load the same memory as a scalar
10382 // load. For example, we don't want to vectorize loads that are smaller
10383 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
10384 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
10385 // from such a struct, we read/write packed bits disagreeing with the
10386 // unvectorized version.
10387 auto IsGatheredNode = [&]() {
10388 if (!GatheredLoadsEntriesFirst)
10389 return false;
10390 return all_of(Range&: VL, P: [&](Value *V) {
10391 if (isa<PoisonValue>(Val: V))
10392 return true;
10393 return any_of(Range: getTreeEntries(V), P: [&](const TreeEntry *TE) {
10394 return TE->Idx >= *GatheredLoadsEntriesFirst;
10395 });
10396 });
10397 };
10398 switch (canVectorizeLoads(VL, VL0, Order&: CurrentOrder, PointerOps, SPtrInfo)) {
10399 case LoadsState::Vectorize:
10400 return TreeEntry::Vectorize;
10401 case LoadsState::CompressVectorize:
10402 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10403 // Delay slow vectorized nodes for better vectorization attempts.
10404 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10405 return TreeEntry::NeedToGather;
10406 }
10407 return IsGatheredNode() ? TreeEntry::NeedToGather
10408 : TreeEntry::CompressVectorize;
10409 case LoadsState::ScatterVectorize:
10410 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
10411 // Delay slow vectorized nodes for better vectorization attempts.
10412 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10413 return TreeEntry::NeedToGather;
10414 }
10415 return IsGatheredNode() ? TreeEntry::NeedToGather
10416 : TreeEntry::ScatterVectorize;
10417 case LoadsState::StridedVectorize:
10418 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
10419 // Delay slow vectorized nodes for better vectorization attempts.
10420 LoadEntriesToVectorize.insert(X: VectorizableTree.size());
10421 return TreeEntry::NeedToGather;
10422 }
10423 return IsGatheredNode() ? TreeEntry::NeedToGather
10424 : TreeEntry::StridedVectorize;
10425 case LoadsState::Gather:
10426#ifndef NDEBUG
10427 Type *ScalarTy = VL0->getType();
10428 if (DL->getTypeSizeInBits(ScalarTy) !=
10429 DL->getTypeAllocSizeInBits(ScalarTy))
10430 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
10431 else if (any_of(VL, [](Value *V) {
10432 auto *LI = dyn_cast<LoadInst>(V);
10433 return !LI || !LI->isSimple();
10434 }))
10435 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
10436 else
10437 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
10438#endif // NDEBUG
10439 registerNonVectorizableLoads(VL);
10440 return TreeEntry::NeedToGather;
10441 }
10442 llvm_unreachable("Unexpected state of loads");
10443 }
10444 case Instruction::ZExt:
10445 case Instruction::SExt:
10446 case Instruction::FPToUI:
10447 case Instruction::FPToSI:
10448 case Instruction::FPExt:
10449 case Instruction::PtrToInt:
10450 case Instruction::IntToPtr:
10451 case Instruction::SIToFP:
10452 case Instruction::UIToFP:
10453 case Instruction::Trunc:
10454 case Instruction::FPTrunc:
10455 case Instruction::BitCast: {
10456 Type *SrcTy = VL0->getOperand(i: 0)->getType();
10457 for (Value *V : VL) {
10458 if (isa<PoisonValue>(Val: V))
10459 continue;
10460 Type *Ty = cast<Instruction>(Val: V)->getOperand(i: 0)->getType();
10461 if (Ty != SrcTy || !isValidElementType(Ty)) {
10462 LLVM_DEBUG(
10463 dbgs() << "SLP: Gathering casts with different src types.\n");
10464 return TreeEntry::NeedToGather;
10465 }
10466 }
10467 return TreeEntry::Vectorize;
10468 }
10469 case Instruction::ICmp:
10470 case Instruction::FCmp: {
10471 // Check that all of the compares have the same predicate.
10472 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
10473 CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(pred: P0);
10474 Type *ComparedTy = VL0->getOperand(i: 0)->getType();
10475 for (Value *V : VL) {
10476 if (isa<PoisonValue>(Val: V))
10477 continue;
10478 auto *Cmp = cast<CmpInst>(Val: V);
10479 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
10480 Cmp->getOperand(i_nocapture: 0)->getType() != ComparedTy) {
10481 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
10482 return TreeEntry::NeedToGather;
10483 }
10484 }
10485 return TreeEntry::Vectorize;
10486 }
10487 case Instruction::Select:
10488 if (SLPReVec) {
10489 SmallPtrSet<Type *, 4> CondTypes;
10490 for (Value *V : VL) {
10491 Value *Cond;
10492 if (!match(V, P: m_Select(C: m_Value(V&: Cond), L: m_Value(), R: m_Value())) &&
10493 !match(V, P: m_ZExt(Op: m_Value(V&: Cond))))
10494 continue;
10495 CondTypes.insert(Ptr: Cond->getType());
10496 }
10497 if (CondTypes.size() > 1) {
10498 LLVM_DEBUG(
10499 dbgs()
10500 << "SLP: Gathering select with different condition types.\n");
10501 return TreeEntry::NeedToGather;
10502 }
10503 }
10504 [[fallthrough]];
10505 case Instruction::FNeg:
10506 case Instruction::Add:
10507 case Instruction::FAdd:
10508 case Instruction::Sub:
10509 case Instruction::FSub:
10510 case Instruction::Mul:
10511 case Instruction::FMul:
10512 case Instruction::UDiv:
10513 case Instruction::SDiv:
10514 case Instruction::FDiv:
10515 case Instruction::URem:
10516 case Instruction::SRem:
10517 case Instruction::FRem:
10518 case Instruction::Shl:
10519 case Instruction::LShr:
10520 case Instruction::AShr:
10521 case Instruction::And:
10522 case Instruction::Or:
10523 case Instruction::Xor:
10524 case Instruction::Freeze:
10525 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10526 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10527 auto *I = dyn_cast<Instruction>(Val: V);
10528 return I && I->isBinaryOp() && !I->isFast();
10529 }))
10530 return TreeEntry::NeedToGather;
10531 return TreeEntry::Vectorize;
10532 case Instruction::GetElementPtr: {
10533 // We don't combine GEPs with complicated (nested) indexing.
10534 for (Value *V : VL) {
10535 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10536 if (!I)
10537 continue;
10538 if (I->getNumOperands() != 2) {
10539 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
10540 return TreeEntry::NeedToGather;
10541 }
10542 }
10543
10544 // We can't combine several GEPs into one vector if they operate on
10545 // different types.
10546 Type *Ty0 = cast<GEPOperator>(Val: VL0)->getSourceElementType();
10547 for (Value *V : VL) {
10548 auto *GEP = dyn_cast<GEPOperator>(Val: V);
10549 if (!GEP)
10550 continue;
10551 Type *CurTy = GEP->getSourceElementType();
10552 if (Ty0 != CurTy) {
10553 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
10554 return TreeEntry::NeedToGather;
10555 }
10556 }
10557
10558 // We don't combine GEPs with non-constant indexes.
10559 Type *Ty1 = VL0->getOperand(i: 1)->getType();
10560 for (Value *V : VL) {
10561 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
10562 if (!I)
10563 continue;
10564 auto *Op = I->getOperand(i_nocapture: 1);
10565 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
10566 (Op->getType() != Ty1 &&
10567 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Val: Op)) ||
10568 Op->getType()->getScalarSizeInBits() >
10569 DL->getIndexSizeInBits(
10570 AS: V->getType()->getPointerAddressSpace())))) {
10571 LLVM_DEBUG(
10572 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
10573 return TreeEntry::NeedToGather;
10574 }
10575 }
10576
10577 return TreeEntry::Vectorize;
10578 }
10579 case Instruction::Store: {
10580 // Check if the stores are consecutive or if we need to swizzle them.
10581 llvm::Type *ScalarTy = cast<StoreInst>(Val: VL0)->getValueOperand()->getType();
10582 // Avoid types that are padded when being allocated as scalars, while
10583 // being packed together in a vector (such as i1).
10584 if (DL->getTypeSizeInBits(Ty: ScalarTy) !=
10585 DL->getTypeAllocSizeInBits(Ty: ScalarTy)) {
10586 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
10587 return TreeEntry::NeedToGather;
10588 }
10589 // Make sure all stores in the bundle are simple - we can't vectorize
10590 // atomic or volatile stores.
10591 for (Value *V : VL) {
10592 auto *SI = cast<StoreInst>(Val: V);
10593 if (!SI->isSimple()) {
10594 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
10595 return TreeEntry::NeedToGather;
10596 }
10597 PointerOps.push_back(Elt: SI->getPointerOperand());
10598 }
10599
10600 // Check the order of pointer operands.
10601 if (llvm::sortPtrAccesses(VL: PointerOps, ElemTy: ScalarTy, DL: *DL, SE&: *SE, SortedIndices&: CurrentOrder)) {
10602 Value *Ptr0;
10603 Value *PtrN;
10604 if (CurrentOrder.empty()) {
10605 Ptr0 = PointerOps.front();
10606 PtrN = PointerOps.back();
10607 } else {
10608 Ptr0 = PointerOps[CurrentOrder.front()];
10609 PtrN = PointerOps[CurrentOrder.back()];
10610 }
10611 std::optional<int64_t> Dist =
10612 getPointersDiff(ElemTyA: ScalarTy, PtrA: Ptr0, ElemTyB: ScalarTy, PtrB: PtrN, DL: *DL, SE&: *SE);
10613 // Check that the sorted pointer operands are consecutive.
10614 if (static_cast<uint64_t>(*Dist) == VL.size() - 1)
10615 return TreeEntry::Vectorize;
10616 }
10617
10618 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
10619 return TreeEntry::NeedToGather;
10620 }
10621 case Instruction::Call: {
10622 if (S.getMainOp()->getType()->isFloatingPointTy() &&
10623 TTI->isFPVectorizationPotentiallyUnsafe() && any_of(Range&: VL, P: [](Value *V) {
10624 auto *I = dyn_cast<Instruction>(Val: V);
10625 return I && !I->isFast();
10626 }))
10627 return TreeEntry::NeedToGather;
10628 // Check if the calls are all to the same vectorizable intrinsic or
10629 // library function.
10630 CallInst *CI = cast<CallInst>(Val: VL0);
10631 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
10632
10633 VFShape Shape = VFShape::get(
10634 FTy: CI->getFunctionType(),
10635 EC: ElementCount::getFixed(MinVal: static_cast<unsigned int>(VL.size())),
10636 HasGlobalPred: false /*HasGlobalPred*/);
10637 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
10638
10639 if (!VecFunc && !isTriviallyVectorizable(ID)) {
10640 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
10641 return TreeEntry::NeedToGather;
10642 }
10643 Function *F = CI->getCalledFunction();
10644 unsigned NumArgs = CI->arg_size();
10645 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
10646 for (unsigned J = 0; J != NumArgs; ++J)
10647 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI))
10648 ScalarArgs[J] = CI->getArgOperand(i: J);
10649 for (Value *V : VL) {
10650 CallInst *CI2 = dyn_cast<CallInst>(Val: V);
10651 if (!CI2 || CI2->getCalledFunction() != F ||
10652 getVectorIntrinsicIDForCall(CI: CI2, TLI) != ID ||
10653 (VecFunc &&
10654 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
10655 !CI->hasIdenticalOperandBundleSchema(Other: *CI2)) {
10656 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
10657 << "\n");
10658 return TreeEntry::NeedToGather;
10659 }
10660 // Some intrinsics have scalar arguments and should be same in order for
10661 // them to be vectorized.
10662 for (unsigned J = 0; J != NumArgs; ++J) {
10663 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: J, TTI)) {
10664 Value *A1J = CI2->getArgOperand(i: J);
10665 if (ScalarArgs[J] != A1J) {
10666 LLVM_DEBUG(dbgs()
10667 << "SLP: mismatched arguments in call:" << *CI
10668 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
10669 return TreeEntry::NeedToGather;
10670 }
10671 }
10672 }
10673 // Verify that the bundle operands are identical between the two calls.
10674 if (CI->hasOperandBundles() &&
10675 !std::equal(first1: CI->op_begin() + CI->getBundleOperandsStartIndex(),
10676 last1: CI->op_begin() + CI->getBundleOperandsEndIndex(),
10677 first2: CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
10678 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
10679 << "!=" << *V << '\n');
10680 return TreeEntry::NeedToGather;
10681 }
10682 }
10683 SmallVector<Type *> ArgTys =
10684 buildIntrinsicArgTypes(CI, ID, VF: VL.size(), MinBW: 0, TTI);
10685 auto *VecTy = getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
10686 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
10687 if (!VecCallCosts.first.isValid() && !VecCallCosts.second.isValid())
10688 return TreeEntry::NeedToGather;
10689
10690 return TreeEntry::Vectorize;
10691 }
10692 case Instruction::ShuffleVector: {
10693 if (!S.isAltShuffle()) {
10694 // REVEC can support non alternate shuffle.
10695 if (SLPReVec && getShufflevectorNumGroups(VL))
10696 return TreeEntry::Vectorize;
10697 // If this is not an alternate sequence of opcode like add-sub
10698 // then do not vectorize this instruction.
10699 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
10700 return TreeEntry::NeedToGather;
10701 }
10702
10703 return TreeEntry::Vectorize;
10704 }
10705 default:
10706 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
10707 return TreeEntry::NeedToGather;
10708 }
10709}
10710
10711namespace {
10712/// Allows to correctly handle operands of the phi nodes based on the \p Main
10713/// PHINode order of incoming basic blocks/values.
10714class PHIHandler {
10715 DominatorTree &DT;
10716 PHINode *Main = nullptr;
10717 SmallVector<Value *> Phis;
10718 SmallVector<SmallVector<Value *>> Operands;
10719
10720public:
10721 PHIHandler() = delete;
10722 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
10723 : DT(DT), Main(Main), Phis(Phis),
10724 Operands(Main->getNumIncomingValues(),
10725 SmallVector<Value *>(Phis.size(), nullptr)) {}
10726 void buildOperands() {
10727 constexpr unsigned FastLimit = 4;
10728 if (Main->getNumIncomingValues() <= FastLimit) {
10729 for (unsigned I : seq<unsigned>(Begin: 0, End: Main->getNumIncomingValues())) {
10730 BasicBlock *InBB = Main->getIncomingBlock(i: I);
10731 if (!DT.isReachableFromEntry(A: InBB)) {
10732 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10733 continue;
10734 }
10735 // Prepare the operand vector.
10736 for (auto [Idx, V] : enumerate(First&: Phis)) {
10737 auto *P = dyn_cast<PHINode>(Val: V);
10738 if (!P) {
10739 assert(isa<PoisonValue>(V) &&
10740 "Expected isa instruction or poison value.");
10741 Operands[I][Idx] = V;
10742 continue;
10743 }
10744 if (P->getIncomingBlock(i: I) == InBB)
10745 Operands[I][Idx] = P->getIncomingValue(i: I);
10746 else
10747 Operands[I][Idx] = P->getIncomingValueForBlock(BB: InBB);
10748 }
10749 }
10750 return;
10751 }
10752 SmallMapVector<BasicBlock *, SmallVector<unsigned>, 4>
10753 Blocks;
10754 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues())) {
10755 BasicBlock *InBB = Main->getIncomingBlock(i: I);
10756 if (!DT.isReachableFromEntry(A: InBB)) {
10757 Operands[I].assign(NumElts: Phis.size(), Elt: PoisonValue::get(T: Main->getType()));
10758 continue;
10759 }
10760 Blocks.try_emplace(Key: InBB).first->second.push_back(Elt: I);
10761 }
10762 for (auto [Idx, V] : enumerate(First&: Phis)) {
10763 if (isa<PoisonValue>(Val: V)) {
10764 for (unsigned I : seq<unsigned>(Size: Main->getNumIncomingValues()))
10765 Operands[I][Idx] = V;
10766 continue;
10767 }
10768 auto *P = cast<PHINode>(Val: V);
10769 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
10770 BasicBlock *InBB = P->getIncomingBlock(i: I);
10771 if (InBB == Main->getIncomingBlock(i: I)) {
10772 if (isa_and_nonnull<PoisonValue>(Val: Operands[I][Idx]))
10773 continue;
10774 Operands[I][Idx] = P->getIncomingValue(i: I);
10775 continue;
10776 }
10777 auto *It = Blocks.find(Key: InBB);
10778 if (It == Blocks.end())
10779 continue;
10780 Operands[It->second.front()][Idx] = P->getIncomingValue(i: I);
10781 }
10782 }
10783 for (const auto &P : Blocks) {
10784 ArrayRef<unsigned> IncomingValues = P.second;
10785 if (IncomingValues.size() <= 1)
10786 continue;
10787 unsigned BasicI = IncomingValues.consume_front();
10788 for (unsigned I : IncomingValues) {
10789 assert(all_of(enumerate(Operands[I]),
10790 [&](const auto &Data) {
10791 return !Data.value() ||
10792 Data.value() == Operands[BasicI][Data.index()];
10793 }) &&
10794 "Expected empty operands list.");
10795 Operands[I] = Operands[BasicI];
10796 }
10797 }
10798 }
10799 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
10800};
10801} // namespace
10802
10803/// Returns main/alternate instructions for the given \p VL. Unlike
10804/// getSameOpcode supports non-compatible instructions for better SplitVectorize
10805/// node support.
10806/// \returns first main/alt instructions, if only poisons and instruction with
10807/// only 2 opcodes exists. Returns pair of nullptr otherwise.
10808static std::pair<Instruction *, Instruction *>
10809getMainAltOpsNoStateVL(ArrayRef<Value *> VL) {
10810 Instruction *MainOp = nullptr;
10811 Instruction *AltOp = nullptr;
10812 for (Value *V : VL) {
10813 if (isa<PoisonValue>(Val: V))
10814 continue;
10815 auto *I = dyn_cast<Instruction>(Val: V);
10816 if (!I)
10817 return {};
10818 if (!MainOp) {
10819 MainOp = I;
10820 continue;
10821 }
10822 if (MainOp->getOpcode() == I->getOpcode()) {
10823 if (I->getParent() != MainOp->getParent())
10824 return {};
10825 continue;
10826 }
10827 if (!AltOp) {
10828 AltOp = I;
10829 continue;
10830 }
10831 if (AltOp->getOpcode() == I->getOpcode()) {
10832 if (I->getParent() != AltOp->getParent())
10833 return {};
10834 continue;
10835 }
10836 return {};
10837 }
10838 if (!AltOp)
10839 return {};
10840 assert(MainOp && AltOp && MainOp->getOpcode() != AltOp->getOpcode() &&
10841 "Expected different main and alt instructions.");
10842 return std::make_pair(x&: MainOp, y&: AltOp);
10843}
10844
10845/// Checks that every instruction appears once in the list and if not, packs
10846/// them, building \p ReuseShuffleIndices mask and mutating \p VL. The list of
10847/// unique scalars is extended by poison values to the whole register size.
10848///
10849/// \returns false if \p VL could not be uniquified, in which case \p VL is
10850/// unchanged and \p ReuseShuffleIndices is empty.
10851static bool tryToFindDuplicates(SmallVectorImpl<Value *> &VL,
10852 SmallVectorImpl<int> &ReuseShuffleIndices,
10853 const TargetTransformInfo &TTI,
10854 const TargetLibraryInfo &TLI,
10855 const InstructionsState &S,
10856 const BoUpSLP::EdgeInfo &UserTreeIdx,
10857 bool TryPad = false) {
10858 // Check that every instruction appears once in this bundle.
10859 SmallVector<Value *> UniqueValues;
10860 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
10861 for (Value *V : VL) {
10862 if (isConstant(V)) {
10863 // Constants are always considered distinct, even if the same constant
10864 // appears multiple times in VL.
10865 ReuseShuffleIndices.emplace_back(
10866 Args: isa<PoisonValue>(Val: V) ? PoisonMaskElem : UniqueValues.size());
10867 UniqueValues.emplace_back(Args&: V);
10868 continue;
10869 }
10870 auto Res = UniquePositions.try_emplace(Key: V, Args: UniqueValues.size());
10871 ReuseShuffleIndices.emplace_back(Args&: Res.first->second);
10872 if (Res.second)
10873 UniqueValues.emplace_back(Args&: V);
10874 }
10875
10876 // Easy case: VL has unique values and a "natural" size
10877 size_t NumUniqueScalarValues = UniqueValues.size();
10878 bool IsFullVectors = hasFullVectorsOrPowerOf2(
10879 TTI, Ty: getValueType(V: UniqueValues.front()), Sz: NumUniqueScalarValues);
10880 if (NumUniqueScalarValues == VL.size() &&
10881 (VectorizeNonPowerOf2 || IsFullVectors)) {
10882 ReuseShuffleIndices.clear();
10883 return true;
10884 }
10885
10886 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
10887 if ((UserTreeIdx.UserTE &&
10888 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) ||
10889 !hasFullVectorsOrPowerOf2(TTI, Ty: getValueType(V: VL.front()), Sz: VL.size())) {
10890 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
10891 "for nodes with padding.\n");
10892 ReuseShuffleIndices.clear();
10893 return false;
10894 }
10895
10896 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
10897 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
10898 (UniquePositions.size() == 1 && all_of(Range&: UniqueValues, P: [](Value *V) {
10899 return isa<UndefValue>(Val: V) || !isConstant(V);
10900 }))) {
10901 if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 &&
10902 S.getMainOp()->isSafeToRemove() &&
10903 (S.areInstructionsWithCopyableElements() ||
10904 all_of(Range&: UniqueValues, P: IsaPred<Instruction, PoisonValue>))) {
10905 // Find the number of elements, which forms full vectors.
10906 unsigned PWSz = getFullVectorNumberOfElements(
10907 TTI, Ty: UniqueValues.front()->getType(), Sz: UniqueValues.size());
10908 PWSz = std::min<unsigned>(a: PWSz, b: VL.size());
10909 if (PWSz == VL.size()) {
10910 // We ended up with the same size after removing duplicates and
10911 // upgrading the resulting vector size to a "nice size". Just keep
10912 // the initial VL then.
10913 ReuseShuffleIndices.clear();
10914 } else {
10915 // Pad unique values with poison to grow the vector to a "nice" size
10916 SmallVector<Value *> PaddedUniqueValues(UniqueValues.begin(),
10917 UniqueValues.end());
10918 PaddedUniqueValues.append(
10919 NumInputs: PWSz - UniqueValues.size(),
10920 Elt: PoisonValue::get(T: UniqueValues.front()->getType()));
10921 // Check that extended with poisons/copyable operations are still valid
10922 // for vectorization (div/rem are not allowed).
10923 if ((!S.areInstructionsWithCopyableElements() &&
10924 !getSameOpcode(VL: PaddedUniqueValues, TLI).valid()) ||
10925 (S.areInstructionsWithCopyableElements() && S.isMulDivLikeOp() &&
10926 (S.getMainOp()->isIntDivRem() || S.getMainOp()->isFPDivRem() ||
10927 isa<CallInst>(Val: S.getMainOp())))) {
10928 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10929 ReuseShuffleIndices.clear();
10930 return false;
10931 }
10932 VL = std::move(PaddedUniqueValues);
10933 }
10934 return true;
10935 }
10936 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
10937 ReuseShuffleIndices.clear();
10938 return false;
10939 }
10940 VL = std::move(UniqueValues);
10941 return true;
10942}
10943
10944bool BoUpSLP::canBuildSplitNode(ArrayRef<Value *> VL,
10945 const InstructionsState &LocalState,
10946 SmallVectorImpl<Value *> &Op1,
10947 SmallVectorImpl<Value *> &Op2,
10948 OrdersType &ReorderIndices) const {
10949 constexpr unsigned SmallNodeSize = 4;
10950 if (VL.size() <= SmallNodeSize || TTI->preferAlternateOpcodeVectorization() ||
10951 !SplitAlternateInstructions)
10952 return false;
10953
10954 // Check if this is a duplicate of another split entry.
10955 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *LocalState.getMainOp()
10956 << ".\n");
10957 for (TreeEntry *E : getSplitTreeEntries(V: LocalState.getMainOp())) {
10958 if (E->isSame(VL)) {
10959 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at "
10960 << *LocalState.getMainOp() << ".\n");
10961 return false;
10962 }
10963 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
10964 if (all_of(Range&: VL, P: [&](Value *V) {
10965 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V);
10966 })) {
10967 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
10968 return false;
10969 }
10970 }
10971
10972 ReorderIndices.assign(NumElts: VL.size(), Elt: VL.size());
10973 SmallBitVector Op1Indices(VL.size());
10974 for (auto [Idx, V] : enumerate(First&: VL)) {
10975 auto *I = dyn_cast<Instruction>(Val: V);
10976 if (!I) {
10977 Op1.push_back(Elt: V);
10978 Op1Indices.set(Idx);
10979 continue;
10980 }
10981 if ((LocalState.getAltOpcode() != LocalState.getOpcode() &&
10982 isMainInstruction(I, MainOp: LocalState.getMainOp(), AltOp: LocalState.getAltOp(),
10983 TLI: *TLI)) ||
10984 (LocalState.getAltOpcode() == LocalState.getOpcode() &&
10985 !isAlternateInstruction(I, MainOp: LocalState.getMainOp(),
10986 AltOp: LocalState.getAltOp(), TLI: *TLI))) {
10987 Op1.push_back(Elt: V);
10988 Op1Indices.set(Idx);
10989 continue;
10990 }
10991 Op2.push_back(Elt: V);
10992 }
10993 Type *ScalarTy = getValueType(V: VL.front());
10994 VectorType *VecTy = getWidenedType(ScalarTy, VF: VL.size());
10995 unsigned Opcode0 = LocalState.getOpcode();
10996 unsigned Opcode1 = LocalState.getAltOpcode();
10997 SmallBitVector OpcodeMask(getAltInstrMask(VL, ScalarTy, Opcode0, Opcode1));
10998 // Enable split node, only if all nodes do not form legal alternate
10999 // instruction (like X86 addsub).
11000 SmallPtrSet<Value *, 4> UOp1(llvm::from_range, Op1);
11001 SmallPtrSet<Value *, 4> UOp2(llvm::from_range, Op2);
11002 if (UOp1.size() <= 1 || UOp2.size() <= 1 ||
11003 TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) ||
11004 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op1.front()->getType(), Sz: Op1.size()) ||
11005 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Op2.front()->getType(), Sz: Op2.size()))
11006 return false;
11007 // Enable split node, only if all nodes are power-of-2/full registers.
11008 unsigned Op1Cnt = 0, Op2Cnt = Op1.size();
11009 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11010 if (Op1Indices.test(Idx)) {
11011 ReorderIndices[Op1Cnt] = Idx;
11012 ++Op1Cnt;
11013 } else {
11014 ReorderIndices[Op2Cnt] = Idx;
11015 ++Op2Cnt;
11016 }
11017 }
11018 if (isIdentityOrder(Order: ReorderIndices))
11019 ReorderIndices.clear();
11020 SmallVector<int> Mask;
11021 if (!ReorderIndices.empty())
11022 inversePermutation(Indices: ReorderIndices, Mask);
11023 unsigned NumParts = TTI->getNumberOfParts(Tp: VecTy);
11024 VectorType *Op1VecTy = getWidenedType(ScalarTy, VF: Op1.size());
11025 VectorType *Op2VecTy = getWidenedType(ScalarTy, VF: Op2.size());
11026 // Check non-profitable single register ops, which better to be represented
11027 // as alternate ops.
11028 if (NumParts >= VL.size())
11029 return false;
11030 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11031 InstructionCost InsertCost = ::getShuffleCost(
11032 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind: Kind, Index: Op1.size(), SubTp: Op2VecTy);
11033 FixedVectorType *SubVecTy =
11034 getWidenedType(ScalarTy, VF: std::max(a: Op1.size(), b: Op2.size()));
11035 InstructionCost NewShuffleCost =
11036 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: SubVecTy, Mask, CostKind: Kind);
11037 if (!LocalState.isCmpOp() && NumParts <= 1 &&
11038 (Mask.empty() || InsertCost >= NewShuffleCost))
11039 return false;
11040 if ((LocalState.getMainOp()->isBinaryOp() &&
11041 LocalState.getAltOp()->isBinaryOp() &&
11042 (LocalState.isShiftOp() || LocalState.isBitwiseLogicOp() ||
11043 LocalState.isAddSubLikeOp() || LocalState.isMulDivLikeOp())) ||
11044 (LocalState.getMainOp()->isCast() && LocalState.getAltOp()->isCast()) ||
11045 (LocalState.getMainOp()->isUnaryOp() &&
11046 LocalState.getAltOp()->isUnaryOp())) {
11047 InstructionCost OriginalVecOpsCost =
11048 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: VecTy, CostKind: Kind) +
11049 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: VecTy, CostKind: Kind);
11050 SmallVector<int> OriginalMask(VL.size(), PoisonMaskElem);
11051 for (unsigned Idx : seq<unsigned>(Size: VL.size())) {
11052 if (isa<PoisonValue>(Val: VL[Idx]))
11053 continue;
11054 OriginalMask[Idx] = Idx + (Op1Indices.test(Idx) ? 0 : VL.size());
11055 }
11056 InstructionCost OriginalCost =
11057 OriginalVecOpsCost + ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
11058 Tp: VecTy, Mask: OriginalMask, CostKind: Kind);
11059 InstructionCost NewVecOpsCost =
11060 TTI->getArithmeticInstrCost(Opcode: Opcode0, Ty: Op1VecTy, CostKind: Kind) +
11061 TTI->getArithmeticInstrCost(Opcode: Opcode1, Ty: Op2VecTy, CostKind: Kind);
11062 InstructionCost NewCost =
11063 NewVecOpsCost + InsertCost +
11064 (!VectorizableTree.empty() && VectorizableTree.front()->hasState() &&
11065 VectorizableTree.front()->getOpcode() == Instruction::Store
11066 ? NewShuffleCost
11067 : 0);
11068 // If not profitable to split - exit.
11069 if (NewCost >= OriginalCost)
11070 return false;
11071 }
11072 return true;
11073}
11074
11075namespace {
11076/// Class accepts incoming list of values, checks if it is able to model
11077/// "copyable" values as compatible operations, and generates the list of values
11078/// for scheduling and list of operands doe the new nodes.
11079class InstructionsCompatibilityAnalysis {
11080 DominatorTree &DT;
11081 const DataLayout &DL;
11082 const TargetTransformInfo &TTI;
11083 const TargetLibraryInfo &TLI;
11084 unsigned MainOpcode = 0;
11085 Instruction *MainOp = nullptr;
11086
11087 /// Checks if the opcode is supported as the main opcode for copyable
11088 /// elements.
11089 static bool isSupportedOpcode(const unsigned Opcode) {
11090 return Opcode == Instruction::Add || Opcode == Instruction::Sub ||
11091 Opcode == Instruction::LShr || Opcode == Instruction::Shl ||
11092 Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
11093 Opcode == Instruction::And || Opcode == Instruction::Or ||
11094 Opcode == Instruction::Xor || Opcode == Instruction::FAdd ||
11095 Opcode == Instruction::FSub || Opcode == Instruction::FMul ||
11096 Opcode == Instruction::FDiv;
11097 }
11098
11099 /// Identifies the best candidate value, which represents main opcode
11100 /// operation.
11101 /// Currently the best candidate is the Add instruction with the parent
11102 /// block with the highest DFS incoming number (block, that dominates other).
11103 void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
11104 BasicBlock *Parent = nullptr;
11105 // Checks if the instruction has supported opcode.
11106 auto IsSupportedInstruction = [&](Instruction *I, bool AnyUndef) {
11107 if (AnyUndef && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(Val: I)))
11108 return false;
11109 return I && isSupportedOpcode(Opcode: I->getOpcode()) &&
11110 (!doesNotNeedToBeScheduled(V: I) || !R.isVectorized(V: I));
11111 };
11112 // Exclude operands instructions immediately to improve compile time, it
11113 // will be unable to schedule anyway.
11114 SmallDenseSet<Value *, 8> Operands;
11115 SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
11116 bool AnyUndef = false;
11117 for (Value *V : VL) {
11118 auto *I = dyn_cast<Instruction>(Val: V);
11119 if (!I) {
11120 AnyUndef |= isa<UndefValue>(Val: V);
11121 continue;
11122 }
11123 if (!DT.isReachableFromEntry(A: I->getParent()))
11124 continue;
11125 if (Candidates.empty()) {
11126 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11127 Parent = I->getParent();
11128 Operands.insert(I: I->op_begin(), E: I->op_end());
11129 continue;
11130 }
11131 if (Parent == I->getParent()) {
11132 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11133 Operands.insert(I: I->op_begin(), E: I->op_end());
11134 continue;
11135 }
11136 auto *NodeA = DT.getNode(BB: Parent);
11137 auto *NodeB = DT.getNode(BB: I->getParent());
11138 assert(NodeA && "Should only process reachable instructions");
11139 assert(NodeB && "Should only process reachable instructions");
11140 assert((NodeA == NodeB) ==
11141 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11142 "Different nodes should have different DFS numbers");
11143 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
11144 Candidates.clear();
11145 Candidates.try_emplace(Key: I->getOpcode()).first->second.push_back(Elt: I);
11146 Parent = I->getParent();
11147 Operands.clear();
11148 Operands.insert(I: I->op_begin(), E: I->op_end());
11149 }
11150 }
11151 unsigned BestOpcodeNum = 0;
11152 MainOp = nullptr;
11153 bool UsedOutside = false;
11154 for (const auto &P : Candidates) {
11155 bool PUsedOutside = all_of(Range: P.second, P: isUsedOutsideBlock);
11156 if (UsedOutside && !PUsedOutside)
11157 continue;
11158 if (!UsedOutside && PUsedOutside)
11159 BestOpcodeNum = 0;
11160 if (P.second.size() < BestOpcodeNum)
11161 continue;
11162 // If have inner dependencies - skip.
11163 if (!PUsedOutside && any_of(Range: P.second, P: [&](Instruction *I) {
11164 return Operands.contains(V: I);
11165 }))
11166 continue;
11167 UsedOutside = PUsedOutside;
11168 for (Instruction *I : P.second) {
11169 if (IsSupportedInstruction(I, AnyUndef)) {
11170 MainOp = I;
11171 BestOpcodeNum = P.second.size();
11172 break;
11173 }
11174 }
11175 }
11176 if (MainOp) {
11177 // Do not match, if any copyable is a terminator from the same block as
11178 // the main operation.
11179 if (any_of(Range&: VL, P: [&](Value *V) {
11180 auto *I = dyn_cast<Instruction>(Val: V);
11181 return I && I->getParent() == MainOp->getParent() &&
11182 I->isTerminator();
11183 })) {
11184 MainOp = nullptr;
11185 return;
11186 }
11187 MainOpcode = MainOp->getOpcode();
11188 }
11189 }
11190
11191 /// Returns the idempotent value for the \p MainOp with the detected \p
11192 /// MainOpcode. For Add, returns 0. For Or, it should choose between false and
11193 /// the operand itself, since V or V == V.
11194 Value *selectBestIdempotentValue() const {
11195 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11196 return ConstantExpr::getBinOpIdentity(Opcode: MainOpcode, Ty: MainOp->getType(),
11197 AllowRHSConstant: !MainOp->isCommutative());
11198 }
11199
11200 /// Returns the value and operands for the \p V, considering if it is original
11201 /// instruction and its actual operands should be returned, or it is a
11202 /// copyable element and its should be represented as idempotent instruction.
11203 SmallVector<Value *> getOperands(const InstructionsState &S, Value *V) const {
11204 if (isa<PoisonValue>(Val: V))
11205 return {V, V};
11206 if (!S.isCopyableElement(V))
11207 return convertTo(I: cast<Instruction>(Val: V), S).second;
11208 assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
11209 return {V, selectBestIdempotentValue()};
11210 }
11211
11212 /// Builds operands for the original instructions.
11213 void
11214 buildOriginalOperands(const InstructionsState &S, ArrayRef<Value *> VL,
11215 SmallVectorImpl<BoUpSLP::ValueList> &Operands) const {
11216
11217 unsigned ShuffleOrOp =
11218 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11219 Instruction *VL0 = S.getMainOp();
11220
11221 switch (ShuffleOrOp) {
11222 case Instruction::PHI: {
11223 auto *PH = cast<PHINode>(Val: VL0);
11224
11225 // Keeps the reordered operands to avoid code duplication.
11226 PHIHandler Handler(DT, PH, VL);
11227 Handler.buildOperands();
11228 Operands.assign(NumElts: PH->getNumOperands(), Elt: {});
11229 for (unsigned I : seq<unsigned>(Size: PH->getNumOperands()))
11230 Operands[I].assign(in_start: Handler.getOperands(I).begin(),
11231 in_end: Handler.getOperands(I).end());
11232 return;
11233 }
11234 case Instruction::ExtractValue:
11235 case Instruction::ExtractElement:
11236 // This is a special case, as it does not gather, but at the same time
11237 // we are not extending buildTree_rec() towards the operands.
11238 Operands.assign(NumElts: 1, Elt: {VL.size(), VL0->getOperand(i: 0)});
11239 return;
11240 case Instruction::InsertElement:
11241 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
11242 for (auto [Idx, V] : enumerate(First&: VL)) {
11243 auto *IE = cast<InsertElementInst>(Val: V);
11244 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11245 Ops[Idx] = IE->getOperand(i_nocapture: OpIdx);
11246 }
11247 return;
11248 case Instruction::Load:
11249 Operands.assign(
11250 NumElts: 1, Elt: {VL.size(),
11251 PoisonValue::get(T: cast<LoadInst>(Val: VL0)->getPointerOperandType())});
11252 for (auto [V, Op] : zip(t&: VL, u&: Operands.back())) {
11253 auto *LI = dyn_cast<LoadInst>(Val: V);
11254 if (!LI)
11255 continue;
11256 Op = LI->getPointerOperand();
11257 }
11258 return;
11259 case Instruction::ZExt:
11260 case Instruction::SExt:
11261 case Instruction::FPToUI:
11262 case Instruction::FPToSI:
11263 case Instruction::FPExt:
11264 case Instruction::PtrToInt:
11265 case Instruction::IntToPtr:
11266 case Instruction::SIToFP:
11267 case Instruction::UIToFP:
11268 case Instruction::Trunc:
11269 case Instruction::FPTrunc:
11270 case Instruction::BitCast:
11271 case Instruction::ICmp:
11272 case Instruction::FCmp:
11273 case Instruction::FNeg:
11274 case Instruction::Add:
11275 case Instruction::FAdd:
11276 case Instruction::Sub:
11277 case Instruction::FSub:
11278 case Instruction::Mul:
11279 case Instruction::FMul:
11280 case Instruction::UDiv:
11281 case Instruction::SDiv:
11282 case Instruction::FDiv:
11283 case Instruction::URem:
11284 case Instruction::SRem:
11285 case Instruction::FRem:
11286 case Instruction::Shl:
11287 case Instruction::LShr:
11288 case Instruction::AShr:
11289 case Instruction::And:
11290 case Instruction::Or:
11291 case Instruction::Xor:
11292 case Instruction::Freeze:
11293 case Instruction::Store:
11294 case Instruction::ShuffleVector:
11295 Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
11296 for (auto [Idx, V] : enumerate(First&: VL)) {
11297 auto *I = dyn_cast<Instruction>(Val: V);
11298 if (!I) {
11299 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11300 Ops[Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
11301 continue;
11302 }
11303 auto [Op, ConvertedOps] = convertTo(I, S);
11304 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11305 Ops[Idx] = ConvertedOps[OpIdx];
11306 }
11307 return;
11308 case Instruction::Select:
11309 Operands.assign(NumElts: VL0->getNumOperands(), Elt: {VL.size(), nullptr});
11310 for (auto [Idx, V] : enumerate(First&: VL)) {
11311 auto *I = dyn_cast<Instruction>(Val: V);
11312 if (!I) {
11313 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11314 Ops[Idx] = PoisonValue::get(T: VL0->getOperand(i: OpIdx)->getType());
11315 continue;
11316 }
11317 if (isa<ZExtInst>(Val: I)) {
11318 // Special case for select + zext i1 to avoid explosion of different
11319 // types. We want to keep the condition as i1 to be able to match
11320 // different selects together and reuse the vectorized condition
11321 // rather than trying to gather it.
11322 Operands[0][Idx] = I->getOperand(i: 0);
11323 Operands[1][Idx] = ConstantInt::get(Ty: I->getType(), V: 1);
11324 Operands[2][Idx] = ConstantInt::getNullValue(Ty: I->getType());
11325 continue;
11326 }
11327 auto [Op, ConvertedOps] = convertTo(I, S);
11328 for (auto [OpIdx, Ops] : enumerate(First&: Operands))
11329 Ops[Idx] = ConvertedOps[OpIdx];
11330 }
11331 return;
11332 case Instruction::GetElementPtr: {
11333 Operands.assign(NumElts: 2, Elt: {VL.size(), nullptr});
11334 // Need to cast all indices to the same type before vectorization to
11335 // avoid crash.
11336 // Required to be able to find correct matches between different gather
11337 // nodes and reuse the vectorized values rather than trying to gather them
11338 // again.
11339 const unsigned IndexIdx = 1;
11340 Type *VL0Ty = VL0->getOperand(i: IndexIdx)->getType();
11341 Type *Ty =
11342 all_of(Range&: VL,
11343 P: [&](Value *V) {
11344 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11345 return !GEP || VL0Ty == GEP->getOperand(i_nocapture: IndexIdx)->getType();
11346 })
11347 ? VL0Ty
11348 : DL.getIndexType(PtrTy: cast<GetElementPtrInst>(Val: VL0)
11349 ->getPointerOperandType()
11350 ->getScalarType());
11351 for (auto [Idx, V] : enumerate(First&: VL)) {
11352 auto *GEP = dyn_cast<GetElementPtrInst>(Val: V);
11353 if (!GEP) {
11354 Operands[0][Idx] = V;
11355 Operands[1][Idx] = ConstantInt::getNullValue(Ty);
11356 continue;
11357 }
11358 Operands[0][Idx] = GEP->getPointerOperand();
11359 auto *Op = GEP->getOperand(i_nocapture: IndexIdx);
11360 auto *CI = dyn_cast<ConstantInt>(Val: Op);
11361 Operands[1][Idx] = CI ? ConstantFoldIntegerCast(
11362 C: CI, DestTy: Ty, IsSigned: CI->getValue().isSignBitSet(), DL)
11363 : Op;
11364 }
11365 return;
11366 }
11367 case Instruction::Call: {
11368 auto *CI = cast<CallInst>(Val: VL0);
11369 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI: &TLI);
11370 for (unsigned Idx : seq<unsigned>(Size: CI->arg_size())) {
11371 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: Idx, TTI: &TTI))
11372 continue;
11373 auto &Ops = Operands.emplace_back();
11374 for (Value *V : VL) {
11375 auto *I = dyn_cast<Instruction>(Val: V);
11376 Ops.push_back(Elt: I ? I->getOperand(i: Idx)
11377 : PoisonValue::get(T: VL0->getOperand(i: Idx)->getType()));
11378 }
11379 }
11380 return;
11381 }
11382 default:
11383 break;
11384 }
11385 llvm_unreachable("Unexpected vectorization of the instructions.");
11386 }
11387
11388public:
11389 InstructionsCompatibilityAnalysis(DominatorTree &DT, const DataLayout &DL,
11390 const TargetTransformInfo &TTI,
11391 const TargetLibraryInfo &TLI)
11392 : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {}
11393
11394 InstructionsState
11395 buildInstructionsState(ArrayRef<Value *> VL, const BoUpSLP &R,
11396 bool TryCopyableElementsVectorization,
11397 bool WithProfitabilityCheck = false,
11398 bool SkipSameCodeCheck = false) {
11399 InstructionsState S = (SkipSameCodeCheck || !allSameBlock(VL))
11400 ? InstructionsState::invalid()
11401 : getSameOpcode(VL, TLI);
11402 if (S)
11403 return S;
11404 // Check if series of selects + zext i1 %x to in can be combined into
11405 // selects + select %x, i32 1, i32 0.
11406 Instruction *SelectOp = nullptr;
11407 if (allSameBlock(VL) && all_of(Range&: VL, P: [&](Value *V) {
11408 if (match(V, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()))) {
11409 if (!SelectOp)
11410 SelectOp = cast<Instruction>(Val: V);
11411 return true;
11412 }
11413 auto *ZExt = dyn_cast<ZExtInst>(Val: V);
11414 return (ZExt && ZExt->getSrcTy()->isIntegerTy(Bitwidth: 1)) ||
11415 isa<PoisonValue>(Val: V);
11416 })) {
11417 if (SelectOp)
11418 return InstructionsState(SelectOp, SelectOp);
11419 }
11420 if (!VectorizeCopyableElements || !TryCopyableElementsVectorization)
11421 return S;
11422 findAndSetMainInstruction(VL, R);
11423 if (!MainOp)
11424 return InstructionsState::invalid();
11425 S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true);
11426 if (!WithProfitabilityCheck)
11427 return S;
11428 // Check if it is profitable to vectorize the instruction.
11429 SmallVector<BoUpSLP::ValueList> Operands = buildOperands(S, VL);
11430 auto BuildCandidates =
11431 [](SmallVectorImpl<std::pair<Value *, Value *>> &Candidates, Value *V1,
11432 Value *V2) {
11433 if (V1 != V2 && isa<PHINode>(Val: V1))
11434 return;
11435 auto *I1 = dyn_cast<Instruction>(Val: V1);
11436 auto *I2 = dyn_cast<Instruction>(Val: V2);
11437 if (I1 && I2 && I1->getOpcode() == I2->getOpcode() &&
11438 I1->getParent() != I2->getParent())
11439 return;
11440 Candidates.emplace_back(Args&: V1, Args&: (I1 || I2) ? V2 : V1);
11441 };
11442 if (VL.size() == 2) {
11443 // Check if the operands allow better vectorization.
11444 SmallVector<std::pair<Value *, Value *>, 4> Candidates1, Candidates2;
11445 BuildCandidates(Candidates1, Operands[0][0], Operands[0][1]);
11446 BuildCandidates(Candidates2, Operands[1][0], Operands[1][1]);
11447 bool Res = !Candidates1.empty() && !Candidates2.empty() &&
11448 R.findBestRootPair(Candidates: Candidates1) &&
11449 R.findBestRootPair(Candidates: Candidates2);
11450 if (!Res && isCommutative(I: MainOp)) {
11451 Candidates1.clear();
11452 Candidates2.clear();
11453 BuildCandidates(Candidates1, Operands[0][0], Operands[1][1]);
11454 BuildCandidates(Candidates2, Operands[1][0], Operands[0][1]);
11455 Res = !Candidates1.empty() && !Candidates2.empty() &&
11456 R.findBestRootPair(Candidates: Candidates1) &&
11457 R.findBestRootPair(Candidates: Candidates2);
11458 }
11459 if (!Res)
11460 return InstructionsState::invalid();
11461 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11462 InstructionCost ScalarCost = TTI.getInstructionCost(U: S.getMainOp(), CostKind: Kind);
11463 InstructionCost VectorCost;
11464 FixedVectorType *VecTy =
11465 getWidenedType(ScalarTy: S.getMainOp()->getType(), VF: VL.size());
11466 switch (MainOpcode) {
11467 case Instruction::Add:
11468 case Instruction::Sub:
11469 case Instruction::LShr:
11470 case Instruction::Shl:
11471 case Instruction::SDiv:
11472 case Instruction::UDiv:
11473 case Instruction::And:
11474 case Instruction::Or:
11475 case Instruction::Xor:
11476 case Instruction::FAdd:
11477 case Instruction::FMul:
11478 case Instruction::FSub:
11479 case Instruction::FDiv:
11480 VectorCost = TTI.getArithmeticInstrCost(Opcode: MainOpcode, Ty: VecTy, CostKind: Kind);
11481 break;
11482 default:
11483 llvm_unreachable("Unexpected instruction.");
11484 }
11485 if (VectorCost > ScalarCost)
11486 return InstructionsState::invalid();
11487 return S;
11488 }
11489 assert(Operands.size() == 2 && "Unexpected number of operands!");
11490 unsigned CopyableNum =
11491 count_if(Range&: VL, P: [&](Value *V) { return S.isCopyableElement(V); });
11492 if (CopyableNum < VL.size() / 2)
11493 return S;
11494 // Too many phi copyables - exit.
11495 const unsigned Limit = VL.size() / 24;
11496 if ((CopyableNum >= VL.size() - Limit ||
11497 (CopyableNum >= VL.size() - 1 && VL.size() > 4) ||
11498 CopyableNum >= MaxPHINumOperands) &&
11499 all_of(Range&: VL, P: [&](Value *V) {
11500 return isa<PHINode>(Val: V) || !S.isCopyableElement(V);
11501 }))
11502 return InstructionsState::invalid();
11503 // Check profitability if number of copyables > VL.size() / 2.
11504 // 1. Reorder operands for better matching.
11505 if (isCommutative(I: MainOp)) {
11506 for (auto [OpL, OpR] : zip(t&: Operands.front(), u&: Operands.back())) {
11507 // Make instructions the first operands.
11508 if (!isa<Instruction>(Val: OpL) && isa<Instruction>(Val: OpR)) {
11509 std::swap(a&: OpL, b&: OpR);
11510 continue;
11511 }
11512 // Make constants the second operands.
11513 if ((isa<Constant>(Val: OpL) && !match(V: OpR, P: m_Zero())) ||
11514 match(V: OpL, P: m_Zero())) {
11515 std::swap(a&: OpL, b&: OpR);
11516 continue;
11517 }
11518 }
11519 }
11520 // 2. Check, if operands can be vectorized.
11521 if (count_if(Range&: Operands.back(), P: IsaPred<Instruction>) > 1)
11522 return InstructionsState::invalid();
11523 auto CheckOperand = [&](ArrayRef<Value *> Ops) {
11524 if (allConstant(VL: Ops) || isSplat(VL: Ops))
11525 return true;
11526 // Check if it is "almost" splat, i.e. has >= 4 elements and only single
11527 // one is different.
11528 constexpr unsigned Limit = 4;
11529 if (Operands.front().size() >= Limit) {
11530 SmallDenseMap<const Value *, unsigned> Counters;
11531 for (Value *V : Ops) {
11532 if (isa<UndefValue>(Val: V))
11533 continue;
11534 ++Counters[V];
11535 }
11536 if (Counters.size() == 2 &&
11537 any_of(Range&: Counters, P: [&](const std::pair<const Value *, unsigned> &C) {
11538 return C.second == 1;
11539 }))
11540 return true;
11541 }
11542 // First operand not a constant or splat? Last attempt - check for
11543 // potential vectorization.
11544 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
11545 InstructionsState OpS = Analysis.buildInstructionsState(
11546 VL: Ops, R, /*TryCopyableElementsVectorization=*/true);
11547 if (!OpS || (OpS.getOpcode() == Instruction::PHI && !allSameBlock(VL: Ops)))
11548 return false;
11549 unsigned CopyableNum =
11550 count_if(Range&: Ops, P: [&](Value *V) { return OpS.isCopyableElement(V); });
11551 return CopyableNum <= VL.size() / 2;
11552 };
11553 if (!CheckOperand(Operands.front()))
11554 return InstructionsState::invalid();
11555
11556 return S;
11557 }
11558
11559 SmallVector<BoUpSLP::ValueList> buildOperands(const InstructionsState &S,
11560 ArrayRef<Value *> VL) {
11561 assert(S && "Invalid state!");
11562 SmallVector<BoUpSLP::ValueList> Operands;
11563 if (S.areInstructionsWithCopyableElements()) {
11564 MainOp = S.getMainOp();
11565 MainOpcode = S.getOpcode();
11566 Operands.assign(NumElts: MainOp->getNumOperands(),
11567 Elt: BoUpSLP::ValueList(VL.size(), nullptr));
11568 for (auto [Idx, V] : enumerate(First&: VL)) {
11569 SmallVector<Value *> OperandsForValue = getOperands(S, V);
11570 for (auto [OperandIdx, Operand] : enumerate(First&: OperandsForValue))
11571 Operands[OperandIdx][Idx] = Operand;
11572 }
11573 } else {
11574 buildOriginalOperands(S, VL, Operands);
11575 }
11576 return Operands;
11577 }
11578};
11579} // namespace
11580
11581BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality(
11582 ArrayRef<Value *> VL, unsigned Depth, const EdgeInfo &UserTreeIdx,
11583 bool TryCopyableElementsVectorization) const {
11584 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
11585
11586 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11587 InstructionsState S = Analysis.buildInstructionsState(
11588 VL, R: *this, TryCopyableElementsVectorization,
11589 /*WithProfitabilityCheck=*/true, SkipSameCodeCheck: TryCopyableElementsVectorization);
11590
11591 bool AreScatterAllGEPSameBlock = false;
11592 if (!S) {
11593 SmallVector<unsigned> SortedIndices;
11594 BasicBlock *BB = nullptr;
11595 bool IsScatterVectorizeUserTE =
11596 UserTreeIdx.UserTE &&
11597 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11598 AreScatterAllGEPSameBlock =
11599 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
11600 VL.size() > 2 &&
11601 all_of(Range&: VL,
11602 P: [&BB](Value *V) {
11603 auto *I = dyn_cast<GetElementPtrInst>(Val: V);
11604 if (!I)
11605 return doesNotNeedToBeScheduled(V);
11606 if (!BB)
11607 BB = I->getParent();
11608 return BB == I->getParent() && I->getNumOperands() == 2;
11609 }) &&
11610 BB &&
11611 sortPtrAccesses(VL, ElemTy: UserTreeIdx.UserTE->getMainOp()->getType(), DL: *DL,
11612 SE&: *SE, SortedIndices));
11613 if (!AreScatterAllGEPSameBlock) {
11614 LLVM_DEBUG(dbgs() << "SLP: Try split and if failed, gathering due to "
11615 "C,S,B,O, small shuffle. \n";
11616 dbgs() << "[";
11617 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11618 dbgs() << "]\n");
11619 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11620 /*TryToFindDuplicates=*/true,
11621 /*TrySplitVectorize=*/true);
11622 }
11623 // Reset S to make it GetElementPtr kind of node.
11624 const auto *It = find_if(Range&: VL, P: IsaPred<GetElementPtrInst>);
11625 assert(It != VL.end() && "Expected at least one GEP.");
11626 S = getSameOpcode(VL: *It, TLI: *TLI);
11627 }
11628 assert(S && "Must be valid.");
11629
11630 // Don't handle vectors.
11631 if (!SLPReVec && getValueType(V: VL.front())->isVectorTy()) {
11632 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
11633 // Do not try to pack to avoid extra instructions here.
11634 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11635 /*TryToFindDuplicates=*/false);
11636 }
11637
11638 // Check that all of the users of the scalars that we want to vectorize are
11639 // schedulable.
11640 BasicBlock *BB = S.getMainOp()->getParent();
11641
11642 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()) ||
11643 !DT->isReachableFromEntry(A: BB)) {
11644 // Don't go into unreachable blocks. They may contain instructions with
11645 // dependency cycles which confuse the final scheduling.
11646 // Do not vectorize EH and non-returning blocks, not profitable in most
11647 // cases.
11648 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
11649 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11650 }
11651
11652 // Don't go into catchswitch blocks, which can happen with PHIs.
11653 // Such blocks can only have PHIs and the catchswitch. There is no
11654 // place to insert a shuffle if we need to, so just avoid that issue.
11655 if (isa<CatchSwitchInst>(Val: BB->getTerminator())) {
11656 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
11657 // Do not try to pack to avoid extra instructions here.
11658 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11659 /*TryToFindDuplicates=*/false);
11660 }
11661
11662 // Don't handle scalable vectors
11663 if (S.getOpcode() == Instruction::ExtractElement &&
11664 isa<ScalableVectorType>(
11665 Val: cast<ExtractElementInst>(Val: S.getMainOp())->getVectorOperandType())) {
11666 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
11667 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11668 }
11669
11670 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
11671 // a load), in which case peek through to include it in the tree, without
11672 // ballooning over-budget.
11673 if (Depth >= RecursionMaxDepth &&
11674 (S.isAltShuffle() || VL.size() < 4 ||
11675 !(match(V: S.getMainOp(), P: m_Load(Op: m_Value())) ||
11676 all_of(Range&: VL, P: [&S](const Value *I) {
11677 return match(V: I,
11678 P: m_OneUse(SubPattern: m_ZExtOrSExt(Op: m_OneUse(SubPattern: m_Load(Op: m_Value()))))) &&
11679 cast<Instruction>(Val: I)->getOpcode() == S.getOpcode();
11680 })))) {
11681 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
11682 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11683 }
11684
11685 // Check if this is a duplicate of another entry.
11686 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp() << ".\n");
11687 for (TreeEntry *E : getTreeEntries(V: S.getMainOp())) {
11688 if (E->isSame(VL)) {
11689 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
11690 << ".\n");
11691 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11692 }
11693 SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
11694 if (all_of(Range&: VL, P: [&](Value *V) {
11695 return isa<PoisonValue>(Val: V) || Values.contains(Ptr: V) ||
11696 (S.getOpcode() == Instruction::PHI && isa<PHINode>(Val: V) &&
11697 LI->getLoopFor(BB: S.getMainOp()->getParent()) &&
11698 isVectorized(V));
11699 })) {
11700 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
11701 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11702 }
11703 }
11704
11705 bool AreAllSameBlock = !AreScatterAllGEPSameBlock;
11706 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
11707 if (!AreAllSameInsts || isSplat(VL) ||
11708 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
11709 Val: S.getMainOp()) &&
11710 !all_of(Range&: VL, P: isVectorLikeInstWithConstOps))) {
11711 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O conditions. \n";
11712 dbgs() << "[";
11713 interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; });
11714 dbgs() << "]\n");
11715 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11716 }
11717
11718 // Don't vectorize ephemeral values.
11719 if (!EphValues.empty()) {
11720 for (Value *V : VL) {
11721 if (EphValues.count(Ptr: V)) {
11722 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
11723 << ") is ephemeral.\n");
11724 // Do not try to pack to avoid extra instructions here.
11725 return ScalarsVectorizationLegality(S, /*IsLegal=*/false,
11726 /*TryToFindDuplicates=*/false);
11727 }
11728 }
11729 }
11730
11731 // We now know that this is a vector of instructions of the same type from
11732 // the same block.
11733
11734 // Check that none of the instructions in the bundle are already in the tree
11735 // and the node may be not profitable for the vectorization as the small
11736 // alternate node.
11737 if (S.isAltShuffle()) {
11738 auto GetNumVectorizedExtracted = [&]() {
11739 APInt Extracted = APInt::getZero(numBits: VL.size());
11740 APInt Vectorized = APInt::getAllOnes(numBits: VL.size());
11741 for (auto [Idx, V] : enumerate(First&: VL)) {
11742 auto *I = dyn_cast<Instruction>(Val: V);
11743 if (!I || doesNotNeedToBeScheduled(V: I) ||
11744 all_of(Range: I->operands(), P: [&](const Use &U) {
11745 return isa<ExtractElementInst>(Val: U.get());
11746 }))
11747 continue;
11748 if (isVectorized(V: I))
11749 Vectorized.clearBit(BitPosition: Idx);
11750 else if (!I->hasOneUser() && !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList))
11751 Extracted.setBit(Idx);
11752 }
11753 return std::make_pair(x&: Vectorized, y&: Extracted);
11754 };
11755 auto [Vectorized, Extracted] = GetNumVectorizedExtracted();
11756 constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
11757 bool PreferScalarize = !Vectorized.isAllOnes() && VL.size() == 2;
11758 if (!Vectorized.isAllOnes() && !PreferScalarize) {
11759 // Rough cost estimation, if the vector code (+ potential extracts) is
11760 // more profitable than the scalar + buildvector.
11761 Type *ScalarTy = VL.front()->getType();
11762 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
11763 InstructionCost VectorizeCostEstimate =
11764 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: {}, CostKind: Kind) +
11765 ::getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Extracted,
11766 /*Insert=*/false, /*Extract=*/true, CostKind: Kind);
11767 InstructionCost ScalarizeCostEstimate = ::getScalarizationOverhead(
11768 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: Vectorized,
11769 /*Insert=*/true, /*Extract=*/false, CostKind: Kind, /*ForPoisonSrc=*/false);
11770 PreferScalarize = VectorizeCostEstimate > ScalarizeCostEstimate;
11771 }
11772 if (PreferScalarize) {
11773 LLVM_DEBUG(dbgs() << "SLP: The instructions are in tree and alternate "
11774 "node is not profitable.\n");
11775 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11776 }
11777 }
11778
11779 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
11780 if (UserIgnoreList && !UserIgnoreList->empty()) {
11781 for (Value *V : VL) {
11782 if (UserIgnoreList->contains(V)) {
11783 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
11784 return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
11785 }
11786 }
11787 }
11788
11789 return ScalarsVectorizationLegality(S, /*IsLegal=*/true);
11790}
11791
11792void BoUpSLP::buildTreeRec(ArrayRef<Value *> VLRef, unsigned Depth,
11793 const EdgeInfo &UserTreeIdx,
11794 unsigned InterleaveFactor) {
11795 assert((allConstant(VLRef) || allSameType(VLRef)) && "Invalid types!");
11796
11797 SmallVector<int> ReuseShuffleIndices;
11798 SmallVector<Value *> VL(VLRef);
11799
11800 // Tries to build split node.
11801 auto TrySplitNode = [&](const InstructionsState &LocalState) {
11802 SmallVector<Value *> Op1, Op2;
11803 OrdersType ReorderIndices;
11804 if (!canBuildSplitNode(VL, LocalState, Op1, Op2, ReorderIndices))
11805 return false;
11806
11807 auto Invalid = ScheduleBundle::invalid();
11808 auto *TE = newTreeEntry(VL, EntryState: TreeEntry::SplitVectorize, Bundle&: Invalid, S: LocalState,
11809 UserTreeIdx, ReuseShuffleIndices: {}, ReorderIndices);
11810 LLVM_DEBUG(dbgs() << "SLP: split alternate node.\n"; TE->dump());
11811 auto AddNode = [&](ArrayRef<Value *> Op, unsigned Idx) {
11812 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
11813 if (S && (isa<LoadInst>(Val: S.getMainOp()) ||
11814 getSameValuesTreeEntry(V: S.getMainOp(), VL: Op, /*SameVF=*/true))) {
11815 // Build gather node for loads, they will be gathered later.
11816 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
11817 Args: Idx == 0 ? 0 : Op1.size());
11818 (void)newTreeEntry(VL: Op, EntryState: TreeEntry::NeedToGather, Bundle&: Invalid, S, UserTreeIdx: {TE, Idx});
11819 } else {
11820 TE->CombinedEntriesWithIndices.emplace_back(Args: VectorizableTree.size(),
11821 Args: Idx == 0 ? 0 : Op1.size());
11822 buildTreeRec(VLRef: Op, Depth, UserTreeIdx: {TE, Idx});
11823 }
11824 };
11825 AddNode(Op1, 0);
11826 AddNode(Op2, 1);
11827 return true;
11828 };
11829
11830 auto AreOnlyConstsWithPHIs = [](ArrayRef<Value *> VL) {
11831 bool AreConsts = false;
11832 for (Value *V : VL) {
11833 if (isa<PoisonValue>(Val: V))
11834 continue;
11835 if (isa<Constant>(Val: V)) {
11836 AreConsts = true;
11837 continue;
11838 }
11839 if (!isa<PHINode>(Val: V))
11840 return false;
11841 }
11842 return AreConsts;
11843 };
11844 if (AreOnlyConstsWithPHIs(VL)) {
11845 LLVM_DEBUG(dbgs() << "SLP: Gathering due to all constants and PHIs.\n");
11846 newGatherTreeEntry(VL, S: InstructionsState::invalid(), UserTreeIdx);
11847 return;
11848 }
11849
11850 ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality(
11851 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false);
11852 InstructionsState S = Legality.getInstructionsState();
11853 if (!Legality.isLegal()) {
11854 if (Legality.trySplitVectorize()) {
11855 auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL);
11856 // Last chance to try to vectorize alternate node.
11857 if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp)))
11858 return;
11859 }
11860 if (!S)
11861 Legality = getScalarsVectorizationLegality(
11862 VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true);
11863 if (!Legality.isLegal()) {
11864 if (Legality.tryToFindDuplicates())
11865 tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S,
11866 UserTreeIdx);
11867
11868 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11869 return;
11870 }
11871 S = Legality.getInstructionsState();
11872 }
11873
11874 // FIXME: investigate if there are profitable cases for VL.size() <= 4.
11875 if (S.isAltShuffle() && TrySplitNode(S))
11876 return;
11877
11878 // Check that every instruction appears once in this bundle.
11879 if (!tryToFindDuplicates(VL, ReuseShuffleIndices, TTI: *TTI, TLI: *TLI, S, UserTreeIdx,
11880 /*TryPad=*/true)) {
11881 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11882 return;
11883 }
11884
11885 // Perform specific checks for each particular instruction kind.
11886 bool IsScatterVectorizeUserTE =
11887 UserTreeIdx.UserTE &&
11888 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
11889 OrdersType CurrentOrder;
11890 SmallVector<Value *> PointerOps;
11891 StridedPtrInfo SPtrInfo;
11892 TreeEntry::EntryState State = getScalarsVectorizationState(
11893 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps, SPtrInfo);
11894 if (State == TreeEntry::NeedToGather) {
11895 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11896 return;
11897 }
11898
11899 Instruction *VL0 = S.getMainOp();
11900 BasicBlock *BB = VL0->getParent();
11901 auto &BSRef = BlocksSchedules[BB];
11902 if (!BSRef)
11903 BSRef = std::make_unique<BlockScheduling>(args&: BB);
11904
11905 BlockScheduling &BS = *BSRef;
11906
11907 SetVector<Value *> UniqueValues(llvm::from_range, VL);
11908 std::optional<ScheduleBundle *> BundlePtr =
11909 BS.tryScheduleBundle(VL: UniqueValues.getArrayRef(), SLP: this, S, EI: UserTreeIdx);
11910#ifdef EXPENSIVE_CHECKS
11911 // Make sure we didn't break any internal invariants
11912 BS.verify();
11913#endif
11914 if (!BundlePtr || (*BundlePtr && !*BundlePtr.value())) {
11915 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
11916 // Last chance to try to vectorize alternate node.
11917 if (S.isAltShuffle() && ReuseShuffleIndices.empty() && TrySplitNode(S))
11918 return;
11919 newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices);
11920 NonScheduledFirst.insert(Ptr: VL.front());
11921 if (S.getOpcode() == Instruction::Load &&
11922 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
11923 registerNonVectorizableLoads(VL: ArrayRef(VL));
11924 return;
11925 }
11926 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
11927 SmallVector<ValueList> Operands = Analysis.buildOperands(S, VL);
11928 ScheduleBundle Empty;
11929 ScheduleBundle &Bundle = BundlePtr.value() ? *BundlePtr.value() : Empty;
11930 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
11931
11932 unsigned ShuffleOrOp =
11933 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
11934 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
11935 // Postpone PHI nodes creation
11936 SmallVector<unsigned> PHIOps;
11937 for (unsigned I : seq<unsigned>(Operands.size())) {
11938 ArrayRef<Value *> Op = Operands[I];
11939 if (Op.empty())
11940 continue;
11941 InstructionsState S = getSameOpcode(VL: Op, TLI: *TLI);
11942 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
11943 buildTreeRec(VLRef: Op, Depth: Depth + 1, UserTreeIdx: {TE, I});
11944 else
11945 PHIOps.push_back(Elt: I);
11946 }
11947 for (unsigned I : PHIOps)
11948 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
11949 };
11950 switch (ShuffleOrOp) {
11951 case Instruction::PHI: {
11952 TreeEntry *TE =
11953 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
11954 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
11955 TE->dump());
11956
11957 TE->setOperands(Operands);
11958 CreateOperandNodes(TE, Operands);
11959 return;
11960 }
11961 case Instruction::ExtractValue:
11962 case Instruction::ExtractElement: {
11963 if (CurrentOrder.empty()) {
11964 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
11965 } else {
11966 LLVM_DEBUG({
11967 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
11968 "with order";
11969 for (unsigned Idx : CurrentOrder)
11970 dbgs() << " " << Idx;
11971 dbgs() << "\n";
11972 });
11973 fixupOrderingIndices(Order: CurrentOrder);
11974 }
11975 // Insert new order with initial value 0, if it does not exist,
11976 // otherwise return the iterator to the existing one.
11977 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
11978 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
11979 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
11980 "(ExtractValueInst/ExtractElementInst).\n";
11981 TE->dump());
11982 // This is a special case, as it does not gather, but at the same time
11983 // we are not extending buildTreeRec() towards the operands.
11984 TE->setOperands(Operands);
11985 return;
11986 }
11987 case Instruction::InsertElement: {
11988 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
11989
11990 auto OrdCompare = [](const std::pair<int, int> &P1,
11991 const std::pair<int, int> &P2) {
11992 return P1.first > P2.first;
11993 };
11994 PriorityQueue<std::pair<int, int>, SmallVector<std::pair<int, int>>,
11995 decltype(OrdCompare)>
11996 Indices(OrdCompare);
11997 for (int I = 0, E = VL.size(); I < E; ++I) {
11998 unsigned Idx = *getElementIndex(Inst: VL[I]);
11999 Indices.emplace(args&: Idx, args&: I);
12000 }
12001 OrdersType CurrentOrder(VL.size(), VL.size());
12002 bool IsIdentity = true;
12003 for (int I = 0, E = VL.size(); I < E; ++I) {
12004 CurrentOrder[Indices.top().second] = I;
12005 IsIdentity &= Indices.top().second == I;
12006 Indices.pop();
12007 }
12008 if (IsIdentity)
12009 CurrentOrder.clear();
12010 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12011 ReuseShuffleIndices: {}, ReorderIndices: CurrentOrder);
12012 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
12013 TE->dump());
12014
12015 TE->setOperands(Operands);
12016 buildTreeRec(VLRef: TE->getOperand(OpIdx: 1), Depth: Depth + 1, UserTreeIdx: {TE, 1});
12017 return;
12018 }
12019 case Instruction::Load: {
12020 // Check that a vectorized load would load the same memory as a scalar
12021 // load. For example, we don't want to vectorize loads that are smaller
12022 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
12023 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
12024 // from such a struct, we read/write packed bits disagreeing with the
12025 // unvectorized version.
12026 TreeEntry *TE = nullptr;
12027 fixupOrderingIndices(Order: CurrentOrder);
12028 switch (State) {
12029 case TreeEntry::Vectorize:
12030 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12031 ReuseShuffleIndices, ReorderIndices: CurrentOrder, InterleaveFactor);
12032 if (CurrentOrder.empty())
12033 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
12034 TE->dump());
12035 else
12036 LLVM_DEBUG(dbgs()
12037 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
12038 TE->dump());
12039 break;
12040 case TreeEntry::CompressVectorize:
12041 // Vectorizing non-consecutive loads with (masked)load + compress.
12042 TE = newTreeEntry(VL, EntryState: TreeEntry::CompressVectorize, Bundle, S,
12043 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12044 LLVM_DEBUG(
12045 dbgs()
12046 << "SLP: added a new TreeEntry (masked LoadInst + compress).\n";
12047 TE->dump());
12048 break;
12049 case TreeEntry::StridedVectorize:
12050 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12051 TE = newTreeEntry(VL, EntryState: TreeEntry::StridedVectorize, Bundle, S,
12052 UserTreeIdx, ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12053 TreeEntryToStridedPtrInfoMap[TE] = SPtrInfo;
12054 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
12055 TE->dump());
12056 break;
12057 case TreeEntry::ScatterVectorize:
12058 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
12059 TE = newTreeEntry(VL, EntryState: TreeEntry::ScatterVectorize, Bundle, S,
12060 UserTreeIdx, ReuseShuffleIndices);
12061 LLVM_DEBUG(
12062 dbgs()
12063 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
12064 TE->dump());
12065 break;
12066 case TreeEntry::CombinedVectorize:
12067 case TreeEntry::SplitVectorize:
12068 case TreeEntry::NeedToGather:
12069 llvm_unreachable("Unexpected loads state.");
12070 }
12071 if (!CurrentOrder.empty() && State != TreeEntry::ScatterVectorize) {
12072 assert(Operands.size() == 1 && "Expected a single operand only");
12073 SmallVector<int> Mask;
12074 inversePermutation(Indices: CurrentOrder, Mask);
12075 reorderScalars(Scalars&: Operands.front(), Mask);
12076 }
12077 TE->setOperands(Operands);
12078 if (State == TreeEntry::ScatterVectorize)
12079 buildTreeRec(VLRef: PointerOps, Depth: Depth + 1, UserTreeIdx: {TE, 0});
12080 return;
12081 }
12082 case Instruction::ZExt:
12083 case Instruction::SExt:
12084 case Instruction::FPToUI:
12085 case Instruction::FPToSI:
12086 case Instruction::FPExt:
12087 case Instruction::PtrToInt:
12088 case Instruction::IntToPtr:
12089 case Instruction::SIToFP:
12090 case Instruction::UIToFP:
12091 case Instruction::Trunc:
12092 case Instruction::FPTrunc:
12093 case Instruction::BitCast: {
12094 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
12095 u: std::make_pair(x: std::numeric_limits<unsigned>::min(),
12096 y: std::numeric_limits<unsigned>::max()));
12097 if (ShuffleOrOp == Instruction::ZExt ||
12098 ShuffleOrOp == Instruction::SExt) {
12099 CastMaxMinBWSizes = std::make_pair(
12100 x: std::max<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12101 b: PrevMaxBW),
12102 y: std::min<unsigned>(
12103 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
12104 b: PrevMinBW));
12105 } else if (ShuffleOrOp == Instruction::Trunc) {
12106 CastMaxMinBWSizes = std::make_pair(
12107 x: std::max<unsigned>(
12108 a: DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()),
12109 b: PrevMaxBW),
12110 y: std::min<unsigned>(a: DL->getTypeSizeInBits(Ty: VL0->getType()),
12111 b: PrevMinBW));
12112 }
12113 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12114 ReuseShuffleIndices);
12115 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
12116 TE->dump());
12117
12118 TE->setOperands(Operands);
12119 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12120 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth, UserTreeIdx: {TE, I});
12121 if (ShuffleOrOp == Instruction::Trunc) {
12122 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12123 } else if (ShuffleOrOp == Instruction::SIToFP ||
12124 ShuffleOrOp == Instruction::UIToFP) {
12125 unsigned NumSignBits =
12126 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
12127 if (auto *OpI = dyn_cast<Instruction>(Val: VL0->getOperand(i: 0))) {
12128 APInt Mask = DB->getDemandedBits(I: OpI);
12129 NumSignBits = std::max(a: NumSignBits, b: Mask.countl_zero());
12130 }
12131 if (NumSignBits * 2 >=
12132 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
12133 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12134 }
12135 return;
12136 }
12137 case Instruction::ICmp:
12138 case Instruction::FCmp: {
12139 // Check that all of the compares have the same predicate.
12140 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
12141 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12142 ReuseShuffleIndices);
12143 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
12144 TE->dump());
12145
12146 VLOperands Ops(VL, Operands, S, *this);
12147 if (cast<CmpInst>(Val: VL0)->isCommutative()) {
12148 // Commutative predicate - collect + sort operands of the instructions
12149 // so that each side is more likely to have the same opcode.
12150 assert(P0 == CmpInst::getSwappedPredicate(P0) &&
12151 "Commutative Predicate mismatch");
12152 Ops.reorder();
12153 Operands.front() = Ops.getVL(OpIdx: 0);
12154 Operands.back() = Ops.getVL(OpIdx: 1);
12155 } else {
12156 // Collect operands - commute if it uses the swapped predicate.
12157 for (auto [Idx, V] : enumerate(First&: VL)) {
12158 if (isa<PoisonValue>(Val: V))
12159 continue;
12160 auto *Cmp = cast<CmpInst>(Val: V);
12161 if (Cmp->getPredicate() != P0)
12162 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12163 }
12164 }
12165 TE->setOperands(Operands);
12166 buildTreeRec(VLRef: Operands.front(), Depth, UserTreeIdx: {TE, 0});
12167 buildTreeRec(VLRef: Operands.back(), Depth, UserTreeIdx: {TE, 1});
12168 if (ShuffleOrOp == Instruction::ICmp) {
12169 unsigned NumSignBits0 =
12170 ComputeNumSignBits(Op: VL0->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
12171 if (NumSignBits0 * 2 >=
12172 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 0)->getType()))
12173 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 0)->Idx);
12174 unsigned NumSignBits1 =
12175 ComputeNumSignBits(Op: VL0->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
12176 if (NumSignBits1 * 2 >=
12177 DL->getTypeSizeInBits(Ty: VL0->getOperand(i: 1)->getType()))
12178 ExtraBitWidthNodes.insert(V: getOperandEntry(E: TE, Idx: 1)->Idx);
12179 }
12180 return;
12181 }
12182 case Instruction::Select:
12183 case Instruction::FNeg:
12184 case Instruction::Add:
12185 case Instruction::FAdd:
12186 case Instruction::Sub:
12187 case Instruction::FSub:
12188 case Instruction::Mul:
12189 case Instruction::FMul:
12190 case Instruction::UDiv:
12191 case Instruction::SDiv:
12192 case Instruction::FDiv:
12193 case Instruction::URem:
12194 case Instruction::SRem:
12195 case Instruction::FRem:
12196 case Instruction::Shl:
12197 case Instruction::LShr:
12198 case Instruction::AShr:
12199 case Instruction::And:
12200 case Instruction::Or:
12201 case Instruction::Xor:
12202 case Instruction::Freeze: {
12203 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12204 ReuseShuffleIndices);
12205 LLVM_DEBUG(
12206 dbgs() << "SLP: added a new TreeEntry "
12207 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
12208 TE->dump());
12209
12210 if (isa<BinaryOperator>(Val: VL0) && isCommutative(I: VL0)) {
12211 VLOperands Ops(VL, Operands, S, *this);
12212 Ops.reorder();
12213 Operands[0] = Ops.getVL(OpIdx: 0);
12214 Operands[1] = Ops.getVL(OpIdx: 1);
12215 }
12216 TE->setOperands(Operands);
12217 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12218 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12219 return;
12220 }
12221 case Instruction::GetElementPtr: {
12222 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12223 ReuseShuffleIndices);
12224 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
12225 TE->dump());
12226 TE->setOperands(Operands);
12227
12228 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
12229 buildTreeRec(VLRef: Operands[I], Depth: Depth + 1, UserTreeIdx: {TE, I});
12230 return;
12231 }
12232 case Instruction::Store: {
12233 bool Consecutive = CurrentOrder.empty();
12234 if (!Consecutive)
12235 fixupOrderingIndices(Order: CurrentOrder);
12236 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12237 ReuseShuffleIndices, ReorderIndices: CurrentOrder);
12238 if (Consecutive)
12239 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
12240 TE->dump());
12241 else
12242 LLVM_DEBUG(
12243 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
12244 TE->dump());
12245 TE->setOperands(Operands);
12246 buildTreeRec(VLRef: TE->getOperand(OpIdx: 0), Depth: Depth + 1, UserTreeIdx: {TE, 0});
12247 return;
12248 }
12249 case Instruction::Call: {
12250 // Check if the calls are all to the same vectorizable intrinsic or
12251 // library function.
12252 CallInst *CI = cast<CallInst>(Val: VL0);
12253 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
12254
12255 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12256 ReuseShuffleIndices);
12257 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
12258 TE->dump());
12259 if (isCommutative(I: VL0)) {
12260 VLOperands Ops(VL, Operands, S, *this);
12261 Ops.reorder();
12262 Operands[0] = Ops.getVL(OpIdx: 0);
12263 Operands[1] = Ops.getVL(OpIdx: 1);
12264 }
12265 TE->setOperands(Operands);
12266 for (unsigned I : seq<unsigned>(Size: CI->arg_size())) {
12267 // For scalar operands no need to create an entry since no need to
12268 // vectorize it.
12269 if (isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI))
12270 continue;
12271 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12272 }
12273 return;
12274 }
12275 case Instruction::ShuffleVector: {
12276 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
12277 ReuseShuffleIndices);
12278 if (S.isAltShuffle()) {
12279 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
12280 TE->dump());
12281 } else {
12282 assert(SLPReVec && "Only supported by REVEC.");
12283 LLVM_DEBUG(
12284 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
12285 TE->dump());
12286 }
12287
12288 // Reorder operands if reordering would enable vectorization.
12289 auto *CI = dyn_cast<CmpInst>(Val: VL0);
12290 if (CI && any_of(Range&: VL, P: [](Value *V) {
12291 return !isa<PoisonValue>(Val: V) && !cast<CmpInst>(Val: V)->isCommutative();
12292 })) {
12293 auto *MainCI = cast<CmpInst>(Val: S.getMainOp());
12294 auto *AltCI = cast<CmpInst>(Val: S.getAltOp());
12295 CmpInst::Predicate MainP = MainCI->getPredicate();
12296 CmpInst::Predicate AltP = AltCI->getPredicate();
12297 assert(MainP != AltP &&
12298 "Expected different main/alternate predicates.");
12299 // Collect operands - commute if it uses the swapped predicate or
12300 // alternate operation.
12301 for (auto [Idx, V] : enumerate(First&: VL)) {
12302 if (isa<PoisonValue>(Val: V))
12303 continue;
12304 auto *Cmp = cast<CmpInst>(Val: V);
12305
12306 if (isAlternateInstruction(I: Cmp, MainOp: MainCI, AltOp: AltCI, TLI: *TLI)) {
12307 if (AltP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12308 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12309 } else {
12310 if (MainP == CmpInst::getSwappedPredicate(pred: Cmp->getPredicate()))
12311 std::swap(a&: Operands.front()[Idx], b&: Operands.back()[Idx]);
12312 }
12313 }
12314 TE->setOperands(Operands);
12315 buildTreeRec(VLRef: Operands.front(), Depth: Depth + 1, UserTreeIdx: {TE, 0});
12316 buildTreeRec(VLRef: Operands.back(), Depth: Depth + 1, UserTreeIdx: {TE, 1});
12317 return;
12318 }
12319
12320 if (isa<BinaryOperator>(Val: VL0) || CI) {
12321 VLOperands Ops(VL, Operands, S, *this);
12322 Ops.reorder();
12323 Operands[0] = Ops.getVL(OpIdx: 0);
12324 Operands[1] = Ops.getVL(OpIdx: 1);
12325 }
12326 TE->setOperands(Operands);
12327 for (unsigned I : seq<unsigned>(Size: VL0->getNumOperands()))
12328 buildTreeRec(VLRef: TE->getOperand(OpIdx: I), Depth: Depth + 1, UserTreeIdx: {TE, I});
12329 return;
12330 }
12331 default:
12332 break;
12333 }
12334 llvm_unreachable("Unexpected vectorization of the instructions.");
12335}
12336
12337unsigned BoUpSLP::canMapToVector(Type *T) const {
12338 unsigned N = 1;
12339 Type *EltTy = T;
12340
12341 while (isa<StructType, ArrayType, FixedVectorType>(Val: EltTy)) {
12342 if (EltTy->isEmptyTy())
12343 return 0;
12344 if (auto *ST = dyn_cast<StructType>(Val: EltTy)) {
12345 // Check that struct is homogeneous.
12346 for (const auto *Ty : ST->elements())
12347 if (Ty != *ST->element_begin())
12348 return 0;
12349 N *= ST->getNumElements();
12350 EltTy = *ST->element_begin();
12351 } else if (auto *AT = dyn_cast<ArrayType>(Val: EltTy)) {
12352 N *= AT->getNumElements();
12353 EltTy = AT->getElementType();
12354 } else {
12355 auto *VT = cast<FixedVectorType>(Val: EltTy);
12356 N *= VT->getNumElements();
12357 EltTy = VT->getElementType();
12358 }
12359 }
12360
12361 if (!isValidElementType(Ty: EltTy))
12362 return 0;
12363 size_t VTSize = DL->getTypeStoreSizeInBits(Ty: getWidenedType(ScalarTy: EltTy, VF: N));
12364 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
12365 VTSize != DL->getTypeStoreSizeInBits(Ty: T))
12366 return 0;
12367 return N;
12368}
12369
12370bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
12371 SmallVectorImpl<unsigned> &CurrentOrder,
12372 bool ResizeAllowed) const {
12373 const auto *It = find_if(Range&: VL, P: IsaPred<ExtractElementInst, ExtractValueInst>);
12374 assert(It != VL.end() && "Expected at least one extract instruction.");
12375 auto *E0 = cast<Instruction>(Val: *It);
12376 assert(
12377 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
12378 "Invalid opcode");
12379 // Check if all of the extracts come from the same vector and from the
12380 // correct offset.
12381 Value *Vec = E0->getOperand(i: 0);
12382
12383 CurrentOrder.clear();
12384
12385 // We have to extract from a vector/aggregate with the same number of elements.
12386 unsigned NElts;
12387 if (E0->getOpcode() == Instruction::ExtractValue) {
12388 NElts = canMapToVector(T: Vec->getType());
12389 if (!NElts)
12390 return false;
12391 // Check if load can be rewritten as load of vector.
12392 LoadInst *LI = dyn_cast<LoadInst>(Val: Vec);
12393 if (!LI || !LI->isSimple() || !LI->hasNUses(N: VL.size()))
12394 return false;
12395 } else {
12396 NElts = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
12397 }
12398
12399 unsigned E = VL.size();
12400 if (!ResizeAllowed && NElts != E)
12401 return false;
12402 SmallVector<int> Indices(E, PoisonMaskElem);
12403 unsigned MinIdx = NElts, MaxIdx = 0;
12404 for (auto [I, V] : enumerate(First&: VL)) {
12405 auto *Inst = dyn_cast<Instruction>(Val: V);
12406 if (!Inst)
12407 continue;
12408 if (Inst->getOperand(i: 0) != Vec)
12409 return false;
12410 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst))
12411 if (isa<UndefValue>(Val: EE->getIndexOperand()))
12412 continue;
12413 std::optional<unsigned> Idx = getExtractIndex(E: Inst);
12414 if (!Idx)
12415 return false;
12416 const unsigned ExtIdx = *Idx;
12417 if (ExtIdx >= NElts)
12418 continue;
12419 Indices[I] = ExtIdx;
12420 if (MinIdx > ExtIdx)
12421 MinIdx = ExtIdx;
12422 if (MaxIdx < ExtIdx)
12423 MaxIdx = ExtIdx;
12424 }
12425 if (MaxIdx - MinIdx + 1 > E)
12426 return false;
12427 if (MaxIdx + 1 <= E)
12428 MinIdx = 0;
12429
12430 // Check that all of the indices extract from the correct offset.
12431 bool ShouldKeepOrder = true;
12432 // Assign to all items the initial value E + 1 so we can check if the extract
12433 // instruction index was used already.
12434 // Also, later we can check that all the indices are used and we have a
12435 // consecutive access in the extract instructions, by checking that no
12436 // element of CurrentOrder still has value E + 1.
12437 CurrentOrder.assign(NumElts: E, Elt: E);
12438 for (unsigned I = 0; I < E; ++I) {
12439 if (Indices[I] == PoisonMaskElem)
12440 continue;
12441 const unsigned ExtIdx = Indices[I] - MinIdx;
12442 if (CurrentOrder[ExtIdx] != E) {
12443 CurrentOrder.clear();
12444 return false;
12445 }
12446 ShouldKeepOrder &= ExtIdx == I;
12447 CurrentOrder[ExtIdx] = I;
12448 }
12449 if (ShouldKeepOrder)
12450 CurrentOrder.clear();
12451
12452 return ShouldKeepOrder;
12453}
12454
12455bool BoUpSLP::areAllUsersVectorized(
12456 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
12457 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(V: I))) ||
12458 all_of(Range: I->users(), P: [this](User *U) {
12459 return isVectorized(V: U) || isVectorLikeInstWithConstOps(V: U) ||
12460 (isa<ExtractElementInst>(Val: U) && MustGather.contains(Ptr: U));
12461 });
12462}
12463
12464void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
12465 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
12466 SmallVectorImpl<Value *> *OpScalars,
12467 SmallVectorImpl<Value *> *AltScalars) const {
12468 unsigned Sz = Scalars.size();
12469 Mask.assign(NumElts: Sz, Elt: PoisonMaskElem);
12470 SmallVector<int> OrderMask;
12471 if (!ReorderIndices.empty())
12472 inversePermutation(Indices: ReorderIndices, Mask&: OrderMask);
12473 for (unsigned I = 0; I < Sz; ++I) {
12474 unsigned Idx = I;
12475 if (!ReorderIndices.empty())
12476 Idx = OrderMask[I];
12477 if (isa<PoisonValue>(Val: Scalars[Idx]))
12478 continue;
12479 auto *OpInst = cast<Instruction>(Val: Scalars[Idx]);
12480 if (IsAltOp(OpInst)) {
12481 Mask[I] = Sz + Idx;
12482 if (AltScalars)
12483 AltScalars->push_back(Elt: OpInst);
12484 } else {
12485 Mask[I] = Idx;
12486 if (OpScalars)
12487 OpScalars->push_back(Elt: OpInst);
12488 }
12489 }
12490 if (!ReuseShuffleIndices.empty()) {
12491 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
12492 transform(Range: ReuseShuffleIndices, d_first: NewMask.begin(), F: [&Mask](int Idx) {
12493 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
12494 });
12495 Mask.swap(RHS&: NewMask);
12496 }
12497}
12498
12499static bool isMainInstruction(Instruction *I, Instruction *MainOp,
12500 Instruction *AltOp,
12501 const TargetLibraryInfo &TLI) {
12502 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == MainOp;
12503}
12504
12505static bool isAlternateInstruction(Instruction *I, Instruction *MainOp,
12506 Instruction *AltOp,
12507 const TargetLibraryInfo &TLI) {
12508 if (auto *MainCI = dyn_cast<CmpInst>(Val: MainOp)) {
12509 auto *AltCI = cast<CmpInst>(Val: AltOp);
12510 CmpInst::Predicate MainP = MainCI->getPredicate();
12511 [[maybe_unused]] CmpInst::Predicate AltP = AltCI->getPredicate();
12512 assert(MainP != AltP && "Expected different main/alternate predicates.");
12513 auto *CI = cast<CmpInst>(Val: I);
12514 if (isCmpSameOrSwapped(BaseCI: MainCI, CI, TLI))
12515 return false;
12516 if (isCmpSameOrSwapped(BaseCI: AltCI, CI, TLI))
12517 return true;
12518 CmpInst::Predicate P = CI->getPredicate();
12519 CmpInst::Predicate SwappedP = CmpInst::getSwappedPredicate(pred: P);
12520
12521 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
12522 "CmpInst expected to match either main or alternate predicate or "
12523 "their swap.");
12524 return MainP != P && MainP != SwappedP;
12525 }
12526 return InstructionsState(MainOp, AltOp).getMatchingMainOpOrAltOp(I) == AltOp;
12527}
12528
12529TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) const {
12530 assert(!Ops.empty());
12531 const auto *Op0 = Ops.front();
12532
12533 const bool IsConstant = all_of(Range&: Ops, P: [](Value *V) {
12534 // TODO: We should allow undef elements here
12535 return isConstant(V) && !isa<UndefValue>(Val: V);
12536 });
12537 const bool IsUniform = all_of(Range&: Ops, P: [=](Value *V) {
12538 // TODO: We should allow undef elements here
12539 return V == Op0;
12540 });
12541 const bool IsPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12542 // TODO: We should allow undef elements here
12543 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12544 return CI->getValue().isPowerOf2();
12545 return false;
12546 });
12547 const bool IsNegatedPowerOfTwo = all_of(Range&: Ops, P: [](Value *V) {
12548 // TODO: We should allow undef elements here
12549 if (auto *CI = dyn_cast<ConstantInt>(Val: V))
12550 return CI->getValue().isNegatedPowerOf2();
12551 return false;
12552 });
12553
12554 TTI::OperandValueKind VK = TTI::OK_AnyValue;
12555 if (IsConstant && IsUniform)
12556 VK = TTI::OK_UniformConstantValue;
12557 else if (IsConstant)
12558 VK = TTI::OK_NonUniformConstantValue;
12559 else if (IsUniform)
12560 VK = TTI::OK_UniformValue;
12561
12562 TTI::OperandValueProperties VP = TTI::OP_None;
12563 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
12564 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
12565
12566 return {.Kind: VK, .Properties: VP};
12567}
12568
12569namespace {
12570/// The base class for shuffle instruction emission and shuffle cost estimation.
12571class BaseShuffleAnalysis {
12572protected:
12573 Type *ScalarTy = nullptr;
12574
12575 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
12576
12577 /// V is expected to be a vectorized value.
12578 /// When REVEC is disabled, there is no difference between VF and
12579 /// VNumElements.
12580 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
12581 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
12582 /// of 8.
12583 unsigned getVF(Value *V) const {
12584 assert(V && "V cannot be nullptr");
12585 assert(isa<FixedVectorType>(V->getType()) &&
12586 "V does not have FixedVectorType");
12587 assert(ScalarTy && "ScalarTy cannot be nullptr");
12588 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
12589 unsigned VNumElements =
12590 cast<FixedVectorType>(Val: V->getType())->getNumElements();
12591 assert(VNumElements > ScalarTyNumElements &&
12592 "the number of elements of V is not large enough");
12593 assert(VNumElements % ScalarTyNumElements == 0 &&
12594 "the number of elements of V is not a vectorized value");
12595 return VNumElements / ScalarTyNumElements;
12596 }
12597
12598 /// Checks if the mask is an identity mask.
12599 /// \param IsStrict if is true the function returns false if mask size does
12600 /// not match vector size.
12601 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
12602 bool IsStrict) {
12603 int Limit = Mask.size();
12604 int VF = VecTy->getNumElements();
12605 int Index = -1;
12606 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Limit))
12607 return true;
12608 if (!IsStrict) {
12609 // Consider extract subvector starting from index 0.
12610 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
12611 Index == 0)
12612 return true;
12613 // All VF-size submasks are identity (e.g.
12614 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
12615 if (Limit % VF == 0 && all_of(Range: seq<int>(Begin: 0, End: Limit / VF), P: [=](int Idx) {
12616 ArrayRef<int> Slice = Mask.slice(N: Idx * VF, M: VF);
12617 return all_of(Range&: Slice, P: equal_to(Arg: PoisonMaskElem)) ||
12618 ShuffleVectorInst::isIdentityMask(Mask: Slice, NumSrcElts: VF);
12619 }))
12620 return true;
12621 }
12622 return false;
12623 }
12624
12625 /// Tries to combine 2 different masks into single one.
12626 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
12627 /// change the size of the vector, \p LocalVF is the original size of the
12628 /// shuffled vector.
12629 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
12630 ArrayRef<int> ExtMask) {
12631 unsigned VF = Mask.size();
12632 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
12633 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
12634 if (ExtMask[I] == PoisonMaskElem)
12635 continue;
12636 int MaskedIdx = Mask[ExtMask[I] % VF];
12637 NewMask[I] =
12638 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
12639 }
12640 Mask.swap(RHS&: NewMask);
12641 }
12642
12643 /// Looks through shuffles trying to reduce final number of shuffles in the
12644 /// code. The function looks through the previously emitted shuffle
12645 /// instructions and properly mark indices in mask as undef.
12646 /// For example, given the code
12647 /// \code
12648 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
12649 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
12650 /// \endcode
12651 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
12652 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12653 /// <0, 1, 2, 3> for the shuffle.
12654 /// If 2 operands are of different size, the smallest one will be resized and
12655 /// the mask recalculated properly.
12656 /// For example, given the code
12657 /// \code
12658 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
12659 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
12660 /// \endcode
12661 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
12662 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
12663 /// <0, 1, 2, 3> for the shuffle.
12664 /// So, it tries to transform permutations to simple vector merge, if
12665 /// possible.
12666 /// \param V The input vector which must be shuffled using the given \p Mask.
12667 /// If the better candidate is found, \p V is set to this best candidate
12668 /// vector.
12669 /// \param Mask The input mask for the shuffle. If the best candidate is found
12670 /// during looking-through-shuffles attempt, it is updated accordingly.
12671 /// \param SinglePermute true if the shuffle operation is originally a
12672 /// single-value-permutation. In this case the look-through-shuffles procedure
12673 /// may look for resizing shuffles as the best candidates.
12674 /// \return true if the shuffle results in the non-resizing identity shuffle
12675 /// (and thus can be ignored), false - otherwise.
12676 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
12677 bool SinglePermute) {
12678 Value *Op = V;
12679 ShuffleVectorInst *IdentityOp = nullptr;
12680 SmallVector<int> IdentityMask;
12681 while (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Op)) {
12682 // Exit if not a fixed vector type or changing size shuffle.
12683 auto *SVTy = dyn_cast<FixedVectorType>(Val: SV->getType());
12684 if (!SVTy)
12685 break;
12686 // Remember the identity or broadcast mask, if it is not a resizing
12687 // shuffle. If no better candidates are found, this Op and Mask will be
12688 // used in the final shuffle.
12689 if (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/false)) {
12690 if (!IdentityOp || !SinglePermute ||
12691 (isIdentityMask(Mask, VecTy: SVTy, /*IsStrict=*/true) &&
12692 !ShuffleVectorInst::isZeroEltSplatMask(Mask: IdentityMask,
12693 NumSrcElts: IdentityMask.size()))) {
12694 IdentityOp = SV;
12695 // Store current mask in the IdentityMask so later we did not lost
12696 // this info if IdentityOp is selected as the best candidate for the
12697 // permutation.
12698 IdentityMask.assign(RHS: Mask);
12699 }
12700 }
12701 // Remember the broadcast mask. If no better candidates are found, this Op
12702 // and Mask will be used in the final shuffle.
12703 // Zero splat can be used as identity too, since it might be used with
12704 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
12705 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
12706 // expensive, the analysis founds out, that the source vector is just a
12707 // broadcast, this original mask can be transformed to identity mask <0,
12708 // 1, 2, 3>.
12709 // \code
12710 // %0 = shuffle %v, poison, zeroinitalizer
12711 // %res = shuffle %0, poison, <3, 1, 2, 0>
12712 // \endcode
12713 // may be transformed to
12714 // \code
12715 // %0 = shuffle %v, poison, zeroinitalizer
12716 // %res = shuffle %0, poison, <0, 1, 2, 3>
12717 // \endcode
12718 if (SV->isZeroEltSplat()) {
12719 IdentityOp = SV;
12720 IdentityMask.assign(RHS: Mask);
12721 }
12722 int LocalVF = Mask.size();
12723 if (auto *SVOpTy =
12724 dyn_cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType()))
12725 LocalVF = SVOpTy->getNumElements();
12726 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
12727 for (auto [Idx, I] : enumerate(First&: Mask)) {
12728 if (I == PoisonMaskElem ||
12729 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
12730 continue;
12731 ExtMask[Idx] = SV->getMaskValue(Elt: I);
12732 }
12733 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
12734 V: SV->getOperand(i_nocapture: 0),
12735 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::FirstArg))
12736 .all();
12737 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
12738 V: SV->getOperand(i_nocapture: 1),
12739 UseMask: buildUseMask(VF: LocalVF, Mask: ExtMask, MaskArg: UseMask::SecondArg))
12740 .all();
12741 if (!IsOp1Undef && !IsOp2Undef) {
12742 // Update mask and mark undef elems.
12743 for (int &I : Mask) {
12744 if (I == PoisonMaskElem)
12745 continue;
12746 if (SV->getMaskValue(Elt: I % SV->getShuffleMask().size()) ==
12747 PoisonMaskElem)
12748 I = PoisonMaskElem;
12749 }
12750 break;
12751 }
12752 SmallVector<int> ShuffleMask(SV->getShuffleMask());
12753 combineMasks(LocalVF, Mask&: ShuffleMask, ExtMask: Mask);
12754 Mask.swap(RHS&: ShuffleMask);
12755 if (IsOp2Undef)
12756 Op = SV->getOperand(i_nocapture: 0);
12757 else
12758 Op = SV->getOperand(i_nocapture: 1);
12759 }
12760 if (auto *OpTy = dyn_cast<FixedVectorType>(Val: Op->getType());
12761 !OpTy || !isIdentityMask(Mask, VecTy: OpTy, IsStrict: SinglePermute) ||
12762 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size())) {
12763 if (IdentityOp) {
12764 V = IdentityOp;
12765 assert(Mask.size() == IdentityMask.size() &&
12766 "Expected masks of same sizes.");
12767 // Clear known poison elements.
12768 for (auto [I, Idx] : enumerate(First&: Mask))
12769 if (Idx == PoisonMaskElem)
12770 IdentityMask[I] = PoisonMaskElem;
12771 Mask.swap(RHS&: IdentityMask);
12772 auto *Shuffle = dyn_cast<ShuffleVectorInst>(Val: V);
12773 return SinglePermute &&
12774 (isIdentityMask(Mask, VecTy: cast<FixedVectorType>(Val: V->getType()),
12775 /*IsStrict=*/true) ||
12776 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
12777 Shuffle->isZeroEltSplat() &&
12778 ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts: Mask.size()) &&
12779 all_of(Range: enumerate(First&: Mask), P: [&](const auto &P) {
12780 return P.value() == PoisonMaskElem ||
12781 Shuffle->getShuffleMask()[P.index()] == 0;
12782 })));
12783 }
12784 V = Op;
12785 return false;
12786 }
12787 V = Op;
12788 return true;
12789 }
12790
12791 /// Smart shuffle instruction emission, walks through shuffles trees and
12792 /// tries to find the best matching vector for the actual shuffle
12793 /// instruction.
12794 template <typename T, typename ShuffleBuilderTy>
12795 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
12796 ShuffleBuilderTy &Builder, Type *ScalarTy) {
12797 assert(V1 && "Expected at least one vector value.");
12798 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
12799 SmallVector<int> NewMask(Mask);
12800 if (ScalarTyNumElements != 1) {
12801 assert(SLPReVec && "FixedVectorType is not expected.");
12802 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask&: NewMask);
12803 Mask = NewMask;
12804 }
12805 if (V2)
12806 Builder.resizeToMatch(V1, V2);
12807 int VF = Mask.size();
12808 if (auto *FTy = dyn_cast<FixedVectorType>(Val: V1->getType()))
12809 VF = FTy->getNumElements();
12810 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
12811 V: V2, UseMask: buildUseMask(VF, Mask, MaskArg: UseMask::SecondArg))
12812 .all()) {
12813 // Peek through shuffles.
12814 Value *Op1 = V1;
12815 Value *Op2 = V2;
12816 int VF =
12817 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
12818 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
12819 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
12820 for (int I = 0, E = Mask.size(); I < E; ++I) {
12821 if (Mask[I] < VF)
12822 CombinedMask1[I] = Mask[I];
12823 else
12824 CombinedMask2[I] = Mask[I] - VF;
12825 }
12826 Value *PrevOp1;
12827 Value *PrevOp2;
12828 do {
12829 PrevOp1 = Op1;
12830 PrevOp2 = Op2;
12831 (void)peekThroughShuffles(V&: Op1, Mask&: CombinedMask1, /*SinglePermute=*/false);
12832 (void)peekThroughShuffles(V&: Op2, Mask&: CombinedMask2, /*SinglePermute=*/false);
12833 // Check if we have 2 resizing shuffles - need to peek through operands
12834 // again.
12835 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Val: Op1))
12836 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Val: Op2)) {
12837 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
12838 for (auto [Idx, I] : enumerate(First&: CombinedMask1)) {
12839 if (I == PoisonMaskElem)
12840 continue;
12841 ExtMask1[Idx] = SV1->getMaskValue(Elt: I);
12842 }
12843 SmallBitVector UseMask1 = buildUseMask(
12844 VF: cast<FixedVectorType>(Val: SV1->getOperand(i_nocapture: 1)->getType())
12845 ->getNumElements(),
12846 Mask: ExtMask1, MaskArg: UseMask::SecondArg);
12847 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
12848 for (auto [Idx, I] : enumerate(First&: CombinedMask2)) {
12849 if (I == PoisonMaskElem)
12850 continue;
12851 ExtMask2[Idx] = SV2->getMaskValue(Elt: I);
12852 }
12853 SmallBitVector UseMask2 = buildUseMask(
12854 VF: cast<FixedVectorType>(Val: SV2->getOperand(i_nocapture: 1)->getType())
12855 ->getNumElements(),
12856 Mask: ExtMask2, MaskArg: UseMask::SecondArg);
12857 if (SV1->getOperand(i_nocapture: 0)->getType() ==
12858 SV2->getOperand(i_nocapture: 0)->getType() &&
12859 SV1->getOperand(i_nocapture: 0)->getType() != SV1->getType() &&
12860 isUndefVector(V: SV1->getOperand(i_nocapture: 1), UseMask: UseMask1).all() &&
12861 isUndefVector(V: SV2->getOperand(i_nocapture: 1), UseMask: UseMask2).all()) {
12862 Op1 = SV1->getOperand(i_nocapture: 0);
12863 Op2 = SV2->getOperand(i_nocapture: 0);
12864 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
12865 int LocalVF = ShuffleMask1.size();
12866 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op1->getType()))
12867 LocalVF = FTy->getNumElements();
12868 combineMasks(LocalVF, Mask&: ShuffleMask1, ExtMask: CombinedMask1);
12869 CombinedMask1.swap(RHS&: ShuffleMask1);
12870 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
12871 LocalVF = ShuffleMask2.size();
12872 if (auto *FTy = dyn_cast<FixedVectorType>(Val: Op2->getType()))
12873 LocalVF = FTy->getNumElements();
12874 combineMasks(LocalVF, Mask&: ShuffleMask2, ExtMask: CombinedMask2);
12875 CombinedMask2.swap(RHS&: ShuffleMask2);
12876 }
12877 }
12878 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
12879 Builder.resizeToMatch(Op1, Op2);
12880 VF = std::max(a: cast<VectorType>(Val: Op1->getType())
12881 ->getElementCount()
12882 .getKnownMinValue(),
12883 b: cast<VectorType>(Val: Op2->getType())
12884 ->getElementCount()
12885 .getKnownMinValue());
12886 for (int I = 0, E = Mask.size(); I < E; ++I) {
12887 if (CombinedMask2[I] != PoisonMaskElem) {
12888 assert(CombinedMask1[I] == PoisonMaskElem &&
12889 "Expected undefined mask element");
12890 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
12891 }
12892 }
12893 if (Op1 == Op2 &&
12894 (ShuffleVectorInst::isIdentityMask(Mask: CombinedMask1, NumSrcElts: VF) ||
12895 (ShuffleVectorInst::isZeroEltSplatMask(Mask: CombinedMask1, NumSrcElts: VF) &&
12896 isa<ShuffleVectorInst>(Val: Op1) &&
12897 cast<ShuffleVectorInst>(Val: Op1)->getShuffleMask() ==
12898 ArrayRef(CombinedMask1))))
12899 return Builder.createIdentity(Op1);
12900 return Builder.createShuffleVector(
12901 Op1, Op1 == Op2 ? PoisonValue::get(T: Op1->getType()) : Op2,
12902 CombinedMask1);
12903 }
12904 if (isa<PoisonValue>(Val: V1))
12905 return Builder.createPoison(
12906 cast<VectorType>(Val: V1->getType())->getElementType(), Mask.size());
12907 bool IsIdentity = peekThroughShuffles(V&: V1, Mask&: NewMask, /*SinglePermute=*/true);
12908 assert(V1 && "Expected non-null value after looking through shuffles.");
12909
12910 if (!IsIdentity)
12911 return Builder.createShuffleVector(V1, NewMask);
12912 return Builder.createIdentity(V1);
12913 }
12914
12915 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
12916 /// shuffle emission.
12917 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
12918 ArrayRef<int> Mask) {
12919 for (unsigned I : seq<unsigned>(Size: CommonMask.size()))
12920 if (Mask[I] != PoisonMaskElem)
12921 CommonMask[I] = I;
12922 }
12923};
12924} // namespace
12925
12926/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
12927static std::pair<InstructionCost, InstructionCost>
12928getGEPCosts(const TargetTransformInfo &TTI, ArrayRef<Value *> Ptrs,
12929 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
12930 Type *ScalarTy, VectorType *VecTy) {
12931 InstructionCost ScalarCost = 0;
12932 InstructionCost VecCost = 0;
12933 // Here we differentiate two cases: (1) when Ptrs represent a regular
12934 // vectorization tree node (as they are pointer arguments of scattered
12935 // loads) or (2) when Ptrs are the arguments of loads or stores being
12936 // vectorized as plane wide unit-stride load/store since all the
12937 // loads/stores are known to be from/to adjacent locations.
12938 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
12939 // Case 2: estimate costs for pointer related costs when vectorizing to
12940 // a wide load/store.
12941 // Scalar cost is estimated as a set of pointers with known relationship
12942 // between them.
12943 // For vector code we will use BasePtr as argument for the wide load/store
12944 // but we also need to account all the instructions which are going to
12945 // stay in vectorized code due to uses outside of these scalar
12946 // loads/stores.
12947 ScalarCost = TTI.getPointersChainCost(
12948 Ptrs, Base: BasePtr, Info: TTI::PointersChainInfo::getUnitStride(), AccessTy: ScalarTy,
12949 CostKind);
12950
12951 SmallVector<const Value *> PtrsRetainedInVecCode;
12952 for (Value *V : Ptrs) {
12953 if (V == BasePtr) {
12954 PtrsRetainedInVecCode.push_back(Elt: V);
12955 continue;
12956 }
12957 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
12958 // For simplicity assume Ptr to stay in vectorized code if it's not a
12959 // GEP instruction. We don't care since it's cost considered free.
12960 // TODO: We should check for any uses outside of vectorizable tree
12961 // rather than just single use.
12962 if (!Ptr || !Ptr->hasOneUse())
12963 PtrsRetainedInVecCode.push_back(Elt: V);
12964 }
12965
12966 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
12967 // If all pointers stay in vectorized code then we don't have
12968 // any savings on that.
12969 return std::make_pair(x: TTI::TCC_Free, y: TTI::TCC_Free);
12970 }
12971 VecCost = TTI.getPointersChainCost(Ptrs: PtrsRetainedInVecCode, Base: BasePtr,
12972 Info: TTI::PointersChainInfo::getKnownStride(),
12973 AccessTy: VecTy, CostKind);
12974 } else {
12975 // Case 1: Ptrs are the arguments of loads that we are going to transform
12976 // into masked gather load intrinsic.
12977 // All the scalar GEPs will be removed as a result of vectorization.
12978 // For any external uses of some lanes extract element instructions will
12979 // be generated (which cost is estimated separately).
12980 TTI::PointersChainInfo PtrsInfo =
12981 all_of(Range&: Ptrs,
12982 P: [](const Value *V) {
12983 auto *Ptr = dyn_cast<GetElementPtrInst>(Val: V);
12984 return Ptr && !Ptr->hasAllConstantIndices();
12985 })
12986 ? TTI::PointersChainInfo::getUnknownStride()
12987 : TTI::PointersChainInfo::getKnownStride();
12988
12989 ScalarCost =
12990 TTI.getPointersChainCost(Ptrs, Base: BasePtr, Info: PtrsInfo, AccessTy: ScalarTy, CostKind);
12991 auto *BaseGEP = dyn_cast<GEPOperator>(Val: BasePtr);
12992 if (!BaseGEP) {
12993 auto *It = find_if(Range&: Ptrs, P: IsaPred<GEPOperator>);
12994 if (It != Ptrs.end())
12995 BaseGEP = cast<GEPOperator>(Val: *It);
12996 }
12997 if (BaseGEP) {
12998 SmallVector<const Value *> Indices(BaseGEP->indices());
12999 VecCost = TTI.getGEPCost(PointeeType: BaseGEP->getSourceElementType(),
13000 Ptr: BaseGEP->getPointerOperand(), Operands: Indices, AccessType: VecTy,
13001 CostKind);
13002 }
13003 }
13004
13005 return std::make_pair(x&: ScalarCost, y&: VecCost);
13006}
13007
13008void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
13009 assert(TE.isGather() && TE.ReorderIndices.empty() &&
13010 "Expected gather node without reordering.");
13011 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
13012 SmallSet<size_t, 2> LoadKeyUsed;
13013
13014 // Do not reorder nodes if it small (just 2 elements), all-constant or all
13015 // instructions have same opcode already.
13016 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
13017 all_of(Range&: TE.Scalars, P: isConstant))
13018 return;
13019
13020 if (any_of(Range: seq<unsigned>(Size: TE.Idx), P: [&](unsigned Idx) {
13021 return VectorizableTree[Idx]->isSame(VL: TE.Scalars);
13022 }))
13023 return;
13024
13025 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
13026 Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
13027 Value *Ptr =
13028 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
13029 if (LoadKeyUsed.contains(V: Key)) {
13030 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
13031 if (LIt != LoadsMap.end()) {
13032 for (LoadInst *RLI : LIt->second) {
13033 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
13034 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL: *DL, SE&: *SE,
13035 /*StrictCheck=*/true))
13036 return hash_value(ptr: RLI->getPointerOperand());
13037 }
13038 for (LoadInst *RLI : LIt->second) {
13039 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
13040 Ptr2: LI->getPointerOperand(), TLI: *TLI)) {
13041 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
13042 return SubKey;
13043 }
13044 }
13045 if (LIt->second.size() > 2) {
13046 hash_code SubKey =
13047 hash_value(ptr: LIt->second.back()->getPointerOperand());
13048 return SubKey;
13049 }
13050 }
13051 }
13052 LoadKeyUsed.insert(V: Key);
13053 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr)).first->second.push_back(Elt: LI);
13054 return hash_value(ptr: LI->getPointerOperand());
13055 };
13056 MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;
13057 SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;
13058 bool IsOrdered = true;
13059 unsigned NumInstructions = 0;
13060 // Try to "cluster" scalar instructions, to be able to build extra vectorized
13061 // nodes.
13062 for (auto [I, V] : enumerate(First&: TE.Scalars)) {
13063 size_t Key = 1, Idx = 1;
13064 if (auto *Inst = dyn_cast<Instruction>(Val: V);
13065 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(Val: V) &&
13066 !isDeleted(I: Inst) && !isVectorized(V)) {
13067 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
13068 /*AllowAlternate=*/false);
13069 ++NumInstructions;
13070 }
13071 auto &Container = SortedValues[Key];
13072 if (IsOrdered && !KeyToIndex.contains(Val: V) &&
13073 !(isa<Constant, ExtractElementInst>(Val: V) ||
13074 isVectorLikeInstWithConstOps(V)) &&
13075 ((Container.contains(Key: Idx) &&
13076 KeyToIndex.at(Val: Container[Idx].back()).back() != I - 1) ||
13077 (!Container.empty() && !Container.contains(Key: Idx) &&
13078 KeyToIndex.at(Val: Container.back().second.back()).back() != I - 1)))
13079 IsOrdered = false;
13080 auto &KTI = KeyToIndex[V];
13081 if (KTI.empty())
13082 Container[Idx].push_back(Elt: V);
13083 KTI.push_back(Elt: I);
13084 }
13085 SmallVector<std::pair<unsigned, unsigned>> SubVectors;
13086 APInt DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13087 if (!IsOrdered && NumInstructions > 1) {
13088 unsigned Cnt = 0;
13089 TE.ReorderIndices.resize(N: TE.Scalars.size(), NV: TE.Scalars.size());
13090 for (const auto &D : SortedValues) {
13091 for (const auto &P : D.second) {
13092 unsigned Sz = 0;
13093 for (Value *V : P.second) {
13094 ArrayRef<unsigned> Indices = KeyToIndex.at(Val: V);
13095 for (auto [K, Idx] : enumerate(First&: Indices)) {
13096 TE.ReorderIndices[Cnt + K] = Idx;
13097 TE.Scalars[Cnt + K] = V;
13098 }
13099 Sz += Indices.size();
13100 Cnt += Indices.size();
13101 }
13102 if (Sz > 1 && isa<Instruction>(Val: P.second.front())) {
13103 const unsigned SubVF = getFloorFullVectorNumberOfElements(
13104 TTI: *TTI, Ty: TE.Scalars.front()->getType(), Sz);
13105 SubVectors.emplace_back(Args: Cnt - Sz, Args: SubVF);
13106 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt - Sz + SubVF))
13107 DemandedElts.clearBit(BitPosition: I);
13108 } else if (!P.second.empty() && isConstant(V: P.second.front())) {
13109 for (unsigned I : seq<unsigned>(Begin: Cnt - Sz, End: Cnt))
13110 DemandedElts.clearBit(BitPosition: I);
13111 }
13112 }
13113 }
13114 }
13115 // Reuses always require shuffles, so consider it as profitable.
13116 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
13117 return;
13118 // Do simple cost estimation.
13119 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13120 InstructionCost Cost = 0;
13121 auto *ScalarTy = TE.Scalars.front()->getType();
13122 auto *VecTy = getWidenedType(ScalarTy, VF: TE.Scalars.size());
13123 for (auto [Idx, Sz] : SubVectors) {
13124 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: VecTy, Mask: {}, CostKind,
13125 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: Sz));
13126 }
13127 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13128 /*Insert=*/true,
13129 /*Extract=*/false, CostKind);
13130 int Sz = TE.Scalars.size();
13131 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
13132 TE.ReorderIndices.end());
13133 for (unsigned I : seq<unsigned>(Size: Sz)) {
13134 Value *V = TE.getOrdered(Idx: I);
13135 if (isa<PoisonValue>(Val: V)) {
13136 ReorderMask[I] = PoisonMaskElem;
13137 } else if (isConstant(V) || DemandedElts[I]) {
13138 ReorderMask[I] = I + TE.ReorderIndices.size();
13139 }
13140 }
13141 Cost += ::getShuffleCost(TTI: *TTI,
13142 Kind: any_of(Range&: ReorderMask, P: [&](int I) { return I >= Sz; })
13143 ? TTI::SK_PermuteTwoSrc
13144 : TTI::SK_PermuteSingleSrc,
13145 Tp: VecTy, Mask: ReorderMask);
13146 DemandedElts = APInt::getAllOnes(numBits: TE.Scalars.size());
13147 ReorderMask.assign(NumElts: Sz, Elt: PoisonMaskElem);
13148 for (unsigned I : seq<unsigned>(Size: Sz)) {
13149 Value *V = TE.getOrdered(Idx: I);
13150 if (isConstant(V)) {
13151 DemandedElts.clearBit(BitPosition: I);
13152 if (!isa<PoisonValue>(Val: V))
13153 ReorderMask[I] = I;
13154 } else {
13155 ReorderMask[I] = I + Sz;
13156 }
13157 }
13158 InstructionCost BVCost =
13159 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
13160 /*Insert=*/true, /*Extract=*/false, CostKind);
13161 if (!DemandedElts.isAllOnes())
13162 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy, Mask: ReorderMask);
13163 if (Cost >= BVCost) {
13164 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
13165 reorderScalars(Scalars&: TE.Scalars, Mask);
13166 TE.ReorderIndices.clear();
13167 }
13168}
13169
13170/// Check if we can convert fadd/fsub sequence to FMAD.
13171/// \returns Cost of the FMAD, if conversion is possible, invalid cost otherwise.
13172static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
13173 const InstructionsState &S,
13174 DominatorTree &DT, const DataLayout &DL,
13175 TargetTransformInfo &TTI,
13176 const TargetLibraryInfo &TLI) {
13177 assert(all_of(VL,
13178 [](Value *V) {
13179 return V->getType()->getScalarType()->isFloatingPointTy();
13180 }) &&
13181 "Can only convert to FMA for floating point types");
13182 assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
13183
13184 auto CheckForContractable = [&](ArrayRef<Value *> VL) {
13185 FastMathFlags FMF;
13186 FMF.set();
13187 for (Value *V : VL) {
13188 auto *I = dyn_cast<Instruction>(Val: V);
13189 if (!I)
13190 continue;
13191 if (S.isCopyableElement(V: I))
13192 continue;
13193 Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
13194 if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
13195 continue;
13196 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13197 FMF &= FPCI->getFastMathFlags();
13198 }
13199 return FMF.allowContract();
13200 };
13201 if (!CheckForContractable(VL))
13202 return InstructionCost::getInvalid();
13203 // fmul also should be contractable
13204 InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
13205 SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
13206
13207 InstructionsState OpS = getSameOpcode(VL: Operands.front(), TLI);
13208 if (!OpS.valid())
13209 return InstructionCost::getInvalid();
13210
13211 if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
13212 return InstructionCost::getInvalid();
13213 if (!CheckForContractable(Operands.front()))
13214 return InstructionCost::getInvalid();
13215 // Compare the costs.
13216 InstructionCost FMulPlusFAddCost = 0;
13217 InstructionCost FMACost = 0;
13218 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13219 FastMathFlags FMF;
13220 FMF.set();
13221 for (Value *V : VL) {
13222 auto *I = dyn_cast<Instruction>(Val: V);
13223 if (!I)
13224 continue;
13225 if (!S.isCopyableElement(V: I))
13226 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13227 FMF &= FPCI->getFastMathFlags();
13228 FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13229 }
13230 unsigned NumOps = 0;
13231 for (auto [V, Op] : zip(t&: VL, u&: Operands.front())) {
13232 if (S.isCopyableElement(V))
13233 continue;
13234 auto *I = dyn_cast<Instruction>(Val: Op);
13235 if (!I || !I->hasOneUse() || OpS.isCopyableElement(V: I)) {
13236 if (auto *OpI = dyn_cast<Instruction>(Val: V))
13237 FMACost += TTI.getInstructionCost(U: OpI, CostKind);
13238 if (I)
13239 FMACost += TTI.getInstructionCost(U: I, CostKind);
13240 continue;
13241 }
13242 ++NumOps;
13243 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: I))
13244 FMF &= FPCI->getFastMathFlags();
13245 FMulPlusFAddCost += TTI.getInstructionCost(U: I, CostKind);
13246 }
13247 Type *Ty = VL.front()->getType();
13248 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
13249 FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
13250 return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
13251}
13252
13253bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
13254 bool &IsBSwap, bool &ForLoads) const {
13255 assert(TE.hasState() && TE.getOpcode() == Instruction::Shl &&
13256 "Expected Shl node.");
13257 IsBSwap = false;
13258 ForLoads = false;
13259 if (TE.State != TreeEntry::Vectorize || !TE.ReorderIndices.empty() ||
13260 !TE.ReuseShuffleIndices.empty() || MinBWs.contains(Val: &TE) ||
13261 any_of(Range: TE.Scalars, P: [](Value *V) { return !V->hasOneUse(); }))
13262 return false;
13263 Type *ScalarTy = TE.getMainOp()->getType();
13264 // TODO: Check if same can be done for the vector types.
13265 if (!ScalarTy->isIntegerTy())
13266 return false;
13267 if (ScalarTy->isVectorTy())
13268 return false;
13269 const unsigned Sz = DL->getTypeSizeInBits(Ty: ScalarTy);
13270 const TreeEntry *LhsTE = getOperandEntry(E: &TE, /*Idx=*/0);
13271 const TreeEntry *RhsTE = getOperandEntry(E: &TE, /*Idx=*/1);
13272 // Lhs should be zext i<stride> to I<sz>.
13273 if (!(LhsTE->State == TreeEntry::Vectorize &&
13274 LhsTE->getOpcode() == Instruction::ZExt &&
13275 LhsTE->ReorderIndices.empty() && LhsTE->ReuseShuffleIndices.empty() &&
13276 !MinBWs.contains(Val: LhsTE) &&
13277 all_of(Range: LhsTE->Scalars, P: [](Value *V) { return V->hasOneUse(); })))
13278 return false;
13279 Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
13280 unsigned Stride = DL->getTypeSizeInBits(Ty: SrcScalarTy);
13281 if (!isPowerOf2_64(Value: Stride) || Stride >= Sz || Sz % Stride != 0 ||
13282 !isPowerOf2_64(Value: LhsTE->getVectorFactor()))
13283 return false;
13284 if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
13285 RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(Val: RhsTE)))
13286 return false;
13287 Order.clear();
13288 unsigned CurrentValue = 0;
13289 // Rhs should be (0, Stride, 2 * Stride, ..., N-Stride), where N <= Sz.
13290 if (all_of(Range: RhsTE->Scalars,
13291 P: [&](Value *V) {
13292 CurrentValue += Stride;
13293 if (isa<UndefValue>(Val: V))
13294 return true;
13295 auto *C = dyn_cast<Constant>(Val: V);
13296 if (!C)
13297 return false;
13298 return C->getUniqueInteger() == CurrentValue - Stride;
13299 }) &&
13300 CurrentValue <= Sz) {
13301 Order.clear();
13302 } else {
13303 const unsigned VF = RhsTE->getVectorFactor();
13304 Order.assign(NumElts: VF, Elt: VF);
13305 // Track which logical positions we've seen; reject duplicate shift amounts.
13306 SmallBitVector SeenPositions(VF);
13307 // Check if need to reorder Rhs to make it in form (0, Stride, 2 * Stride,
13308 // ..., N-Stride), where N <= Sz.
13309 if (VF * Stride > Sz)
13310 return false;
13311 for (const auto [Idx, V] : enumerate(First: RhsTE->Scalars)) {
13312 if (isa<UndefValue>(Val: V))
13313 continue;
13314 auto *C = dyn_cast<Constant>(Val: V);
13315 if (!C)
13316 return false;
13317 const APInt &Val = C->getUniqueInteger();
13318 if (Val.isNegative() || Val.uge(RHS: Sz) || Val.getZExtValue() % Stride != 0)
13319 return false;
13320 unsigned Pos = Val.getZExtValue() / Stride;
13321 // TODO: Support Pos >= VF, in this case need to shift the final value.
13322 if (Order[Idx] != VF || Pos >= VF)
13323 return false;
13324 if (SeenPositions.test(Idx: Pos))
13325 return false;
13326 SeenPositions.set(Pos);
13327 Order[Idx] = Pos;
13328 }
13329 // One of the indices not set - exit.
13330 if (is_contained(Range&: Order, Element: VF))
13331 return false;
13332 }
13333 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13334 auto *SrcType = IntegerType::getIntNTy(C&: ScalarTy->getContext(),
13335 N: Stride * LhsTE->getVectorFactor());
13336 FastMathFlags FMF;
13337 SmallPtrSet<Value *, 4> CheckedExtracts;
13338 auto *VecTy = getWidenedType(ScalarTy, VF: TE.getVectorFactor());
13339 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
13340 TTI::CastContextHint CastCtx =
13341 getCastContextHint(TE: *getOperandEntry(E: LhsTE, /*Idx=*/0));
13342 InstructionCost VecCost =
13343 TTI->getArithmeticReductionCost(Opcode: Instruction::Or, Ty: VecTy, FMF, CostKind) +
13344 TTI->getArithmeticInstrCost(Opcode: Instruction::Shl, Ty: VecTy, CostKind,
13345 Opd1Info: getOperandInfo(Ops: LhsTE->Scalars)) +
13346 TTI->getCastInstrCost(
13347 Opcode: Instruction::ZExt, Dst: VecTy,
13348 Src: getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor()), CCH: CastCtx,
13349 CostKind);
13350 InstructionCost BitcastCost = TTI->getCastInstrCost(
13351 Opcode: Instruction::BitCast, Dst: SrcType, Src: SrcVecTy, CCH: CastCtx, CostKind);
13352 if (!Order.empty()) {
13353 fixupOrderingIndices(Order);
13354 SmallVector<int> Mask;
13355 inversePermutation(Indices: Order, Mask);
13356 BitcastCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: SrcVecTy,
13357 Mask, CostKind);
13358 }
13359 // Check if the combination can be modeled as a bitcast+byteswap operation.
13360 constexpr unsigned ByteSize = 8;
13361 if (!Order.empty() && isReverseOrder(Order) &&
13362 DL->getTypeSizeInBits(Ty: SrcScalarTy) == ByteSize) {
13363 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13364 InstructionCost BSwapCost =
13365 TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: SrcType, Src: SrcVecTy, CCH: CastCtx,
13366 CostKind) +
13367 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13368 if (BSwapCost <= BitcastCost) {
13369 BitcastCost = BSwapCost;
13370 IsBSwap = true;
13371 Order.clear();
13372 // Check for loads in the ZExt node.
13373 const TreeEntry *SrcTE = getOperandEntry(E: LhsTE, /*Idx=*/0);
13374 if (SrcTE->State == TreeEntry::Vectorize &&
13375 SrcTE->ReorderIndices.empty() && SrcTE->ReuseShuffleIndices.empty() &&
13376 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13377 all_of(Range: SrcTE->Scalars, P: [](Value *V) { return V->hasOneUse(); })) {
13378 auto *LI = cast<LoadInst>(Val: SrcTE->getMainOp());
13379 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
13380 InstructionCost BSwapCost =
13381 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI->getAlign(),
13382 AddressSpace: LI->getPointerAddressSpace(), CostKind) +
13383 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
13384 if (BSwapCost <= BitcastCost) {
13385 VecCost +=
13386 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcVecTy, Alignment: LI->getAlign(),
13387 AddressSpace: LI->getPointerAddressSpace(), CostKind);
13388 BitcastCost = BSwapCost;
13389 ForLoads = true;
13390 }
13391 }
13392 }
13393 } else if (Order.empty() && DL->getTypeSizeInBits(Ty: SrcScalarTy) == ByteSize) {
13394 // Check for loads in the ZExt node.
13395 const TreeEntry *SrcTE = getOperandEntry(E: LhsTE, /*Idx=*/0);
13396 if (SrcTE->State == TreeEntry::Vectorize && SrcTE->ReorderIndices.empty() &&
13397 SrcTE->ReuseShuffleIndices.empty() &&
13398 SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
13399 all_of(Range: SrcTE->Scalars, P: [](Value *V) { return V->hasOneUse(); })) {
13400 auto *LI = cast<LoadInst>(Val: SrcTE->getMainOp());
13401 BitcastCost =
13402 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI->getAlign(),
13403 AddressSpace: LI->getPointerAddressSpace(), CostKind);
13404 VecCost +=
13405 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: SrcVecTy, Alignment: LI->getAlign(),
13406 AddressSpace: LI->getPointerAddressSpace(), CostKind);
13407 ForLoads = true;
13408 }
13409 }
13410 if (SrcType != ScalarTy) {
13411 BitcastCost += TTI->getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
13412 CCH: TTI::CastContextHint::None, CostKind);
13413 }
13414 return BitcastCost < VecCost;
13415}
13416
13417bool BoUpSLP::matchesInversedZExtSelect(
13418 const TreeEntry &SelectTE,
13419 SmallVectorImpl<unsigned> &InversedCmpsIndices) const {
13420 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13421 "Expected select node.");
13422 SmallVector<std::pair<Instruction *, unsigned>> ZExts;
13423 for (auto [Idx, V] : enumerate(First: SelectTE.Scalars)) {
13424 auto *Inst = dyn_cast<Instruction>(Val: V);
13425 if (!Inst || Inst->getOpcode() != Instruction::ZExt)
13426 continue;
13427 ZExts.emplace_back(Args&: Inst, Args&: Idx);
13428 }
13429 if (ZExts.empty())
13430 return false;
13431 const auto *CmpTE = getOperandEntry(E: &SelectTE, Idx: 0);
13432 const auto *Op1TE = getOperandEntry(E: &SelectTE, Idx: 1);
13433 const auto *Op2TE = getOperandEntry(E: &SelectTE, Idx: 2);
13434 // Compares must be alternate vectorized, and other operands must be gathers
13435 // or copyables.
13436 // TODO: investigate opportunity for reordered/reused nodes.
13437 if (CmpTE->State != TreeEntry::Vectorize || !CmpTE->isAltShuffle() ||
13438 (CmpTE->getOpcode() != Instruction::ICmp &&
13439 CmpTE->getOpcode() != Instruction::FCmp) ||
13440 !CmpTE->ReorderIndices.empty() || !CmpTE->ReuseShuffleIndices.empty() ||
13441 !Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13442 !Op2TE->ReorderIndices.empty() || !Op2TE->ReuseShuffleIndices.empty())
13443 return false;
13444 // The operands must be buildvectors/copyables.
13445 if (!Op1TE->isGather() || !Op2TE->isGather())
13446 return false;
13447 // TODO: investigate opportunity for the vector nodes with copyables.
13448 auto *Cmp = CmpTE->getMainOp();
13449 CmpPredicate Pred;
13450 auto MatchCmp = m_Cmp(Pred, L: m_Value(), R: m_Value());
13451 if (!match(V: Cmp, P: MatchCmp))
13452 return false;
13453 CmpPredicate MainPred = Pred;
13454 CmpPredicate InversedPred(CmpInst::getInversePredicate(pred: Pred),
13455 Pred.hasSameSign());
13456 for (const auto [Idx, V] : enumerate(First: CmpTE->Scalars)) {
13457 if (!match(V, P: MatchCmp))
13458 continue;
13459 if (CmpPredicate::getMatching(A: MainPred, B: Pred))
13460 continue;
13461 if (!CmpPredicate::getMatching(A: InversedPred, B: Pred))
13462 return false;
13463 if (!V->hasOneUse())
13464 return false;
13465 InversedCmpsIndices.push_back(Elt: Idx);
13466 }
13467
13468 if (InversedCmpsIndices.empty())
13469 return false;
13470 VectorType *VecTy =
13471 getWidenedType(ScalarTy: Cmp->getOperand(i: 0)->getType(), VF: CmpTE->getVectorFactor());
13472 Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: VecTy);
13473
13474 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13475 InstructionCost VecCost =
13476 TTI->getCmpSelInstrCost(Opcode: CmpTE->getOpcode(), ValTy: VecTy, CondTy: CmpTy, VecPred: MainPred,
13477 CostKind, Op1Info: getOperandInfo(Ops: CmpTE->getOperand(OpIdx: 0)),
13478 Op2Info: getOperandInfo(Ops: CmpTE->getOperand(OpIdx: 1)));
13479 InstructionCost BVCost =
13480 ::getScalarizationOverhead(TTI: *TTI, ScalarTy: Cmp->getType(), Ty: cast<VectorType>(Val: CmpTy),
13481 DemandedElts: APInt::getAllOnes(numBits: CmpTE->getVectorFactor()),
13482 /*Insert=*/true, /*Extract=*/false, CostKind);
13483 for (Value *V : CmpTE->Scalars) {
13484 auto *I = dyn_cast<Instruction>(Val: V);
13485 if (!I)
13486 continue;
13487 BVCost += TTI->getInstructionCost(U: I, CostKind);
13488 }
13489 return VecCost < BVCost;
13490}
13491
13492bool BoUpSLP::matchesSelectOfBits(const TreeEntry &SelectTE) const {
13493 assert(SelectTE.hasState() && SelectTE.getOpcode() == Instruction::Select &&
13494 "Expected select node.");
13495 if (DL->isBigEndian())
13496 return false;
13497 if (!SelectTE.ReorderIndices.empty() || !SelectTE.ReuseShuffleIndices.empty())
13498 return false;
13499 if (!UserIgnoreList)
13500 return false;
13501 if (any_of(Range: SelectTE.Scalars, P: [](Value *V) { return !V->hasOneUse(); }))
13502 return false;
13503 // Check that all reduction operands are or instructions.
13504 if (any_of(Range: *UserIgnoreList,
13505 P: [](Value *V) { return !match(V, P: m_Or(L: m_Value(), R: m_Value())); }))
13506 return false;
13507 const TreeEntry *Op1TE = getOperandEntry(E: &SelectTE, Idx: 1);
13508 const TreeEntry *Op2TE = getOperandEntry(E: &SelectTE, Idx: 2);
13509 if (!Op1TE->isGather() || !Op2TE->isGather())
13510 return false;
13511 // No need to check for zeroes reordering.
13512 if (!Op1TE->ReorderIndices.empty() || !Op1TE->ReuseShuffleIndices.empty() ||
13513 !Op2TE->ReuseShuffleIndices.empty())
13514 return false;
13515 Type *ScalarTy = Op1TE->Scalars.front()->getType();
13516 if (!ScalarTy->isIntegerTy())
13517 return false;
13518 // Check that second operand is all zeroes.
13519 if (any_of(Range: Op2TE->Scalars, P: [](Value *V) { return !match(V, P: m_ZeroInt()); }))
13520 return false;
13521 // Check that first operand is 1,2,4,...
13522 if (any_of(Range: enumerate(First: Op1TE->Scalars), P: [](const auto &P) {
13523 uint64_t V;
13524 return !(match(P.value(), m_ConstantInt(V)) && isPowerOf2_64(Value: V) &&
13525 Log2_64(Value: V) == P.index());
13526 }))
13527 return false;
13528 // Check if bitcast is cheaper than select.
13529 auto *DstTy = IntegerType::getIntNTy(C&: ScalarTy->getContext(),
13530 N: SelectTE.getVectorFactor());
13531 VectorType *OpTy = getWidenedType(ScalarTy: DstTy, VF: SelectTE.getVectorFactor());
13532 Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: OpTy);
13533 VectorType *VecTy = getWidenedType(ScalarTy, VF: SelectTE.getVectorFactor());
13534 auto It = MinBWs.find(Val: &SelectTE);
13535 if (It != MinBWs.end()) {
13536 auto *EffectiveScalarTy =
13537 IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
13538 VecTy = getWidenedType(ScalarTy: EffectiveScalarTy, VF: SelectTE.getVectorFactor());
13539 }
13540 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13541 InstructionCost BitcastCost = TTI->getCastInstrCost(
13542 Opcode: Instruction::BitCast, Dst: DstTy, Src: CmpTy, CCH: TTI::CastContextHint::None, CostKind);
13543 if (DstTy != ScalarTy) {
13544 BitcastCost += TTI->getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: DstTy,
13545 CCH: TTI::CastContextHint::None, CostKind);
13546 }
13547 FastMathFlags FMF;
13548 InstructionCost SelectCost =
13549 TTI->getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy, CondTy: CmpTy,
13550 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind,
13551 Op1Info: getOperandInfo(Ops: Op1TE->Scalars),
13552 Op2Info: getOperandInfo(Ops: Op2TE->Scalars)) +
13553 TTI->getArithmeticReductionCost(Opcode: Instruction::Or, Ty: VecTy, FMF, CostKind);
13554 return BitcastCost <= SelectCost;
13555}
13556
13557void BoUpSLP::transformNodes() {
13558 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
13559 BaseGraphSize = VectorizableTree.size();
13560 // Turn graph transforming mode on and off, when done.
13561 class GraphTransformModeRAAI {
13562 bool &SavedIsGraphTransformMode;
13563
13564 public:
13565 GraphTransformModeRAAI(bool &IsGraphTransformMode)
13566 : SavedIsGraphTransformMode(IsGraphTransformMode) {
13567 IsGraphTransformMode = true;
13568 }
13569 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
13570 } TransformContext(IsGraphTransformMode);
13571 // Operands are profitable if they are:
13572 // 1. At least one constant
13573 // or
13574 // 2. Splats
13575 // or
13576 // 3. Results in good vectorization opportunity, i.e. may generate vector
13577 // nodes and reduce cost of the graph.
13578 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
13579 const InstructionsState &S) {
13580 SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;
13581 for (unsigned Op : seq<unsigned>(Size: S.getMainOp()->getNumOperands()))
13582 Candidates.emplace_back().emplace_back(Args: I1->getOperand(i: Op),
13583 Args: I2->getOperand(i: Op));
13584 return all_of(
13585 Range&: Candidates, P: [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
13586 return all_of(Range&: Cand,
13587 P: [](const std::pair<Value *, Value *> &P) {
13588 return isa<Constant>(Val: P.first) ||
13589 isa<Constant>(Val: P.second) || P.first == P.second;
13590 }) ||
13591 findBestRootPair(Candidates: Cand, Limit: LookAheadHeuristics::ScoreSplatLoads);
13592 });
13593 };
13594
13595 // Try to reorder gather nodes for better vectorization opportunities.
13596 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
13597 TreeEntry &E = *VectorizableTree[Idx];
13598 if (E.isGather())
13599 reorderGatherNode(TE&: E);
13600 }
13601
13602 // Better to use full gathered loads analysis, if there are only 2 loads
13603 // gathered nodes each having less than 16 elements.
13604 constexpr unsigned VFLimit = 16;
13605 bool ForceLoadGather =
13606 count_if(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
13607 return TE->isGather() && TE->hasState() &&
13608 TE->getOpcode() == Instruction::Load &&
13609 TE->getVectorFactor() < VFLimit;
13610 }) == 2;
13611
13612 // Checks if the scalars are used in other node.
13613 auto AreReusedScalars = [&](const TreeEntry *TE, ArrayRef<Value *> VL,
13614 function_ref<bool(Value *)> CheckContainer) {
13615 return TE->isSame(VL) || all_of(Range&: VL, P: [&](Value *V) {
13616 if (isa<PoisonValue>(Val: V))
13617 return true;
13618 auto *I = dyn_cast<Instruction>(Val: V);
13619 if (!I)
13620 return false;
13621 return is_contained(Range: TE->Scalars, Element: I) || CheckContainer(I);
13622 });
13623 };
13624 auto CheckForSameVectorNodes = [&](const TreeEntry &E) {
13625 if (E.hasState()) {
13626 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: E.getMainOp());
13627 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13628 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13629 ArrayRef<TreeEntry *> VTEs = getTreeEntries(V);
13630 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13631 return is_contained(Range&: TEs, Element: TE);
13632 });
13633 });
13634 }))
13635 return true;
13636 ;
13637 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: E.getMainOp());
13638 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13639 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13640 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13641 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13642 return is_contained(Range&: TEs, Element: TE);
13643 });
13644 });
13645 }))
13646 return true;
13647 } else {
13648 // Check if the gather node full copy of split node.
13649 auto *It = find_if(Range: E.Scalars, P: IsaPred<Instruction>);
13650 if (It != E.Scalars.end()) {
13651 if (ArrayRef<TreeEntry *> TEs = getSplitTreeEntries(V: *It);
13652 !TEs.empty() && any_of(Range&: TEs, P: [&](const TreeEntry *TE) {
13653 return AreReusedScalars(TE, E.Scalars, [&](Value *V) {
13654 ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V);
13655 return !VTEs.empty() && any_of(Range&: VTEs, P: [&](const TreeEntry *TE) {
13656 return is_contained(Range&: TEs, Element: TE);
13657 });
13658 });
13659 }))
13660 return true;
13661 }
13662 }
13663 return false;
13664 };
13665 // The tree may grow here, so iterate over nodes, built before.
13666 for (unsigned Idx : seq<unsigned>(Size: BaseGraphSize)) {
13667 TreeEntry &E = *VectorizableTree[Idx];
13668 if (E.isGather()) {
13669 ArrayRef<Value *> VL = E.Scalars;
13670 const unsigned Sz = getVectorElementSize(V: VL.front());
13671 unsigned MinVF = getMinVF(Sz: 2 * Sz);
13672 // Do not try partial vectorization for small nodes (<= 2), nodes with the
13673 // same opcode and same parent block or all constants.
13674 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(key: Idx) ||
13675 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
13676 // We use allSameOpcode instead of isAltShuffle because we don't
13677 // want to use interchangeable instruction here.
13678 !allSameOpcode(VL) || !allSameBlock(VL)) ||
13679 allConstant(VL) || isSplat(VL))
13680 continue;
13681 if (ForceLoadGather && E.hasState() && E.getOpcode() == Instruction::Load)
13682 continue;
13683 // Check if the node is a copy of other vector nodes.
13684 if (CheckForSameVectorNodes(E))
13685 continue;
13686 // Try to find vectorizable sequences and transform them into a series of
13687 // insertvector instructions.
13688 unsigned StartIdx = 0;
13689 unsigned End = VL.size();
13690 SmallBitVector Processed(End);
13691 for (unsigned VF = getFloorFullVectorNumberOfElements(
13692 TTI: *TTI, Ty: VL.front()->getType(), Sz: VL.size() - 1);
13693 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
13694 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
13695 if (StartIdx + VF > End)
13696 continue;
13697 SmallVector<std::pair<unsigned, unsigned>> Slices;
13698 bool AllStrided = true;
13699 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
13700 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: VF);
13701 // If any instruction is vectorized already - do not try again.
13702 // Reuse the existing node, if it fully matches the slice.
13703 if ((Processed.test(Idx: Cnt) || isVectorized(V: Slice.front())) &&
13704 !getSameValuesTreeEntry(V: Slice.front(), VL: Slice, /*SameVF=*/true))
13705 continue;
13706 // Constant already handled effectively - skip.
13707 if (allConstant(VL: Slice))
13708 continue;
13709 // Do not try to vectorize small splats (less than vector register and
13710 // only with the single non-undef element).
13711 bool IsSplat = isSplat(VL: Slice);
13712 bool IsTwoRegisterSplat = true;
13713 if (IsSplat && VF == 2) {
13714 unsigned NumRegs2VF = ::getNumberOfParts(
13715 TTI: *TTI, VecTy: getWidenedType(ScalarTy: Slice.front()->getType(), VF: 2 * VF));
13716 IsTwoRegisterSplat = NumRegs2VF == 2;
13717 }
13718 if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||
13719 count(Range&: Slice, Element: Slice.front()) ==
13720 static_cast<long>(isa<UndefValue>(Val: Slice.front()) ? VF - 1
13721 : 1)) {
13722 if (IsSplat)
13723 continue;
13724 InstructionsState S = getSameOpcode(VL: Slice, TLI: *TLI);
13725 if (!S || !allSameOpcode(VL: Slice) || !allSameBlock(VL: Slice) ||
13726 (S.getOpcode() == Instruction::Load &&
13727 areKnownNonVectorizableLoads(VL: Slice)) ||
13728 (S.getOpcode() != Instruction::Load &&
13729 !hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: Slice.front()->getType(), Sz: VF)))
13730 continue;
13731 if (VF == 2) {
13732 // Try to vectorize reduced values or if all users are vectorized.
13733 // For expensive instructions extra extracts might be profitable.
13734 if ((!UserIgnoreList || E.Idx != 0) &&
13735 TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
13736 TTI::TCC_Expensive &&
13737 !all_of(Range&: Slice, P: [&](Value *V) {
13738 if (isa<PoisonValue>(Val: V))
13739 return true;
13740 return areAllUsersVectorized(I: cast<Instruction>(Val: V),
13741 VectorizedVals: UserIgnoreList);
13742 }))
13743 continue;
13744 if (S.getOpcode() == Instruction::Load) {
13745 OrdersType Order;
13746 SmallVector<Value *> PointerOps;
13747 StridedPtrInfo SPtrInfo;
13748 LoadsState Res = canVectorizeLoads(VL: Slice, VL0: Slice.front(), Order,
13749 PointerOps, SPtrInfo);
13750 AllStrided &= Res == LoadsState::StridedVectorize ||
13751 Res == LoadsState::ScatterVectorize ||
13752 Res == LoadsState::Gather;
13753 // Do not vectorize gathers.
13754 if (Res == LoadsState::ScatterVectorize ||
13755 Res == LoadsState::Gather) {
13756 if (Res == LoadsState::Gather) {
13757 registerNonVectorizableLoads(VL: Slice);
13758 // If reductions and the scalars from the root node are
13759 // analyzed - mark as non-vectorizable reduction.
13760 if (UserIgnoreList && E.Idx == 0)
13761 analyzedReductionVals(VL: Slice);
13762 }
13763 continue;
13764 }
13765 } else if (S.getOpcode() == Instruction::ExtractElement ||
13766 (TTI->getInstructionCost(U: S.getMainOp(), CostKind) <
13767 TTI::TCC_Expensive &&
13768 !CheckOperandsProfitability(
13769 S.getMainOp(),
13770 cast<Instruction>(Val: *find_if(Range: reverse(C&: Slice),
13771 P: IsaPred<Instruction>)),
13772 S))) {
13773 // Do not vectorize extractelements (handled effectively
13774 // alread). Do not vectorize non-profitable instructions (with
13775 // low cost and non-vectorizable operands.)
13776 continue;
13777 }
13778 }
13779 }
13780 Slices.emplace_back(Args&: Cnt, Args: Slice.size());
13781 }
13782 // Do not try to vectorize if all slides are strided or gathered with
13783 // vector factor 2 and there are more than 2 slices. Better to handle
13784 // them in gathered loads analysis, may result in better vectorization.
13785 if (VF == 2 && AllStrided && Slices.size() > 2)
13786 continue;
13787 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
13788 E.CombinedEntriesWithIndices.emplace_back(Args&: Idx, Args&: Cnt);
13789 Processed.set(I: Cnt, E: Cnt + Sz);
13790 if (StartIdx == Cnt)
13791 StartIdx = Cnt + Sz;
13792 if (End == Cnt + Sz)
13793 End = Cnt;
13794 };
13795 for (auto [Cnt, Sz] : Slices) {
13796 ArrayRef<Value *> Slice = VL.slice(N: Cnt, M: Sz);
13797 const TreeEntry *SameTE = nullptr;
13798 if (const auto *It = find_if(Range&: Slice, P: IsaPred<Instruction>);
13799 It != Slice.end()) {
13800 // If any instruction is vectorized already - do not try again.
13801 SameTE = getSameValuesTreeEntry(V: *It, VL: Slice);
13802 }
13803 unsigned PrevSize = VectorizableTree.size();
13804 [[maybe_unused]] unsigned PrevEntriesSize =
13805 LoadEntriesToVectorize.size();
13806 buildTreeRec(VLRef: Slice, Depth: 0, UserTreeIdx: EdgeInfo(&E, UINT_MAX));
13807 if (PrevSize + 1 == VectorizableTree.size() && !SameTE &&
13808 VectorizableTree[PrevSize]->isGather() &&
13809 VectorizableTree[PrevSize]->hasState() &&
13810 VectorizableTree[PrevSize]->getOpcode() !=
13811 Instruction::ExtractElement &&
13812 !isSplat(VL: Slice)) {
13813 if (UserIgnoreList && E.Idx == 0 && VF == 2)
13814 analyzedReductionVals(VL: Slice);
13815 VectorizableTree.pop_back();
13816 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
13817 "LoadEntriesToVectorize expected to remain the same");
13818 continue;
13819 }
13820 AddCombinedNode(PrevSize, Cnt, Sz);
13821 }
13822 }
13823 // Restore ordering, if no extra vectorization happened.
13824 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
13825 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13826 reorderScalars(Scalars&: E.Scalars, Mask);
13827 E.ReorderIndices.clear();
13828 }
13829 }
13830 if (!E.hasState())
13831 continue;
13832 switch (E.getOpcode()) {
13833 case Instruction::Load: {
13834 // No need to reorder masked gather loads, just reorder the scalar
13835 // operands.
13836 if (E.State != TreeEntry::Vectorize)
13837 break;
13838 Type *ScalarTy = E.getMainOp()->getType();
13839 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
13840 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E.Scalars);
13841 // Check if profitable to represent consecutive load + reverse as strided
13842 // load with stride -1.
13843 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
13844 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
13845 SmallVector<int> Mask;
13846 inversePermutation(Indices: E.ReorderIndices, Mask);
13847 auto *BaseLI = cast<LoadInst>(Val: E.Scalars.back());
13848 InstructionCost OriginalVecCost =
13849 TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: VecTy, Alignment: BaseLI->getAlign(),
13850 AddressSpace: BaseLI->getPointerAddressSpace(), CostKind,
13851 OpdInfo: TTI::OperandValueInfo()) +
13852 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
13853 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13854 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
13855 VecTy, BaseLI->getPointerOperand(),
13856 /*VariableMask=*/false, CommonAlignment,
13857 BaseLI),
13858 CostKind);
13859 if (StridedCost < OriginalVecCost || ForceStridedLoads) {
13860 // Strided load is more profitable than consecutive load + reverse -
13861 // transform the node to strided load.
13862 Type *StrideTy = DL->getIndexType(PtrTy: cast<LoadInst>(Val: E.Scalars.front())
13863 ->getPointerOperand()
13864 ->getType());
13865 StridedPtrInfo SPtrInfo;
13866 SPtrInfo.StrideVal = ConstantInt::get(Ty: StrideTy, V: 1);
13867 SPtrInfo.Ty = VecTy;
13868 TreeEntryToStridedPtrInfoMap[&E] = SPtrInfo;
13869 E.State = TreeEntry::StridedVectorize;
13870 }
13871 }
13872 break;
13873 }
13874 case Instruction::Store: {
13875 Type *ScalarTy =
13876 cast<StoreInst>(Val: E.getMainOp())->getValueOperand()->getType();
13877 auto *VecTy = getWidenedType(ScalarTy, VF: E.Scalars.size());
13878 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E.Scalars);
13879 // Check if profitable to represent consecutive load + reverse as strided
13880 // load with stride -1.
13881 if (!E.ReorderIndices.empty() && isReverseOrder(Order: E.ReorderIndices) &&
13882 TTI->isLegalStridedLoadStore(DataType: VecTy, Alignment: CommonAlignment)) {
13883 SmallVector<int> Mask;
13884 inversePermutation(Indices: E.ReorderIndices, Mask);
13885 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.back());
13886 InstructionCost OriginalVecCost =
13887 TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
13888 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind,
13889 OpdInfo: TTI::OperandValueInfo()) +
13890 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_Reverse, Tp: VecTy, Mask, CostKind);
13891 InstructionCost StridedCost = TTI->getMemIntrinsicInstrCost(
13892 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
13893 VecTy, BaseSI->getPointerOperand(),
13894 /*VariableMask=*/false, CommonAlignment,
13895 BaseSI),
13896 CostKind);
13897 if (StridedCost < OriginalVecCost)
13898 // Strided store is more profitable than reverse + consecutive store -
13899 // transform the node to strided store.
13900 E.State = TreeEntry::StridedVectorize;
13901 } else if (!E.ReorderIndices.empty()) {
13902 // Check for interleaved stores.
13903 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
13904 auto *BaseSI = cast<StoreInst>(Val: E.Scalars.front());
13905 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
13906 if (Mask.size() < 4)
13907 return 0u;
13908 for (unsigned Factor : seq<unsigned>(Begin: 2, End: Mask.size() / 2 + 1)) {
13909 if (ShuffleVectorInst::isInterleaveMask(
13910 Mask, Factor, NumInputElts: VecTy->getElementCount().getFixedValue()) &&
13911 TTI.isLegalInterleavedAccessType(
13912 VTy: VecTy, Factor, Alignment: BaseSI->getAlign(),
13913 AddrSpace: BaseSI->getPointerAddressSpace()))
13914 return Factor;
13915 }
13916
13917 return 0u;
13918 };
13919 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
13920 unsigned InterleaveFactor = IsInterleaveMask(Mask);
13921 if (InterleaveFactor != 0)
13922 E.setInterleave(InterleaveFactor);
13923 }
13924 break;
13925 }
13926 case Instruction::Select: {
13927 if (E.State != TreeEntry::Vectorize)
13928 break;
13929 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: E.Scalars);
13930 if (MinMaxID != Intrinsic::not_intrinsic) {
13931 // This node is a minmax node.
13932 E.CombinedOp = TreeEntry::MinMax;
13933 TreeEntry *CondEntry = getOperandEntry(E: &E, Idx: 0);
13934 if (SelectOnly && CondEntry->UserTreeIndex &&
13935 CondEntry->State == TreeEntry::Vectorize) {
13936 // The condition node is part of the combined minmax node.
13937 CondEntry->State = TreeEntry::CombinedVectorize;
13938 }
13939 break;
13940 }
13941 // Check for zext + selects, which can be reordered.
13942 SmallVector<unsigned> InversedCmpsIndices;
13943 if (matchesInversedZExtSelect(SelectTE: E, InversedCmpsIndices)) {
13944 auto *CmpTE = getOperandEntry(E: &E, Idx: 0);
13945 auto *Op1TE = getOperandEntry(E: &E, Idx: 1);
13946 auto *Op2TE = getOperandEntry(E: &E, Idx: 2);
13947 // State now is uniform, not alternate opcode.
13948 CmpTE->setOperations(
13949 InstructionsState(CmpTE->getMainOp(), CmpTE->getMainOp()));
13950 // Update mapping between the swapped values and their internal matching
13951 // nodes.
13952 auto UpdateGatherEntry = [&](TreeEntry *OldTE, TreeEntry *NewTE,
13953 Value *V) {
13954 if (isConstant(V))
13955 return;
13956 auto It = ValueToGatherNodes.find(Val: V);
13957 assert(It != ValueToGatherNodes.end() &&
13958 "Expected to find the value in the map.");
13959 auto &C = It->getSecond();
13960 if (!is_contained(Range&: OldTE->Scalars, Element: V))
13961 C.remove(X: OldTE);
13962 C.insert(X: NewTE);
13963 };
13964 ValueList &Op1 = E.getOperand(OpIdx: 1);
13965 ValueList &Op2 = E.getOperand(OpIdx: 2);
13966 for (const unsigned Idx : InversedCmpsIndices) {
13967 Value *V1 = Op1TE->Scalars[Idx];
13968 Value *V2 = Op2TE->Scalars[Idx];
13969 std::swap(a&: Op1TE->Scalars[Idx], b&: Op2TE->Scalars[Idx]);
13970 std::swap(a&: Op1[Idx], b&: Op2[Idx]);
13971 UpdateGatherEntry(Op1TE, Op2TE, V1);
13972 UpdateGatherEntry(Op2TE, Op1TE, V2);
13973 }
13974 OperandsToTreeEntry.emplace_or_assign(Key: std::make_pair(x: &E, y: 1), Args&: Op1TE);
13975 OperandsToTreeEntry.emplace_or_assign(Key: std::make_pair(x: &E, y: 2), Args&: Op2TE);
13976 // NB: Fallback to check if select can be converted to cmp bitcast.
13977 }
13978 if (matchesSelectOfBits(SelectTE: E)) {
13979 // This node is a (reduced or) cmp bitcast node.
13980 const TreeEntry::CombinedOpcode Code = TreeEntry::ReducedCmpBitcast;
13981 E.CombinedOp = Code;
13982 auto *Op1TE = getOperandEntry(E: &E, Idx: 1);
13983 auto *Op2TE = getOperandEntry(E: &E, Idx: 2);
13984 Op1TE->State = TreeEntry::CombinedVectorize;
13985 Op1TE->CombinedOp = Code;
13986 Op2TE->State = TreeEntry::CombinedVectorize;
13987 Op2TE->CombinedOp = Code;
13988 break;
13989 }
13990 break;
13991 }
13992 case Instruction::FSub:
13993 case Instruction::FAdd: {
13994 // Check if possible to convert (a*b)+c to fma.
13995 if (E.State != TreeEntry::Vectorize ||
13996 !E.getOperations().isAddSubLikeOp())
13997 break;
13998 if (!canConvertToFMA(VL: E.Scalars, S: E.getOperations(), DT&: *DT, DL: *DL, TTI&: *TTI, TLI: *TLI)
13999 .isValid())
14000 break;
14001 // This node is a fmuladd node.
14002 E.CombinedOp = TreeEntry::FMulAdd;
14003 TreeEntry *FMulEntry = getOperandEntry(E: &E, Idx: 0);
14004 if (FMulEntry->UserTreeIndex &&
14005 FMulEntry->State == TreeEntry::Vectorize) {
14006 // The FMul node is part of the combined fmuladd node.
14007 FMulEntry->State = TreeEntry::CombinedVectorize;
14008 }
14009 break;
14010 }
14011 case Instruction::Shl: {
14012 if (E.Idx != 0 || DL->isBigEndian())
14013 break;
14014 if (!UserIgnoreList)
14015 break;
14016 // Check that all reduction operands are disjoint or instructions.
14017 if (any_of(Range: *UserIgnoreList, P: [](Value *V) {
14018 return !match(V, P: m_DisjointOr(L: m_Value(), R: m_Value()));
14019 }))
14020 break;
14021 OrdersType Order;
14022 bool IsBSwap;
14023 bool ForLoads;
14024 if (!matchesShlZExt(TE: E, Order, IsBSwap, ForLoads))
14025 break;
14026 // This node is a (reduced disjoint or) bitcast node.
14027 TreeEntry::CombinedOpcode Code =
14028 IsBSwap ? (ForLoads ? TreeEntry::ReducedBitcastBSwapLoads
14029 : TreeEntry::ReducedBitcastBSwap)
14030 : (ForLoads ? TreeEntry::ReducedBitcastLoads
14031 : TreeEntry::ReducedBitcast);
14032 E.CombinedOp = Code;
14033 E.ReorderIndices = std::move(Order);
14034 TreeEntry *ZExtEntry = getOperandEntry(E: &E, Idx: 0);
14035 assert(ZExtEntry->UserTreeIndex &&
14036 ZExtEntry->State == TreeEntry::Vectorize &&
14037 ZExtEntry->getOpcode() == Instruction::ZExt &&
14038 "Expected ZExt node.");
14039 // The ZExt node is part of the combined node.
14040 ZExtEntry->State = TreeEntry::CombinedVectorize;
14041 ZExtEntry->CombinedOp = Code;
14042 if (ForLoads) {
14043 TreeEntry *LoadsEntry = getOperandEntry(E: ZExtEntry, Idx: 0);
14044 assert(LoadsEntry->UserTreeIndex &&
14045 LoadsEntry->State == TreeEntry::Vectorize &&
14046 LoadsEntry->getOpcode() == Instruction::Load &&
14047 "Expected Load node.");
14048 // The Load node is part of the combined node.
14049 LoadsEntry->State = TreeEntry::CombinedVectorize;
14050 LoadsEntry->CombinedOp = Code;
14051 }
14052 TreeEntry *ConstEntry = getOperandEntry(E: &E, Idx: 1);
14053 assert(ConstEntry->UserTreeIndex && ConstEntry->isGather() &&
14054 "Expected ZExt node.");
14055 // The ConstNode node is part of the combined node.
14056 ConstEntry->State = TreeEntry::CombinedVectorize;
14057 ConstEntry->CombinedOp = Code;
14058 break;
14059 }
14060 default:
14061 break;
14062 }
14063 }
14064
14065 if (LoadEntriesToVectorize.empty()) {
14066 // Single load node - exit.
14067 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
14068 VectorizableTree.front()->getOpcode() == Instruction::Load)
14069 return;
14070 // Small graph with small VF - exit.
14071 constexpr unsigned SmallTree = 3;
14072 constexpr unsigned SmallVF = 2;
14073 if ((VectorizableTree.size() <= SmallTree &&
14074 VectorizableTree.front()->Scalars.size() == SmallVF) ||
14075 (VectorizableTree.size() <= 2 && UserIgnoreList))
14076 return;
14077
14078 if (VectorizableTree.front()->isNonPowOf2Vec() &&
14079 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
14080 getCanonicalGraphSize() <= SmallTree &&
14081 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
14082 P: [](const std::unique_ptr<TreeEntry> &TE) {
14083 return TE->isGather() && TE->hasState() &&
14084 TE->getOpcode() == Instruction::Load &&
14085 !allSameBlock(VL: TE->Scalars);
14086 }) == 1)
14087 return;
14088 }
14089
14090 // A list of loads to be gathered during the vectorization process. We can
14091 // try to vectorize them at the end, if profitable.
14092 SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
14093 SmallVector<SmallVector<std::pair<LoadInst *, int64_t>>>, 8>
14094 GatheredLoads;
14095
14096 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
14097 TreeEntry &E = *TE;
14098 if (E.isGather() &&
14099 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
14100 (!E.hasState() && any_of(Range&: E.Scalars,
14101 P: [&](Value *V) {
14102 return isa<LoadInst>(Val: V) &&
14103 !isVectorized(V) &&
14104 !isDeleted(I: cast<Instruction>(Val: V));
14105 }))) &&
14106 !isSplat(VL: E.Scalars)) {
14107 for (Value *V : E.Scalars) {
14108 auto *LI = dyn_cast<LoadInst>(Val: V);
14109 if (!LI)
14110 continue;
14111 if (isDeleted(I: LI) || isVectorized(V: LI) || !LI->isSimple())
14112 continue;
14113 gatherPossiblyVectorizableLoads(
14114 R: *this, VL: V, DL: *DL, SE&: *SE, TTI: *TTI,
14115 GatheredLoads&: GatheredLoads[std::make_tuple(
14116 args: LI->getParent(),
14117 args: getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth),
14118 args: LI->getType())]);
14119 }
14120 }
14121 }
14122 // Try to vectorize gathered loads if this is not just a gather of loads.
14123 if (!GatheredLoads.empty())
14124 tryToVectorizeGatheredLoads(GatheredLoads);
14125}
14126
14127/// Merges shuffle masks and emits final shuffle instruction, if required. It
14128/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14129/// when the actual shuffle instruction is generated only if this is actually
14130/// required. Otherwise, the shuffle instruction emission is delayed till the
14131/// end of the process, to reduce the number of emitted instructions and further
14132/// analysis/transformations.
14133class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
14134 bool IsFinalized = false;
14135 SmallVector<int> CommonMask;
14136 SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;
14137 const TargetTransformInfo &TTI;
14138 InstructionCost Cost = 0;
14139 SmallDenseSet<Value *> VectorizedVals;
14140 BoUpSLP &R;
14141 SmallPtrSetImpl<Value *> &CheckedExtracts;
14142 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
14143 /// While set, still trying to estimate the cost for the same nodes and we
14144 /// can delay actual cost estimation (virtual shuffle instruction emission).
14145 /// May help better estimate the cost if same nodes must be permuted + allows
14146 /// to move most of the long shuffles cost estimation to TTI.
14147 bool SameNodesEstimated = true;
14148
14149 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
14150 if (Ty->getScalarType()->isPointerTy()) {
14151 Constant *Res = ConstantExpr::getIntToPtr(
14152 C: ConstantInt::getAllOnesValue(
14153 Ty: IntegerType::get(C&: Ty->getContext(),
14154 NumBits: DL.getTypeStoreSizeInBits(Ty: Ty->getScalarType()))),
14155 Ty: Ty->getScalarType());
14156 if (auto *VTy = dyn_cast<VectorType>(Val: Ty))
14157 Res = ConstantVector::getSplat(EC: VTy->getElementCount(), Elt: Res);
14158 return Res;
14159 }
14160 return Constant::getAllOnesValue(Ty);
14161 }
14162
14163 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
14164 if ((!Root && allConstant(VL)) || all_of(Range&: VL, P: IsaPred<UndefValue>))
14165 return TTI::TCC_Free;
14166 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
14167 InstructionCost GatherCost = 0;
14168 SmallVector<Value *> Gathers(VL);
14169 if (!Root && isSplat(VL)) {
14170 // Found the broadcasting of the single scalar, calculate the cost as
14171 // the broadcast.
14172 const auto *It = find_if_not(Range&: VL, P: IsaPred<UndefValue>);
14173 assert(It != VL.end() && "Expected at least one non-undef value.");
14174 // Add broadcast for non-identity shuffle only.
14175 bool NeedShuffle =
14176 count(Range&: VL, Element: *It) > 1 &&
14177 (VL.front() != *It || !all_of(Range: VL.drop_front(), P: IsaPred<UndefValue>));
14178 if (!NeedShuffle) {
14179 if (isa<FixedVectorType>(Val: ScalarTy)) {
14180 assert(SLPReVec && "FixedVectorType is not expected.");
14181 return TTI.getShuffleCost(
14182 Kind: TTI::SK_InsertSubvector, DstTy: VecTy, SrcTy: VecTy, Mask: {}, CostKind,
14183 Index: std::distance(first: VL.begin(), last: It) * getNumElements(Ty: ScalarTy),
14184 SubTp: cast<FixedVectorType>(Val: ScalarTy));
14185 }
14186 return TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy,
14187 CostKind, Index: std::distance(first: VL.begin(), last: It),
14188 Op0: PoisonValue::get(T: VecTy), Op1: *It);
14189 }
14190
14191 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
14192 transform(Range&: VL, d_first: ShuffleMask.begin(), F: [](Value *V) {
14193 return isa<PoisonValue>(Val: V) ? PoisonMaskElem : 0;
14194 });
14195 InstructionCost InsertCost =
14196 TTI.getVectorInstrCost(Opcode: Instruction::InsertElement, Val: VecTy, CostKind, Index: 0,
14197 Op0: PoisonValue::get(T: VecTy), Op1: *It);
14198 return InsertCost + ::getShuffleCost(TTI,
14199 Kind: TargetTransformInfo::SK_Broadcast,
14200 Tp: VecTy, Mask: ShuffleMask, CostKind,
14201 /*Index=*/0, /*SubTp=*/nullptr,
14202 /*Args=*/*It);
14203 }
14204 return GatherCost +
14205 (all_of(Range&: Gathers, P: IsaPred<UndefValue>)
14206 ? TTI::TCC_Free
14207 : R.getGatherCost(VL: Gathers, ForPoisonSrc: !Root && VL.equals(RHS: Gathers),
14208 ScalarTy));
14209 };
14210
14211 /// Compute the cost of creating a vector containing the extracted values from
14212 /// \p VL.
14213 InstructionCost
14214 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
14215 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14216 unsigned NumParts) {
14217 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
14218 unsigned NumElts =
14219 std::accumulate(first: VL.begin(), last: VL.end(), init: 0, binary_op: [](unsigned Sz, Value *V) {
14220 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
14221 if (!EE)
14222 return Sz;
14223 auto *VecTy = dyn_cast<FixedVectorType>(Val: EE->getVectorOperandType());
14224 if (!VecTy)
14225 return Sz;
14226 return std::max(a: Sz, b: VecTy->getNumElements());
14227 });
14228 // FIXME: this must be moved to TTI for better estimation.
14229 unsigned EltsPerVector = getPartNumElems(Size: VL.size(), NumParts);
14230 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
14231 SmallVectorImpl<unsigned> &Indices,
14232 SmallVectorImpl<unsigned> &SubVecSizes)
14233 -> std::optional<TTI::ShuffleKind> {
14234 if (NumElts <= EltsPerVector)
14235 return std::nullopt;
14236 int OffsetReg0 =
14237 alignDown(Value: std::accumulate(first: Mask.begin(), last: Mask.end(), INT_MAX,
14238 binary_op: [](int S, int I) {
14239 if (I == PoisonMaskElem)
14240 return S;
14241 return std::min(a: S, b: I);
14242 }),
14243 Align: EltsPerVector);
14244 int OffsetReg1 = OffsetReg0;
14245 DenseSet<int> RegIndices;
14246 // Check that if trying to permute same single/2 input vectors.
14247 TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
14248 int FirstRegId = -1;
14249 Indices.assign(NumElts: 1, Elt: OffsetReg0);
14250 for (auto [Pos, I] : enumerate(First&: Mask)) {
14251 if (I == PoisonMaskElem)
14252 continue;
14253 int Idx = I - OffsetReg0;
14254 int RegId =
14255 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
14256 if (FirstRegId < 0)
14257 FirstRegId = RegId;
14258 RegIndices.insert(V: RegId);
14259 if (RegIndices.size() > 2)
14260 return std::nullopt;
14261 if (RegIndices.size() == 2) {
14262 ShuffleKind = TTI::SK_PermuteTwoSrc;
14263 if (Indices.size() == 1) {
14264 OffsetReg1 = alignDown(
14265 Value: std::accumulate(
14266 first: std::next(x: Mask.begin(), n: Pos), last: Mask.end(), INT_MAX,
14267 binary_op: [&](int S, int I) {
14268 if (I == PoisonMaskElem)
14269 return S;
14270 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
14271 ((I - OffsetReg0) % NumElts) / EltsPerVector;
14272 if (RegId == FirstRegId)
14273 return S;
14274 return std::min(a: S, b: I);
14275 }),
14276 Align: EltsPerVector);
14277 unsigned Index = OffsetReg1 % NumElts;
14278 Indices.push_back(Elt: Index);
14279 SubVecSizes.push_back(Elt: std::min(a: NumElts - Index, b: EltsPerVector));
14280 }
14281 Idx = I - OffsetReg1;
14282 }
14283 I = (Idx % NumElts) % EltsPerVector +
14284 (RegId == FirstRegId ? 0 : EltsPerVector);
14285 }
14286 return ShuffleKind;
14287 };
14288 InstructionCost Cost = 0;
14289
14290 // Process extracts in blocks of EltsPerVector to check if the source vector
14291 // operand can be re-used directly. If not, add the cost of creating a
14292 // shuffle to extract the values into a vector register.
14293 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
14294 if (!ShuffleKinds[Part])
14295 continue;
14296 ArrayRef<int> MaskSlice = Mask.slice(
14297 N: Part * EltsPerVector, M: getNumElems(Size: Mask.size(), PartNumElems: EltsPerVector, Part));
14298 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
14299 copy(Range&: MaskSlice, Out: SubMask.begin());
14300 SmallVector<unsigned, 2> Indices;
14301 SmallVector<unsigned, 2> SubVecSizes;
14302 std::optional<TTI::ShuffleKind> RegShuffleKind =
14303 CheckPerRegistersShuffle(SubMask, Indices, SubVecSizes);
14304 if (!RegShuffleKind) {
14305 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
14306 !ShuffleVectorInst::isIdentityMask(
14307 Mask: MaskSlice, NumSrcElts: std::max<unsigned>(a: NumElts, b: MaskSlice.size())))
14308 Cost +=
14309 ::getShuffleCost(TTI, Kind: *ShuffleKinds[Part],
14310 Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: MaskSlice);
14311 continue;
14312 }
14313 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
14314 !ShuffleVectorInst::isIdentityMask(Mask: SubMask, NumSrcElts: EltsPerVector)) {
14315 Cost +=
14316 ::getShuffleCost(TTI, Kind: *RegShuffleKind,
14317 Tp: getWidenedType(ScalarTy, VF: EltsPerVector), Mask: SubMask);
14318 }
14319 const unsigned BaseVF = getFullVectorNumberOfElements(
14320 TTI: *R.TTI, Ty: VL.front()->getType(), Sz: alignTo(Value: NumElts, Align: EltsPerVector));
14321 for (const auto [Idx, SubVecSize] : zip(t&: Indices, u&: SubVecSizes)) {
14322 assert((Idx + SubVecSize) <= BaseVF &&
14323 "SK_ExtractSubvector index out of range");
14324 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_ExtractSubvector,
14325 Tp: getWidenedType(ScalarTy, VF: BaseVF), Mask: {}, CostKind,
14326 Index: Idx, SubTp: getWidenedType(ScalarTy, VF: SubVecSize));
14327 }
14328 // Second attempt to check, if just a permute is better estimated than
14329 // subvector extract.
14330 SubMask.assign(NumElts, Elt: PoisonMaskElem);
14331 copy(Range&: MaskSlice, Out: SubMask.begin());
14332 InstructionCost OriginalCost = ::getShuffleCost(
14333 TTI, Kind: *ShuffleKinds[Part], Tp: getWidenedType(ScalarTy, VF: NumElts), Mask: SubMask);
14334 if (OriginalCost < Cost)
14335 Cost = OriginalCost;
14336 }
14337 return Cost;
14338 }
14339 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
14340 /// mask \p Mask, register number \p Part, that includes \p SliceSize
14341 /// elements.
14342 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
14343 ArrayRef<int> Mask, unsigned Part,
14344 unsigned SliceSize) {
14345 if (SameNodesEstimated) {
14346 // Delay the cost estimation if the same nodes are reshuffling.
14347 // If we already requested the cost of reshuffling of E1 and E2 before, no
14348 // need to estimate another cost with the sub-Mask, instead include this
14349 // sub-Mask into the CommonMask to estimate it later and avoid double cost
14350 // estimation.
14351 if ((InVectors.size() == 2 &&
14352 cast<const TreeEntry *>(Val&: InVectors.front()) == &E1 &&
14353 cast<const TreeEntry *>(Val&: InVectors.back()) == E2) ||
14354 (!E2 && cast<const TreeEntry *>(Val&: InVectors.front()) == &E1)) {
14355 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part);
14356 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
14357 [](int Idx) { return Idx == PoisonMaskElem; }) &&
14358 "Expected all poisoned elements.");
14359 ArrayRef<int> SubMask = ArrayRef(Mask).slice(N: Part * SliceSize, M: Limit);
14360 copy(Range&: SubMask, Out: std::next(x: CommonMask.begin(), n: SliceSize * Part));
14361 return;
14362 }
14363 // Found non-matching nodes - need to estimate the cost for the matched
14364 // and transform mask.
14365 Cost += createShuffle(P1: InVectors.front(),
14366 P2: InVectors.size() == 1 ? nullptr : InVectors.back(),
14367 Mask: CommonMask);
14368 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14369 } else if (InVectors.size() == 2) {
14370 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
14371 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14372 }
14373 SameNodesEstimated = false;
14374 if (!E2 && InVectors.size() == 1) {
14375 unsigned VF = E1.getVectorFactor();
14376 if (Value *V1 = dyn_cast<Value *>(Val&: InVectors.front())) {
14377 VF = std::max(a: VF, b: getVF(V: V1));
14378 } else {
14379 const auto *E = cast<const TreeEntry *>(Val&: InVectors.front());
14380 VF = std::max(a: VF, b: E->getVectorFactor());
14381 }
14382 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14383 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14384 CommonMask[Idx] = Mask[Idx] + VF;
14385 Cost += createShuffle(P1: InVectors.front(), P2: &E1, Mask: CommonMask);
14386 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14387 } else {
14388 auto P = InVectors.front();
14389 Cost += createShuffle(P1: &E1, P2: E2, Mask);
14390 unsigned VF = Mask.size();
14391 if (Value *V1 = dyn_cast<Value *>(Val&: P)) {
14392 VF = std::max(a: VF,
14393 b: getNumElements(Ty: V1->getType()));
14394 } else {
14395 const auto *E = cast<const TreeEntry *>(Val&: P);
14396 VF = std::max(a: VF, b: E->getVectorFactor());
14397 }
14398 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14399 if (Mask[Idx] != PoisonMaskElem)
14400 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
14401 Cost += createShuffle(P1: P, P2: InVectors.front(), Mask: CommonMask);
14402 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14403 }
14404 }
14405
14406 class ShuffleCostBuilder {
14407 const TargetTransformInfo &TTI;
14408
14409 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
14410 int Index = -1;
14411 return Mask.empty() ||
14412 (VF == Mask.size() &&
14413 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) ||
14414 (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: VF, Index) &&
14415 Index == 0);
14416 }
14417
14418 public:
14419 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
14420 ~ShuffleCostBuilder() = default;
14421 InstructionCost createShuffleVector(Value *V1, Value *,
14422 ArrayRef<int> Mask) const {
14423 // Empty mask or identity mask are free.
14424 unsigned VF =
14425 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14426 if (isEmptyOrIdentity(Mask, VF))
14427 return TTI::TCC_Free;
14428 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
14429 Tp: cast<VectorType>(Val: V1->getType()), Mask);
14430 }
14431 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
14432 // Empty mask or identity mask are free.
14433 unsigned VF =
14434 cast<VectorType>(Val: V1->getType())->getElementCount().getKnownMinValue();
14435 if (isEmptyOrIdentity(Mask, VF))
14436 return TTI::TCC_Free;
14437 return ::getShuffleCost(TTI, Kind: TTI::SK_PermuteSingleSrc,
14438 Tp: cast<VectorType>(Val: V1->getType()), Mask);
14439 }
14440 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
14441 InstructionCost createPoison(Type *Ty, unsigned VF) const {
14442 return TTI::TCC_Free;
14443 }
14444 void resizeToMatch(Value *&, Value *&) const {}
14445 };
14446
14447 /// Smart shuffle instruction emission, walks through shuffles trees and
14448 /// tries to find the best matching vector for the actual shuffle
14449 /// instruction.
14450 InstructionCost
14451 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
14452 const PointerUnion<Value *, const TreeEntry *> &P2,
14453 ArrayRef<int> Mask) {
14454 ShuffleCostBuilder Builder(TTI);
14455 SmallVector<int> CommonMask(Mask);
14456 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
14457 unsigned CommonVF = Mask.size();
14458 InstructionCost ExtraCost = 0;
14459 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
14460 unsigned VF) -> InstructionCost {
14461 if (E.isGather() && allConstant(VL: E.Scalars))
14462 return TTI::TCC_Free;
14463 Type *EScalarTy = E.Scalars.front()->getType();
14464 bool IsSigned = true;
14465 if (auto It = R.MinBWs.find(Val: &E); It != R.MinBWs.end()) {
14466 EScalarTy = IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
14467 IsSigned = It->second.second;
14468 }
14469 if (EScalarTy != ScalarTy) {
14470 unsigned CastOpcode = Instruction::Trunc;
14471 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14472 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14473 if (DstSz > SrcSz)
14474 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14475 return TTI.getCastInstrCost(Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF),
14476 Src: getWidenedType(ScalarTy: EScalarTy, VF),
14477 CCH: TTI::CastContextHint::None, CostKind);
14478 }
14479 return TTI::TCC_Free;
14480 };
14481 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
14482 if (isa<Constant>(Val: V))
14483 return TTI::TCC_Free;
14484 auto *VecTy = cast<VectorType>(Val: V->getType());
14485 Type *EScalarTy = VecTy->getElementType();
14486 if (EScalarTy != ScalarTy) {
14487 bool IsSigned = !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL));
14488 unsigned CastOpcode = Instruction::Trunc;
14489 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
14490 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
14491 if (DstSz > SrcSz)
14492 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
14493 return TTI.getCastInstrCost(
14494 Opcode: CastOpcode, Dst: VectorType::get(ElementType: ScalarTy, EC: VecTy->getElementCount()),
14495 Src: VecTy, CCH: TTI::CastContextHint::None, CostKind);
14496 }
14497 return TTI::TCC_Free;
14498 };
14499 if (!V1 && !V2 && !P2.isNull()) {
14500 // Shuffle 2 entry nodes.
14501 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
14502 unsigned VF = E->getVectorFactor();
14503 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
14504 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14505 assert(all_of(Mask,
14506 [=](int Idx) {
14507 return Idx < 2 * static_cast<int>(CommonVF);
14508 }) &&
14509 "All elements in mask must be less than 2 * CommonVF.");
14510 if (E->Scalars.size() == E2->Scalars.size()) {
14511 SmallVector<int> EMask = E->getCommonMask();
14512 SmallVector<int> E2Mask = E2->getCommonMask();
14513 if (!EMask.empty() || !E2Mask.empty()) {
14514 for (int &Idx : CommonMask) {
14515 if (Idx == PoisonMaskElem)
14516 continue;
14517 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
14518 Idx = EMask[Idx];
14519 else if (Idx >= static_cast<int>(CommonVF))
14520 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
14521 E->Scalars.size();
14522 }
14523 }
14524 CommonVF = E->Scalars.size();
14525 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
14526 GetNodeMinBWAffectedCost(*E2, CommonVF);
14527 } else {
14528 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
14529 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
14530 }
14531 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14532 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14533 } else if (!V1 && P2.isNull()) {
14534 // Shuffle single entry node.
14535 const TreeEntry *E = cast<const TreeEntry *>(Val: P1);
14536 unsigned VF = E->getVectorFactor();
14537 CommonVF = VF;
14538 assert(
14539 all_of(Mask,
14540 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14541 "All elements in mask must be less than CommonVF.");
14542 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
14543 SmallVector<int> EMask = E->getCommonMask();
14544 assert(!EMask.empty() && "Expected non-empty common mask.");
14545 for (int &Idx : CommonMask) {
14546 if (Idx != PoisonMaskElem)
14547 Idx = EMask[Idx];
14548 }
14549 CommonVF = E->Scalars.size();
14550 } else if (unsigned Factor = E->getInterleaveFactor();
14551 Factor > 0 && E->Scalars.size() != Mask.size() &&
14552 ShuffleVectorInst::isDeInterleaveMaskOfFactor(Mask: CommonMask,
14553 Factor)) {
14554 // Deinterleaved nodes are free.
14555 std::iota(first: CommonMask.begin(), last: CommonMask.end(), value: 0);
14556 }
14557 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
14558 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14559 // Not identity/broadcast? Try to see if the original vector is better.
14560 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
14561 CommonVF == CommonMask.size() &&
14562 any_of(Range: enumerate(First&: CommonMask),
14563 P: [](const auto &&P) {
14564 return P.value() != PoisonMaskElem &&
14565 static_cast<unsigned>(P.value()) != P.index();
14566 }) &&
14567 any_of(Range&: CommonMask,
14568 P: [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
14569 SmallVector<int> ReorderMask;
14570 inversePermutation(Indices: E->ReorderIndices, Mask&: ReorderMask);
14571 ::addMask(Mask&: CommonMask, SubMask: ReorderMask);
14572 }
14573 } else if (V1 && P2.isNull()) {
14574 // Shuffle single vector.
14575 ExtraCost += GetValueMinBWAffectedCost(V1);
14576 CommonVF = getVF(V: V1);
14577 assert(
14578 all_of(Mask,
14579 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
14580 "All elements in mask must be less than CommonVF.");
14581 } else if (V1 && !V2) {
14582 // Shuffle vector and tree node.
14583 unsigned VF = getVF(V: V1);
14584 const TreeEntry *E2 = cast<const TreeEntry *>(Val: P2);
14585 CommonVF = std::max(a: VF, b: E2->getVectorFactor());
14586 assert(all_of(Mask,
14587 [=](int Idx) {
14588 return Idx < 2 * static_cast<int>(CommonVF);
14589 }) &&
14590 "All elements in mask must be less than 2 * CommonVF.");
14591 if (E2->Scalars.size() == VF && VF != CommonVF) {
14592 SmallVector<int> E2Mask = E2->getCommonMask();
14593 assert(!E2Mask.empty() && "Expected non-empty common mask.");
14594 for (int &Idx : CommonMask) {
14595 if (Idx == PoisonMaskElem)
14596 continue;
14597 if (Idx >= static_cast<int>(CommonVF))
14598 Idx = E2Mask[Idx - CommonVF] + VF;
14599 }
14600 CommonVF = VF;
14601 }
14602 ExtraCost += GetValueMinBWAffectedCost(V1);
14603 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14604 ExtraCost += GetNodeMinBWAffectedCost(
14605 *E2, std::min(a: CommonVF, b: E2->getVectorFactor()));
14606 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14607 } else if (!V1 && V2) {
14608 // Shuffle vector and tree node.
14609 unsigned VF = getVF(V: V2);
14610 const TreeEntry *E1 = cast<const TreeEntry *>(Val: P1);
14611 CommonVF = std::max(a: VF, b: E1->getVectorFactor());
14612 assert(all_of(Mask,
14613 [=](int Idx) {
14614 return Idx < 2 * static_cast<int>(CommonVF);
14615 }) &&
14616 "All elements in mask must be less than 2 * CommonVF.");
14617 if (E1->Scalars.size() == VF && VF != CommonVF) {
14618 SmallVector<int> E1Mask = E1->getCommonMask();
14619 assert(!E1Mask.empty() && "Expected non-empty common mask.");
14620 for (int &Idx : CommonMask) {
14621 if (Idx == PoisonMaskElem)
14622 continue;
14623 if (Idx >= static_cast<int>(CommonVF))
14624 Idx = E1Mask[Idx - CommonVF] + VF;
14625 else
14626 Idx = E1Mask[Idx];
14627 }
14628 CommonVF = VF;
14629 }
14630 ExtraCost += GetNodeMinBWAffectedCost(
14631 *E1, std::min(a: CommonVF, b: E1->getVectorFactor()));
14632 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14633 ExtraCost += GetValueMinBWAffectedCost(V2);
14634 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14635 } else {
14636 assert(V1 && V2 && "Expected both vectors.");
14637 unsigned VF = getVF(V: V1);
14638 CommonVF = std::max(a: VF, b: getVF(V: V2));
14639 assert(all_of(Mask,
14640 [=](int Idx) {
14641 return Idx < 2 * static_cast<int>(CommonVF);
14642 }) &&
14643 "All elements in mask must be less than 2 * CommonVF.");
14644 ExtraCost +=
14645 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
14646 if (V1->getType() != V2->getType()) {
14647 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14648 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14649 } else {
14650 if (cast<VectorType>(Val: V1->getType())->getElementType() != ScalarTy)
14651 V1 = Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonVF));
14652 if (cast<VectorType>(Val: V2->getType())->getElementType() != ScalarTy)
14653 V2 = getAllOnesValue(DL: *R.DL, Ty: getWidenedType(ScalarTy, VF: CommonVF));
14654 }
14655 }
14656 InVectors.front() =
14657 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
14658 if (InVectors.size() == 2)
14659 InVectors.pop_back();
14660 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
14661 V1, V2, Mask: CommonMask, Builder, ScalarTy);
14662 }
14663
14664public:
14665 ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI,
14666 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
14667 SmallPtrSetImpl<Value *> &CheckedExtracts)
14668 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
14669 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
14670 CheckedExtracts(CheckedExtracts) {}
14671 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14672 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14673 unsigned NumParts, bool &UseVecBaseAsInput) {
14674 UseVecBaseAsInput = false;
14675 if (Mask.empty())
14676 return nullptr;
14677 Value *VecBase = nullptr;
14678 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14679 if (!E->ReorderIndices.empty()) {
14680 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14681 E->ReorderIndices.end());
14682 reorderScalars(Scalars&: VL, Mask: ReorderMask);
14683 }
14684 // Check if it can be considered reused if same extractelements were
14685 // vectorized already.
14686 bool PrevNodeFound = any_of(
14687 Range: ArrayRef(R.VectorizableTree).take_front(N: E->Idx),
14688 P: [&](const std::unique_ptr<TreeEntry> &TE) {
14689 return ((TE->hasState() && !TE->isAltShuffle() &&
14690 TE->getOpcode() == Instruction::ExtractElement) ||
14691 TE->isGather()) &&
14692 all_of(Range: enumerate(First&: TE->Scalars), P: [&](auto &&Data) {
14693 return VL.size() > Data.index() &&
14694 (Mask[Data.index()] == PoisonMaskElem ||
14695 isa<UndefValue>(VL[Data.index()]) ||
14696 Data.value() == VL[Data.index()]);
14697 });
14698 });
14699 SmallPtrSet<Value *, 4> UniqueBases;
14700 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
14701 SmallDenseMap<Value *, APInt, 4> VectorOpsToExtracts;
14702 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
14703 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
14704 ArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
14705 for (auto [I, V] :
14706 enumerate(First: ArrayRef(VL).slice(N: Part * SliceSize, M: Limit))) {
14707 // Ignore non-extractelement scalars.
14708 if (isa<UndefValue>(Val: V) ||
14709 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
14710 continue;
14711 // If all users of instruction are going to be vectorized and this
14712 // instruction itself is not going to be vectorized, consider this
14713 // instruction as dead and remove its cost from the final cost of the
14714 // vectorized tree.
14715 // Also, avoid adjusting the cost for extractelements with multiple uses
14716 // in different graph entries.
14717 auto *EE = cast<ExtractElementInst>(Val: V);
14718 VecBase = EE->getVectorOperand();
14719 UniqueBases.insert(Ptr: VecBase);
14720 ArrayRef<TreeEntry *> VEs = R.getTreeEntries(V);
14721 if (!CheckedExtracts.insert(Ptr: V).second ||
14722 !R.areAllUsersVectorized(I: cast<Instruction>(Val: V), VectorizedVals: &VectorizedVals) ||
14723 any_of(Range&: VEs,
14724 P: [&](const TreeEntry *TE) {
14725 return R.DeletedNodes.contains(Ptr: TE) ||
14726 R.TransformedToGatherNodes.contains(Val: TE);
14727 }) ||
14728 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
14729 !R.isVectorized(V: EE) &&
14730 count_if(Range: E->Scalars, P: [&](Value *V) { return V == EE; }) !=
14731 count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
14732 P: [&](Value *V) { return V == EE; })) ||
14733 any_of(Range: EE->users(),
14734 P: [&](User *U) {
14735 return isa<GetElementPtrInst>(Val: U) &&
14736 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U),
14737 VectorizedVals: &VectorizedVals);
14738 }) ||
14739 (!VEs.empty() && !is_contained(Range&: VEs, Element: E)))
14740 continue;
14741 std::optional<unsigned> EEIdx = getExtractIndex(E: EE);
14742 if (!EEIdx)
14743 continue;
14744 unsigned Idx = *EEIdx;
14745 // Take credit for instruction that will become dead.
14746 if (EE->hasOneUse() || !PrevNodeFound) {
14747 Instruction *Ext = EE->user_back();
14748 if (isa<SExtInst, ZExtInst>(Val: Ext) &&
14749 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
14750 // Use getExtractWithExtendCost() to calculate the cost of
14751 // extractelement/ext pair.
14752 Cost -= TTI.getExtractWithExtendCost(
14753 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: EE->getVectorOperandType(),
14754 Index: Idx, CostKind);
14755 // Add back the cost of s|zext which is subtracted separately.
14756 Cost += TTI.getCastInstrCost(
14757 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: EE->getType(),
14758 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
14759 continue;
14760 }
14761 }
14762 APInt &DemandedElts =
14763 VectorOpsToExtracts
14764 .try_emplace(Key: VecBase,
14765 Args: APInt::getZero(numBits: getNumElements(Ty: VecBase->getType())))
14766 .first->getSecond();
14767 DemandedElts.setBit(Idx);
14768 }
14769 }
14770 for (const auto &[Vec, DemandedElts] : VectorOpsToExtracts)
14771 Cost -= TTI.getScalarizationOverhead(Ty: cast<VectorType>(Val: Vec->getType()),
14772 DemandedElts, /*Insert=*/false,
14773 /*Extract=*/true, CostKind);
14774 // Check that gather of extractelements can be represented as just a
14775 // shuffle of a single/two vectors the scalars are extracted from.
14776 // Found the bunch of extractelement instructions that must be gathered
14777 // into a vector and can be represented as a permutation elements in a
14778 // single input vector or of 2 input vectors.
14779 // Done for reused if same extractelements were vectorized already.
14780 if (!PrevNodeFound)
14781 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
14782 InVectors.assign(NumElts: 1, Elt: E);
14783 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14784 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14785 SameNodesEstimated = false;
14786 if (NumParts != 1 && UniqueBases.size() != 1) {
14787 UseVecBaseAsInput = true;
14788 VecBase =
14789 Constant::getNullValue(Ty: getWidenedType(ScalarTy, VF: CommonMask.size()));
14790 }
14791 return VecBase;
14792 }
14793 /// Checks if the specified entry \p E needs to be delayed because of its
14794 /// dependency nodes.
14795 std::optional<InstructionCost>
14796 needToDelay(const TreeEntry *,
14797 ArrayRef<SmallVector<const TreeEntry *>>) const {
14798 // No need to delay the cost estimation during analysis.
14799 return std::nullopt;
14800 }
14801 /// Reset the builder to handle perfect diamond match.
14802 void resetForSameNode() {
14803 IsFinalized = false;
14804 CommonMask.clear();
14805 InVectors.clear();
14806 Cost = 0;
14807 VectorizedVals.clear();
14808 SameNodesEstimated = true;
14809 }
14810 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14811 if (&E1 == &E2) {
14812 assert(all_of(Mask,
14813 [&](int Idx) {
14814 return Idx < static_cast<int>(E1.getVectorFactor());
14815 }) &&
14816 "Expected single vector shuffle mask.");
14817 add(E1, Mask);
14818 return;
14819 }
14820 if (InVectors.empty()) {
14821 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14822 InVectors.assign(IL: {&E1, &E2});
14823 return;
14824 }
14825 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14826 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
14827 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
14828 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
14829 const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
14830 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
14831 estimateNodesPermuteCost(E1, E2: &E2, Mask, Part, SliceSize);
14832 }
14833 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14834 if (InVectors.empty()) {
14835 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14836 InVectors.assign(NumElts: 1, Elt: &E1);
14837 return;
14838 }
14839 assert(!CommonMask.empty() && "Expected non-empty common mask.");
14840 auto *MaskVecTy = getWidenedType(ScalarTy, VF: Mask.size());
14841 unsigned NumParts = ::getNumberOfParts(TTI, VecTy: MaskVecTy, Limit: Mask.size());
14842 unsigned SliceSize = getPartNumElems(Size: Mask.size(), NumParts);
14843 const auto *It = find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem));
14844 unsigned Part = std::distance(first: Mask.begin(), last: It) / SliceSize;
14845 estimateNodesPermuteCost(E1, E2: nullptr, Mask, Part, SliceSize);
14846 if (!SameNodesEstimated && InVectors.size() == 1)
14847 InVectors.emplace_back(Args: &E1);
14848 }
14849 /// Adds 2 input vectors and the mask for their shuffling.
14850 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14851 // May come only for shuffling of 2 vectors with extractelements, already
14852 // handled in adjustExtracts.
14853 assert(InVectors.size() == 1 &&
14854 all_of(enumerate(CommonMask),
14855 [&](auto P) {
14856 if (P.value() == PoisonMaskElem)
14857 return Mask[P.index()] == PoisonMaskElem;
14858 auto *EI = cast<ExtractElementInst>(
14859 cast<const TreeEntry *>(InVectors.front())
14860 ->getOrdered(P.index()));
14861 return EI->getVectorOperand() == V1 ||
14862 EI->getVectorOperand() == V2;
14863 }) &&
14864 "Expected extractelement vectors.");
14865 }
14866 /// Adds another one input vector and the mask for the shuffling.
14867 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
14868 if (InVectors.empty()) {
14869 assert(CommonMask.empty() && !ForExtracts &&
14870 "Expected empty input mask/vectors.");
14871 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
14872 InVectors.assign(NumElts: 1, Elt: V1);
14873 return;
14874 }
14875 if (ForExtracts) {
14876 // No need to add vectors here, already handled them in adjustExtracts.
14877 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
14878 !CommonMask.empty() &&
14879 all_of(enumerate(CommonMask),
14880 [&](auto P) {
14881 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
14882 ->getOrdered(P.index());
14883 if (P.value() == PoisonMaskElem)
14884 return P.value() == Mask[P.index()] ||
14885 isa<UndefValue>(Scalar);
14886 if (isa<Constant>(V1))
14887 return true;
14888 auto *EI = cast<ExtractElementInst>(Scalar);
14889 return EI->getVectorOperand() == V1;
14890 }) &&
14891 "Expected only tree entry for extractelement vectors.");
14892 return;
14893 }
14894 assert(!InVectors.empty() && !CommonMask.empty() &&
14895 "Expected only tree entries from extracts/reused buildvectors.");
14896 unsigned VF = getVF(V: V1);
14897 if (InVectors.size() == 2) {
14898 Cost += createShuffle(P1: InVectors.front(), P2: InVectors.back(), Mask: CommonMask);
14899 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14900 VF = std::max<unsigned>(a: VF, b: CommonMask.size());
14901 } else if (const auto *InTE =
14902 InVectors.front().dyn_cast<const TreeEntry *>()) {
14903 VF = std::max(a: VF, b: InTE->getVectorFactor());
14904 } else {
14905 VF = std::max(
14906 a: VF, b: cast<FixedVectorType>(Val: cast<Value *>(Val&: InVectors.front())->getType())
14907 ->getNumElements());
14908 }
14909 InVectors.push_back(Elt: V1);
14910 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14911 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14912 CommonMask[Idx] = Mask[Idx] + VF;
14913 }
14914 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14915 Value *Root = nullptr) {
14916 Cost += getBuildVectorCost(VL, Root);
14917 if (!Root) {
14918 // FIXME: Need to find a way to avoid use of getNullValue here.
14919 SmallVector<Constant *> Vals;
14920 unsigned VF = VL.size();
14921 if (MaskVF != 0)
14922 VF = std::min(a: VF, b: MaskVF);
14923 Type *VLScalarTy = VL.front()->getType();
14924 for (Value *V : VL.take_front(N: VF)) {
14925 Type *ScalarTy = VLScalarTy->getScalarType();
14926 if (isa<PoisonValue>(Val: V)) {
14927 Vals.push_back(Elt: PoisonValue::get(T: ScalarTy));
14928 continue;
14929 }
14930 if (isa<UndefValue>(Val: V)) {
14931 Vals.push_back(Elt: UndefValue::get(T: ScalarTy));
14932 continue;
14933 }
14934 Vals.push_back(Elt: Constant::getNullValue(Ty: ScalarTy));
14935 }
14936 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: VLScalarTy)) {
14937 assert(SLPReVec && "FixedVectorType is not expected.");
14938 // When REVEC is enabled, we need to expand vector types into scalar
14939 // types.
14940 Vals = replicateMask(Val: Vals, VF: VecTy->getNumElements());
14941 }
14942 return ConstantVector::get(V: Vals);
14943 }
14944 return ConstantVector::getSplat(
14945 EC: ElementCount::getFixed(
14946 MinVal: cast<FixedVectorType>(Val: Root->getType())->getNumElements()),
14947 Elt: getAllOnesValue(DL: *R.DL, Ty: ScalarTy->getScalarType()));
14948 }
14949 InstructionCost createFreeze(InstructionCost Cost) { return Cost; }
14950 /// Finalize emission of the shuffles.
14951 InstructionCost finalize(
14952 ArrayRef<int> ExtMask,
14953 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14954 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14955 function_ref<void(Value *&, SmallVectorImpl<int> &,
14956 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
14957 Action = {}) {
14958 IsFinalized = true;
14959 if (Action) {
14960 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14961 if (InVectors.size() == 2)
14962 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
14963 else
14964 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
14965 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14966 assert(VF > 0 &&
14967 "Expected vector length for the final value before action.");
14968 Value *V = cast<Value *>(Val: Vec);
14969 Action(V, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
14970 Cost += createShuffle(P1: V1, P2: V2, Mask);
14971 return V1;
14972 });
14973 InVectors.front() = V;
14974 }
14975 if (!SubVectors.empty()) {
14976 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
14977 if (InVectors.size() == 2)
14978 Cost += createShuffle(P1: Vec, P2: InVectors.back(), Mask: CommonMask);
14979 else
14980 Cost += createShuffle(P1: Vec, P2: nullptr, Mask: CommonMask);
14981 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
14982 // Add subvectors permutation cost.
14983 if (!SubVectorsMask.empty()) {
14984 assert(SubVectorsMask.size() <= CommonMask.size() &&
14985 "Expected same size of masks for subvectors and common mask.");
14986 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14987 copy(Range&: SubVectorsMask, Out: SVMask.begin());
14988 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
14989 if (I2 != PoisonMaskElem) {
14990 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14991 I1 = I2 + CommonMask.size();
14992 }
14993 }
14994 Cost += ::getShuffleCost(TTI, Kind: TTI::SK_PermuteTwoSrc,
14995 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()),
14996 Mask: SVMask, CostKind);
14997 }
14998 for (auto [E, Idx] : SubVectors) {
14999 Type *EScalarTy = E->Scalars.front()->getType();
15000 bool IsSigned = true;
15001 if (auto It = R.MinBWs.find(Val: E); It != R.MinBWs.end()) {
15002 EScalarTy =
15003 IntegerType::get(C&: EScalarTy->getContext(), NumBits: It->second.first);
15004 IsSigned = It->second.second;
15005 }
15006 if (ScalarTy != EScalarTy) {
15007 unsigned CastOpcode = Instruction::Trunc;
15008 unsigned DstSz = R.DL->getTypeSizeInBits(Ty: ScalarTy);
15009 unsigned SrcSz = R.DL->getTypeSizeInBits(Ty: EScalarTy);
15010 if (DstSz > SrcSz)
15011 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
15012 Cost += TTI.getCastInstrCost(
15013 Opcode: CastOpcode, Dst: getWidenedType(ScalarTy, VF: E->getVectorFactor()),
15014 Src: getWidenedType(ScalarTy: EScalarTy, VF: E->getVectorFactor()),
15015 CCH: TTI::CastContextHint::Normal, CostKind);
15016 }
15017 Cost += ::getShuffleCost(
15018 TTI, Kind: TTI::SK_InsertSubvector,
15019 Tp: getWidenedType(ScalarTy, VF: CommonMask.size()), Mask: {}, CostKind, Index: Idx,
15020 SubTp: getWidenedType(ScalarTy, VF: E->getVectorFactor()));
15021 if (!CommonMask.empty()) {
15022 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
15023 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
15024 value: Idx);
15025 }
15026 }
15027 }
15028
15029 if (!ExtMask.empty()) {
15030 if (CommonMask.empty()) {
15031 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
15032 } else {
15033 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
15034 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
15035 if (ExtMask[I] == PoisonMaskElem)
15036 continue;
15037 NewMask[I] = CommonMask[ExtMask[I]];
15038 }
15039 CommonMask.swap(RHS&: NewMask);
15040 }
15041 }
15042 if (CommonMask.empty()) {
15043 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
15044 return Cost;
15045 }
15046 return Cost +
15047 createShuffle(P1: InVectors.front(),
15048 P2: InVectors.size() == 2 ? InVectors.back() : nullptr,
15049 Mask: CommonMask);
15050 }
15051
15052 ~ShuffleCostEstimator() {
15053 assert((IsFinalized || CommonMask.empty()) &&
15054 "Shuffle construction must be finalized.");
15055 }
15056};
15057
15058const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
15059 unsigned Idx) const {
15060 TreeEntry *Op = OperandsToTreeEntry.at(Val: {E, Idx});
15061 assert(Op->isSame(E->getOperand(Idx)) && "Operands mismatch!");
15062 return Op;
15063}
15064
15065TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
15066 if (TE.State == TreeEntry::ScatterVectorize ||
15067 TE.State == TreeEntry::StridedVectorize)
15068 return TTI::CastContextHint::GatherScatter;
15069 if (TE.State == TreeEntry::CompressVectorize)
15070 return TTI::CastContextHint::Masked;
15071 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
15072 !TE.isAltShuffle()) {
15073 if (TE.ReorderIndices.empty())
15074 return TTI::CastContextHint::Normal;
15075 SmallVector<int> Mask;
15076 inversePermutation(Indices: TE.ReorderIndices, Mask);
15077 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts: Mask.size()))
15078 return TTI::CastContextHint::Reversed;
15079 }
15080 return TTI::CastContextHint::None;
15081}
15082
15083InstructionCost
15084BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
15085 SmallPtrSetImpl<Value *> &CheckedExtracts) {
15086 ArrayRef<Value *> VL = E->Scalars;
15087
15088 Type *ScalarTy = getValueType(V: VL[0]);
15089 if (!isValidElementType(Ty: ScalarTy))
15090 return InstructionCost::getInvalid();
15091 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
15092
15093 // If we have computed a smaller type for the expression, update VecTy so
15094 // that the costs will be accurate.
15095 auto It = MinBWs.find(Val: E);
15096 Type *OrigScalarTy = ScalarTy;
15097 if (It != MinBWs.end()) {
15098 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
15099 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
15100 if (VecTy)
15101 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
15102 } else if (E->Idx == 0 && isReducedBitcastRoot()) {
15103 const TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
15104 ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
15105 }
15106 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
15107 unsigned EntryVF = E->getVectorFactor();
15108 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
15109
15110 if (E->isGather() || TransformedToGatherNodes.contains(Val: E)) {
15111 if (allConstant(VL))
15112 return 0;
15113 if (isa<InsertElementInst>(Val: VL[0]))
15114 return InstructionCost::getInvalid();
15115 if (isa<CmpInst>(Val: VL.front()))
15116 ScalarTy = VL.front()->getType();
15117 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
15118 E, ScalarTy, Params&: *TTI, Params&: VectorizedVals, Params&: *this, Params&: CheckedExtracts);
15119 }
15120 if (E->State == TreeEntry::SplitVectorize) {
15121 assert(E->CombinedEntriesWithIndices.size() == 2 &&
15122 "Expected exactly 2 combined entries.");
15123 assert(E->ReuseShuffleIndices.empty() && "Expected empty reuses mask.");
15124 InstructionCost VectorCost = 0;
15125 if (E->ReorderIndices.empty()) {
15126 VectorCost = ::getShuffleCost(
15127 TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: FinalVecTy, Mask: {}, CostKind,
15128 Index: E->CombinedEntriesWithIndices.back().second,
15129 SubTp: getWidenedType(
15130 ScalarTy,
15131 VF: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
15132 ->getVectorFactor()));
15133 } else {
15134 unsigned CommonVF =
15135 std::max(a: VectorizableTree[E->CombinedEntriesWithIndices.front().first]
15136 ->getVectorFactor(),
15137 b: VectorizableTree[E->CombinedEntriesWithIndices.back().first]
15138 ->getVectorFactor());
15139 VectorCost = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc,
15140 Tp: getWidenedType(ScalarTy, VF: CommonVF),
15141 Mask: E->getSplitMask(), CostKind);
15142 }
15143 LLVM_DEBUG(dumpTreeCosts(E, 0, VectorCost, 0, "Calculated costs for Tree"));
15144 return VectorCost;
15145 }
15146 InstructionCost CommonCost = 0;
15147 SmallVector<int> Mask;
15148 if (!E->ReorderIndices.empty() && E->State != TreeEntry::CompressVectorize &&
15149 (E->State != TreeEntry::StridedVectorize ||
15150 !isReverseOrder(Order: E->ReorderIndices))) {
15151 SmallVector<int> NewMask;
15152 if (E->getOpcode() == Instruction::Store) {
15153 // For stores the order is actually a mask.
15154 NewMask.resize(N: E->ReorderIndices.size());
15155 copy(Range: E->ReorderIndices, Out: NewMask.begin());
15156 } else {
15157 inversePermutation(Indices: E->ReorderIndices, Mask&: NewMask);
15158 }
15159 ::addMask(Mask, SubMask: NewMask);
15160 }
15161 if (!E->ReuseShuffleIndices.empty())
15162 ::addMask(Mask, SubMask: E->ReuseShuffleIndices);
15163 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
15164 CommonCost =
15165 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
15166 assert((E->State == TreeEntry::Vectorize ||
15167 E->State == TreeEntry::ScatterVectorize ||
15168 E->State == TreeEntry::StridedVectorize ||
15169 E->State == TreeEntry::CompressVectorize) &&
15170 "Unhandled state");
15171 assert(E->getOpcode() &&
15172 ((allSameType(VL) && allSameBlock(VL)) ||
15173 (E->getOpcode() == Instruction::GetElementPtr &&
15174 E->getMainOp()->getType()->isPointerTy()) ||
15175 E->hasCopyableElements()) &&
15176 "Invalid VL");
15177 Instruction *VL0 = E->getMainOp();
15178 unsigned ShuffleOrOp =
15179 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15180 if (E->CombinedOp != TreeEntry::NotCombinedOp)
15181 ShuffleOrOp = E->CombinedOp;
15182 SmallSetVector<Value *, 16> UniqueValues;
15183 SmallVector<unsigned, 16> UniqueIndexes;
15184 for (auto [Idx, V] : enumerate(First&: VL))
15185 if (UniqueValues.insert(X: V))
15186 UniqueIndexes.push_back(Elt: Idx);
15187 const unsigned Sz = UniqueValues.size();
15188 SmallBitVector UsedScalars(Sz, false);
15189 for (unsigned I = 0; I < Sz; ++I) {
15190 if (isa<Instruction>(Val: UniqueValues[I]) &&
15191 !E->isCopyableElement(V: UniqueValues[I]) &&
15192 getTreeEntries(V: UniqueValues[I]).front() == E)
15193 continue;
15194 UsedScalars.set(I);
15195 }
15196 auto GetCastContextHint = [&](Value *V) {
15197 if (ArrayRef<TreeEntry *> OpTEs = getTreeEntries(V); OpTEs.size() == 1)
15198 return getCastContextHint(TE: *OpTEs.front());
15199 InstructionsState SrcState = getSameOpcode(VL: E->getOperand(OpIdx: 0), TLI: *TLI);
15200 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
15201 !SrcState.isAltShuffle())
15202 return TTI::CastContextHint::GatherScatter;
15203 return TTI::CastContextHint::None;
15204 };
15205 auto GetCostDiff =
15206 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
15207 function_ref<InstructionCost(InstructionCost)> VectorCost) {
15208 // Calculate the cost of this instruction.
15209 InstructionCost ScalarCost = 0;
15210 if (isa<CastInst, CallInst>(Val: VL0)) {
15211 // For some of the instructions no need to calculate cost for each
15212 // particular instruction, we can use the cost of the single
15213 // instruction x total number of scalar instructions.
15214 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
15215 } else {
15216 for (unsigned I = 0; I < Sz; ++I) {
15217 if (UsedScalars.test(Idx: I))
15218 continue;
15219 ScalarCost += ScalarEltCost(I);
15220 }
15221 }
15222
15223 InstructionCost VecCost = VectorCost(CommonCost);
15224 // Check if the current node must be resized, if the parent node is not
15225 // resized.
15226 if (It != MinBWs.end() && !UnaryInstruction::isCast(Opcode: E->getOpcode()) &&
15227 E->Idx != 0 &&
15228 (E->getOpcode() != Instruction::Load || E->UserTreeIndex)) {
15229 const EdgeInfo &EI = E->UserTreeIndex;
15230 if (!EI.UserTE->hasState() ||
15231 EI.UserTE->getOpcode() != Instruction::Select ||
15232 EI.EdgeIdx != 0) {
15233 auto UserBWIt = MinBWs.find(Val: EI.UserTE);
15234 Type *UserScalarTy =
15235 (EI.UserTE->isGather() ||
15236 EI.UserTE->State == TreeEntry::SplitVectorize)
15237 ? EI.UserTE->Scalars.front()->getType()
15238 : EI.UserTE->getOperand(OpIdx: EI.EdgeIdx).front()->getType();
15239 if (UserBWIt != MinBWs.end())
15240 UserScalarTy = IntegerType::get(C&: ScalarTy->getContext(),
15241 NumBits: UserBWIt->second.first);
15242 if (ScalarTy != UserScalarTy) {
15243 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
15244 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: UserScalarTy);
15245 unsigned VecOpcode;
15246 auto *UserVecTy = getWidenedType(ScalarTy: UserScalarTy, VF: E->Scalars.size());
15247 if (BWSz > SrcBWSz)
15248 VecOpcode = Instruction::Trunc;
15249 else
15250 VecOpcode =
15251 It->second.second ? Instruction::SExt : Instruction::ZExt;
15252 TTI::CastContextHint CCH = GetCastContextHint(VL0);
15253 VecCost += TTI->getCastInstrCost(Opcode: VecOpcode, Dst: UserVecTy, Src: VecTy, CCH,
15254 CostKind);
15255 }
15256 }
15257 }
15258 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
15259 ScalarCost, "Calculated costs for Tree"));
15260 return VecCost - ScalarCost;
15261 };
15262 // Calculate cost difference from vectorizing set of GEPs.
15263 // Negative value means vectorizing is profitable.
15264 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
15265 assert((E->State == TreeEntry::Vectorize ||
15266 E->State == TreeEntry::StridedVectorize ||
15267 E->State == TreeEntry::CompressVectorize) &&
15268 "Entry state expected to be Vectorize, StridedVectorize or "
15269 "MaskedLoadCompressVectorize here.");
15270 InstructionCost ScalarCost = 0;
15271 InstructionCost VecCost = 0;
15272 std::tie(args&: ScalarCost, args&: VecCost) = getGEPCosts(
15273 TTI: *TTI, Ptrs, BasePtr, Opcode: E->getOpcode(), CostKind, ScalarTy: OrigScalarTy, VecTy);
15274 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
15275 "Calculated GEPs cost for Tree"));
15276
15277 return VecCost - ScalarCost;
15278 };
15279
15280 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
15281 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VL: VI ? VI : VL);
15282 if (MinMaxID == Intrinsic::not_intrinsic)
15283 return InstructionCost::getInvalid();
15284 Type *CanonicalType = Ty;
15285 if (CanonicalType->isPtrOrPtrVectorTy())
15286 CanonicalType = CanonicalType->getWithNewType(EltTy: IntegerType::get(
15287 C&: CanonicalType->getContext(),
15288 NumBits: DL->getTypeSizeInBits(Ty: CanonicalType->getScalarType())));
15289
15290 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
15291 {CanonicalType, CanonicalType});
15292 InstructionCost IntrinsicCost =
15293 TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15294 // If the selects are the only uses of the compares, they will be
15295 // dead and we can adjust the cost by removing their cost.
15296 if (VI && SelectOnly) {
15297 assert((!Ty->isVectorTy() || SLPReVec) &&
15298 "Expected only for scalar type.");
15299 auto *CI = cast<CmpInst>(Val: VI->getOperand(i: 0));
15300 IntrinsicCost -= TTI->getCmpSelInstrCost(
15301 Opcode: CI->getOpcode(), ValTy: Ty, CondTy: Builder.getInt1Ty(), VecPred: CI->getPredicate(),
15302 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
15303 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I: CI);
15304 }
15305 return IntrinsicCost;
15306 };
15307 auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
15308 Instruction *VI) {
15309 InstructionCost Cost = canConvertToFMA(VL: VI, S, DT&: *DT, DL: *DL, TTI, TLI: *TLI);
15310 return Cost;
15311 };
15312 switch (ShuffleOrOp) {
15313 case Instruction::PHI: {
15314 // Count reused scalars.
15315 InstructionCost ScalarCost = 0;
15316 SmallPtrSet<const TreeEntry *, 4> CountedOps;
15317 for (Value *V : UniqueValues) {
15318 auto *PHI = dyn_cast<PHINode>(Val: V);
15319 if (!PHI)
15320 continue;
15321
15322 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
15323 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
15324 Value *Op = PHI->getIncomingValue(i: I);
15325 Operands[I] = Op;
15326 }
15327 if (const TreeEntry *OpTE =
15328 getSameValuesTreeEntry(V: Operands.front(), VL: Operands))
15329 if (CountedOps.insert(Ptr: OpTE).second &&
15330 !OpTE->ReuseShuffleIndices.empty())
15331 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
15332 OpTE->Scalars.size());
15333 }
15334
15335 return CommonCost - ScalarCost;
15336 }
15337 case Instruction::ExtractValue:
15338 case Instruction::ExtractElement: {
15339 APInt DemandedElts;
15340 VectorType *SrcVecTy = nullptr;
15341 auto GetScalarCost = [&](unsigned Idx) {
15342 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15343 return InstructionCost(TTI::TCC_Free);
15344
15345 auto *I = cast<Instruction>(Val: UniqueValues[Idx]);
15346 if (!SrcVecTy) {
15347 if (ShuffleOrOp == Instruction::ExtractElement) {
15348 auto *EE = cast<ExtractElementInst>(Val: I);
15349 SrcVecTy = EE->getVectorOperandType();
15350 } else {
15351 auto *EV = cast<ExtractValueInst>(Val: I);
15352 Type *AggregateTy = EV->getAggregateOperand()->getType();
15353 unsigned NumElts;
15354 if (auto *ATy = dyn_cast<ArrayType>(Val: AggregateTy))
15355 NumElts = ATy->getNumElements();
15356 else
15357 NumElts = AggregateTy->getStructNumElements();
15358 SrcVecTy = getWidenedType(ScalarTy: OrigScalarTy, VF: NumElts);
15359 }
15360 }
15361 if (I->hasOneUse()) {
15362 Instruction *Ext = I->user_back();
15363 if ((isa<SExtInst>(Val: Ext) || isa<ZExtInst>(Val: Ext)) &&
15364 all_of(Range: Ext->users(), P: IsaPred<GetElementPtrInst>)) {
15365 // Use getExtractWithExtendCost() to calculate the cost of
15366 // extractelement/ext pair.
15367 InstructionCost Cost = TTI->getExtractWithExtendCost(
15368 Opcode: Ext->getOpcode(), Dst: Ext->getType(), VecTy: SrcVecTy, Index: *getExtractIndex(E: I),
15369 CostKind);
15370 // Subtract the cost of s|zext which is subtracted separately.
15371 Cost -= TTI->getCastInstrCost(
15372 Opcode: Ext->getOpcode(), Dst: Ext->getType(), Src: I->getType(),
15373 CCH: TTI::getCastContextHint(I: Ext), CostKind, I: Ext);
15374 return Cost;
15375 }
15376 }
15377 if (DemandedElts.isZero())
15378 DemandedElts = APInt::getZero(numBits: getNumElements(Ty: SrcVecTy));
15379 DemandedElts.setBit(*getExtractIndex(E: I));
15380 return InstructionCost(TTI::TCC_Free);
15381 };
15382 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15383 return CommonCost - (DemandedElts.isZero()
15384 ? TTI::TCC_Free
15385 : TTI.getScalarizationOverhead(
15386 Ty: SrcVecTy, DemandedElts, /*Insert=*/false,
15387 /*Extract=*/true, CostKind));
15388 };
15389 return GetCostDiff(GetScalarCost, GetVectorCost);
15390 }
15391 case Instruction::InsertElement: {
15392 assert(E->ReuseShuffleIndices.empty() &&
15393 "Unique insertelements only are expected.");
15394 auto *SrcVecTy = cast<FixedVectorType>(Val: VL0->getType());
15395 unsigned const NumElts = SrcVecTy->getNumElements();
15396 unsigned const NumScalars = VL.size();
15397
15398 unsigned NumOfParts = ::getNumberOfParts(TTI: *TTI, VecTy: SrcVecTy);
15399
15400 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15401 unsigned OffsetBeg = *getElementIndex(Inst: VL.front());
15402 unsigned OffsetEnd = OffsetBeg;
15403 InsertMask[OffsetBeg] = 0;
15404 for (auto [I, V] : enumerate(First: VL.drop_front())) {
15405 unsigned Idx = *getElementIndex(Inst: V);
15406 if (OffsetBeg > Idx)
15407 OffsetBeg = Idx;
15408 else if (OffsetEnd < Idx)
15409 OffsetEnd = Idx;
15410 InsertMask[Idx] = I + 1;
15411 }
15412 unsigned VecScalarsSz = PowerOf2Ceil(A: NumElts);
15413 if (NumOfParts > 0 && NumOfParts < NumElts)
15414 VecScalarsSz = PowerOf2Ceil(A: (NumElts + NumOfParts - 1) / NumOfParts);
15415 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
15416 VecScalarsSz;
15417 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
15418 unsigned InsertVecSz = std::min<unsigned>(
15419 a: PowerOf2Ceil(A: OffsetEnd - OffsetBeg + 1),
15420 b: ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
15421 bool IsWholeSubvector =
15422 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
15423 // Check if we can safely insert a subvector. If it is not possible, just
15424 // generate a whole-sized vector and shuffle the source vector and the new
15425 // subvector.
15426 if (OffsetBeg + InsertVecSz > VecSz) {
15427 // Align OffsetBeg to generate correct mask.
15428 OffsetBeg = alignDown(Value: OffsetBeg, Align: VecSz, Skew: Offset);
15429 InsertVecSz = VecSz;
15430 }
15431
15432 APInt DemandedElts = APInt::getZero(numBits: NumElts);
15433 // TODO: Add support for Instruction::InsertValue.
15434 SmallVector<int> Mask;
15435 if (!E->ReorderIndices.empty()) {
15436 inversePermutation(Indices: E->ReorderIndices, Mask);
15437 Mask.append(NumInputs: InsertVecSz - Mask.size(), Elt: PoisonMaskElem);
15438 } else {
15439 Mask.assign(NumElts: VecSz, Elt: PoisonMaskElem);
15440 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: InsertVecSz), value: 0);
15441 }
15442 bool IsIdentity = true;
15443 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
15444 Mask.swap(RHS&: PrevMask);
15445 for (unsigned I = 0; I < NumScalars; ++I) {
15446 unsigned InsertIdx = *getElementIndex(Inst: VL[PrevMask[I]]);
15447 DemandedElts.setBit(InsertIdx);
15448 IsIdentity &= InsertIdx - OffsetBeg == I;
15449 Mask[InsertIdx - OffsetBeg] = I;
15450 }
15451 assert(Offset < NumElts && "Failed to find vector index offset");
15452
15453 InstructionCost Cost = 0;
15454 Cost -=
15455 getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: SrcVecTy, DemandedElts,
15456 /*Insert*/ true, /*Extract*/ false, CostKind);
15457
15458 // First cost - resize to actual vector size if not identity shuffle or
15459 // need to shift the vector.
15460 // Do not calculate the cost if the actual size is the register size and
15461 // we can merge this shuffle with the following SK_Select.
15462 auto *InsertVecTy = getWidenedType(ScalarTy, VF: InsertVecSz);
15463 if (!IsIdentity)
15464 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc,
15465 Tp: InsertVecTy, Mask);
15466 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: [E](Value *V) {
15467 return !is_contained(Range: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
15468 }));
15469 // Second cost - permutation with subvector, if some elements are from the
15470 // initial vector or inserting a subvector.
15471 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
15472 // subvector of ActualVecTy.
15473 SmallBitVector InMask =
15474 isUndefVector(V: FirstInsert->getOperand(i: 0),
15475 UseMask: buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask));
15476 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
15477 if (InsertVecSz != VecSz) {
15478 auto *ActualVecTy = getWidenedType(ScalarTy, VF: VecSz);
15479 Cost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_InsertSubvector, Tp: ActualVecTy, Mask: {},
15480 CostKind, Index: OffsetBeg - Offset, SubTp: InsertVecTy);
15481 } else {
15482 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
15483 Mask[I] = InMask.test(Idx: I) ? PoisonMaskElem : I;
15484 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
15485 I <= End; ++I)
15486 if (Mask[I] != PoisonMaskElem)
15487 Mask[I] = I + VecSz;
15488 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
15489 Mask[I] =
15490 ((I >= InMask.size()) || InMask.test(Idx: I)) ? PoisonMaskElem : I;
15491 Cost +=
15492 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: InsertVecTy, Mask);
15493 }
15494 }
15495 return Cost;
15496 }
15497 case Instruction::ZExt:
15498 case Instruction::SExt:
15499 case Instruction::FPToUI:
15500 case Instruction::FPToSI:
15501 case Instruction::FPExt:
15502 case Instruction::PtrToInt:
15503 case Instruction::IntToPtr:
15504 case Instruction::SIToFP:
15505 case Instruction::UIToFP:
15506 case Instruction::Trunc:
15507 case Instruction::FPTrunc:
15508 case Instruction::BitCast: {
15509 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
15510 Type *SrcScalarTy = VL0->getOperand(i: 0)->getType();
15511 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size());
15512 unsigned Opcode = ShuffleOrOp;
15513 unsigned VecOpcode = Opcode;
15514 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15515 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
15516 // Check if the values are candidates to demote.
15517 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy->getScalarType());
15518 if (SrcIt != MinBWs.end()) {
15519 SrcBWSz = SrcIt->second.first;
15520 unsigned SrcScalarTyNumElements = getNumElements(Ty: SrcScalarTy);
15521 SrcScalarTy = IntegerType::get(C&: F->getContext(), NumBits: SrcBWSz);
15522 SrcVecTy =
15523 getWidenedType(ScalarTy: SrcScalarTy, VF: VL.size() * SrcScalarTyNumElements);
15524 }
15525 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
15526 if (BWSz == SrcBWSz) {
15527 VecOpcode = Instruction::BitCast;
15528 } else if (BWSz < SrcBWSz) {
15529 VecOpcode = Instruction::Trunc;
15530 } else if (It != MinBWs.end()) {
15531 assert(BWSz > SrcBWSz && "Invalid cast!");
15532 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15533 } else if (SrcIt != MinBWs.end()) {
15534 assert(BWSz > SrcBWSz && "Invalid cast!");
15535 VecOpcode =
15536 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15537 }
15538 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15539 !SrcIt->second.second) {
15540 VecOpcode = Instruction::UIToFP;
15541 }
15542 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
15543 assert(Idx == 0 && "Expected 0 index only");
15544 return TTI->getCastInstrCost(Opcode, Dst: VL0->getType(),
15545 Src: VL0->getOperand(i: 0)->getType(),
15546 CCH: TTI::getCastContextHint(I: VL0), CostKind, I: VL0);
15547 };
15548 auto GetVectorCost = [=](InstructionCost CommonCost) {
15549 // Do not count cost here if minimum bitwidth is in effect and it is just
15550 // a bitcast (here it is just a noop).
15551 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
15552 return CommonCost;
15553 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
15554 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(i: 0));
15555
15556 bool IsArithmeticExtendedReduction =
15557 E->Idx == 0 && UserIgnoreList &&
15558 all_of(Range: *UserIgnoreList, P: [](Value *V) {
15559 auto *I = cast<Instruction>(Val: V);
15560 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
15561 Instruction::Mul, Instruction::FMul,
15562 Instruction::And, Instruction::Or,
15563 Instruction::Xor},
15564 Element: I->getOpcode());
15565 });
15566 if (IsArithmeticExtendedReduction &&
15567 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
15568 return CommonCost;
15569 return CommonCost +
15570 TTI->getCastInstrCost(Opcode: VecOpcode, Dst: VecTy, Src: SrcVecTy, CCH, CostKind,
15571 I: VecOpcode == Opcode ? VI : nullptr);
15572 };
15573 return GetCostDiff(GetScalarCost, GetVectorCost);
15574 }
15575 case Instruction::FCmp:
15576 case Instruction::ICmp:
15577 case Instruction::Select: {
15578 CmpPredicate VecPred, SwappedVecPred;
15579 auto MatchCmp = m_Cmp(Pred&: VecPred, L: m_Value(), R: m_Value());
15580 if (match(V: VL0, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) ||
15581 match(V: VL0, P: MatchCmp))
15582 SwappedVecPred = CmpInst::getSwappedPredicate(pred: VecPred);
15583 else
15584 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
15585 ? CmpInst::BAD_FCMP_PREDICATE
15586 : CmpInst::BAD_ICMP_PREDICATE;
15587 auto GetScalarCost = [&](unsigned Idx) {
15588 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15589 return InstructionCost(TTI::TCC_Free);
15590
15591 if (!isa<SelectInst>(Val: UniqueValues[Idx]))
15592 return TTI->getInstructionCost(U: cast<Instruction>(Val: UniqueValues[Idx]),
15593 CostKind);
15594
15595 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
15596 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
15597 ? CmpInst::BAD_FCMP_PREDICATE
15598 : CmpInst::BAD_ICMP_PREDICATE;
15599 auto MatchCmp = m_Cmp(Pred&: CurrentPred, L: m_Value(), R: m_Value());
15600 if ((!match(V: VI, P: m_Select(C: MatchCmp, L: m_Value(), R: m_Value())) &&
15601 !match(V: VI, P: MatchCmp)) ||
15602 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
15603 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
15604 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
15605 ? CmpInst::BAD_FCMP_PREDICATE
15606 : CmpInst::BAD_ICMP_PREDICATE;
15607
15608 InstructionCost ScalarCost = TTI->getCmpSelInstrCost(
15609 Opcode: E->getOpcode(), ValTy: OrigScalarTy, CondTy: Builder.getInt1Ty(), VecPred: CurrentPred,
15610 CostKind, Op1Info: getOperandInfo(Ops: VI->getOperand(i: 0)),
15611 Op2Info: getOperandInfo(Ops: VI->getOperand(i: 1)), I: VI);
15612 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
15613 if (IntrinsicCost.isValid())
15614 ScalarCost = IntrinsicCost;
15615
15616 return ScalarCost;
15617 };
15618 auto GetVectorCost = [&](InstructionCost CommonCost) {
15619 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
15620
15621 InstructionCost VecCost =
15622 TTI->getCmpSelInstrCost(Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred,
15623 CostKind, Op1Info: getOperandInfo(Ops: E->getOperand(OpIdx: 0)),
15624 Op2Info: getOperandInfo(Ops: E->getOperand(OpIdx: 1)), I: VL0);
15625 if (auto *SI = dyn_cast<SelectInst>(Val: VL0)) {
15626 auto *CondType =
15627 getWidenedType(ScalarTy: SI->getCondition()->getType(), VF: VL.size());
15628 unsigned CondNumElements = CondType->getNumElements();
15629 unsigned VecTyNumElements = getNumElements(Ty: VecTy);
15630 assert(VecTyNumElements >= CondNumElements &&
15631 VecTyNumElements % CondNumElements == 0 &&
15632 "Cannot vectorize Instruction::Select");
15633 if (CondNumElements != VecTyNumElements) {
15634 // When the return type is i1 but the source is fixed vector type, we
15635 // need to duplicate the condition value.
15636 VecCost += ::getShuffleCost(
15637 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: CondType,
15638 Mask: createReplicatedMask(ReplicationFactor: VecTyNumElements / CondNumElements,
15639 VF: CondNumElements));
15640 }
15641 }
15642 return VecCost + CommonCost;
15643 };
15644 return GetCostDiff(GetScalarCost, GetVectorCost);
15645 }
15646 case TreeEntry::MinMax: {
15647 auto GetScalarCost = [&](unsigned Idx) {
15648 return GetMinMaxCost(OrigScalarTy);
15649 };
15650 auto GetVectorCost = [&](InstructionCost CommonCost) {
15651 InstructionCost VecCost = GetMinMaxCost(VecTy);
15652 return VecCost + CommonCost;
15653 };
15654 return GetCostDiff(GetScalarCost, GetVectorCost);
15655 }
15656 case TreeEntry::FMulAdd: {
15657 auto GetScalarCost = [&](unsigned Idx) {
15658 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15659 return InstructionCost(TTI::TCC_Free);
15660 return GetFMulAddCost(E->getOperations(),
15661 cast<Instruction>(Val: UniqueValues[Idx]));
15662 };
15663 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15664 FastMathFlags FMF;
15665 FMF.set();
15666 for (Value *V : E->Scalars) {
15667 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: V)) {
15668 FMF &= FPCI->getFastMathFlags();
15669 if (auto *FPCIOp = dyn_cast<FPMathOperator>(Val: FPCI->getOperand(i: 0)))
15670 FMF &= FPCIOp->getFastMathFlags();
15671 }
15672 }
15673 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
15674 {VecTy, VecTy, VecTy}, FMF);
15675 InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
15676 return VecCost + CommonCost;
15677 };
15678 return GetCostDiff(GetScalarCost, GetVectorCost);
15679 }
15680 case TreeEntry::ReducedBitcast:
15681 case TreeEntry::ReducedBitcastBSwap: {
15682 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
15683 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15684 return InstructionCost(TTI::TCC_Free);
15685 auto *Shl = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
15686 if (!Shl)
15687 return InstructionCost(TTI::TCC_Free);
15688 InstructionCost ScalarCost = TTI.getInstructionCost(U: Shl, CostKind);
15689 auto *ZExt = dyn_cast<Instruction>(Val: Shl->getOperand(i: 0));
15690 if (!ZExt)
15691 return ScalarCost;
15692 ScalarCost += TTI.getInstructionCost(U: ZExt, CostKind);
15693 return ScalarCost;
15694 };
15695 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15696 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
15697 TTI::CastContextHint CastCtx =
15698 getCastContextHint(TE: *getOperandEntry(E: LhsTE, /*Idx=*/0));
15699 Type *SrcScalarTy = cast<ZExtInst>(Val: LhsTE->getMainOp())->getSrcTy();
15700 auto *SrcVecTy = getWidenedType(ScalarTy: SrcScalarTy, VF: LhsTE->getVectorFactor());
15701 InstructionCost BitcastCost = TTI.getCastInstrCost(
15702 Opcode: Instruction::BitCast, Dst: ScalarTy, Src: SrcVecTy, CCH: CastCtx, CostKind);
15703 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
15704 auto *SrcType = IntegerType::getIntNTy(
15705 C&: ScalarTy->getContext(),
15706 N: DL->getTypeSizeInBits(Ty: SrcScalarTy) * EntryVF);
15707 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
15708 InstructionCost IntrinsicCost =
15709 TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15710 BitcastCost += IntrinsicCost;
15711 if (SrcType != ScalarTy) {
15712 BitcastCost +=
15713 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
15714 CCH: TTI::CastContextHint::None, CostKind);
15715 }
15716 }
15717 return BitcastCost + CommonCost;
15718 };
15719 return GetCostDiff(GetScalarCost, GetVectorCost);
15720 }
15721 case TreeEntry::ReducedBitcastLoads:
15722 case TreeEntry::ReducedBitcastBSwapLoads: {
15723 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
15724 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15725 return InstructionCost(TTI::TCC_Free);
15726 auto *Shl = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
15727 if (!Shl)
15728 return InstructionCost(TTI::TCC_Free);
15729 InstructionCost ScalarCost = TTI.getInstructionCost(U: Shl, CostKind);
15730 auto *ZExt = dyn_cast<Instruction>(Val: Shl->getOperand(i: 0));
15731 if (!ZExt)
15732 return ScalarCost;
15733 ScalarCost += TTI.getInstructionCost(U: ZExt, CostKind);
15734 auto *Load = dyn_cast<Instruction>(Val: ZExt->getOperand(i: 0));
15735 if (!Load)
15736 return ScalarCost;
15737 ScalarCost += TTI.getInstructionCost(U: Load, CostKind);
15738 return ScalarCost;
15739 };
15740 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15741 const TreeEntry *LhsTE = getOperandEntry(E, /*Idx=*/0);
15742 const TreeEntry *LoadTE = getOperandEntry(E: LhsTE, /*Idx=*/0);
15743 auto *LI0 = cast<LoadInst>(Val: LoadTE->getMainOp());
15744 auto *SrcType = IntegerType::getIntNTy(
15745 C&: ScalarTy->getContext(),
15746 N: DL->getTypeSizeInBits(Ty: LI0->getType()) * EntryVF);
15747 InstructionCost LoadCost =
15748 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: SrcType, Alignment: LI0->getAlign(),
15749 AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15750 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
15751 IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
15752 InstructionCost IntrinsicCost =
15753 TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
15754 LoadCost += IntrinsicCost;
15755 if (SrcType != ScalarTy) {
15756 LoadCost +=
15757 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: SrcType,
15758 CCH: TTI::CastContextHint::None, CostKind);
15759 }
15760 }
15761 return LoadCost + CommonCost;
15762 };
15763 return GetCostDiff(GetScalarCost, GetVectorCost);
15764 }
15765 case TreeEntry::ReducedCmpBitcast: {
15766 auto GetScalarCost = [&, &TTI = *TTI](unsigned Idx) {
15767 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15768 return InstructionCost(TTI::TCC_Free);
15769 auto *Sel = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
15770 if (!Sel)
15771 return InstructionCost(TTI::TCC_Free);
15772 InstructionCost ScalarCost = TTI.getInstructionCost(U: Sel, CostKind);
15773 return ScalarCost;
15774 };
15775 auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
15776 Type *CmpTy = CmpInst::makeCmpResultType(opnd_type: VecTy);
15777 auto *DstTy =
15778 IntegerType::getIntNTy(C&: ScalarTy->getContext(), N: E->getVectorFactor());
15779 InstructionCost BitcastCost =
15780 TTI.getCastInstrCost(Opcode: Instruction::BitCast, Dst: DstTy, Src: CmpTy,
15781 CCH: TTI::CastContextHint::None, CostKind);
15782 if (DstTy != ScalarTy) {
15783 BitcastCost +=
15784 TTI.getCastInstrCost(Opcode: Instruction::ZExt, Dst: ScalarTy, Src: DstTy,
15785 CCH: TTI::CastContextHint::None, CostKind);
15786 }
15787 return BitcastCost + CommonCost;
15788 };
15789 return GetCostDiff(GetScalarCost, GetVectorCost);
15790 }
15791 case Instruction::FNeg:
15792 case Instruction::Add:
15793 case Instruction::FAdd:
15794 case Instruction::Sub:
15795 case Instruction::FSub:
15796 case Instruction::Mul:
15797 case Instruction::FMul:
15798 case Instruction::UDiv:
15799 case Instruction::SDiv:
15800 case Instruction::FDiv:
15801 case Instruction::URem:
15802 case Instruction::SRem:
15803 case Instruction::FRem:
15804 case Instruction::Shl:
15805 case Instruction::LShr:
15806 case Instruction::AShr:
15807 case Instruction::And:
15808 case Instruction::Or:
15809 case Instruction::Xor: {
15810 auto GetScalarCost = [&](unsigned Idx) {
15811 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
15812 return InstructionCost(TTI::TCC_Free);
15813
15814 // We cannot retrieve the operand from UniqueValues[Idx] because an
15815 // interchangeable instruction may be used. The order and the actual
15816 // operand might differ from what is retrieved from UniqueValues[Idx].
15817 unsigned Lane = UniqueIndexes[Idx];
15818 Value *Op1 = E->getOperand(OpIdx: 0)[Lane];
15819 Value *Op2;
15820 SmallVector<const Value *, 2> Operands(1, Op1);
15821 if (isa<UnaryOperator>(Val: UniqueValues[Idx])) {
15822 Op2 = Op1;
15823 } else {
15824 Op2 = E->getOperand(OpIdx: 1)[Lane];
15825 Operands.push_back(Elt: Op2);
15826 }
15827 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(V: Op1);
15828 TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(V: Op2);
15829 InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
15830 Opcode: ShuffleOrOp, Ty: OrigScalarTy, CostKind, Opd1Info: Op1Info, Opd2Info: Op2Info, Args: Operands);
15831 if (auto *I = dyn_cast<Instruction>(Val: UniqueValues[Idx]);
15832 I && (ShuffleOrOp == Instruction::FAdd ||
15833 ShuffleOrOp == Instruction::FSub)) {
15834 InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
15835 if (IntrinsicCost.isValid())
15836 ScalarCost = IntrinsicCost;
15837 }
15838 return ScalarCost;
15839 };
15840 auto GetVectorCost = [=](InstructionCost CommonCost) {
15841 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15842 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
15843 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
15844 if (all_of(Range&: Ops, P: [&](Value *Op) {
15845 auto *CI = dyn_cast<ConstantInt>(Val: Op);
15846 return CI && CI->getValue().countr_one() >= It->second.first;
15847 }))
15848 return CommonCost;
15849 }
15850 }
15851 unsigned OpIdx = isa<UnaryOperator>(Val: VL0) ? 0 : 1;
15852 TTI::OperandValueInfo Op1Info = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
15853 TTI::OperandValueInfo Op2Info = getOperandInfo(Ops: E->getOperand(OpIdx));
15854 return TTI->getArithmeticInstrCost(Opcode: ShuffleOrOp, Ty: VecTy, CostKind, Opd1Info: Op1Info,
15855 Opd2Info: Op2Info, Args: {}, CxtI: nullptr, TLibInfo: TLI) +
15856 CommonCost;
15857 };
15858 return GetCostDiff(GetScalarCost, GetVectorCost);
15859 }
15860 case Instruction::GetElementPtr: {
15861 return CommonCost + GetGEPCostDiff(VL, VL0);
15862 }
15863 case Instruction::Load: {
15864 auto GetScalarCost = [&](unsigned Idx) {
15865 auto *VI = cast<LoadInst>(Val: UniqueValues[Idx]);
15866 return TTI->getMemoryOpCost(Opcode: Instruction::Load, Src: OrigScalarTy,
15867 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
15868 CostKind, OpdInfo: TTI::OperandValueInfo(), I: VI);
15869 };
15870 auto *LI0 = cast<LoadInst>(Val: VL0);
15871 auto GetVectorCost = [&](InstructionCost CommonCost) {
15872 InstructionCost VecLdCost;
15873 switch (E->State) {
15874 case TreeEntry::Vectorize:
15875 if (unsigned Factor = E->getInterleaveFactor()) {
15876 VecLdCost = TTI->getInterleavedMemoryOpCost(
15877 Opcode: Instruction::Load, VecTy, Factor, Indices: {}, Alignment: LI0->getAlign(),
15878 AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15879
15880 } else {
15881 VecLdCost = TTI->getMemoryOpCost(
15882 Opcode: Instruction::Load, Src: VecTy, Alignment: LI0->getAlign(),
15883 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
15884 }
15885 break;
15886 case TreeEntry::StridedVectorize: {
15887 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
15888 FixedVectorType *StridedLoadTy = SPtrInfo.Ty;
15889 assert(StridedLoadTy && "Missing StridedPointerInfo for tree entry.");
15890 Align CommonAlignment =
15891 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
15892 VecLdCost = TTI->getMemIntrinsicInstrCost(
15893 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_load,
15894 StridedLoadTy, LI0->getPointerOperand(),
15895 /*VariableMask=*/false, CommonAlignment),
15896 CostKind);
15897 if (StridedLoadTy != VecTy)
15898 VecLdCost +=
15899 TTI->getCastInstrCost(Opcode: Instruction::BitCast, Dst: VecTy, Src: StridedLoadTy,
15900 CCH: getCastContextHint(TE: *E), CostKind);
15901
15902 break;
15903 }
15904 case TreeEntry::CompressVectorize: {
15905 bool IsMasked;
15906 unsigned InterleaveFactor;
15907 SmallVector<int> CompressMask;
15908 VectorType *LoadVecTy;
15909 SmallVector<Value *> Scalars(VL);
15910 if (!E->ReorderIndices.empty()) {
15911 SmallVector<int> Mask(E->ReorderIndices.begin(),
15912 E->ReorderIndices.end());
15913 reorderScalars(Scalars, Mask);
15914 }
15915 SmallVector<Value *> PointerOps(Scalars.size());
15916 for (auto [I, V] : enumerate(First&: Scalars))
15917 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
15918 [[maybe_unused]] bool IsVectorized = isMaskedLoadCompress(
15919 VL: Scalars, PointerOps, Order: E->ReorderIndices, TTI: *TTI, DL: *DL, SE&: *SE, AC&: *AC, DT: *DT,
15920 TLI: *TLI, AreAllUsersVectorized: [](Value *) { return true; }, IsMasked, InterleaveFactor,
15921 CompressMask, LoadVecTy);
15922 assert(IsVectorized && "Failed to vectorize load");
15923 CompressEntryToData.try_emplace(Key: E, Args&: CompressMask, Args&: LoadVecTy,
15924 Args&: InterleaveFactor, Args&: IsMasked);
15925 Align CommonAlignment = LI0->getAlign();
15926 if (InterleaveFactor) {
15927 VecLdCost = TTI->getInterleavedMemoryOpCost(
15928 Opcode: Instruction::Load, VecTy: LoadVecTy, Factor: InterleaveFactor, Indices: {},
15929 Alignment: CommonAlignment, AddressSpace: LI0->getPointerAddressSpace(), CostKind);
15930 } else if (IsMasked) {
15931 VecLdCost = TTI->getMemIntrinsicInstrCost(
15932 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_load, LoadVecTy,
15933 CommonAlignment,
15934 LI0->getPointerAddressSpace()),
15935 CostKind);
15936 // TODO: include this cost into CommonCost.
15937 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
15938 Tp: LoadVecTy, Mask: CompressMask, CostKind);
15939 } else {
15940 VecLdCost = TTI->getMemoryOpCost(
15941 Opcode: Instruction::Load, Src: LoadVecTy, Alignment: CommonAlignment,
15942 AddressSpace: LI0->getPointerAddressSpace(), CostKind, OpdInfo: TTI::OperandValueInfo());
15943 // TODO: include this cost into CommonCost.
15944 VecLdCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
15945 Tp: LoadVecTy, Mask: CompressMask, CostKind);
15946 }
15947 break;
15948 }
15949 case TreeEntry::ScatterVectorize: {
15950 Align CommonAlignment =
15951 computeCommonAlignment<LoadInst>(VL: UniqueValues.getArrayRef());
15952 VecLdCost = TTI->getMemIntrinsicInstrCost(
15953 MICA: MemIntrinsicCostAttributes(Intrinsic::masked_gather, VecTy,
15954 LI0->getPointerOperand(),
15955 /*VariableMask=*/false, CommonAlignment),
15956 CostKind);
15957 break;
15958 }
15959 case TreeEntry::CombinedVectorize:
15960 case TreeEntry::SplitVectorize:
15961 case TreeEntry::NeedToGather:
15962 llvm_unreachable("Unexpected vectorization state.");
15963 }
15964 return VecLdCost + CommonCost;
15965 };
15966
15967 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
15968 // If this node generates masked gather load then it is not a terminal node.
15969 // Hence address operand cost is estimated separately.
15970 if (E->State == TreeEntry::ScatterVectorize)
15971 return Cost;
15972
15973 // Estimate cost of GEPs since this tree node is a terminator.
15974 SmallVector<Value *> PointerOps(VL.size());
15975 for (auto [I, V] : enumerate(First&: VL))
15976 PointerOps[I] = cast<LoadInst>(Val: V)->getPointerOperand();
15977 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
15978 }
15979 case Instruction::Store: {
15980 bool IsReorder = !E->ReorderIndices.empty();
15981 auto GetScalarCost = [=](unsigned Idx) {
15982 auto *VI = cast<StoreInst>(Val: VL[Idx]);
15983 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: VI->getValueOperand());
15984 return TTI->getMemoryOpCost(Opcode: Instruction::Store, Src: OrigScalarTy,
15985 Alignment: VI->getAlign(), AddressSpace: VI->getPointerAddressSpace(),
15986 CostKind, OpdInfo: OpInfo, I: VI);
15987 };
15988 auto *BaseSI =
15989 cast<StoreInst>(Val: IsReorder ? VL[E->ReorderIndices.front()] : VL0);
15990 auto GetVectorCost = [=](InstructionCost CommonCost) {
15991 // We know that we can merge the stores. Calculate the cost.
15992 InstructionCost VecStCost;
15993 if (E->State == TreeEntry::StridedVectorize) {
15994 Align CommonAlignment =
15995 computeCommonAlignment<StoreInst>(VL: UniqueValues.getArrayRef());
15996 VecStCost = TTI->getMemIntrinsicInstrCost(
15997 MICA: MemIntrinsicCostAttributes(Intrinsic::experimental_vp_strided_store,
15998 VecTy, BaseSI->getPointerOperand(),
15999 /*VariableMask=*/false, CommonAlignment),
16000 CostKind);
16001 } else {
16002 assert(E->State == TreeEntry::Vectorize &&
16003 "Expected either strided or consecutive stores.");
16004 if (unsigned Factor = E->getInterleaveFactor()) {
16005 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
16006 "No reused shuffles expected");
16007 CommonCost = 0;
16008 VecStCost = TTI->getInterleavedMemoryOpCost(
16009 Opcode: Instruction::Store, VecTy, Factor, Indices: {}, Alignment: BaseSI->getAlign(),
16010 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind);
16011 } else {
16012 TTI::OperandValueInfo OpInfo = getOperandInfo(Ops: E->getOperand(OpIdx: 0));
16013 VecStCost = TTI->getMemoryOpCost(
16014 Opcode: Instruction::Store, Src: VecTy, Alignment: BaseSI->getAlign(),
16015 AddressSpace: BaseSI->getPointerAddressSpace(), CostKind, OpdInfo: OpInfo);
16016 }
16017 }
16018 return VecStCost + CommonCost;
16019 };
16020 SmallVector<Value *> PointerOps(VL.size());
16021 for (auto [I, V] : enumerate(First&: VL)) {
16022 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
16023 PointerOps[Idx] = cast<StoreInst>(Val: V)->getPointerOperand();
16024 }
16025
16026 return GetCostDiff(GetScalarCost, GetVectorCost) +
16027 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
16028 }
16029 case Instruction::Call: {
16030 auto GetScalarCost = [&](unsigned Idx) {
16031 auto *CI = cast<CallInst>(Val: UniqueValues[Idx]);
16032 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
16033 if (ID != Intrinsic::not_intrinsic) {
16034 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
16035 return TTI->getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
16036 }
16037 return TTI->getCallInstrCost(F: CI->getCalledFunction(),
16038 RetTy: CI->getFunctionType()->getReturnType(),
16039 Tys: CI->getFunctionType()->params(), CostKind);
16040 };
16041 auto GetVectorCost = [=](InstructionCost CommonCost) {
16042 auto *CI = cast<CallInst>(Val: VL0);
16043 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
16044 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
16045 CI, ID, VF: VecTy->getNumElements(),
16046 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
16047 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16048 return std::min(a: VecCallCosts.first, b: VecCallCosts.second) + CommonCost;
16049 };
16050 return GetCostDiff(GetScalarCost, GetVectorCost);
16051 }
16052 case Instruction::ShuffleVector: {
16053 if (!SLPReVec || E->isAltShuffle())
16054 assert(E->isAltShuffle() &&
16055 ((Instruction::isBinaryOp(E->getOpcode()) &&
16056 Instruction::isBinaryOp(E->getAltOpcode())) ||
16057 (Instruction::isCast(E->getOpcode()) &&
16058 Instruction::isCast(E->getAltOpcode())) ||
16059 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16060 "Invalid Shuffle Vector Operand");
16061 // Try to find the previous shuffle node with the same operands and same
16062 // main/alternate ops.
16063 auto TryFindNodeWithEqualOperands = [=]() {
16064 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16065 if (TE.get() == E)
16066 break;
16067 if (TE->hasState() && TE->isAltShuffle() &&
16068 ((TE->getOpcode() == E->getOpcode() &&
16069 TE->getAltOpcode() == E->getAltOpcode()) ||
16070 (TE->getOpcode() == E->getAltOpcode() &&
16071 TE->getAltOpcode() == E->getOpcode())) &&
16072 TE->hasEqualOperands(TE: *E))
16073 return true;
16074 }
16075 return false;
16076 };
16077 auto GetScalarCost = [&](unsigned Idx) {
16078 if (isa<PoisonValue>(Val: UniqueValues[Idx]))
16079 return InstructionCost(TTI::TCC_Free);
16080
16081 auto *VI = cast<Instruction>(Val: UniqueValues[Idx]);
16082 assert(E->getMatchingMainOpOrAltOp(VI) &&
16083 "Unexpected main/alternate opcode");
16084 (void)E;
16085 return TTI->getInstructionCost(U: VI, CostKind);
16086 };
16087 // Need to clear CommonCost since the final shuffle cost is included into
16088 // vector cost.
16089 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
16090 // VecCost is equal to sum of the cost of creating 2 vectors
16091 // and the cost of creating shuffle.
16092 InstructionCost VecCost = 0;
16093 if (TryFindNodeWithEqualOperands()) {
16094 LLVM_DEBUG({
16095 dbgs() << "SLP: diamond match for alternate node found.\n";
16096 E->dump();
16097 });
16098 // No need to add new vector costs here since we're going to reuse
16099 // same main/alternate vector ops, just do different shuffling.
16100 } else if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
16101 VecCost =
16102 TTIRef.getArithmeticInstrCost(Opcode: E->getOpcode(), Ty: VecTy, CostKind);
16103 VecCost +=
16104 TTIRef.getArithmeticInstrCost(Opcode: E->getAltOpcode(), Ty: VecTy, CostKind);
16105 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
16106 auto *MaskTy = getWidenedType(ScalarTy: Builder.getInt1Ty(), VF: VL.size());
16107 VecCost = TTIRef.getCmpSelInstrCost(
16108 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy, VecPred: CI0->getPredicate(), CostKind,
16109 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
16110 I: VL0);
16111 VecCost += TTIRef.getCmpSelInstrCost(
16112 Opcode: E->getOpcode(), ValTy: VecTy, CondTy: MaskTy,
16113 VecPred: cast<CmpInst>(Val: E->getAltOp())->getPredicate(), CostKind,
16114 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
16115 I: E->getAltOp());
16116 } else {
16117 Type *SrcSclTy = E->getMainOp()->getOperand(i: 0)->getType();
16118 auto *SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
16119 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
16120 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
16121 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
16122 unsigned SrcBWSz =
16123 DL->getTypeSizeInBits(Ty: E->getMainOp()->getOperand(i: 0)->getType());
16124 if (SrcIt != MinBWs.end()) {
16125 SrcBWSz = SrcIt->second.first;
16126 SrcSclTy = IntegerType::get(C&: SrcSclTy->getContext(), NumBits: SrcBWSz);
16127 SrcTy = getWidenedType(ScalarTy: SrcSclTy, VF: VL.size());
16128 }
16129 if (BWSz <= SrcBWSz) {
16130 if (BWSz < SrcBWSz)
16131 VecCost =
16132 TTIRef.getCastInstrCost(Opcode: Instruction::Trunc, Dst: VecTy, Src: SrcTy,
16133 CCH: TTI::CastContextHint::None, CostKind);
16134 LLVM_DEBUG({
16135 dbgs()
16136 << "SLP: alternate extension, which should be truncated.\n";
16137 E->dump();
16138 });
16139 return VecCost;
16140 }
16141 }
16142 VecCost = TTIRef.getCastInstrCost(Opcode: E->getOpcode(), Dst: VecTy, Src: SrcTy,
16143 CCH: TTI::CastContextHint::None, CostKind);
16144 VecCost +=
16145 TTIRef.getCastInstrCost(Opcode: E->getAltOpcode(), Dst: VecTy, Src: SrcTy,
16146 CCH: TTI::CastContextHint::None, CostKind);
16147 }
16148 SmallVector<int> Mask;
16149 E->buildAltOpShuffleMask(
16150 IsAltOp: [&](Instruction *I) {
16151 assert(E->getMatchingMainOpOrAltOp(I) &&
16152 "Unexpected main/alternate opcode");
16153 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
16154 TLI: *TLI);
16155 },
16156 Mask);
16157 VecCost += ::getShuffleCost(TTI: TTIRef, Kind: TargetTransformInfo::SK_PermuteTwoSrc,
16158 Tp: FinalVecTy, Mask, CostKind);
16159 // Patterns like [fadd,fsub] can be combined into a single instruction
16160 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
16161 // need to take into account their order when looking for the most used
16162 // order.
16163 unsigned Opcode0 = E->getOpcode();
16164 unsigned Opcode1 = E->getAltOpcode();
16165 SmallBitVector OpcodeMask(
16166 getAltInstrMask(VL: E->Scalars, ScalarTy, Opcode0, Opcode1));
16167 // If this pattern is supported by the target then we consider the
16168 // order.
16169 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
16170 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
16171 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
16172 return AltVecCost < VecCost ? AltVecCost : VecCost;
16173 }
16174 // TODO: Check the reverse order too.
16175 return VecCost;
16176 };
16177 if (SLPReVec && !E->isAltShuffle())
16178 return GetCostDiff(
16179 GetScalarCost, [&](InstructionCost) -> InstructionCost {
16180 // If a group uses mask in order, the shufflevector can be
16181 // eliminated by instcombine. Then the cost is 0.
16182 assert(isa<ShuffleVectorInst>(VL.front()) &&
16183 "Not supported shufflevector usage.");
16184 auto *SV = cast<ShuffleVectorInst>(Val: VL.front());
16185 unsigned SVNumElements =
16186 cast<FixedVectorType>(Val: SV->getOperand(i_nocapture: 0)->getType())
16187 ->getNumElements();
16188 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
16189 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
16190 ArrayRef<Value *> Group = VL.slice(N: I, M: GroupSize);
16191 int NextIndex = 0;
16192 if (!all_of(Range&: Group, P: [&](Value *V) {
16193 assert(isa<ShuffleVectorInst>(V) &&
16194 "Not supported shufflevector usage.");
16195 auto *SV = cast<ShuffleVectorInst>(Val: V);
16196 int Index;
16197 [[maybe_unused]] bool IsExtractSubvectorMask =
16198 SV->isExtractSubvectorMask(Index);
16199 assert(IsExtractSubvectorMask &&
16200 "Not supported shufflevector usage.");
16201 if (NextIndex != Index)
16202 return false;
16203 NextIndex += SV->getShuffleMask().size();
16204 return true;
16205 }))
16206 return ::getShuffleCost(
16207 TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteSingleSrc, Tp: VecTy,
16208 Mask: calculateShufflevectorMask(VL: E->Scalars));
16209 }
16210 return TTI::TCC_Free;
16211 });
16212 return GetCostDiff(GetScalarCost, GetVectorCost);
16213 }
16214 case Instruction::Freeze:
16215 return CommonCost;
16216 default:
16217 llvm_unreachable("Unknown instruction");
16218 }
16219}
16220
16221bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
16222 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
16223 << VectorizableTree.size() << " is fully vectorizable .\n");
16224
16225 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
16226 SmallVector<int> Mask;
16227 return TE->isGather() &&
16228 !any_of(Range: TE->Scalars,
16229 P: [this](Value *V) { return EphValues.contains(Ptr: V); }) &&
16230 (allConstant(VL: TE->Scalars) || isSplat(VL: TE->Scalars) ||
16231 TE->Scalars.size() < Limit ||
16232 (((TE->hasState() &&
16233 TE->getOpcode() == Instruction::ExtractElement) ||
16234 all_of(Range: TE->Scalars, P: IsaPred<ExtractElementInst, UndefValue>)) &&
16235 isFixedVectorShuffle(VL: TE->Scalars, Mask, AC)) ||
16236 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
16237 !TE->isAltShuffle()) ||
16238 any_of(Range: TE->Scalars, P: IsaPred<LoadInst>));
16239 };
16240
16241 // We only handle trees of heights 1 and 2.
16242 if (VectorizableTree.size() == 1 &&
16243 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
16244 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
16245 VectorizableTree[0]->State == TreeEntry::CompressVectorize ||
16246 (ForReduction &&
16247 AreVectorizableGathers(VectorizableTree[0].get(),
16248 VectorizableTree[0]->Scalars.size()) &&
16249 VectorizableTree[0]->getVectorFactor() > 2)))
16250 return true;
16251
16252 if (VectorizableTree.size() != 2)
16253 return false;
16254
16255 // Handle splat and all-constants stores. Also try to vectorize tiny trees
16256 // with the second gather nodes if they have less scalar operands rather than
16257 // the initial tree element (may be profitable to shuffle the second gather)
16258 // or they are extractelements, which form shuffle.
16259 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
16260 AreVectorizableGathers(VectorizableTree[1].get(),
16261 VectorizableTree[0]->Scalars.size()))
16262 return true;
16263
16264 // Gathering cost would be too much for tiny trees.
16265 if (VectorizableTree[0]->isGather() ||
16266 (VectorizableTree[1]->isGather() &&
16267 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
16268 VectorizableTree[0]->State != TreeEntry::StridedVectorize &&
16269 VectorizableTree[0]->State != TreeEntry::CompressVectorize))
16270 return false;
16271
16272 return true;
16273}
16274
16275bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
16276 if (!DebugCounter::shouldExecute(Counter&: VectorizedGraphs))
16277 return true;
16278
16279 // Graph is empty - do nothing.
16280 if (VectorizableTree.empty()) {
16281 assert(ExternalUses.empty() && "We shouldn't have any external users");
16282
16283 return true;
16284 }
16285
16286 // No need to vectorize inserts of gathered values.
16287 if (VectorizableTree.size() == 2 &&
16288 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
16289 VectorizableTree[1]->isGather() &&
16290 (VectorizableTree[1]->getVectorFactor() <= 2 ||
16291 !(isSplat(VL: VectorizableTree[1]->Scalars) ||
16292 allConstant(VL: VectorizableTree[1]->Scalars))))
16293 return true;
16294
16295 // If the graph includes only PHI nodes and gathers, it is defnitely not
16296 // profitable for the vectorization, we can skip it, if the cost threshold is
16297 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
16298 // gathers/buildvectors.
16299 constexpr int Limit = 4;
16300 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16301 !VectorizableTree.empty() &&
16302 all_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16303 return (TE->isGather() &&
16304 (!TE->hasState() ||
16305 TE->getOpcode() != Instruction::ExtractElement) &&
16306 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <= Limit) ||
16307 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
16308 }))
16309 return true;
16310
16311 // Do not vectorize small tree of phis only, if all vector phis are also
16312 // gathered.
16313 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16314 VectorizableTree.size() <= Limit &&
16315 all_of(Range: VectorizableTree,
16316 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16317 return (TE->isGather() &&
16318 (!TE->hasState() ||
16319 TE->getOpcode() != Instruction::ExtractElement) &&
16320 count_if(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>) <=
16321 Limit) ||
16322 (TE->hasState() &&
16323 (TE->getOpcode() == Instruction::InsertElement ||
16324 (TE->getOpcode() == Instruction::PHI &&
16325 all_of(Range&: TE->Scalars, P: [&](Value *V) {
16326 return isa<PoisonValue>(Val: V) || MustGather.contains(Ptr: V);
16327 }))));
16328 }) &&
16329 any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16330 return TE->State == TreeEntry::Vectorize &&
16331 TE->getOpcode() == Instruction::PHI;
16332 }))
16333 return true;
16334
16335 // If the tree contains only phis, buildvectors, split nodes and
16336 // small nodes with reuses, we can skip it.
16337 SmallVector<const TreeEntry *> StoreLoadNodes;
16338 unsigned NumGathers = 0;
16339 constexpr int LimitTreeSize = 36;
16340 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
16341 all_of(Range: VectorizableTree,
16342 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16343 if (!TE->isGather() && TE->hasState() &&
16344 (TE->getOpcode() == Instruction::Load ||
16345 TE->getOpcode() == Instruction::Store)) {
16346 StoreLoadNodes.push_back(Elt: TE.get());
16347 return true;
16348 }
16349 if (TE->isGather())
16350 ++NumGathers;
16351 return TE->State == TreeEntry::SplitVectorize ||
16352 (TE->Idx == 0 && TE->Scalars.size() == 2 &&
16353 TE->hasState() && TE->getOpcode() == Instruction::ICmp &&
16354 VectorizableTree.size() > LimitTreeSize) ||
16355 (TE->isGather() &&
16356 none_of(Range&: TE->Scalars, P: IsaPred<ExtractElementInst>)) ||
16357 (TE->hasState() &&
16358 (TE->getOpcode() == Instruction::PHI ||
16359 (TE->hasCopyableElements() &&
16360 static_cast<unsigned>(count_if(
16361 Range&: TE->Scalars, P: IsaPred<PHINode, Constant>)) >=
16362 TE->Scalars.size() / 2) ||
16363 ((!TE->ReuseShuffleIndices.empty() ||
16364 !TE->ReorderIndices.empty() || TE->isAltShuffle()) &&
16365 TE->Scalars.size() == 2)));
16366 }) &&
16367 (StoreLoadNodes.empty() ||
16368 (VectorizableTree.size() > LimitTreeSize * StoreLoadNodes.size() &&
16369 (NumGathers > 0 || none_of(Range&: StoreLoadNodes, P: [&](const TreeEntry *TE) {
16370 return TE->getOpcode() == Instruction::Store ||
16371 all_of(Range: TE->Scalars, P: [&](Value *V) {
16372 return !isa<LoadInst>(Val: V) ||
16373 areAllUsersVectorized(I: cast<Instruction>(Val: V));
16374 });
16375 })))))
16376 return true;
16377
16378 // If the tree contains only buildvector, 2 non-buildvectors (with root user
16379 // tree node) and other buildvectors, we can skip it.
16380 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16381 VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
16382 VectorizableTree.size() >= Limit &&
16383 count_if(Range: ArrayRef(VectorizableTree).drop_front(),
16384 P: [&](const std::unique_ptr<TreeEntry> &TE) {
16385 return !TE->isGather() && TE->UserTreeIndex.UserTE &&
16386 TE->UserTreeIndex.UserTE->Idx == 0;
16387 }) == 2)
16388 return true;
16389
16390 // If the tree contains only vectorization of the phi node from the
16391 // buildvector - skip it.
16392 if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
16393 VectorizableTree.size() > 2 &&
16394 VectorizableTree.front()->State == TreeEntry::Vectorize &&
16395 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
16396 VectorizableTree[1]->State == TreeEntry::Vectorize &&
16397 VectorizableTree[1]->getOpcode() == Instruction::PHI &&
16398 all_of(
16399 Range: ArrayRef(VectorizableTree).drop_front(N: 2),
16400 P: [&](const std::unique_ptr<TreeEntry> &TE) { return TE->isGather(); }))
16401 return true;
16402
16403 // We can vectorize the tree if its size is greater than or equal to the
16404 // minimum size specified by the MinTreeSize command line option.
16405 if (VectorizableTree.size() >= MinTreeSize)
16406 return false;
16407
16408 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
16409 // can vectorize it if we can prove it fully vectorizable.
16410 if (isFullyVectorizableTinyTree(ForReduction))
16411 return false;
16412
16413 // Check if any of the gather node forms an insertelement buildvector
16414 // somewhere.
16415 bool IsAllowedSingleBVNode =
16416 VectorizableTree.size() > 1 ||
16417 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
16418 !VectorizableTree.front()->isAltShuffle() &&
16419 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
16420 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
16421 allSameBlock(VL: VectorizableTree.front()->Scalars));
16422 if (any_of(Range: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
16423 return TE->isGather() && all_of(Range&: TE->Scalars, P: [&](Value *V) {
16424 return isa<ExtractElementInst, Constant>(Val: V) ||
16425 (IsAllowedSingleBVNode &&
16426 !V->hasNUsesOrMore(N: UsesLimit) &&
16427 any_of(Range: V->users(), P: IsaPred<InsertElementInst>));
16428 });
16429 }))
16430 return false;
16431
16432 if (VectorizableTree.back()->isGather() &&
16433 VectorizableTree.back()->hasState() &&
16434 VectorizableTree.back()->isAltShuffle() &&
16435 VectorizableTree.back()->getVectorFactor() > 2 &&
16436 allSameBlock(VL: VectorizableTree.back()->Scalars) &&
16437 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
16438 TTI->getScalarizationOverhead(
16439 Ty: getWidenedType(ScalarTy: VectorizableTree.back()->Scalars.front()->getType(),
16440 VF: VectorizableTree.back()->getVectorFactor()),
16441 DemandedElts: APInt::getAllOnes(numBits: VectorizableTree.back()->getVectorFactor()),
16442 /*Insert=*/true, /*Extract=*/false,
16443 CostKind: TTI::TCK_RecipThroughput) > -SLPCostThreshold)
16444 return false;
16445
16446 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
16447 // vectorizable.
16448 return true;
16449}
16450
16451bool BoUpSLP::isTreeNotExtendable() const {
16452 if (getCanonicalGraphSize() != getTreeSize()) {
16453 constexpr unsigned SmallTree = 3;
16454 if (VectorizableTree.front()->isNonPowOf2Vec() &&
16455 getCanonicalGraphSize() <= SmallTree &&
16456 count_if(Range: ArrayRef(VectorizableTree).drop_front(N: getCanonicalGraphSize()),
16457 P: [](const std::unique_ptr<TreeEntry> &TE) {
16458 return TE->isGather() && TE->hasState() &&
16459 TE->getOpcode() == Instruction::Load &&
16460 !allSameBlock(VL: TE->Scalars);
16461 }) == 1)
16462 return true;
16463 return false;
16464 }
16465 bool Res = false;
16466 for (unsigned Idx : seq<unsigned>(Size: getTreeSize())) {
16467 TreeEntry &E = *VectorizableTree[Idx];
16468 if (E.State == TreeEntry::SplitVectorize)
16469 return false;
16470 if (!E.isGather())
16471 continue;
16472 if ((E.hasState() && E.getOpcode() != Instruction::Load) ||
16473 (!E.hasState() &&
16474 all_of(Range&: E.Scalars, P: IsaPred<ExtractElementInst, LoadInst>)) ||
16475 (isa<ExtractElementInst>(Val: E.Scalars.front()) &&
16476 getSameOpcode(VL: ArrayRef(E.Scalars).drop_front(), TLI: *TLI).valid()))
16477 return false;
16478 if (isSplat(VL: E.Scalars) || allConstant(VL: E.Scalars))
16479 continue;
16480 Res = true;
16481 }
16482 return Res;
16483}
16484
16485InstructionCost BoUpSLP::getSpillCost() {
16486 // Walk from the bottom of the tree to the top, tracking which values are
16487 // live. When we see a call instruction that is not part of our tree,
16488 // query TTI to see if there is a cost to keeping values live over it
16489 // (for example, if spills and fills are required).
16490
16491 const TreeEntry *Root = VectorizableTree.front().get();
16492 if (Root->isGather())
16493 return 0;
16494
16495 InstructionCost Cost = 0;
16496 SmallDenseMap<const TreeEntry *, SmallVector<const TreeEntry *>>
16497 EntriesToOperands;
16498 SmallDenseMap<const TreeEntry *, Instruction *> EntriesToLastInstruction;
16499 SmallPtrSet<const Instruction *, 8> LastInstructions;
16500 SmallPtrSet<const TreeEntry *, 8> ScalarOrPseudoEntries;
16501 for (const auto &TEPtr : VectorizableTree) {
16502 if (TEPtr->CombinedOp == TreeEntry::ReducedBitcast ||
16503 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
16504 TEPtr->CombinedOp == TreeEntry::ReducedBitcastLoads ||
16505 TEPtr->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
16506 TEPtr->CombinedOp == TreeEntry::ReducedCmpBitcast) {
16507 ScalarOrPseudoEntries.insert(Ptr: TEPtr.get());
16508 continue;
16509 }
16510 if (!TEPtr->isGather()) {
16511 Instruction *LastInst = &getLastInstructionInBundle(E: TEPtr.get());
16512 EntriesToLastInstruction.try_emplace(Key: TEPtr.get(), Args&: LastInst);
16513 LastInstructions.insert(Ptr: LastInst);
16514 }
16515 if (TEPtr->UserTreeIndex)
16516 EntriesToOperands[TEPtr->UserTreeIndex.UserTE].push_back(Elt: TEPtr.get());
16517 }
16518
16519 auto NoCallIntrinsic = [this](const Instruction *I) {
16520 const auto *II = dyn_cast<IntrinsicInst>(Val: I);
16521 if (!II)
16522 return false;
16523 if (II->isAssumeLikeIntrinsic())
16524 return true;
16525 IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);
16526 InstructionCost IntrCost =
16527 TTI->getIntrinsicInstrCost(ICA, CostKind: TTI::TCK_RecipThroughput);
16528 InstructionCost CallCost = TTI->getCallInstrCost(
16529 F: nullptr, RetTy: II->getType(), Tys: ICA.getArgTypes(), CostKind: TTI::TCK_RecipThroughput);
16530 return IntrCost < CallCost;
16531 };
16532
16533 // Maps last instruction in the entry to the last instruction for the one of
16534 // operand entries and the flag. If the flag is true, there are no calls in
16535 // between these instructions.
16536 SmallDenseMap<const Instruction *, PointerIntPair<const Instruction *, 1>>
16537 CheckedInstructions;
16538 unsigned Budget = 0;
16539 const unsigned BudgetLimit =
16540 ScheduleRegionSizeBudget / VectorizableTree.size();
16541 auto CheckForNonVecCallsInSameBlock = [&](Instruction *First,
16542 const Instruction *Last) {
16543 assert(First->getParent() == Last->getParent() &&
16544 "Expected instructions in same block.");
16545 if (auto It = CheckedInstructions.find(Val: Last);
16546 It != CheckedInstructions.end()) {
16547 const Instruction *Checked = It->second.getPointer();
16548 if (Checked == First || Checked->comesBefore(Other: First))
16549 return It->second.getInt() != 0;
16550 Last = Checked;
16551 } else if (Last == First || Last->comesBefore(Other: First)) {
16552 return true;
16553 }
16554 BasicBlock::const_reverse_iterator InstIt =
16555 ++First->getIterator().getReverse(),
16556 PrevInstIt =
16557 Last->getIterator().getReverse();
16558 SmallVector<const Instruction *> LastInstsInRange;
16559 while (InstIt != PrevInstIt && Budget <= BudgetLimit) {
16560 // Debug information does not impact spill cost.
16561 // Vectorized calls, represented as vector intrinsics, do not impact spill
16562 // cost.
16563 if (const auto *CB = dyn_cast<CallBase>(Val: &*PrevInstIt);
16564 CB && !NoCallIntrinsic(CB) && !isVectorized(V: CB)) {
16565 for (const Instruction *LastInst : LastInstsInRange)
16566 CheckedInstructions.try_emplace(Key: LastInst, Args: &*PrevInstIt, Args: 0);
16567 return false;
16568 }
16569 if (LastInstructions.contains(Ptr: &*PrevInstIt))
16570 LastInstsInRange.push_back(Elt: &*PrevInstIt);
16571
16572 ++PrevInstIt;
16573 ++Budget;
16574 }
16575 for (const Instruction *LastInst : LastInstsInRange)
16576 CheckedInstructions.try_emplace(
16577 Key: LastInst, Args: PrevInstIt == InstIt ? First : &*PrevInstIt,
16578 Args: Budget <= BudgetLimit ? 1 : 0);
16579 return Budget <= BudgetLimit;
16580 };
16581 auto AddCosts = [&](const TreeEntry *Op) {
16582 if (ScalarOrPseudoEntries.contains(Ptr: Op))
16583 return;
16584 Type *ScalarTy = Op->Scalars.front()->getType();
16585 auto It = MinBWs.find(Val: Op);
16586 if (It != MinBWs.end())
16587 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
16588 auto *VecTy = getWidenedType(ScalarTy, VF: Op->getVectorFactor());
16589 Cost += TTI->getCostOfKeepingLiveOverCall(Tys: VecTy);
16590 if (ScalarTy->isVectorTy()) {
16591 // Handle revec dead vector instructions.
16592 Cost -= Op->Scalars.size() * TTI->getCostOfKeepingLiveOverCall(Tys: ScalarTy);
16593 }
16594 };
16595 // Memoize the relationship between blocks, i.e. if there is (at least one)
16596 // non-vectorized call between the blocks. This allows to skip the analysis of
16597 // the same block paths multiple times.
16598 SmallDenseMap<std::pair<const BasicBlock *, const BasicBlock *>, bool>
16599 ParentOpParentToPreds;
16600 auto CheckPredecessors = [&](BasicBlock *Root, BasicBlock *Pred,
16601 BasicBlock *OpParent) {
16602 auto Key = std::make_pair(x&: Root, y&: OpParent);
16603 if (auto It = ParentOpParentToPreds.find(Val: Key);
16604 It != ParentOpParentToPreds.end())
16605 return It->second;
16606 SmallVector<BasicBlock *> Worklist;
16607 if (Pred)
16608 Worklist.push_back(Elt: Pred);
16609 else
16610 Worklist.append(in_start: pred_begin(BB: Root), in_end: pred_end(BB: Root));
16611 SmallPtrSet<const BasicBlock *, 16> Visited;
16612 SmallDenseSet<std::pair<const BasicBlock *, const BasicBlock *>>
16613 ParentsPairsToAdd;
16614 bool Res = false;
16615 llvm::scope_exit Cleanup([&]() {
16616 for (const auto &KeyPair : ParentsPairsToAdd) {
16617 assert(!ParentOpParentToPreds.contains(KeyPair) &&
16618 "Should not have been added before.");
16619 ParentOpParentToPreds.try_emplace(Key: KeyPair, Args&: Res);
16620 }
16621 });
16622 while (!Worklist.empty()) {
16623 BasicBlock *BB = Worklist.pop_back_val();
16624 if (BB == OpParent || !Visited.insert(Ptr: BB).second)
16625 continue;
16626 auto Pair = std::make_pair(x&: BB, y&: OpParent);
16627 if (auto It = ParentOpParentToPreds.find(Val: Pair);
16628 It != ParentOpParentToPreds.end()) {
16629 Res = It->second;
16630 return Res;
16631 }
16632 ParentsPairsToAdd.insert(V: Pair);
16633 unsigned BlockSize = BB->size();
16634 if (BlockSize > static_cast<unsigned>(ScheduleRegionSizeBudget))
16635 return Res;
16636 Budget += BlockSize;
16637 if (Budget > BudgetLimit)
16638 return Res;
16639 if (!isa<CatchSwitchInst>(Val: BB->getTerminator()) &&
16640 !CheckForNonVecCallsInSameBlock(&*BB->getFirstNonPHIOrDbgOrAlloca(),
16641 BB->getTerminator()))
16642 return Res;
16643 Worklist.append(in_start: pred_begin(BB), in_end: pred_end(BB));
16644 }
16645 Res = true;
16646 return Res;
16647 };
16648 SmallVector<const TreeEntry *> LiveEntries(1, Root);
16649 auto FindNonScalarParentEntry = [&](const TreeEntry *E) -> const TreeEntry * {
16650 assert(ScalarOrPseudoEntries.contains(E) &&
16651 "Expected scalar or pseudo entry.");
16652 const TreeEntry *Entry = E;
16653 while (Entry->UserTreeIndex) {
16654 Entry = Entry->UserTreeIndex.UserTE;
16655 if (!ScalarOrPseudoEntries.contains(Ptr: Entry))
16656 return Entry;
16657 }
16658 return nullptr;
16659 };
16660 while (!LiveEntries.empty()) {
16661 const TreeEntry *Entry = LiveEntries.pop_back_val();
16662 SmallVector<const TreeEntry *> Operands = EntriesToOperands.lookup(Val: Entry);
16663 if (Operands.empty())
16664 continue;
16665 if (ScalarOrPseudoEntries.contains(Ptr: Entry)) {
16666 Entry = FindNonScalarParentEntry(Entry);
16667 if (!Entry) {
16668 for (const TreeEntry *Op : Operands) {
16669 if (!Op->isGather())
16670 LiveEntries.push_back(Elt: Op);
16671 }
16672 continue;
16673 }
16674 }
16675 Instruction *LastInst = EntriesToLastInstruction.at(Val: Entry);
16676 BasicBlock *Parent = LastInst->getParent();
16677 for (const TreeEntry *Op : Operands) {
16678 if (!Op->isGather())
16679 LiveEntries.push_back(Elt: Op);
16680 if (ScalarOrPseudoEntries.contains(Ptr: Op))
16681 continue;
16682 if (Entry->State == TreeEntry::SplitVectorize ||
16683 (Entry->getOpcode() != Instruction::PHI && Op->isGather()) ||
16684 (Op->isGather() && allConstant(VL: Op->Scalars)))
16685 continue;
16686 Budget = 0;
16687 BasicBlock *Pred = nullptr;
16688 if (auto *Phi = dyn_cast<PHINode>(Val: Entry->getMainOp()))
16689 Pred = Phi->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
16690 BasicBlock *OpParent;
16691 Instruction *OpLastInst;
16692 if (Op->isGather()) {
16693 assert(Entry->getOpcode() == Instruction::PHI &&
16694 "Expected phi node only.");
16695 OpParent = cast<PHINode>(Val: Entry->getMainOp())
16696 ->getIncomingBlock(i: Op->UserTreeIndex.EdgeIdx);
16697 OpLastInst = OpParent->getTerminator();
16698 for (Value *V : Op->Scalars) {
16699 auto *Inst = dyn_cast<Instruction>(Val: V);
16700 if (!Inst)
16701 continue;
16702 if (isVectorized(V)) {
16703 OpParent = Inst->getParent();
16704 OpLastInst = Inst;
16705 break;
16706 }
16707 }
16708 } else {
16709 OpLastInst = EntriesToLastInstruction.at(Val: Op);
16710 OpParent = OpLastInst->getParent();
16711 }
16712 // Check the call instructions within the same basic blocks.
16713 if (OpParent == Parent) {
16714 if (Entry->getOpcode() == Instruction::PHI) {
16715 if (!CheckForNonVecCallsInSameBlock(LastInst, OpLastInst))
16716 AddCosts(Op);
16717 continue;
16718 }
16719 if (!CheckForNonVecCallsInSameBlock(OpLastInst, LastInst))
16720 AddCosts(Op);
16721 continue;
16722 }
16723 // Check for call instruction in between blocks.
16724 // 1. Check entry's block to the head.
16725 if (Entry->getOpcode() != Instruction::PHI &&
16726 !CheckForNonVecCallsInSameBlock(
16727 &*Parent->getFirstNonPHIOrDbgOrAlloca(), LastInst)) {
16728 AddCosts(Op);
16729 continue;
16730 }
16731 // 2. Check op's block from the end.
16732 if (!CheckForNonVecCallsInSameBlock(OpLastInst,
16733 OpParent->getTerminator())) {
16734 AddCosts(Op);
16735 continue;
16736 }
16737 // 3. Check the predecessors of entry's block till op's block.
16738 if (!CheckPredecessors(Parent, Pred, OpParent)) {
16739 AddCosts(Op);
16740 continue;
16741 }
16742 }
16743 }
16744
16745 return Cost;
16746}
16747
16748/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
16749/// buildvector sequence.
16750static bool isFirstInsertElement(const InsertElementInst *IE1,
16751 const InsertElementInst *IE2) {
16752 if (IE1 == IE2)
16753 return false;
16754 const auto *I1 = IE1;
16755 const auto *I2 = IE2;
16756 const InsertElementInst *PrevI1;
16757 const InsertElementInst *PrevI2;
16758 unsigned Idx1 = *getElementIndex(Inst: IE1);
16759 unsigned Idx2 = *getElementIndex(Inst: IE2);
16760 do {
16761 if (I2 == IE1)
16762 return true;
16763 if (I1 == IE2)
16764 return false;
16765 PrevI1 = I1;
16766 PrevI2 = I2;
16767 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
16768 getElementIndex(Inst: I1).value_or(u&: Idx2) != Idx2)
16769 I1 = dyn_cast<InsertElementInst>(Val: I1->getOperand(i_nocapture: 0));
16770 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
16771 getElementIndex(Inst: I2).value_or(u&: Idx1) != Idx1)
16772 I2 = dyn_cast<InsertElementInst>(Val: I2->getOperand(i_nocapture: 0));
16773 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
16774 llvm_unreachable("Two different buildvectors not expected.");
16775}
16776
16777namespace {
16778/// Returns incoming Value *, if the requested type is Value * too, or a default
16779/// value, otherwise.
16780struct ValueSelect {
16781 template <typename U>
16782 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
16783 return V;
16784 }
16785 template <typename U>
16786 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
16787 return U();
16788 }
16789};
16790} // namespace
16791
16792/// Does the analysis of the provided shuffle masks and performs the requested
16793/// actions on the vectors with the given shuffle masks. It tries to do it in
16794/// several steps.
16795/// 1. If the Base vector is not undef vector, resizing the very first mask to
16796/// have common VF and perform action for 2 input vectors (including non-undef
16797/// Base). Other shuffle masks are combined with the resulting after the 1 stage
16798/// and processed as a shuffle of 2 elements.
16799/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
16800/// action only for 1 vector with the given mask, if it is not the identity
16801/// mask.
16802/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
16803/// vectors, combing the masks properly between the steps.
16804template <typename T>
16805static T *performExtractsShuffleAction(
16806 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
16807 function_ref<unsigned(T *)> GetVF,
16808 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
16809 function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
16810 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
16811 SmallVector<int> Mask(ShuffleMask.begin()->second);
16812 auto VMIt = std::next(ShuffleMask.begin());
16813 T *Prev = nullptr;
16814 SmallBitVector UseMask =
16815 buildUseMask(VF: Mask.size(), Mask, MaskArg: UseMask::UndefsAsMask);
16816 SmallBitVector IsBaseUndef = isUndefVector(V: Base, UseMask);
16817 if (!IsBaseUndef.all()) {
16818 // Base is not undef, need to combine it with the next subvectors.
16819 std::pair<T *, bool> Res =
16820 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
16821 SmallBitVector IsBasePoison = isUndefVector<true>(V: Base, UseMask);
16822 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
16823 if (Mask[Idx] == PoisonMaskElem)
16824 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
16825 else
16826 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
16827 }
16828 [[maybe_unused]] auto *V = ValueSelect::get<T *>(Base);
16829 assert((!V || GetVF(V) == Mask.size()) &&
16830 "Expected base vector of VF number of elements.");
16831 Prev = Action(Mask, {nullptr, Res.first});
16832 } else if (ShuffleMask.size() == 1) {
16833 // Base is undef and only 1 vector is shuffled - perform the action only for
16834 // single vector, if the mask is not the identity mask.
16835 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
16836 /*ForSingleMask=*/true);
16837 if (Res.second)
16838 // Identity mask is found.
16839 Prev = Res.first;
16840 else
16841 Prev = Action(Mask, {ShuffleMask.begin()->first});
16842 } else {
16843 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
16844 // shuffles step by step, combining shuffle between the steps.
16845 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
16846 unsigned Vec2VF = GetVF(VMIt->first);
16847 if (Vec1VF == Vec2VF) {
16848 // No need to resize the input vectors since they are of the same size, we
16849 // can shuffle them directly.
16850 ArrayRef<int> SecMask = VMIt->second;
16851 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16852 if (SecMask[I] != PoisonMaskElem) {
16853 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16854 Mask[I] = SecMask[I] + Vec1VF;
16855 }
16856 }
16857 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
16858 } else {
16859 // Vectors of different sizes - resize and reshuffle.
16860 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
16861 /*ForSingleMask=*/false);
16862 std::pair<T *, bool> Res2 =
16863 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16864 ArrayRef<int> SecMask = VMIt->second;
16865 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16866 if (Mask[I] != PoisonMaskElem) {
16867 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16868 if (Res1.second)
16869 Mask[I] = I;
16870 } else if (SecMask[I] != PoisonMaskElem) {
16871 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
16872 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
16873 }
16874 }
16875 Prev = Action(Mask, {Res1.first, Res2.first});
16876 }
16877 VMIt = std::next(VMIt);
16878 }
16879 [[maybe_unused]] bool IsBaseNotUndef = !IsBaseUndef.all();
16880 // Perform requested actions for the remaining masks/vectors.
16881 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
16882 // Shuffle other input vectors, if any.
16883 std::pair<T *, bool> Res =
16884 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
16885 ArrayRef<int> SecMask = VMIt->second;
16886 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
16887 if (SecMask[I] != PoisonMaskElem) {
16888 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
16889 "Multiple uses of scalars.");
16890 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
16891 } else if (Mask[I] != PoisonMaskElem) {
16892 Mask[I] = I;
16893 }
16894 }
16895 Prev = Action(Mask, {Prev, Res.first});
16896 }
16897 return Prev;
16898}
16899
16900InstructionCost BoUpSLP::calculateTreeCostAndTrimNonProfitable(
16901 ArrayRef<Value *> VectorizedVals) {
16902 SmallDenseMap<const TreeEntry *, InstructionCost> NodesCosts;
16903 SmallPtrSet<Value *, 4> CheckedExtracts;
16904 SmallSetVector<TreeEntry *, 4> GatheredLoadsNodes;
16905 SmallDenseMap<const TreeEntry *, InstructionCost> ExtractCosts;
16906 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
16907 << VectorizableTree.size() << ".\n");
16908 auto IsExternallyUsed = [&](const TreeEntry &TE, Value *V) {
16909 assert(TE.hasState() && !TE.isGather() &&
16910 TE.State != TreeEntry::SplitVectorize && "Expected vector node.");
16911 if (V->hasOneUse() || V->getType()->isVoidTy())
16912 return false;
16913 if (TE.hasCopyableElements() && TE.isCopyableElement(V))
16914 return false;
16915 const size_t NumVectScalars = ScalarToTreeEntries.size() + 1;
16916 if (V->hasNUsesOrMore(N: NumVectScalars))
16917 return true;
16918 auto *I = dyn_cast<Instruction>(Val: V);
16919 // Check if any user is used outside of the tree.
16920 return I && any_of(Range: I->users(), P: [&](const User *U) {
16921 // store/insertelt v, [cast]U will likely be vectorized.
16922 if (match(V: U, P: m_InsertElt(Val: m_Value(),
16923 Elt: m_OneUse(SubPattern: m_CastOrSelf(Op: m_Specific(V: I))),
16924 Idx: m_ConstantInt())))
16925 return false;
16926 if (match(V: U,
16927 P: m_InsertElt(Val: m_Value(), Elt: m_Specific(V: I), Idx: m_ConstantInt())))
16928 return false;
16929 if (match(V: U, P: m_Store(ValueOp: m_OneUse(SubPattern: m_CastOrSelf(Op: m_Specific(V: I))),
16930 PointerOp: m_Value())))
16931 return false;
16932 if (match(V: U, P: m_Store(ValueOp: m_Specific(V: I), PointerOp: m_Value())))
16933 return false;
16934 ArrayRef<TreeEntry *> Entries = getTreeEntries(V: U);
16935 if (Entries.empty() && !MustGather.contains(Ptr: U))
16936 return true;
16937 if (any_of(Range&: Entries, P: [&](TreeEntry *TE) {
16938 return DeletedNodes.contains(Ptr: TE);
16939 }))
16940 return true;
16941 return any_of(Range: ValueToGatherNodes.lookup(Val: U),
16942 P: [&](const TreeEntry *TE) {
16943 return DeletedNodes.contains(Ptr: TE);
16944 });
16945 });
16946 };
16947 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
16948 InstructionCost Cost = 0;
16949 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
16950 TreeEntry &TE = *Ptr;
16951 // No need to count the cost for combined entries, they are combined and
16952 // just skip their cost.
16953 if (TE.State == TreeEntry::CombinedVectorize) {
16954 LLVM_DEBUG(
16955 dbgs() << "SLP: Skipping cost for combined node that starts with "
16956 << *TE.Scalars[0] << ".\n";
16957 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
16958 NodesCosts.try_emplace(Key: &TE);
16959 continue;
16960 }
16961 if (TE.hasState() &&
16962 (TE.isGather() || TE.State == TreeEntry::SplitVectorize)) {
16963 if (const TreeEntry *E =
16964 getSameValuesTreeEntry(V: TE.getMainOp(), VL: TE.Scalars);
16965 E && E->getVectorFactor() == TE.getVectorFactor()) {
16966 // Some gather nodes might be absolutely the same as some vectorizable
16967 // nodes after reordering, need to handle it.
16968 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
16969 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16970 << "SLP: Current total cost = " << Cost << "\n");
16971 NodesCosts.try_emplace(Key: &TE);
16972 continue;
16973 }
16974 }
16975
16976 // Exclude cost of gather loads nodes which are not used. These nodes were
16977 // built as part of the final attempt to vectorize gathered loads.
16978 assert((!TE.isGather() || TE.Idx == 0 || TE.UserTreeIndex) &&
16979 "Expected gather nodes with users only.");
16980
16981 InstructionCost C = getEntryCost(E: &TE, VectorizedVals, CheckedExtracts);
16982 Cost += C;
16983 NodesCosts.try_emplace(Key: &TE, Args&: C);
16984 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
16985 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
16986 << "SLP: Current total cost = " << Cost << "\n");
16987 // Add gathered loads nodes to the set for later processing.
16988 if (TE.Idx > 0 && !TE.UserTreeIndex && TE.hasState() &&
16989 TE.getOpcode() == Instruction::Load)
16990 GatheredLoadsNodes.insert(X: &TE);
16991 if (!TE.isGather() && TE.State != TreeEntry::SplitVectorize &&
16992 !(TE.Idx == 0 && (TE.getOpcode() == Instruction::InsertElement ||
16993 TE.getOpcode() == Instruction::Store))) {
16994 // Calculate costs of external uses.
16995 APInt DemandedElts = APInt::getZero(numBits: TE.getVectorFactor());
16996 for (Value *V : TE.Scalars) {
16997 if (IsExternallyUsed(TE, V))
16998 DemandedElts.setBit(TE.findLaneForValue(V));
16999 }
17000 if (!DemandedElts.isZero()) {
17001 Type *ScalarTy = TE.Scalars.front()->getType();
17002 auto It = MinBWs.find(Val: &TE);
17003 if (It != MinBWs.end())
17004 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
17005 auto *VecTy = getWidenedType(ScalarTy, VF: TE.getVectorFactor());
17006 InstructionCost ExtCost = ::getScalarizationOverhead(
17007 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts, /*Insert=*/false,
17008 /*Extract=*/true, CostKind);
17009 ExtractCosts.try_emplace(Key: &TE, Args&: ExtCost);
17010 }
17011 }
17012 }
17013 // Bail out if the cost threshold is negative and cost already below it.
17014 if (SLPCostThreshold.getNumOccurrences() > 0 && SLPCostThreshold < 0 &&
17015 Cost < -SLPCostThreshold)
17016 return Cost;
17017 // The narrow non-profitable tree in loop? Skip, may cause regressions.
17018 constexpr unsigned PartLimit = 2;
17019 const unsigned Sz =
17020 getVectorElementSize(V: VectorizableTree.front()->Scalars.front());
17021 const unsigned MinVF = getMinVF(Sz);
17022 if (Cost >= -SLPCostThreshold &&
17023 VectorizableTree.front()->Scalars.size() * PartLimit <= MinVF &&
17024 (!VectorizableTree.front()->hasState() ||
17025 (VectorizableTree.front()->getOpcode() != Instruction::Store &&
17026 LI->getLoopFor(BB: VectorizableTree.front()->getMainOp()->getParent()))))
17027 return Cost;
17028 // Store the cost + external uses estimation as the first element of the
17029 // tuple, just the cost as the second element of the tuple. Required to return
17030 // correct cost estimation for the tree, extracts are calculated separately.
17031 // Extracts, calculated here, are just quick estimations.
17032 SmallVector<
17033 std::tuple<InstructionCost, InstructionCost, SmallVector<unsigned>>>
17034 SubtreeCosts(VectorizableTree.size());
17035 auto UpdateParentNodes =
17036 [&](const TreeEntry *UserTE, const TreeEntry *TE,
17037 InstructionCost TotalCost, InstructionCost Cost,
17038 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17039 &VisitedUser,
17040 bool AddToList = true) {
17041 while (UserTE &&
17042 VisitedUser.insert(V: std::make_pair(x&: TE, y&: UserTE)).second) {
17043 std::get<0>(t&: SubtreeCosts[UserTE->Idx]) += TotalCost;
17044 std::get<1>(t&: SubtreeCosts[UserTE->Idx]) += Cost;
17045 if (AddToList)
17046 std::get<2>(t&: SubtreeCosts[UserTE->Idx]).push_back(Elt: TE->Idx);
17047 UserTE = UserTE->UserTreeIndex.UserTE;
17048 }
17049 };
17050 for (const std::unique_ptr<TreeEntry> &Ptr : VectorizableTree) {
17051 TreeEntry &TE = *Ptr;
17052 InstructionCost C = NodesCosts.at(Val: &TE);
17053 InstructionCost ExtractCost = ExtractCosts.lookup(Val: &TE);
17054 std::get<0>(t&: SubtreeCosts[TE.Idx]) += C + ExtractCost;
17055 std::get<1>(t&: SubtreeCosts[TE.Idx]) += C;
17056 if (const TreeEntry *UserTE = TE.UserTreeIndex.UserTE) {
17057 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4>
17058 VisitedUser;
17059 UpdateParentNodes(UserTE, &TE, C + ExtractCost, C, VisitedUser);
17060 }
17061 }
17062 SmallDenseSet<std::pair<const TreeEntry *, const TreeEntry *>, 4> Visited;
17063 for (TreeEntry *TE : GatheredLoadsNodes) {
17064 InstructionCost TotalCost = std::get<0>(t&: SubtreeCosts[TE->Idx]);
17065 InstructionCost Cost = std::get<1>(t&: SubtreeCosts[TE->Idx]);
17066 for (Value *V : TE->Scalars) {
17067 for (const TreeEntry *BVTE : ValueToGatherNodes.lookup(Val: V))
17068 UpdateParentNodes(BVTE, TE, TotalCost, Cost, Visited,
17069 /*AddToList=*/false);
17070 }
17071 }
17072 Visited.clear();
17073 using CostIndicesTy =
17074 std::pair<TreeEntry *, std::tuple<InstructionCost, InstructionCost,
17075 SmallVector<unsigned>>>;
17076 struct FirstGreater {
17077 bool operator()(const CostIndicesTy &LHS, const CostIndicesTy &RHS) const {
17078 return std::get<0>(t: LHS.second) < std::get<0>(t: RHS.second) ||
17079 (std::get<0>(t: LHS.second) == std::get<0>(t: RHS.second) &&
17080 LHS.first->Idx < RHS.first->Idx);
17081 }
17082 };
17083 PriorityQueue<CostIndicesTy, SmallVector<CostIndicesTy>, FirstGreater>
17084 Worklist;
17085 for (const auto [Idx, P] : enumerate(First&: SubtreeCosts))
17086 Worklist.emplace(args: VectorizableTree[Idx].get(), args&: P);
17087
17088 // Narrow store trees with non-profitable immediate values - exit.
17089 if (!UserIgnoreList && VectorizableTree.front()->getVectorFactor() < MinVF &&
17090 VectorizableTree.front()->hasState() &&
17091 VectorizableTree.front()->getOpcode() == Instruction::Store &&
17092 (Worklist.top().first->Idx == 0 || Worklist.top().first->Idx == 1))
17093 return Cost;
17094
17095 bool Changed = false;
17096 while (!Worklist.empty() && std::get<0>(t: Worklist.top().second) > 0) {
17097 TreeEntry *TE = Worklist.top().first;
17098 if (TE->isGather() || TE->Idx == 0 || DeletedNodes.contains(Ptr: TE) ||
17099 // Exit early if the parent node is split node and any of scalars is
17100 // used in other split nodes.
17101 (TE->UserTreeIndex &&
17102 TE->UserTreeIndex.UserTE->State == TreeEntry::SplitVectorize &&
17103 any_of(Range&: TE->Scalars, P: [&](Value *V) {
17104 ArrayRef<TreeEntry *> Entries = getSplitTreeEntries(V);
17105 return Entries.size() > 1;
17106 }))) {
17107 Worklist.pop();
17108 continue;
17109 }
17110 // Skip inversed compare nodes, they cannot be transformed to buildvectors.
17111 if (TE->State == TreeEntry::Vectorize && !TE->isAltShuffle() &&
17112 (TE->getOpcode() == Instruction::ICmp ||
17113 TE->getOpcode() == Instruction::FCmp) &&
17114 any_of(Range&: TE->Scalars, P: [&](Value *V) {
17115 auto *I = dyn_cast<CmpInst>(Val: V);
17116 if (!I)
17117 return false;
17118 return I->getPredicate() !=
17119 cast<CmpInst>(Val: TE->getMainOp())->getPredicate();
17120 })) {
17121 Worklist.pop();
17122 continue;
17123 }
17124
17125 // Calculate the gather cost of the root node.
17126 InstructionCost TotalSubtreeCost = std::get<0>(t: Worklist.top().second);
17127 InstructionCost SubtreeCost = std::get<1>(t: Worklist.top().second);
17128 if (TotalSubtreeCost < TE->Scalars.size()) {
17129 Worklist.pop();
17130 continue;
17131 }
17132 if (!TransformedToGatherNodes.empty()) {
17133 for (unsigned Idx : std::get<2>(t: Worklist.top().second)) {
17134 auto It = TransformedToGatherNodes.find(Val: VectorizableTree[Idx].get());
17135 if (It != TransformedToGatherNodes.end()) {
17136 TotalSubtreeCost -= std::get<0>(t&: SubtreeCosts[Idx]);
17137 SubtreeCost -= std::get<1>(t&: SubtreeCosts[Idx]);
17138 TotalSubtreeCost += It->second;
17139 SubtreeCost += It->second;
17140 }
17141 }
17142 }
17143 if (TotalSubtreeCost < 0 || TotalSubtreeCost < TE->Scalars.size()) {
17144 Worklist.pop();
17145 continue;
17146 }
17147 const unsigned Sz = TE->Scalars.size();
17148 APInt DemandedElts = APInt::getAllOnes(numBits: Sz);
17149 for (auto [Idx, V] : enumerate(First&: TE->Scalars)) {
17150 if (isConstant(V))
17151 DemandedElts.clearBit(BitPosition: Idx);
17152 }
17153
17154 Type *ScalarTy = getValueType(V: TE->Scalars.front());
17155 auto It = MinBWs.find(Val: TE);
17156 if (It != MinBWs.end())
17157 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
17158 if (isa<CmpInst>(Val: TE->Scalars.front()))
17159 ScalarTy = TE->Scalars.front()->getType();
17160 auto *VecTy = getWidenedType(ScalarTy, VF: Sz);
17161 const unsigned EntryVF = TE->getVectorFactor();
17162 auto *FinalVecTy = getWidenedType(ScalarTy, VF: EntryVF);
17163 InstructionCost GatherCost = ::getScalarizationOverhead(
17164 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
17165 /*Insert=*/true, /*Extract=*/false, CostKind);
17166 SmallVector<int> Mask;
17167 if (!TE->ReorderIndices.empty() &&
17168 TE->State != TreeEntry::CompressVectorize &&
17169 (TE->State != TreeEntry::StridedVectorize ||
17170 !isReverseOrder(Order: TE->ReorderIndices))) {
17171 SmallVector<int> NewMask;
17172 if (TE->getOpcode() == Instruction::Store) {
17173 // For stores the order is actually a mask.
17174 NewMask.resize(N: TE->ReorderIndices.size());
17175 copy(Range&: TE->ReorderIndices, Out: NewMask.begin());
17176 } else {
17177 inversePermutation(Indices: TE->ReorderIndices, Mask&: NewMask);
17178 }
17179 ::addMask(Mask, SubMask: NewMask);
17180 }
17181 if (!TE->ReuseShuffleIndices.empty())
17182 ::addMask(Mask, SubMask: TE->ReuseShuffleIndices);
17183 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: EntryVF))
17184 GatherCost +=
17185 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FinalVecTy, Mask);
17186 // If all scalars are reused in gather node(s) or other vector nodes, there
17187 // might be extra cost for inserting them.
17188 if ((!TE->hasState() || !TE->isAltShuffle()) &&
17189 all_of(Range&: TE->Scalars, P: [&](Value *V) {
17190 return (TE->hasCopyableElements() && TE->isCopyableElement(V)) ||
17191 isConstant(V) || isGathered(V) || getTreeEntries(V).size() > 1;
17192 }))
17193 GatherCost *= 2;
17194 // Erase subtree if it is non-profitable.
17195 if (TotalSubtreeCost > GatherCost) {
17196 // If the remaining tree is just a buildvector - exit, it will cause
17197 // endless attempts to vectorize.
17198 if (VectorizableTree.front()->hasState() &&
17199 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17200 TE->Idx == 1)
17201 return InstructionCost::getInvalid();
17202
17203 LLVM_DEBUG(dbgs() << "SLP: Trimming unprofitable subtree at node "
17204 << TE->Idx << " with cost "
17205 << std::get<0>(Worklist.top().second)
17206 << " and gather cost " << GatherCost << ".\n");
17207 if (TE->UserTreeIndex) {
17208 TransformedToGatherNodes.try_emplace(Key: TE, Args&: GatherCost);
17209 NodesCosts.erase(Val: TE);
17210 } else {
17211 DeletedNodes.insert(Ptr: TE);
17212 TransformedToGatherNodes.erase(Val: TE);
17213 NodesCosts.erase(Val: TE);
17214 }
17215 for (unsigned Idx : std::get<2>(t: Worklist.top().second)) {
17216 TreeEntry &ChildTE = *VectorizableTree[Idx];
17217 DeletedNodes.insert(Ptr: &ChildTE);
17218 TransformedToGatherNodes.erase(Val: &ChildTE);
17219 NodesCosts.erase(Val: &ChildTE);
17220 }
17221 Changed = true;
17222 }
17223 Worklist.pop();
17224 }
17225 if (!Changed)
17226 return std::get<1>(t&: SubtreeCosts.front());
17227
17228 SmallPtrSet<TreeEntry *, 4> GatheredLoadsToDelete;
17229 InstructionCost LoadsExtractsCost = 0;
17230 // Check if all loads of gathered loads nodes are marked for deletion. In this
17231 // case the whole gathered loads subtree must be deleted.
17232 // Also, try to account for extracts, which might be required, if only part of
17233 // gathered load must be vectorized. Keep partially vectorized nodes, if
17234 // extracts are cheaper than gathers.
17235 for (TreeEntry *TE : GatheredLoadsNodes) {
17236 if (DeletedNodes.contains(Ptr: TE) || TransformedToGatherNodes.contains(Val: TE))
17237 continue;
17238 GatheredLoadsToDelete.insert(Ptr: TE);
17239 APInt DemandedElts = APInt::getZero(numBits: TE->getVectorFactor());
17240 // All loads are removed from gathered? Need to delete the subtree.
17241 SmallDenseMap<const TreeEntry *, SmallVector<Value *>> ValuesToInsert;
17242 for (Value *V : TE->Scalars) {
17243 unsigned Pos = TE->findLaneForValue(V);
17244 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
17245 if (DeletedNodes.contains(Ptr: BVE))
17246 continue;
17247 DemandedElts.setBit(Pos);
17248 ValuesToInsert.try_emplace(Key: BVE).first->second.push_back(Elt: V);
17249 }
17250 }
17251 if (!DemandedElts.isZero()) {
17252 Type *ScalarTy = TE->Scalars.front()->getType();
17253 auto It = MinBWs.find(Val: TE);
17254 if (It != MinBWs.end())
17255 ScalarTy = IntegerType::get(C&: ScalarTy->getContext(), NumBits: It->second.first);
17256 auto *VecTy = getWidenedType(ScalarTy, VF: TE->getVectorFactor());
17257 InstructionCost ExtractsCost = ::getScalarizationOverhead(
17258 TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts,
17259 /*Insert=*/false, /*Extract=*/true, CostKind);
17260 InstructionCost BVCost = 0;
17261 for (const auto &[BVE, Values] : ValuesToInsert) {
17262 APInt BVDemandedElts = APInt::getZero(numBits: BVE->getVectorFactor());
17263 SmallVector<Value *> BVValues(BVE->getVectorFactor(),
17264 PoisonValue::get(T: ScalarTy));
17265 for (Value *V : Values) {
17266 unsigned Pos = BVE->findLaneForValue(V);
17267 BVValues[Pos] = V;
17268 BVDemandedElts.setBit(Pos);
17269 }
17270 auto *BVVecTy = getWidenedType(ScalarTy, VF: BVE->getVectorFactor());
17271 BVCost += ::getScalarizationOverhead(
17272 TTI: *TTI, ScalarTy, Ty: BVVecTy, DemandedElts: BVDemandedElts,
17273 /*Insert=*/true, /*Extract=*/false, CostKind,
17274 ForPoisonSrc: BVDemandedElts.isAllOnes(), VL: BVValues);
17275 }
17276 if (ExtractsCost < BVCost) {
17277 LoadsExtractsCost += ExtractsCost;
17278 GatheredLoadsToDelete.erase(Ptr: TE);
17279 continue;
17280 }
17281 LoadsExtractsCost += BVCost;
17282 }
17283 NodesCosts.erase(Val: TE);
17284 }
17285
17286 // Deleted all subtrees rooted at gathered loads nodes.
17287 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17288 if (TE->UserTreeIndex &&
17289 GatheredLoadsToDelete.contains(Ptr: TE->UserTreeIndex.UserTE)) {
17290 DeletedNodes.insert(Ptr: TE.get());
17291 NodesCosts.erase(Val: TE.get());
17292 GatheredLoadsToDelete.insert(Ptr: TE.get());
17293 }
17294 if (GatheredLoadsToDelete.contains(Ptr: TE.get()))
17295 DeletedNodes.insert(Ptr: TE.get());
17296 }
17297
17298 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
17299 if (!TE->UserTreeIndex && TransformedToGatherNodes.contains(Val: TE.get())) {
17300 assert(TE->getOpcode() == Instruction::Load && "Expected load only.");
17301 continue;
17302 }
17303 if (DeletedNodes.contains(Ptr: TE.get()))
17304 continue;
17305 if (!NodesCosts.contains(Val: TE.get())) {
17306 InstructionCost C =
17307 getEntryCost(E: TE.get(), VectorizedVals, CheckedExtracts);
17308 NodesCosts.try_emplace(Key: TE.get(), Args&: C);
17309 }
17310 }
17311
17312 LLVM_DEBUG(dbgs() << "SLP: Recalculate costs after tree trimming.\n");
17313 InstructionCost NewCost = 0;
17314 for (const auto &P : NodesCosts) {
17315 NewCost += P.second;
17316 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << P.second << " for bundle "
17317 << shortBundleName(P.first->Scalars, P.first->Idx)
17318 << ".\n"
17319 << "SLP: Current total cost = " << Cost << "\n");
17320 }
17321 if (NewCost + LoadsExtractsCost >= Cost) {
17322 DeletedNodes.clear();
17323 TransformedToGatherNodes.clear();
17324 NewCost = Cost;
17325 } else {
17326 // If the remaining tree is just a buildvector - exit, it will cause
17327 // endless attempts to vectorize.
17328 if (VectorizableTree.size()>= 2 && VectorizableTree.front()->hasState() &&
17329 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17330 TransformedToGatherNodes.contains(Val: VectorizableTree[1].get()))
17331 return InstructionCost::getInvalid();
17332 if (VectorizableTree.size() >= 3 && VectorizableTree.front()->hasState() &&
17333 VectorizableTree.front()->getOpcode() == Instruction::InsertElement &&
17334 VectorizableTree[1]->hasState() &&
17335 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17336 (VectorizableTree[1]->getOpcode() == Instruction::ZExt ||
17337 VectorizableTree[1]->getOpcode() == Instruction::SExt ||
17338 VectorizableTree[1]->getOpcode() == Instruction::Trunc) &&
17339 TransformedToGatherNodes.contains(Val: VectorizableTree[2].get()))
17340 return InstructionCost::getInvalid();
17341 }
17342 return NewCost;
17343}
17344
17345namespace {
17346/// Data type for handling buildvector sequences with the reused scalars from
17347/// other tree entries.
17348template <typename T> struct ShuffledInsertData {
17349 /// List of insertelements to be replaced by shuffles.
17350 SmallVector<InsertElementInst *> InsertElements;
17351 /// The parent vectors and shuffle mask for the given list of inserts.
17352 MapVector<T, SmallVector<int>> ValueMasks;
17353};
17354} // namespace
17355
17356InstructionCost BoUpSLP::getTreeCost(InstructionCost TreeCost,
17357 ArrayRef<Value *> VectorizedVals,
17358 InstructionCost ReductionCost) {
17359 InstructionCost Cost = TreeCost + ReductionCost;
17360
17361 if (Cost >= -SLPCostThreshold &&
17362 none_of(Range&: ExternalUses, P: [](const ExternalUser &EU) {
17363 return isa_and_nonnull<InsertElementInst>(Val: EU.User);
17364 }))
17365 return Cost;
17366
17367 SmallPtrSet<Value *, 16> ExtractCostCalculated;
17368 InstructionCost ExtractCost = 0;
17369 SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;
17370 SmallVector<APInt> DemandedElts;
17371 SmallDenseSet<Value *, 4> UsedInserts;
17372 DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
17373 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
17374 DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;
17375 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
17376 // Keep track {Scalar, Index, User} tuple.
17377 // On AArch64, this helps in fusing a mov instruction, associated with
17378 // extractelement, with fmul in the backend so that extractelement is free.
17379 SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;
17380 for (ExternalUser &EU : ExternalUses) {
17381 ScalarUserAndIdx.emplace_back(Args&: EU.Scalar, Args&: EU.User, Args&: EU.Lane);
17382 }
17383 SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
17384 for (ExternalUser &EU : ExternalUses) {
17385 LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
17386 << EU.E.Idx << " in lane " << EU.Lane << "\n");
17387 LLVM_DEBUG(if (EU.User) dbgs() << " User:" << *EU.User << "\n";
17388 else dbgs() << " User: nullptr\n");
17389 LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
17390
17391 // Uses by ephemeral values are free (because the ephemeral value will be
17392 // removed prior to code generation, and so the extraction will be
17393 // removed as well).
17394 if (EphValues.count(Ptr: EU.User))
17395 continue;
17396
17397 // Check if the scalar for the given user or all users is accounted already.
17398 if (!CheckedScalarUser.insert(V: std::make_pair(x&: EU.Scalar, y&: EU.User)).second ||
17399 (EU.User &&
17400 CheckedScalarUser.contains(V: std::make_pair(x&: EU.Scalar, y: nullptr))))
17401 continue;
17402
17403 // Used in unreachable blocks or in EH pads (rarely executed) or is
17404 // terminated with unreachable instruction.
17405 if (BasicBlock *UserParent =
17406 EU.User ? cast<Instruction>(Val: EU.User)->getParent() : nullptr;
17407 UserParent &&
17408 (!DT->isReachableFromEntry(A: UserParent) || UserParent->isEHPad() ||
17409 isa_and_present<UnreachableInst>(Val: UserParent->getTerminator())))
17410 continue;
17411
17412 // We only add extract cost once for the same scalar.
17413 if (!isa_and_nonnull<InsertElementInst>(Val: EU.User) &&
17414 !ExtractCostCalculated.insert(Ptr: EU.Scalar).second)
17415 continue;
17416
17417 // No extract cost for vector "scalar" if REVEC is disabled
17418 if (!SLPReVec && isa<FixedVectorType>(Val: EU.Scalar->getType()))
17419 continue;
17420
17421 // If found user is an insertelement, do not calculate extract cost but try
17422 // to detect it as a final shuffled/identity match.
17423 // TODO: what if a user is insertvalue when REVEC is enabled?
17424 if (auto *VU = dyn_cast_or_null<InsertElementInst>(Val: EU.User);
17425 VU && VU->getOperand(i_nocapture: 1) == EU.Scalar) {
17426 if (auto *FTy = dyn_cast<FixedVectorType>(Val: VU->getType())) {
17427 if (!UsedInserts.insert(V: VU).second)
17428 continue;
17429 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
17430 if (InsertIdx) {
17431 const TreeEntry *ScalarTE = &EU.E;
17432 auto *It = find_if(
17433 Range&: ShuffledInserts,
17434 P: [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
17435 // Checks if 2 insertelements are from the same buildvector.
17436 InsertElementInst *VecInsert = Data.InsertElements.front();
17437 return areTwoInsertFromSameBuildVector(
17438 VU, V: VecInsert, GetBaseOperand: [this](InsertElementInst *II) -> Value * {
17439 Value *Op0 = II->getOperand(i_nocapture: 0);
17440 if (isVectorized(V: II) && !isVectorized(V: Op0))
17441 return nullptr;
17442 return Op0;
17443 });
17444 });
17445 int VecId = -1;
17446 if (It == ShuffledInserts.end()) {
17447 auto &Data = ShuffledInserts.emplace_back();
17448 Data.InsertElements.emplace_back(Args&: VU);
17449 DemandedElts.push_back(Elt: APInt::getZero(numBits: FTy->getNumElements()));
17450 VecId = ShuffledInserts.size() - 1;
17451 auto It = MinBWs.find(Val: ScalarTE);
17452 if (It != MinBWs.end() &&
17453 VectorCasts
17454 .insert(V: std::make_pair(x&: ScalarTE, y: FTy->getElementType()))
17455 .second) {
17456 unsigned BWSz = It->second.first;
17457 unsigned DstBWSz = DL->getTypeSizeInBits(Ty: FTy->getElementType());
17458 unsigned VecOpcode;
17459 if (DstBWSz < BWSz)
17460 VecOpcode = Instruction::Trunc;
17461 else
17462 VecOpcode =
17463 It->second.second ? Instruction::SExt : Instruction::ZExt;
17464 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17465 InstructionCost C = TTI->getCastInstrCost(
17466 Opcode: VecOpcode, Dst: FTy,
17467 Src: getWidenedType(ScalarTy: IntegerType::get(C&: FTy->getContext(), NumBits: BWSz),
17468 VF: FTy->getNumElements()),
17469 CCH: TTI::CastContextHint::None, CostKind);
17470 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17471 << " for extending externally used vector with "
17472 "non-equal minimum bitwidth.\n");
17473 Cost += C;
17474 }
17475 } else {
17476 if (isFirstInsertElement(IE1: VU, IE2: It->InsertElements.front()))
17477 It->InsertElements.front() = VU;
17478 VecId = std::distance(first: ShuffledInserts.begin(), last: It);
17479 }
17480 int InIdx = *InsertIdx;
17481 SmallVectorImpl<int> &Mask =
17482 ShuffledInserts[VecId].ValueMasks[ScalarTE];
17483 if (Mask.empty())
17484 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
17485 Mask[InIdx] = EU.Lane;
17486 DemandedElts[VecId].setBit(InIdx);
17487 continue;
17488 }
17489 }
17490 }
17491
17492 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
17493 // If we plan to rewrite the tree in a smaller type, we will need to sign
17494 // extend the extracted value back to the original type. Here, we account
17495 // for the extract and the added cost of the sign extend if needed.
17496 InstructionCost ExtraCost = TTI::TCC_Free;
17497 auto *ScalarTy = EU.Scalar->getType();
17498 const unsigned BundleWidth = EU.E.getVectorFactor();
17499 assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
17500 auto *VecTy = getWidenedType(ScalarTy, VF: BundleWidth);
17501 const TreeEntry *Entry = &EU.E;
17502 auto It = MinBWs.find(Val: Entry);
17503 if (It != MinBWs.end()) {
17504 Type *MinTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
17505 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy))
17506 MinTy = getWidenedType(ScalarTy: MinTy, VF: VecTy->getNumElements());
17507 unsigned Extend = isKnownNonNegative(V: EU.Scalar, SQ: SimplifyQuery(*DL))
17508 ? Instruction::ZExt
17509 : Instruction::SExt;
17510 VecTy = getWidenedType(ScalarTy: MinTy, VF: BundleWidth);
17511 ExtraCost =
17512 getExtractWithExtendCost(TTI: *TTI, Opcode: Extend, Dst: ScalarTy, VecTy, Index: EU.Lane);
17513 LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
17514 << ExtraCost << "\n");
17515 } else {
17516 ExtraCost =
17517 getVectorInstrCost(TTI: *TTI, ScalarTy, Opcode: Instruction::ExtractElement, Val: VecTy,
17518 CostKind, Index: EU.Lane, Scalar: EU.Scalar, ScalarUserAndIdx);
17519 LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
17520 << *VecTy << ": " << ExtraCost << "\n");
17521 }
17522 // Leave the scalar instructions as is if they are cheaper than extracts.
17523 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
17524 Entry->getOpcode() == Instruction::Load) {
17525 // Checks if the user of the external scalar is phi in loop body.
17526 auto IsPhiInLoop = [&](const ExternalUser &U) {
17527 if (auto *Phi = dyn_cast_if_present<PHINode>(Val: U.User)) {
17528 auto *I = cast<Instruction>(Val: U.Scalar);
17529 const Loop *L = LI->getLoopFor(BB: Phi->getParent());
17530 return L && (Phi->getParent() == I->getParent() ||
17531 L == LI->getLoopFor(BB: I->getParent()));
17532 }
17533 return false;
17534 };
17535 if (!ValueToExtUses) {
17536 ValueToExtUses.emplace();
17537 for (const auto &P : enumerate(First&: ExternalUses)) {
17538 // Ignore phis in loops.
17539 if (IsPhiInLoop(P.value()))
17540 continue;
17541
17542 ValueToExtUses->try_emplace(Key: P.value().Scalar, Args: P.index());
17543 }
17544 }
17545 // Can use original instruction, if no operands vectorized or they are
17546 // marked as externally used already.
17547 auto *Inst = cast<Instruction>(Val: EU.Scalar);
17548 InstructionCost ScalarCost = TTI->getInstructionCost(U: Inst, CostKind);
17549 auto OperandIsScalar = [&](Value *V) {
17550 if (!isVectorized(V)) {
17551 // Some extractelements might be not vectorized, but
17552 // transformed into shuffle and removed from the function,
17553 // consider it here.
17554 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V))
17555 return !EE->hasOneUse() || !MustGather.contains(Ptr: EE);
17556 return true;
17557 }
17558 return ValueToExtUses->contains(Val: V);
17559 };
17560 bool CanBeUsedAsScalar = all_of(Range: Inst->operands(), P: OperandIsScalar);
17561 bool CanBeUsedAsScalarCast = false;
17562 if (auto *CI = dyn_cast<CastInst>(Val: Inst); CI && !CanBeUsedAsScalar) {
17563 if (auto *Op = dyn_cast<Instruction>(Val: CI->getOperand(i_nocapture: 0));
17564 Op && all_of(Range: Op->operands(), P: OperandIsScalar)) {
17565 InstructionCost OpCost =
17566 (isVectorized(V: Op) && !ValueToExtUses->contains(Val: Op))
17567 ? TTI->getInstructionCost(U: Op, CostKind)
17568 : 0;
17569 if (ScalarCost + OpCost <= ExtraCost) {
17570 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
17571 ScalarCost += OpCost;
17572 }
17573 }
17574 }
17575 if (CanBeUsedAsScalar) {
17576 bool KeepScalar = ScalarCost <= ExtraCost;
17577 // Try to keep original scalar if the user is the phi node from the same
17578 // block as the root phis, currently vectorized. It allows to keep
17579 // better ordering info of PHIs, being vectorized currently.
17580 bool IsProfitablePHIUser =
17581 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
17582 VectorizableTree.front()->Scalars.size() > 2)) &&
17583 VectorizableTree.front()->hasState() &&
17584 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
17585 !Inst->hasNUsesOrMore(N: UsesLimit) &&
17586 none_of(Range: Inst->users(),
17587 P: [&](User *U) {
17588 auto *PHIUser = dyn_cast<PHINode>(Val: U);
17589 return (!PHIUser ||
17590 PHIUser->getParent() !=
17591 cast<Instruction>(
17592 Val: VectorizableTree.front()->getMainOp())
17593 ->getParent()) &&
17594 !isVectorized(V: U);
17595 }) &&
17596 count_if(Range: Entry->Scalars, P: [&](Value *V) {
17597 return ValueToExtUses->contains(Val: V);
17598 }) <= 2;
17599 if (IsProfitablePHIUser) {
17600 KeepScalar = true;
17601 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
17602 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
17603 (!GatheredLoadsEntriesFirst.has_value() ||
17604 Entry->Idx < *GatheredLoadsEntriesFirst)) {
17605 unsigned ScalarUsesCount = count_if(Range: Entry->Scalars, P: [&](Value *V) {
17606 return ValueToExtUses->contains(Val: V);
17607 });
17608 auto It = ExtractsCount.find(Val: Entry);
17609 if (It != ExtractsCount.end()) {
17610 assert(ScalarUsesCount >= It->getSecond().size() &&
17611 "Expected total number of external uses not less than "
17612 "number of scalar uses.");
17613 ScalarUsesCount -= It->getSecond().size();
17614 }
17615 // Keep original scalar if number of externally used instructions in
17616 // the same entry is not power of 2. It may help to do some extra
17617 // vectorization for now.
17618 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(Value: ScalarUsesCount);
17619 }
17620 if (KeepScalar) {
17621 ExternalUsesAsOriginalScalar.insert(Ptr: EU.Scalar);
17622 for (Value *V : Inst->operands()) {
17623 auto It = ValueToExtUses->find(Val: V);
17624 if (It != ValueToExtUses->end()) {
17625 // Replace all uses to avoid compiler crash.
17626 ExternalUses[It->second].User = nullptr;
17627 }
17628 }
17629 ExtraCost = ScalarCost;
17630 if (!IsPhiInLoop(EU))
17631 ExtractsCount[Entry].insert(V: Inst);
17632 if (CanBeUsedAsScalarCast) {
17633 ScalarOpsFromCasts.insert(Ptr: Inst->getOperand(i: 0));
17634 // Update the users of the operands of the cast operand to avoid
17635 // compiler crash.
17636 if (auto *IOp = dyn_cast<Instruction>(Val: Inst->getOperand(i: 0))) {
17637 for (Value *V : IOp->operands()) {
17638 auto It = ValueToExtUses->find(Val: V);
17639 if (It != ValueToExtUses->end()) {
17640 // Replace all uses to avoid compiler crash.
17641 ExternalUses[It->second].User = nullptr;
17642 }
17643 }
17644 }
17645 }
17646 }
17647 }
17648 }
17649
17650 ExtractCost += ExtraCost;
17651 }
17652 // Insert externals for extract of operands of casts to be emitted as scalars
17653 // instead of extractelement.
17654 for (Value *V : ScalarOpsFromCasts) {
17655 ExternalUsesAsOriginalScalar.insert(Ptr: V);
17656 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V); !TEs.empty()) {
17657 const auto *It = find_if_not(Range&: TEs, P: [&](TreeEntry *TE) {
17658 return TransformedToGatherNodes.contains(Val: TE) ||
17659 DeletedNodes.contains(Ptr: TE);
17660 });
17661 if (It != TEs.end()) {
17662 const TreeEntry *UserTE = *It;
17663 ExternalUses.emplace_back(Args&: V, Args: nullptr, Args: *UserTE,
17664 Args: UserTE->findLaneForValue(V));
17665 }
17666 }
17667 }
17668 // Add reduced value cost, if resized.
17669 if (!VectorizedVals.empty()) {
17670 const TreeEntry &Root = *VectorizableTree.front();
17671 auto BWIt = MinBWs.find(Val: &Root);
17672 if (BWIt != MinBWs.end()) {
17673 Type *DstTy = Root.Scalars.front()->getType();
17674 unsigned OriginalSz = DL->getTypeSizeInBits(Ty: DstTy->getScalarType());
17675 unsigned SrcSz =
17676 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
17677 if (OriginalSz != SrcSz) {
17678 unsigned Opcode = Instruction::Trunc;
17679 if (OriginalSz > SrcSz)
17680 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
17681 Type *SrcTy = IntegerType::get(C&: DstTy->getContext(), NumBits: SrcSz);
17682 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DstTy)) {
17683 assert(SLPReVec && "Only supported by REVEC.");
17684 SrcTy = getWidenedType(ScalarTy: SrcTy, VF: VecTy->getNumElements());
17685 }
17686 Cost += TTI->getCastInstrCost(Opcode, Dst: DstTy, Src: SrcTy,
17687 CCH: TTI::CastContextHint::None,
17688 CostKind: TTI::TCK_RecipThroughput);
17689 }
17690 }
17691 }
17692
17693 // Buildvector with externally used scalars, which should remain as scalars,
17694 // should not be vectorized, the compiler may hang.
17695 if (SLPCostThreshold < 0 && VectorizableTree.size() > 1 &&
17696 isa<InsertElementInst>(Val: VectorizableTree[0]->Scalars[0]) &&
17697 VectorizableTree[1]->hasState() &&
17698 VectorizableTree[1]->State == TreeEntry::Vectorize &&
17699 all_of(Range&: VectorizableTree[1]->Scalars, P: [&](Value *V) {
17700 return ExternalUsesAsOriginalScalar.contains(Ptr: V);
17701 }))
17702 return InstructionCost::getInvalid();
17703
17704 Cost += ExtractCost;
17705 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
17706 bool ForSingleMask) {
17707 InstructionCost C = 0;
17708 unsigned VF = Mask.size();
17709 unsigned VecVF = TE->getVectorFactor();
17710 bool HasLargeIndex =
17711 any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); });
17712 if ((VF != VecVF && HasLargeIndex) ||
17713 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF)) {
17714
17715 if (HasLargeIndex) {
17716 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
17717 std::copy(first: Mask.begin(), last: std::next(x: Mask.begin(), n: std::min(a: VF, b: VecVF)),
17718 result: OrigMask.begin());
17719 C = ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17720 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF),
17721 Mask: OrigMask);
17722 LLVM_DEBUG(
17723 dbgs() << "SLP: Adding cost " << C
17724 << " for final shuffle of insertelement external users.\n";
17725 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17726 Cost += C;
17727 return std::make_pair(x&: TE, y: true);
17728 }
17729
17730 if (!ForSingleMask) {
17731 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
17732 for (unsigned I = 0; I < VF; ++I) {
17733 if (Mask[I] != PoisonMaskElem)
17734 ResizeMask[Mask[I]] = Mask[I];
17735 }
17736 if (!ShuffleVectorInst::isIdentityMask(Mask: ResizeMask, NumSrcElts: VF))
17737 C = ::getShuffleCost(
17738 TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
17739 Tp: getWidenedType(ScalarTy: TE->getMainOp()->getType(), VF: VecVF), Mask: ResizeMask);
17740 LLVM_DEBUG(
17741 dbgs() << "SLP: Adding cost " << C
17742 << " for final shuffle of insertelement external users.\n";
17743 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
17744
17745 Cost += C;
17746 }
17747 }
17748 return std::make_pair(x&: TE, y: false);
17749 };
17750 // Calculate the cost of the reshuffled vectors, if any.
17751 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
17752 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(i_nocapture: 0);
17753 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
17754 unsigned VF = 0;
17755 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
17756 ArrayRef<const TreeEntry *> TEs) {
17757 assert((TEs.size() == 1 || TEs.size() == 2) &&
17758 "Expected exactly 1 or 2 tree entries.");
17759 if (TEs.size() == 1) {
17760 if (VF == 0)
17761 VF = TEs.front()->getVectorFactor();
17762 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
17763 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF) &&
17764 !all_of(Range: enumerate(First&: Mask), P: [=](const auto &Data) {
17765 return Data.value() == PoisonMaskElem ||
17766 (Data.index() < VF &&
17767 static_cast<int>(Data.index()) == Data.value());
17768 })) {
17769 InstructionCost C =
17770 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc, Tp: FTy, Mask);
17771 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17772 << " for final shuffle of insertelement "
17773 "external users.\n";
17774 TEs.front()->dump();
17775 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17776 Cost += C;
17777 }
17778 } else {
17779 if (VF == 0) {
17780 if (TEs.front() &&
17781 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
17782 VF = TEs.front()->getVectorFactor();
17783 else
17784 VF = Mask.size();
17785 }
17786 auto *FTy = getWidenedType(ScalarTy: TEs.back()->Scalars.front()->getType(), VF);
17787 InstructionCost C =
17788 ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: FTy, Mask);
17789 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
17790 << " for final shuffle of vector node and external "
17791 "insertelement users.\n";
17792 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
17793 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17794 Cost += C;
17795 }
17796 VF = Mask.size();
17797 return TEs.back();
17798 };
17799 (void)performExtractsShuffleAction<const TreeEntry>(
17800 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()), Base,
17801 GetVF: [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeAction: ResizeToVF,
17802 Action: EstimateShufflesCost);
17803 InstructionCost InsertCost = TTI->getScalarizationOverhead(
17804 Ty: cast<FixedVectorType>(
17805 Val: ShuffledInserts[I].InsertElements.front()->getType()),
17806 DemandedElts: DemandedElts[I],
17807 /*Insert*/ true, /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
17808 Cost -= InsertCost;
17809 }
17810
17811 // Add the cost for reduced value resize (if required).
17812 if (ReductionBitWidth != 0) {
17813 assert(UserIgnoreList && "Expected reduction tree.");
17814 const TreeEntry &E = *VectorizableTree.front();
17815 auto It = MinBWs.find(Val: &E);
17816 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
17817 unsigned SrcSize = It->second.first;
17818 unsigned DstSize = ReductionBitWidth;
17819 unsigned Opcode = Instruction::Trunc;
17820 if (SrcSize < DstSize) {
17821 bool IsArithmeticExtendedReduction =
17822 all_of(Range: *UserIgnoreList, P: [](Value *V) {
17823 auto *I = cast<Instruction>(Val: V);
17824 return is_contained(Set: {Instruction::Add, Instruction::FAdd,
17825 Instruction::Mul, Instruction::FMul,
17826 Instruction::And, Instruction::Or,
17827 Instruction::Xor},
17828 Element: I->getOpcode());
17829 });
17830 if (IsArithmeticExtendedReduction)
17831 Opcode =
17832 Instruction::BitCast; // Handle it by getExtendedReductionCost
17833 else
17834 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
17835 }
17836 if (Opcode != Instruction::BitCast) {
17837 auto *SrcVecTy =
17838 getWidenedType(ScalarTy: Builder.getIntNTy(N: SrcSize), VF: E.getVectorFactor());
17839 auto *DstVecTy =
17840 getWidenedType(ScalarTy: Builder.getIntNTy(N: DstSize), VF: E.getVectorFactor());
17841 TTI::CastContextHint CCH = getCastContextHint(TE: E);
17842 InstructionCost CastCost;
17843 switch (E.getOpcode()) {
17844 case Instruction::SExt:
17845 case Instruction::ZExt:
17846 case Instruction::Trunc: {
17847 const TreeEntry *OpTE = getOperandEntry(E: &E, Idx: 0);
17848 CCH = getCastContextHint(TE: *OpTE);
17849 break;
17850 }
17851 default:
17852 break;
17853 }
17854 CastCost += TTI->getCastInstrCost(Opcode, Dst: DstVecTy, Src: SrcVecTy, CCH,
17855 CostKind: TTI::TCK_RecipThroughput);
17856 Cost += CastCost;
17857 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
17858 << " for final resize for reduction from " << SrcVecTy
17859 << " to " << DstVecTy << "\n";
17860 dbgs() << "SLP: Current total cost = " << Cost << "\n");
17861 }
17862 }
17863 }
17864
17865 std::optional<InstructionCost> SpillCost;
17866 if (Cost < -SLPCostThreshold) {
17867 SpillCost = getSpillCost();
17868 Cost += *SpillCost;
17869 }
17870#ifndef NDEBUG
17871 SmallString<256> Str;
17872 {
17873 raw_svector_ostream OS(Str);
17874 OS << "SLP: Spill Cost = ";
17875 if (SpillCost)
17876 OS << *SpillCost;
17877 else
17878 OS << "<skipped>";
17879 OS << ".\nSLP: Extract Cost = " << ExtractCost << ".\n"
17880 << "SLP: Total Cost = " << Cost << ".\n";
17881 }
17882 LLVM_DEBUG(dbgs() << Str);
17883 if (ViewSLPTree)
17884 ViewGraph(this, "SLP" + F->getName(), false, Str);
17885#endif
17886
17887 return Cost;
17888}
17889
17890/// Tries to find extractelement instructions with constant indices from fixed
17891/// vector type and gather such instructions into a bunch, which highly likely
17892/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17893/// successful, the matched scalars are replaced by poison values in \p VL for
17894/// future analysis.
17895std::optional<TTI::ShuffleKind>
17896BoUpSLP::tryToGatherSingleRegisterExtractElements(
17897 MutableArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) const {
17898 // Scan list of gathered scalars for extractelements that can be represented
17899 // as shuffles.
17900 MapVector<Value *, SmallVector<int>> VectorOpToIdx;
17901 SmallVector<int> UndefVectorExtracts;
17902 for (int I = 0, E = VL.size(); I < E; ++I) {
17903 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
17904 if (!EI) {
17905 if (isa<UndefValue>(Val: VL[I]))
17906 UndefVectorExtracts.push_back(Elt: I);
17907 continue;
17908 }
17909 auto *VecTy = dyn_cast<FixedVectorType>(Val: EI->getVectorOperandType());
17910 if (!VecTy || !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()))
17911 continue;
17912 std::optional<unsigned> Idx = getExtractIndex(E: EI);
17913 // Undefined index.
17914 if (!Idx) {
17915 UndefVectorExtracts.push_back(Elt: I);
17916 continue;
17917 }
17918 if (Idx >= VecTy->getNumElements()) {
17919 UndefVectorExtracts.push_back(Elt: I);
17920 continue;
17921 }
17922 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
17923 ExtractMask.reset(Idx: *Idx);
17924 if (isUndefVector(V: EI->getVectorOperand(), UseMask: ExtractMask).all()) {
17925 UndefVectorExtracts.push_back(Elt: I);
17926 continue;
17927 }
17928 VectorOpToIdx[EI->getVectorOperand()].push_back(Elt: I);
17929 }
17930 // Sort the vector operands by the maximum number of uses in extractelements.
17931 SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =
17932 VectorOpToIdx.takeVector();
17933 stable_sort(Range&: Vectors, C: [](const auto &P1, const auto &P2) {
17934 return P1.second.size() > P2.second.size();
17935 });
17936 // Find the best pair of the vectors or a single vector.
17937 const int UndefSz = UndefVectorExtracts.size();
17938 unsigned SingleMax = 0;
17939 unsigned PairMax = 0;
17940 if (!Vectors.empty()) {
17941 SingleMax = Vectors.front().second.size() + UndefSz;
17942 if (Vectors.size() > 1) {
17943 auto *ItNext = std::next(x: Vectors.begin());
17944 PairMax = SingleMax + ItNext->second.size();
17945 }
17946 }
17947 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
17948 return std::nullopt;
17949 // Check if better to perform a shuffle of 2 vectors or just of a single
17950 // vector.
17951 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
17952 SmallVector<Value *> GatheredExtracts(
17953 VL.size(), PoisonValue::get(T: VL.front()->getType()));
17954 if (SingleMax >= PairMax && SingleMax) {
17955 for (int Idx : Vectors.front().second)
17956 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
17957 } else if (!Vectors.empty()) {
17958 for (unsigned Idx : {0, 1})
17959 for (int Idx : Vectors[Idx].second)
17960 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
17961 }
17962 // Add extracts from undefs too.
17963 for (int Idx : UndefVectorExtracts)
17964 std::swap(a&: GatheredExtracts[Idx], b&: VL[Idx]);
17965 // Check that gather of extractelements can be represented as just a
17966 // shuffle of a single/two vectors the scalars are extracted from.
17967 std::optional<TTI::ShuffleKind> Res =
17968 isFixedVectorShuffle(VL: GatheredExtracts, Mask, AC);
17969 if (!Res || all_of(Range&: Mask, P: equal_to(Arg: PoisonMaskElem))) {
17970 // TODO: try to check other subsets if possible.
17971 // Restore the original VL if attempt was not successful.
17972 copy(Range&: SavedVL, Out: VL.begin());
17973 return std::nullopt;
17974 }
17975 // Restore unused scalars from mask, if some of the extractelements were not
17976 // selected for shuffle.
17977 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
17978 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(Val: GatheredExtracts[I]) &&
17979 isa<UndefValue>(Val: GatheredExtracts[I])) {
17980 std::swap(a&: VL[I], b&: GatheredExtracts[I]);
17981 continue;
17982 }
17983 auto *EI = dyn_cast<ExtractElementInst>(Val: VL[I]);
17984 if (!EI || !isa<FixedVectorType>(Val: EI->getVectorOperandType()) ||
17985 !isa<ConstantInt, UndefValue>(Val: EI->getIndexOperand()) ||
17986 is_contained(Range&: UndefVectorExtracts, Element: I))
17987 continue;
17988 }
17989 return Res;
17990}
17991
17992/// Tries to find extractelement instructions with constant indices from fixed
17993/// vector type and gather such instructions into a bunch, which highly likely
17994/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
17995/// successful, the matched scalars are replaced by poison values in \p VL for
17996/// future analysis.
17997SmallVector<std::optional<TTI::ShuffleKind>>
17998BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
17999 SmallVectorImpl<int> &Mask,
18000 unsigned NumParts) const {
18001 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
18002 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
18003 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
18004 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
18005 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
18006 // Scan list of gathered scalars for extractelements that can be represented
18007 // as shuffles.
18008 const unsigned PartOffset = Part * SliceSize;
18009 const unsigned PartSize = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
18010 // It may happen in case of revec, need to check no access out of bounds.
18011 if (PartOffset + PartSize > VL.size())
18012 break;
18013 MutableArrayRef<Value *> SubVL =
18014 MutableArrayRef(VL).slice(N: PartOffset, M: PartSize);
18015 SmallVector<int> SubMask;
18016 std::optional<TTI::ShuffleKind> Res =
18017 tryToGatherSingleRegisterExtractElements(VL: SubVL, Mask&: SubMask);
18018 ShufflesRes[Part] = Res;
18019 copy(Range&: SubMask, Out: std::next(x: Mask.begin(), n: Part * SliceSize));
18020 }
18021 if (none_of(Range&: ShufflesRes, P: [](const std::optional<TTI::ShuffleKind> &Res) {
18022 return Res.has_value();
18023 }))
18024 ShufflesRes.clear();
18025 return ShufflesRes;
18026}
18027
18028std::optional<TargetTransformInfo::ShuffleKind>
18029BoUpSLP::isGatherShuffledSingleRegisterEntry(
18030 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
18031 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
18032 Entries.clear();
18033 if (TE->Idx == 0)
18034 return std::nullopt;
18035 // TODO: currently checking only for Scalars in the tree entry, need to count
18036 // reused elements too for better cost estimation.
18037 auto GetUserEntry = [&](const TreeEntry *TE) {
18038 while (TE->UserTreeIndex && TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18039 TE = TE->UserTreeIndex.UserTE;
18040 if (TE == VectorizableTree.front().get())
18041 return EdgeInfo(const_cast<TreeEntry *>(TE), 0);
18042 return TE->UserTreeIndex;
18043 };
18044 auto HasGatherUser = [&](const TreeEntry *TE) {
18045 while (TE->Idx != 0 && TE->UserTreeIndex) {
18046 if (TE->UserTreeIndex.EdgeIdx == UINT_MAX)
18047 return true;
18048 TE = TE->UserTreeIndex.UserTE;
18049 }
18050 return false;
18051 };
18052 const EdgeInfo TEUseEI = GetUserEntry(TE);
18053 if (!TEUseEI || (TEUseEI.UserTE->Idx == 0 && TEUseEI.UserTE->isGather() &&
18054 !TEUseEI.UserTE->hasState()))
18055 return std::nullopt;
18056 const Instruction *TEInsertPt = &getLastInstructionInBundle(E: TEUseEI.UserTE);
18057 const BasicBlock *TEInsertBlock = nullptr;
18058 // Main node of PHI entries keeps the correct order of operands/incoming
18059 // blocks.
18060 if (auto *PHI = dyn_cast_or_null<PHINode>(
18061 Val: TEUseEI.UserTE->hasState() ? TEUseEI.UserTE->getMainOp() : nullptr);
18062 PHI && TEUseEI.UserTE->State != TreeEntry::SplitVectorize) {
18063 TEInsertBlock = PHI->getIncomingBlock(i: TEUseEI.EdgeIdx);
18064 TEInsertPt = TEInsertBlock->getTerminator();
18065 } else {
18066 TEInsertBlock = TEInsertPt->getParent();
18067 }
18068 if (!DT->isReachableFromEntry(A: TEInsertBlock))
18069 return std::nullopt;
18070 auto *NodeUI = DT->getNode(BB: TEInsertBlock);
18071 assert(NodeUI && "Should only process reachable instructions");
18072 SmallPtrSet<Value *, 4> GatheredScalars(llvm::from_range, VL);
18073 auto CheckOrdering = [&](const Instruction *InsertPt) {
18074 // Argument InsertPt is an instruction where vector code for some other
18075 // tree entry (one that shares one or more scalars with TE) is going to be
18076 // generated. This lambda returns true if insertion point of vector code
18077 // for the TE dominates that point (otherwise dependency is the other way
18078 // around). The other node is not limited to be of a gather kind. Gather
18079 // nodes are not scheduled and their vector code is inserted before their
18080 // first user. If user is PHI, that is supposed to be at the end of a
18081 // predecessor block. Otherwise it is the last instruction among scalars of
18082 // the user node. So, instead of checking dependency between instructions
18083 // themselves, we check dependency between their insertion points for vector
18084 // code (since each scalar instruction ends up as a lane of a vector
18085 // instruction).
18086 const BasicBlock *InsertBlock = InsertPt->getParent();
18087 auto *NodeEUI = DT->getNode(BB: InsertBlock);
18088 if (!NodeEUI)
18089 return false;
18090 assert((NodeUI == NodeEUI) ==
18091 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
18092 "Different nodes should have different DFS numbers");
18093 // Check the order of the gather nodes users.
18094 if (TEInsertPt->getParent() != InsertBlock &&
18095 (DT->dominates(A: NodeUI, B: NodeEUI) || !DT->dominates(A: NodeEUI, B: NodeUI)))
18096 return false;
18097 if (TEInsertPt->getParent() == InsertBlock &&
18098 TEInsertPt->comesBefore(Other: InsertPt))
18099 return false;
18100 return true;
18101 };
18102 // Find all tree entries used by the gathered values. If no common entries
18103 // found - not a shuffle.
18104 // Here we build a set of tree nodes for each gathered value and trying to
18105 // find the intersection between these sets. If we have at least one common
18106 // tree node for each gathered value - we have just a permutation of the
18107 // single vector. If we have 2 different sets, we're in situation where we
18108 // have a permutation of 2 input vectors.
18109 SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
18110 SmallDenseMap<Value *, int> UsedValuesEntry;
18111 SmallPtrSet<const Value *, 16> VisitedValue;
18112 bool IsReusedNodeFound = false;
18113 auto CheckAndUseSameNode = [&](const TreeEntry *TEPtr) {
18114 // The node is reused - exit.
18115 if (IsReusedNodeFound)
18116 return false;
18117 if ((TEPtr->getVectorFactor() != VL.size() &&
18118 TEPtr->Scalars.size() != VL.size()) ||
18119 (!TEPtr->isSame(VL) && !TEPtr->isSame(VL: TE->Scalars)))
18120 return false;
18121 IsReusedNodeFound =
18122 equal(LRange: TE->Scalars, RRange: TEPtr->Scalars) &&
18123 equal(LRange: TE->ReorderIndices, RRange: TEPtr->ReorderIndices) &&
18124 equal(LRange: TE->ReuseShuffleIndices, RRange: TEPtr->ReuseShuffleIndices);
18125 UsedTEs.clear();
18126 UsedTEs.emplace_back().insert(Ptr: TEPtr);
18127 for (Value *V : VL) {
18128 if (isConstant(V))
18129 continue;
18130 UsedValuesEntry.try_emplace(Key: V, Args: 0);
18131 }
18132 return true;
18133 };
18134 auto CheckParentNodes = [&](const TreeEntry *User1, const TreeEntry *User2,
18135 unsigned EdgeIdx) {
18136 const TreeEntry *Ptr1 = User1;
18137 const TreeEntry *Ptr2 = User2;
18138 SmallDenseMap<const TreeEntry *, unsigned> PtrToIdx;
18139 while (Ptr2) {
18140 PtrToIdx.try_emplace(Key: Ptr2, Args&: EdgeIdx);
18141 EdgeIdx = Ptr2->UserTreeIndex.EdgeIdx;
18142 Ptr2 = Ptr2->UserTreeIndex.UserTE;
18143 }
18144 while (Ptr1) {
18145 unsigned Idx = Ptr1->UserTreeIndex.EdgeIdx;
18146 Ptr1 = Ptr1->UserTreeIndex.UserTE;
18147 if (auto It = PtrToIdx.find(Val: Ptr1); It != PtrToIdx.end())
18148 return Idx < It->second;
18149 }
18150 return false;
18151 };
18152 auto CheckNonSchedulableOrdering = [&](const TreeEntry *E,
18153 Instruction *InsertPt) {
18154 return TEUseEI && TEUseEI.UserTE && TEUseEI.UserTE->hasCopyableElements() &&
18155 !TEUseEI.UserTE->isCopyableElement(
18156 V: const_cast<Instruction *>(TEInsertPt)) &&
18157 isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
18158 InsertPt->getNextNode() == TEInsertPt &&
18159 (!E->hasCopyableElements() || !E->isCopyableElement(V: InsertPt) ||
18160 !isUsedOutsideBlock(V: InsertPt));
18161 };
18162 for (Value *V : VL) {
18163 if (isConstant(V) || !VisitedValue.insert(Ptr: V).second)
18164 continue;
18165 // Build a list of tree entries where V is used.
18166 SmallPtrSet<const TreeEntry *, 4> VToTEs;
18167 SmallVector<const TreeEntry *> GatherNodes(
18168 ValueToGatherNodes.lookup(Val: V).takeVector());
18169 if (TransformedToGatherNodes.contains(Val: TE)) {
18170 for (TreeEntry *E : getSplitTreeEntries(V)) {
18171 if (TE == E || !TransformedToGatherNodes.contains(Val: E) ||
18172 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
18173 continue;
18174 GatherNodes.push_back(Elt: E);
18175 }
18176 for (TreeEntry *E : getTreeEntries(V)) {
18177 if (TE == E || !TransformedToGatherNodes.contains(Val: E) ||
18178 !E->UserTreeIndex || E->UserTreeIndex.UserTE->isGather())
18179 continue;
18180 GatherNodes.push_back(Elt: E);
18181 }
18182 }
18183 for (const TreeEntry *TEPtr : GatherNodes) {
18184 if (TEPtr == TE || TEPtr->Idx == 0 || DeletedNodes.contains(Ptr: TEPtr))
18185 continue;
18186 assert(any_of(TEPtr->Scalars,
18187 [&](Value *V) { return GatheredScalars.contains(V); }) &&
18188 "Must contain at least single gathered value.");
18189 assert(TEPtr->UserTreeIndex &&
18190 "Expected only single user of a gather node.");
18191 const EdgeInfo &UseEI = TEPtr->UserTreeIndex;
18192
18193 PHINode *UserPHI = (UseEI.UserTE->State != TreeEntry::SplitVectorize &&
18194 UseEI.UserTE->hasState())
18195 ? dyn_cast<PHINode>(Val: UseEI.UserTE->getMainOp())
18196 : nullptr;
18197 Instruction *InsertPt =
18198 UserPHI ? UserPHI->getIncomingBlock(i: UseEI.EdgeIdx)->getTerminator()
18199 : &getLastInstructionInBundle(E: UseEI.UserTE);
18200 if (TEInsertPt == InsertPt) {
18201 // Check nodes, which might be emitted first.
18202 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18203 (TEUseEI.UserTE->getOpcode() != Instruction::PHI ||
18204 TEUseEI.UserTE->isAltShuffle()) &&
18205 all_of(Range&: TEUseEI.UserTE->Scalars, P: isUsedOutsideBlock)) {
18206 if (UseEI.UserTE->State != TreeEntry::Vectorize ||
18207 (UseEI.UserTE->hasState() &&
18208 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18209 !UseEI.UserTE->isAltShuffle()) ||
18210 !all_of(Range&: UseEI.UserTE->Scalars, P: isUsedOutsideBlock))
18211 continue;
18212 }
18213
18214 // If the schedulable insertion point is used in multiple entries - just
18215 // exit, no known ordering at this point, available only after real
18216 // scheduling.
18217 if (!doesNotNeedToBeScheduled(V: InsertPt) &&
18218 (TEUseEI.UserTE != UseEI.UserTE || TEUseEI.EdgeIdx < UseEI.EdgeIdx))
18219 continue;
18220 // If the users are the PHI nodes with the same incoming blocks - skip.
18221 if (TEUseEI.UserTE->State == TreeEntry::Vectorize &&
18222 TEUseEI.UserTE->getOpcode() == Instruction::PHI &&
18223 UseEI.UserTE->State == TreeEntry::Vectorize &&
18224 UseEI.UserTE->getOpcode() == Instruction::PHI &&
18225 TEUseEI.UserTE != UseEI.UserTE)
18226 continue;
18227 // If 2 gathers are operands of the same entry (regardless of whether
18228 // user is PHI or else), compare operands indices, use the earlier one
18229 // as the base.
18230 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
18231 continue;
18232 // If the user instruction is used for some reason in different
18233 // vectorized nodes - make it depend on index.
18234 if (TEUseEI.UserTE != UseEI.UserTE &&
18235 (TEUseEI.UserTE->Idx < UseEI.UserTE->Idx ||
18236 HasGatherUser(TEUseEI.UserTE)))
18237 continue;
18238 // If the user node is the operand of the other user node - skip.
18239 if (CheckParentNodes(TEUseEI.UserTE, UseEI.UserTE, UseEI.EdgeIdx))
18240 continue;
18241 }
18242
18243 if (!TEUseEI.UserTE->isGather() && !UserPHI &&
18244 TEUseEI.UserTE->doesNotNeedToSchedule() !=
18245 UseEI.UserTE->doesNotNeedToSchedule() &&
18246 is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))
18247 continue;
18248 // Check if the user node of the TE comes after user node of TEPtr,
18249 // otherwise TEPtr depends on TE.
18250 if ((TEInsertBlock != InsertPt->getParent() ||
18251 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
18252 (!CheckOrdering(InsertPt) ||
18253 (UseEI.UserTE->hasCopyableElements() &&
18254 isUsedOutsideBlock(V: const_cast<Instruction *>(TEInsertPt)) &&
18255 is_contained(Range&: UseEI.UserTE->Scalars, Element: TEInsertPt))))
18256 continue;
18257 // The node is reused - exit.
18258 if (CheckAndUseSameNode(TEPtr))
18259 break;
18260 // The parent node is copyable with last inst used outside? And the last
18261 // inst is the next inst for the lastinst of TEPtr? Exit, if yes, to
18262 // preserve def-use chain.
18263 if (CheckNonSchedulableOrdering(UseEI.UserTE, InsertPt))
18264 continue;
18265 VToTEs.insert(Ptr: TEPtr);
18266 }
18267 if (ArrayRef<TreeEntry *> VTEs = getSplitTreeEntries(V); !VTEs.empty()) {
18268 const auto *It = find_if(Range&: VTEs, P: [&](const TreeEntry *MTE) {
18269 return MTE != TE && MTE != TEUseEI.UserTE &&
18270 !DeletedNodes.contains(Ptr: MTE) &&
18271 !TransformedToGatherNodes.contains(Val: MTE);
18272 });
18273 if (It != VTEs.end()) {
18274 const TreeEntry *VTE = *It;
18275 if (none_of(Range: TE->CombinedEntriesWithIndices,
18276 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
18277 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
18278 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
18279 continue;
18280 }
18281 // The node is reused - exit.
18282 if (CheckAndUseSameNode(VTE))
18283 break;
18284 VToTEs.insert(Ptr: VTE);
18285 }
18286 }
18287 if (ArrayRef<TreeEntry *> VTEs = getTreeEntries(V); !VTEs.empty()) {
18288 const auto *It = find_if(Range&: VTEs, P: [&, MainTE = TE](const TreeEntry *TE) {
18289 return TE != MainTE && !DeletedNodes.contains(Ptr: TE) &&
18290 !TransformedToGatherNodes.contains(Val: TE);
18291 });
18292 if (It != VTEs.end()) {
18293 const TreeEntry *VTE = *It;
18294 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(u: 0) &&
18295 VTEs.size() > 1 && VTE->State != TreeEntry::Vectorize) {
18296 VTEs = VTEs.drop_front();
18297 // Iterate through all vectorized nodes.
18298 const auto *MIt = find_if(Range&: VTEs, P: [](const TreeEntry *MTE) {
18299 return MTE->State == TreeEntry::Vectorize;
18300 });
18301 if (MIt == VTEs.end())
18302 continue;
18303 VTE = *MIt;
18304 }
18305 if (none_of(Range: TE->CombinedEntriesWithIndices,
18306 P: [&](const auto &P) { return P.first == VTE->Idx; })) {
18307 Instruction &LastBundleInst = getLastInstructionInBundle(E: VTE);
18308 if (&LastBundleInst == TEInsertPt ||
18309 !CheckOrdering(&LastBundleInst) ||
18310 CheckNonSchedulableOrdering(VTE, &LastBundleInst))
18311 continue;
18312 }
18313 // The node is reused - exit.
18314 if (CheckAndUseSameNode(VTE))
18315 break;
18316 VToTEs.insert(Ptr: VTE);
18317 }
18318 }
18319 if (IsReusedNodeFound)
18320 break;
18321 if (VToTEs.empty())
18322 continue;
18323 if (UsedTEs.empty()) {
18324 // The first iteration, just insert the list of nodes to vector.
18325 UsedTEs.push_back(Elt: VToTEs);
18326 UsedValuesEntry.try_emplace(Key: V, Args: 0);
18327 } else {
18328 // Need to check if there are any previously used tree nodes which use V.
18329 // If there are no such nodes, consider that we have another one input
18330 // vector.
18331 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
18332 unsigned Idx = 0;
18333 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
18334 // Do we have a non-empty intersection of previously listed tree entries
18335 // and tree entries using current V?
18336 set_intersect(S1&: VToTEs, S2: Set);
18337 if (!VToTEs.empty()) {
18338 // Yes, write the new subset and continue analysis for the next
18339 // scalar.
18340 Set.swap(RHS&: VToTEs);
18341 break;
18342 }
18343 VToTEs = SavedVToTEs;
18344 ++Idx;
18345 }
18346 // No non-empty intersection found - need to add a second set of possible
18347 // source vectors.
18348 if (Idx == UsedTEs.size()) {
18349 // If the number of input vectors is greater than 2 - not a permutation,
18350 // fallback to the regular gather.
18351 // TODO: support multiple reshuffled nodes.
18352 if (UsedTEs.size() == 2)
18353 continue;
18354 UsedTEs.push_back(Elt: SavedVToTEs);
18355 Idx = UsedTEs.size() - 1;
18356 }
18357 UsedValuesEntry.try_emplace(Key: V, Args&: Idx);
18358 }
18359 }
18360
18361 if (UsedTEs.empty()) {
18362 Entries.clear();
18363 return std::nullopt;
18364 }
18365
18366 unsigned VF = 0;
18367 if (UsedTEs.size() == 1) {
18368 // Keep the order to avoid non-determinism.
18369 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
18370 UsedTEs.front().end());
18371 sort(C&: FirstEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
18372 return TE1->Idx < TE2->Idx;
18373 });
18374 // Try to find the perfect match in another gather node at first.
18375 auto *It = find_if(Range&: FirstEntries, P: [=](const TreeEntry *EntryPtr) {
18376 return EntryPtr->isSame(VL) || EntryPtr->isSame(VL: TE->Scalars);
18377 });
18378 if (It != FirstEntries.end() &&
18379 (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size() ||
18380 ((*It)->getVectorFactor() == TE->Scalars.size() &&
18381 TE->ReuseShuffleIndices.size() == VL.size() &&
18382 (*It)->isSame(VL: TE->Scalars)))) {
18383 Entries.push_back(Elt: *It);
18384 if (IsReusedNodeFound || (*It)->getVectorFactor() == VL.size()) {
18385 std::iota(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18386 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: 0);
18387 } else {
18388 SmallVector<int> CommonMask = TE->getCommonMask();
18389 copy(Range&: CommonMask, Out: Mask.begin());
18390 }
18391 // Clear undef scalars.
18392 for (unsigned I : seq<unsigned>(Size: VL.size()))
18393 if (isa<PoisonValue>(Val: VL[I]))
18394 Mask[Part * VL.size() + I] = PoisonMaskElem;
18395 return TargetTransformInfo::SK_PermuteSingleSrc;
18396 }
18397 // No perfect match, just shuffle, so choose the first tree node from the
18398 // tree.
18399 Entries.push_back(Elt: FirstEntries.front());
18400 // Update mapping between values and corresponding tree entries.
18401 for (auto &P : UsedValuesEntry)
18402 P.second = 0;
18403 VF = FirstEntries.front()->getVectorFactor();
18404 } else {
18405 // Try to find nodes with the same vector factor.
18406 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
18407 // Keep the order of tree nodes to avoid non-determinism.
18408 DenseMap<int, const TreeEntry *> VFToTE;
18409 for (const TreeEntry *TE : UsedTEs.front()) {
18410 unsigned VF = TE->getVectorFactor();
18411 auto It = VFToTE.find(Val: VF);
18412 if (It != VFToTE.end()) {
18413 if (It->second->Idx > TE->Idx)
18414 It->getSecond() = TE;
18415 continue;
18416 }
18417 VFToTE.try_emplace(Key: VF, Args&: TE);
18418 }
18419 // Same, keep the order to avoid non-determinism.
18420 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
18421 UsedTEs.back().end());
18422 sort(C&: SecondEntries, Comp: [](const TreeEntry *TE1, const TreeEntry *TE2) {
18423 return TE1->Idx < TE2->Idx;
18424 });
18425 for (const TreeEntry *TE : SecondEntries) {
18426 auto It = VFToTE.find(Val: TE->getVectorFactor());
18427 if (It != VFToTE.end()) {
18428 VF = It->first;
18429 Entries.push_back(Elt: It->second);
18430 Entries.push_back(Elt: TE);
18431 break;
18432 }
18433 }
18434 // No 2 source vectors with the same vector factor - just choose 2 with max
18435 // index.
18436 if (Entries.empty()) {
18437 Entries.push_back(Elt: *llvm::max_element(
18438 Range&: UsedTEs.front(), C: [](const TreeEntry *TE1, const TreeEntry *TE2) {
18439 return TE1->Idx < TE2->Idx;
18440 }));
18441 Entries.push_back(Elt: SecondEntries.front());
18442 VF = std::max(a: Entries.front()->getVectorFactor(),
18443 b: Entries.back()->getVectorFactor());
18444 } else {
18445 VF = Entries.front()->getVectorFactor();
18446 }
18447 SmallVector<SmallPtrSet<Value *, 8>> ValuesToEntries;
18448 for (const TreeEntry *E : Entries)
18449 ValuesToEntries.emplace_back().insert(I: E->Scalars.begin(),
18450 E: E->Scalars.end());
18451 // Update mapping between values and corresponding tree entries.
18452 for (auto &P : UsedValuesEntry) {
18453 for (unsigned Idx : seq<unsigned>(Size: ValuesToEntries.size()))
18454 if (ValuesToEntries[Idx].contains(Ptr: P.first)) {
18455 P.second = Idx;
18456 break;
18457 }
18458 }
18459 }
18460
18461 bool IsSplatOrUndefs = isSplat(VL) || all_of(Range&: VL, P: IsaPred<UndefValue>);
18462 // Checks if the 2 PHIs are compatible in terms of high possibility to be
18463 // vectorized.
18464 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
18465 auto *PHI = cast<PHINode>(Val: V);
18466 auto *PHI1 = cast<PHINode>(Val: V1);
18467 // Check that all incoming values are compatible/from same parent (if they
18468 // are instructions).
18469 // The incoming values are compatible if they all are constants, or
18470 // instruction with the same/alternate opcodes from the same basic block.
18471 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
18472 Value *In = PHI->getIncomingValue(i: I);
18473 Value *In1 = PHI1->getIncomingValue(i: I);
18474 if (isConstant(V: In) && isConstant(V: In1))
18475 continue;
18476 if (!getSameOpcode(VL: {In, In1}, TLI: *TLI))
18477 return false;
18478 if (cast<Instruction>(Val: In)->getParent() !=
18479 cast<Instruction>(Val: In1)->getParent())
18480 return false;
18481 }
18482 return true;
18483 };
18484 // Check if the value can be ignored during analysis for shuffled gathers.
18485 // We suppose it is better to ignore instruction, which do not form splats,
18486 // are not vectorized/not extractelements (these instructions will be handled
18487 // by extractelements processing) or may form vector node in future.
18488 auto MightBeIgnored = [=](Value *V) {
18489 auto *I = dyn_cast<Instruction>(Val: V);
18490 return I && !IsSplatOrUndefs && !isVectorized(V: I) &&
18491 !isVectorLikeInstWithConstOps(V: I) &&
18492 !areAllUsersVectorized(I, VectorizedVals: UserIgnoreList) && isSimple(I);
18493 };
18494 // Check that the neighbor instruction may form a full vector node with the
18495 // current instruction V. It is possible, if they have same/alternate opcode
18496 // and same parent basic block.
18497 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
18498 Value *V1 = VL[Idx];
18499 bool UsedInSameVTE = false;
18500 auto It = UsedValuesEntry.find(Val: V1);
18501 if (It != UsedValuesEntry.end())
18502 UsedInSameVTE = It->second == UsedValuesEntry.find(Val: V)->second;
18503 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
18504 getSameOpcode(VL: {V, V1}, TLI: *TLI) &&
18505 cast<Instruction>(Val: V)->getParent() ==
18506 cast<Instruction>(Val: V1)->getParent() &&
18507 (!isa<PHINode>(Val: V1) || AreCompatiblePHIs(V, V1));
18508 };
18509 // Build a shuffle mask for better cost estimation and vector emission.
18510 SmallBitVector UsedIdxs(Entries.size());
18511 SmallVector<std::pair<unsigned, int>> EntryLanes;
18512 for (int I = 0, E = VL.size(); I < E; ++I) {
18513 Value *V = VL[I];
18514 auto It = UsedValuesEntry.find(Val: V);
18515 if (It == UsedValuesEntry.end())
18516 continue;
18517 // Do not try to shuffle scalars, if they are constants, or instructions
18518 // that can be vectorized as a result of the following vector build
18519 // vectorization.
18520 if (isConstant(V) || (MightBeIgnored(V) &&
18521 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
18522 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
18523 continue;
18524 unsigned Idx = It->second;
18525 EntryLanes.emplace_back(Args&: Idx, Args&: I);
18526 UsedIdxs.set(Idx);
18527 }
18528 // Iterate through all shuffled scalars and select entries, which can be used
18529 // for final shuffle.
18530 SmallVector<const TreeEntry *> TempEntries;
18531 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
18532 if (!UsedIdxs.test(Idx: I))
18533 continue;
18534 // Fix the entry number for the given scalar. If it is the first entry, set
18535 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
18536 // These indices are used when calculating final shuffle mask as the vector
18537 // offset.
18538 for (std::pair<unsigned, int> &Pair : EntryLanes)
18539 if (Pair.first == I)
18540 Pair.first = TempEntries.size();
18541 TempEntries.push_back(Elt: Entries[I]);
18542 }
18543 Entries.swap(RHS&: TempEntries);
18544 if (EntryLanes.size() == Entries.size() &&
18545 !VL.equals(RHS: ArrayRef(TE->Scalars)
18546 .slice(N: Part * VL.size(),
18547 M: std::min<int>(a: VL.size(), b: TE->Scalars.size())))) {
18548 // We may have here 1 or 2 entries only. If the number of scalars is equal
18549 // to the number of entries, no need to do the analysis, it is not very
18550 // profitable. Since VL is not the same as TE->Scalars, it means we already
18551 // have some shuffles before. Cut off not profitable case.
18552 Entries.clear();
18553 return std::nullopt;
18554 }
18555 // Build the final mask, check for the identity shuffle, if possible.
18556 bool IsIdentity = Entries.size() == 1;
18557 // Pair.first is the offset to the vector, while Pair.second is the index of
18558 // scalar in the list.
18559 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
18560 unsigned Idx = Part * VL.size() + Pair.second;
18561 Mask[Idx] =
18562 Pair.first * VF +
18563 (ForOrder ? std::distance(
18564 first: Entries[Pair.first]->Scalars.begin(),
18565 last: find(Range: Entries[Pair.first]->Scalars, Val: VL[Pair.second]))
18566 : Entries[Pair.first]->findLaneForValue(V: VL[Pair.second]));
18567 IsIdentity &= Mask[Idx] == Pair.second;
18568 }
18569 if (ForOrder || IsIdentity || Entries.empty()) {
18570 switch (Entries.size()) {
18571 case 1:
18572 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
18573 return TargetTransformInfo::SK_PermuteSingleSrc;
18574 break;
18575 case 2:
18576 if (EntryLanes.size() > 2 || VL.size() <= 2)
18577 return TargetTransformInfo::SK_PermuteTwoSrc;
18578 break;
18579 default:
18580 break;
18581 }
18582 } else if (!isa<VectorType>(Val: VL.front()->getType()) &&
18583 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
18584 // Do the cost estimation if shuffle beneficial than buildvector.
18585 SmallVector<int> SubMask(std::next(x: Mask.begin(), n: Part * VL.size()),
18586 std::next(x: Mask.begin(), n: (Part + 1) * VL.size()));
18587 int MinElement = SubMask.front(), MaxElement = SubMask.front();
18588 for (int Idx : SubMask) {
18589 if (Idx == PoisonMaskElem)
18590 continue;
18591 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
18592 MinElement = Idx;
18593 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
18594 MaxElement = Idx;
18595 }
18596 assert(MaxElement >= 0 && MinElement >= 0 &&
18597 MaxElement % VF >= MinElement % VF &&
18598 "Expected at least single element.");
18599 unsigned NewVF = std::max<unsigned>(
18600 a: VL.size(), b: getFullVectorNumberOfElements(TTI: *TTI, Ty: VL.front()->getType(),
18601 Sz: (MaxElement % VF) -
18602 (MinElement % VF) + 1));
18603 if (NewVF < VF) {
18604 for (int &Idx : SubMask) {
18605 if (Idx == PoisonMaskElem)
18606 continue;
18607 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
18608 (Idx >= static_cast<int>(VF) ? NewVF : 0);
18609 }
18610 } else {
18611 NewVF = VF;
18612 }
18613
18614 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18615 auto *VecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: NewVF);
18616 auto *MaskVecTy = getWidenedType(ScalarTy: VL.front()->getType(), VF: SubMask.size());
18617 auto GetShuffleCost = [&,
18618 &TTI = *TTI](ArrayRef<int> Mask,
18619 ArrayRef<const TreeEntry *> Entries,
18620 VectorType *VecTy) -> InstructionCost {
18621 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
18622 ShuffleVectorInst::isDeInterleaveMaskOfFactor(
18623 Mask, Factor: Entries.front()->getInterleaveFactor()))
18624 return TTI::TCC_Free;
18625 return ::getShuffleCost(TTI,
18626 Kind: Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
18627 : TTI::SK_PermuteSingleSrc,
18628 Tp: VecTy, Mask, CostKind);
18629 };
18630 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
18631 InstructionCost FirstShuffleCost = 0;
18632 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
18633 if (Entries.size() == 1 || !Entries[0]->isGather()) {
18634 FirstShuffleCost = ShuffleCost;
18635 } else {
18636 // Transform mask to include only first entry.
18637 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18638 bool IsIdentity = true;
18639 for (auto [I, Idx] : enumerate(First&: FirstMask)) {
18640 if (Idx >= static_cast<int>(NewVF)) {
18641 Idx = PoisonMaskElem;
18642 } else {
18643 DemandedElts.clearBit(BitPosition: I);
18644 if (Idx != PoisonMaskElem)
18645 IsIdentity &= static_cast<int>(I) == Idx;
18646 }
18647 }
18648 if (!IsIdentity)
18649 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
18650 FirstShuffleCost += getScalarizationOverhead(
18651 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
18652 /*Extract=*/false, CostKind);
18653 }
18654 InstructionCost SecondShuffleCost = 0;
18655 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
18656 if (Entries.size() == 1 || !Entries[1]->isGather()) {
18657 SecondShuffleCost = ShuffleCost;
18658 } else {
18659 // Transform mask to include only first entry.
18660 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18661 bool IsIdentity = true;
18662 for (auto [I, Idx] : enumerate(First&: SecondMask)) {
18663 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
18664 Idx = PoisonMaskElem;
18665 } else {
18666 DemandedElts.clearBit(BitPosition: I);
18667 if (Idx != PoisonMaskElem) {
18668 Idx -= NewVF;
18669 IsIdentity &= static_cast<int>(I) == Idx;
18670 }
18671 }
18672 }
18673 if (!IsIdentity)
18674 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
18675 SecondShuffleCost += getScalarizationOverhead(
18676 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
18677 /*Extract=*/false, CostKind);
18678 }
18679 APInt DemandedElts = APInt::getAllOnes(numBits: SubMask.size());
18680 for (auto [I, Idx] : enumerate(First&: SubMask))
18681 if (Idx == PoisonMaskElem)
18682 DemandedElts.clearBit(BitPosition: I);
18683 InstructionCost BuildVectorCost = getScalarizationOverhead(
18684 TTI: *TTI, ScalarTy: VL.front()->getType(), Ty: MaskVecTy, DemandedElts, /*Insert=*/true,
18685 /*Extract=*/false, CostKind);
18686 const TreeEntry *BestEntry = nullptr;
18687 if (FirstShuffleCost < ShuffleCost) {
18688 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18689 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
18690 f: [&](int &Idx) {
18691 if (Idx >= static_cast<int>(VF))
18692 Idx = PoisonMaskElem;
18693 });
18694 BestEntry = Entries.front();
18695 ShuffleCost = FirstShuffleCost;
18696 }
18697 if (SecondShuffleCost < ShuffleCost) {
18698 std::for_each(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18699 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()),
18700 f: [&](int &Idx) {
18701 if (Idx < static_cast<int>(VF))
18702 Idx = PoisonMaskElem;
18703 else
18704 Idx -= VF;
18705 });
18706 BestEntry = Entries[1];
18707 ShuffleCost = SecondShuffleCost;
18708 }
18709 if (BuildVectorCost >= ShuffleCost) {
18710 if (BestEntry) {
18711 Entries.clear();
18712 Entries.push_back(Elt: BestEntry);
18713 }
18714 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
18715 : TargetTransformInfo::SK_PermuteSingleSrc;
18716 }
18717 }
18718 Entries.clear();
18719 // Clear the corresponding mask elements.
18720 std::fill(first: std::next(x: Mask.begin(), n: Part * VL.size()),
18721 last: std::next(x: Mask.begin(), n: (Part + 1) * VL.size()), value: PoisonMaskElem);
18722 return std::nullopt;
18723}
18724
18725SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
18726BoUpSLP::isGatherShuffledEntry(
18727 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
18728 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
18729 bool ForOrder) {
18730 assert(NumParts > 0 && NumParts < VL.size() &&
18731 "Expected positive number of registers.");
18732 Entries.clear();
18733 // No need to check for the topmost gather node.
18734 if (TE == VectorizableTree.front().get() &&
18735 (!GatheredLoadsEntriesFirst.has_value() ||
18736 none_of(Range: ArrayRef(VectorizableTree).drop_front(),
18737 P: [](const std::unique_ptr<TreeEntry> &TE) {
18738 return !TE->isGather();
18739 })))
18740 return {};
18741 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
18742 // implemented yet.
18743 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI: *TTI))
18744 return {};
18745 Mask.assign(NumElts: VL.size(), Elt: PoisonMaskElem);
18746 assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) &&
18747 "Expected only single user of the gather node.");
18748 assert(VL.size() % NumParts == 0 &&
18749 "Number of scalars must be divisible by NumParts.");
18750 if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() &&
18751 TE->UserTreeIndex.EdgeIdx == UINT_MAX &&
18752 (TE->Idx == 0 ||
18753 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
18754 isSplat(VL: TE->Scalars) ||
18755 (TE->hasState() &&
18756 getSameValuesTreeEntry(V: TE->getMainOp(), VL: TE->Scalars))))
18757 return {};
18758 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
18759 SmallVector<std::optional<TTI::ShuffleKind>> Res;
18760 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
18761 ArrayRef<Value *> SubVL =
18762 VL.slice(N: Part * SliceSize, M: getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part));
18763 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
18764 std::optional<TTI::ShuffleKind> SubRes =
18765 isGatherShuffledSingleRegisterEntry(TE, VL: SubVL, Mask, Entries&: SubEntries, Part,
18766 ForOrder);
18767 if (!SubRes)
18768 SubEntries.clear();
18769 Res.push_back(Elt: SubRes);
18770 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
18771 SubEntries.front()->getVectorFactor() == VL.size() &&
18772 (SubEntries.front()->isSame(VL: TE->Scalars) ||
18773 SubEntries.front()->isSame(VL))) {
18774 SmallVector<const TreeEntry *> LocalSubEntries;
18775 LocalSubEntries.swap(RHS&: SubEntries);
18776 Entries.clear();
18777 Res.clear();
18778 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
18779 // Clear undef scalars.
18780 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
18781 if (isa<PoisonValue>(Val: VL[I]))
18782 Mask[I] = PoisonMaskElem;
18783 Entries.emplace_back(Args: 1, Args&: LocalSubEntries.front());
18784 Res.push_back(Elt: TargetTransformInfo::SK_PermuteSingleSrc);
18785 return Res;
18786 }
18787 }
18788 if (all_of(Range&: Res,
18789 P: [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
18790 Entries.clear();
18791 return {};
18792 }
18793 return Res;
18794}
18795
18796InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
18797 Type *ScalarTy) const {
18798 const unsigned VF = VL.size();
18799 auto *VecTy = getWidenedType(ScalarTy, VF);
18800 // Find the cost of inserting/extracting values from the vector.
18801 // Check if the same elements are inserted several times and count them as
18802 // shuffle candidates.
18803 APInt DemandedElements = APInt::getZero(numBits: VF);
18804 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
18805 InstructionCost Cost;
18806 auto EstimateInsertCost = [&](unsigned I, Value *V) {
18807 DemandedElements.setBit(I);
18808 if (V->getType() != ScalarTy)
18809 Cost += TTI->getCastInstrCost(Opcode: Instruction::Trunc, Dst: ScalarTy, Src: V->getType(),
18810 CCH: TTI::CastContextHint::None, CostKind);
18811 };
18812 SmallVector<int> ConstantShuffleMask(VF, PoisonMaskElem);
18813 std::iota(first: ConstantShuffleMask.begin(), last: ConstantShuffleMask.end(), value: 0);
18814 for (auto [I, V] : enumerate(First&: VL)) {
18815 // No need to shuffle duplicates for constants.
18816 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(Val: V))
18817 continue;
18818
18819 if (isConstant(V)) {
18820 ConstantShuffleMask[I] = I + VF;
18821 continue;
18822 }
18823 EstimateInsertCost(I, V);
18824 }
18825 // FIXME: add a cost for constant vector materialization.
18826 bool IsAnyNonUndefConst =
18827 any_of(Range&: VL, P: [](Value *V) { return !isa<UndefValue>(Val: V) && isConstant(V); });
18828 // 1. Shuffle input source vector and constant vector.
18829 if (!ForPoisonSrc && IsAnyNonUndefConst) {
18830 Cost += ::getShuffleCost(TTI: *TTI, Kind: TargetTransformInfo::SK_PermuteTwoSrc, Tp: VecTy,
18831 Mask: ConstantShuffleMask);
18832 }
18833
18834 // 2. Insert unique non-constants.
18835 if (!DemandedElements.isZero())
18836 Cost += getScalarizationOverhead(TTI: *TTI, ScalarTy, Ty: VecTy, DemandedElts: DemandedElements,
18837 /*Insert=*/true,
18838 /*Extract=*/false, CostKind,
18839 ForPoisonSrc: ForPoisonSrc && !IsAnyNonUndefConst, VL);
18840 return Cost;
18841}
18842
18843Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
18844 auto It = EntryToLastInstruction.find(Val: E);
18845 if (It != EntryToLastInstruction.end())
18846 return *cast<Instruction>(Val&: It->second);
18847 Instruction *Res = nullptr;
18848 // Get the basic block this bundle is in. All instructions in the bundle
18849 // should be in this block (except for extractelement-like instructions with
18850 // constant indices or gathered loads or copyables).
18851 Instruction *Front;
18852 unsigned Opcode;
18853 if (E->hasState()) {
18854 Front = E->getMainOp();
18855 Opcode = E->getOpcode();
18856 } else {
18857 Front = cast<Instruction>(Val: *find_if(Range: E->Scalars, P: IsaPred<Instruction>));
18858 Opcode = Front->getOpcode();
18859 }
18860 auto *BB = Front->getParent();
18861 assert(
18862 ((GatheredLoadsEntriesFirst.has_value() && Opcode == Instruction::Load &&
18863 E->isGather() && E->Idx < *GatheredLoadsEntriesFirst) ||
18864 E->State == TreeEntry::SplitVectorize || E->hasCopyableElements() ||
18865 all_of(E->Scalars,
18866 [=](Value *V) -> bool {
18867 if (Opcode == Instruction::GetElementPtr &&
18868 !isa<GetElementPtrInst>(V))
18869 return true;
18870 auto *I = dyn_cast<Instruction>(V);
18871 return !I || !E->getMatchingMainOpOrAltOp(I) ||
18872 I->getParent() == BB || isVectorLikeInstWithConstOps(I);
18873 })) &&
18874 "Expected gathered loads or GEPs or instructions from same basic "
18875 "block.");
18876
18877 auto FindLastInst = [&]() {
18878 Instruction *LastInst = Front;
18879 for (Value *V : E->Scalars) {
18880 auto *I = dyn_cast<Instruction>(Val: V);
18881 if (!I)
18882 continue;
18883 if (E->isCopyableElement(V: I))
18884 continue;
18885 if (LastInst->getParent() == I->getParent()) {
18886 if (LastInst->comesBefore(Other: I))
18887 LastInst = I;
18888 continue;
18889 }
18890 assert(((Opcode == Instruction::GetElementPtr &&
18891 !isa<GetElementPtrInst>(I)) ||
18892 E->State == TreeEntry::SplitVectorize ||
18893 (isVectorLikeInstWithConstOps(LastInst) &&
18894 isVectorLikeInstWithConstOps(I)) ||
18895 (GatheredLoadsEntriesFirst.has_value() &&
18896 Opcode == Instruction::Load && E->isGather() &&
18897 E->Idx < *GatheredLoadsEntriesFirst)) &&
18898 "Expected vector-like or non-GEP in GEP node insts only.");
18899 if (!DT->isReachableFromEntry(A: LastInst->getParent())) {
18900 LastInst = I;
18901 continue;
18902 }
18903 if (!DT->isReachableFromEntry(A: I->getParent()))
18904 continue;
18905 auto *NodeA = DT->getNode(BB: LastInst->getParent());
18906 auto *NodeB = DT->getNode(BB: I->getParent());
18907 assert(NodeA && "Should only process reachable instructions");
18908 assert(NodeB && "Should only process reachable instructions");
18909 assert((NodeA == NodeB) ==
18910 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18911 "Different nodes should have different DFS numbers");
18912 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
18913 LastInst = I;
18914 }
18915 BB = LastInst->getParent();
18916 return LastInst;
18917 };
18918
18919 auto FindFirstInst = [&]() {
18920 Instruction *FirstInst = Front;
18921 for (Value *V : E->Scalars) {
18922 auto *I = dyn_cast<Instruction>(Val: V);
18923 if (!I)
18924 continue;
18925 if (E->isCopyableElement(V: I))
18926 continue;
18927 if (FirstInst->getParent() == I->getParent()) {
18928 if (I->comesBefore(Other: FirstInst))
18929 FirstInst = I;
18930 continue;
18931 }
18932 assert(((Opcode == Instruction::GetElementPtr &&
18933 !isa<GetElementPtrInst>(I)) ||
18934 (isVectorLikeInstWithConstOps(FirstInst) &&
18935 isVectorLikeInstWithConstOps(I))) &&
18936 "Expected vector-like or non-GEP in GEP node insts only.");
18937 if (!DT->isReachableFromEntry(A: FirstInst->getParent())) {
18938 FirstInst = I;
18939 continue;
18940 }
18941 if (!DT->isReachableFromEntry(A: I->getParent()))
18942 continue;
18943 auto *NodeA = DT->getNode(BB: FirstInst->getParent());
18944 auto *NodeB = DT->getNode(BB: I->getParent());
18945 assert(NodeA && "Should only process reachable instructions");
18946 assert(NodeB && "Should only process reachable instructions");
18947 assert((NodeA == NodeB) ==
18948 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
18949 "Different nodes should have different DFS numbers");
18950 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
18951 FirstInst = I;
18952 }
18953 return FirstInst;
18954 };
18955
18956 if (E->State == TreeEntry::SplitVectorize) {
18957 Res = FindLastInst();
18958 if (ArrayRef<TreeEntry *> Entries = getTreeEntries(V: Res); !Entries.empty()) {
18959 for (auto *E : Entries) {
18960 auto *I = dyn_cast_or_null<Instruction>(Val&: E->VectorizedValue);
18961 if (!I)
18962 I = &getLastInstructionInBundle(E);
18963 if (Res->getParent() == I->getParent() && Res->comesBefore(Other: I))
18964 Res = I;
18965 }
18966 }
18967 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18968 return *Res;
18969 }
18970
18971 // Set insertpoint for gathered loads to the very first load.
18972 if (GatheredLoadsEntriesFirst.has_value() &&
18973 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
18974 Opcode == Instruction::Load) {
18975 Res = FindFirstInst();
18976 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
18977 return *Res;
18978 }
18979
18980 // Set the insert point to the beginning of the basic block if the entry
18981 // should not be scheduled.
18982 auto FindScheduleBundle = [&](const TreeEntry *E) -> const ScheduleBundle * {
18983 if (E->isGather())
18984 return nullptr;
18985 // Found previously that the instruction do not need to be scheduled.
18986 const auto *It = BlocksSchedules.find(Key: BB);
18987 if (It == BlocksSchedules.end())
18988 return nullptr;
18989 for (Value *V : E->Scalars) {
18990 auto *I = dyn_cast<Instruction>(Val: V);
18991 if (!I || isa<PHINode>(Val: I) ||
18992 (!E->isCopyableElement(V: I) && doesNotNeedToBeScheduled(V: I)))
18993 continue;
18994 ArrayRef<ScheduleBundle *> Bundles = It->second->getScheduleBundles(V: I);
18995 if (Bundles.empty())
18996 continue;
18997 const auto *It = find_if(
18998 Range&: Bundles, P: [&](ScheduleBundle *B) { return B->getTreeEntry() == E; });
18999 if (It != Bundles.end())
19000 return *It;
19001 }
19002 return nullptr;
19003 };
19004 const ScheduleBundle *Bundle = FindScheduleBundle(E);
19005 if (!E->isGather() && !Bundle) {
19006 if ((Opcode == Instruction::GetElementPtr &&
19007 any_of(Range: E->Scalars,
19008 P: [](Value *V) {
19009 return !isa<GetElementPtrInst>(Val: V) && isa<Instruction>(Val: V);
19010 })) ||
19011 (all_of(Range: E->Scalars,
19012 P: [&](Value *V) {
19013 return isa<PoisonValue>(Val: V) ||
19014 (E->Idx == 0 && isa<InsertElementInst>(Val: V)) ||
19015 E->isCopyableElement(V) ||
19016 (!isVectorLikeInstWithConstOps(V) &&
19017 isUsedOutsideBlock(V));
19018 }) &&
19019 (!E->doesNotNeedToSchedule() ||
19020 any_of(Range: E->Scalars,
19021 P: [&](Value *V) {
19022 if (!isa<Instruction>(Val: V) ||
19023 (E->hasCopyableElements() && E->isCopyableElement(V)))
19024 return false;
19025 return !areAllOperandsNonInsts(V);
19026 }) ||
19027 none_of(Range: E->Scalars, P: [&](Value *V) {
19028 if (!isa<Instruction>(Val: V) ||
19029 (E->hasCopyableElements() && E->isCopyableElement(V)))
19030 return false;
19031 return MustGather.contains(Ptr: V);
19032 }))))
19033 Res = FindLastInst();
19034 else
19035 Res = FindFirstInst();
19036 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19037 return *Res;
19038 }
19039
19040 // Find the last instruction. The common case should be that BB has been
19041 // scheduled, and the last instruction is VL.back(). So we start with
19042 // VL.back() and iterate over schedule data until we reach the end of the
19043 // bundle. The end of the bundle is marked by null ScheduleData.
19044 if (Bundle) {
19045 assert(!E->isGather() && "Gathered instructions should not be scheduled");
19046 Res = Bundle->getBundle().back()->getInst();
19047 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19048 return *Res;
19049 }
19050
19051 // LastInst can still be null at this point if there's either not an entry
19052 // for BB in BlocksSchedules or there's no ScheduleData available for
19053 // VL.back(). This can be the case if buildTreeRec aborts for various
19054 // reasons (e.g., the maximum recursion depth is reached, the maximum region
19055 // size is reached, etc.). ScheduleData is initialized in the scheduling
19056 // "dry-run".
19057 //
19058 // If this happens, we can still find the last instruction by brute force. We
19059 // iterate forwards from Front (inclusive) until we either see all
19060 // instructions in the bundle or reach the end of the block. If Front is the
19061 // last instruction in program order, LastInst will be set to Front, and we
19062 // will visit all the remaining instructions in the block.
19063 //
19064 // One of the reasons we exit early from buildTreeRec is to place an upper
19065 // bound on compile-time. Thus, taking an additional compile-time hit here is
19066 // not ideal. However, this should be exceedingly rare since it requires that
19067 // we both exit early from buildTreeRec and that the bundle be out-of-order
19068 // (causing us to iterate all the way to the end of the block).
19069 if (!Res)
19070 Res = FindLastInst();
19071 assert(Res && "Failed to find last instruction in bundle");
19072 EntryToLastInstruction.try_emplace(Key: E, Args&: Res);
19073 return *Res;
19074}
19075
19076void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
19077 auto *Front = E->getMainOp();
19078 Instruction *LastInst = &getLastInstructionInBundle(E);
19079 assert(LastInst && "Failed to find last instruction in bundle");
19080 BasicBlock::iterator LastInstIt = LastInst->getIterator();
19081 // If the instruction is PHI, set the insert point after all the PHIs.
19082 bool IsPHI = isa<PHINode>(Val: LastInst);
19083 if (IsPHI) {
19084 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
19085 if (LastInstIt != LastInst->getParent()->end() &&
19086 LastInstIt->getParent()->isLandingPad())
19087 LastInstIt = std::next(x: LastInstIt);
19088 }
19089 if (IsPHI ||
19090 (!E->isGather() && E->State != TreeEntry::SplitVectorize &&
19091 (E->doesNotNeedToSchedule() ||
19092 (E->hasCopyableElements() && !E->isCopyableElement(V: LastInst) &&
19093 isUsedOutsideBlock(V: LastInst)))) ||
19094 (GatheredLoadsEntriesFirst.has_value() &&
19095 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
19096 E->getOpcode() == Instruction::Load)) {
19097 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: LastInstIt);
19098 } else {
19099 // Set the insertion point after the last instruction in the bundle. Set the
19100 // debug location to Front.
19101 Builder.SetInsertPoint(
19102 TheBB: LastInst->getParent(),
19103 IP: LastInst->getNextNode()->getIterator());
19104 if (Instruction *Res = LastInstructionToPos.lookup(Val: LastInst)) {
19105 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
19106 } else {
19107 Res = Builder.CreateAlignedLoad(Ty: Builder.getPtrTy(),
19108 Ptr: PoisonValue::get(T: Builder.getPtrTy()),
19109 Align: MaybeAlign());
19110 Builder.SetInsertPoint(TheBB: LastInst->getParent(), IP: Res->getIterator());
19111 eraseInstruction(I: Res);
19112 LastInstructionToPos.try_emplace(Key: LastInst, Args&: Res);
19113 }
19114 }
19115 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
19116}
19117
19118Value *BoUpSLP::gather(
19119 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
19120 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
19121 // List of instructions/lanes from current block and/or the blocks which are
19122 // part of the current loop. These instructions will be inserted at the end to
19123 // make it possible to optimize loops and hoist invariant instructions out of
19124 // the loops body with better chances for success.
19125 SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
19126 SmallSet<int, 4> PostponedIndices;
19127 Loop *L = LI->getLoopFor(BB: Builder.GetInsertBlock());
19128 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
19129 SmallPtrSet<BasicBlock *, 4> Visited;
19130 while (InsertBB && InsertBB != InstBB && Visited.insert(Ptr: InsertBB).second)
19131 InsertBB = InsertBB->getSinglePredecessor();
19132 return InsertBB && InsertBB == InstBB;
19133 };
19134 for (int I = 0, E = VL.size(); I < E; ++I) {
19135 if (auto *Inst = dyn_cast<Instruction>(Val: VL[I]))
19136 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
19137 isVectorized(V: Inst) ||
19138 (L && (!Root || L->isLoopInvariant(V: Root)) && L->contains(Inst))) &&
19139 PostponedIndices.insert(V: I).second)
19140 PostponedInsts.emplace_back(Args&: Inst, Args&: I);
19141 }
19142
19143 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
19144 Type *Ty) {
19145 Value *Scalar = V;
19146 if (Scalar->getType() != Ty) {
19147 assert(Scalar->getType()->isIntOrIntVectorTy() &&
19148 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
19149 Value *V = Scalar;
19150 if (auto *CI = dyn_cast<CastInst>(Val: Scalar);
19151 isa_and_nonnull<SExtInst, ZExtInst>(Val: CI)) {
19152 Value *Op = CI->getOperand(i_nocapture: 0);
19153 if (auto *IOp = dyn_cast<Instruction>(Val: Op);
19154 !IOp || !(isDeleted(I: IOp) || isVectorized(V: IOp)))
19155 V = Op;
19156 }
19157 Scalar = Builder.CreateIntCast(
19158 V, DestTy: Ty, isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
19159 }
19160
19161 Instruction *InsElt;
19162 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
19163 assert(SLPReVec && "FixedVectorType is not expected.");
19164 Vec =
19165 createInsertVector(Builder, Vec, V: Scalar, Index: Pos * getNumElements(Ty: VecTy));
19166 auto *II = dyn_cast<Instruction>(Val: Vec);
19167 if (!II)
19168 return Vec;
19169 InsElt = II;
19170 } else {
19171 Vec = Builder.CreateInsertElement(Vec, NewElt: Scalar, Idx: Builder.getInt32(C: Pos));
19172 InsElt = dyn_cast<InsertElementInst>(Val: Vec);
19173 if (!InsElt)
19174 return Vec;
19175 }
19176 GatherShuffleExtractSeq.insert(X: InsElt);
19177 CSEBlocks.insert(V: InsElt->getParent());
19178 // Add to our 'need-to-extract' list.
19179 if (isa<Instruction>(Val: V)) {
19180 ArrayRef<TreeEntry *> Entries = getTreeEntries(V);
19181 const auto *It = find_if(Range&: Entries, P: [&](const TreeEntry *E) {
19182 return !TransformedToGatherNodes.contains(Val: E) &&
19183 !DeletedNodes.contains(Ptr: E);
19184 });
19185 if (It != Entries.end()) {
19186 // Find which lane we need to extract.
19187 User *UserOp = nullptr;
19188 if (Scalar != V) {
19189 if (auto *SI = dyn_cast<Instruction>(Val: Scalar))
19190 UserOp = SI;
19191 } else {
19192 if (V->getType()->isVectorTy()) {
19193 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: InsElt);
19194 SV && SV->getOperand(i_nocapture: 0) != V && SV->getOperand(i_nocapture: 1) != V) {
19195 // Find shufflevector, caused by resize.
19196 auto FindOperand = [](Value *Vec, Value *V) -> Instruction * {
19197 if (auto *SV = dyn_cast<ShuffleVectorInst>(Val: Vec)) {
19198 if (SV->getOperand(i_nocapture: 0) == V)
19199 return SV;
19200 if (SV->getOperand(i_nocapture: 1) == V)
19201 return SV;
19202 }
19203 return nullptr;
19204 };
19205 InsElt = nullptr;
19206 if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: 0), V))
19207 InsElt = User;
19208 else if (Instruction *User = FindOperand(SV->getOperand(i_nocapture: 1), V))
19209 InsElt = User;
19210 assert(InsElt &&
19211 "Failed to find shufflevector, caused by resize.");
19212 }
19213 }
19214 UserOp = InsElt;
19215 }
19216 if (UserOp) {
19217 unsigned FoundLane = (*It)->findLaneForValue(V);
19218 ExternalUses.emplace_back(Args&: V, Args&: UserOp, Args&: **It, Args&: FoundLane);
19219 }
19220 }
19221 }
19222 return Vec;
19223 };
19224 auto *VecTy = getWidenedType(ScalarTy, VF: VL.size());
19225 Value *Vec = PoisonValue::get(T: VecTy);
19226 SmallVector<int> NonConsts;
19227 SmallVector<int> Mask(VL.size());
19228 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
19229 Value *OriginalRoot = Root;
19230 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Val: Root);
19231 SV && isa<PoisonValue>(Val: SV->getOperand(i_nocapture: 1)) &&
19232 SV->getOperand(i_nocapture: 0)->getType() == VecTy) {
19233 Root = SV->getOperand(i_nocapture: 0);
19234 Mask.assign(in_start: SV->getShuffleMask().begin(), in_end: SV->getShuffleMask().end());
19235 }
19236 // Insert constant values at first.
19237 for (int I = 0, E = VL.size(); I < E; ++I) {
19238 if (PostponedIndices.contains(V: I))
19239 continue;
19240 if (!isConstant(V: VL[I])) {
19241 NonConsts.push_back(Elt: I);
19242 continue;
19243 }
19244 if (isa<PoisonValue>(Val: VL[I]))
19245 continue;
19246 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
19247 Mask[I] = I + E;
19248 }
19249 if (Root) {
19250 if (isa<PoisonValue>(Val: Vec)) {
19251 Vec = OriginalRoot;
19252 } else {
19253 Vec = CreateShuffle(Root, Vec, Mask);
19254 if (auto *OI = dyn_cast<Instruction>(Val: OriginalRoot);
19255 OI && OI->use_empty() &&
19256 none_of(Range&: VectorizableTree, P: [&](const std::unique_ptr<TreeEntry> &TE) {
19257 return TE->VectorizedValue == OI;
19258 }))
19259 eraseInstruction(I: OI);
19260 }
19261 }
19262 // Insert non-constant values.
19263 for (int I : NonConsts)
19264 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
19265 // Append instructions, which are/may be part of the loop, in the end to make
19266 // it possible to hoist non-loop-based instructions.
19267 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
19268 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
19269
19270 return Vec;
19271}
19272
19273/// Merges shuffle masks and emits final shuffle instruction, if required. It
19274/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
19275/// when the actual shuffle instruction is generated only if this is actually
19276/// required. Otherwise, the shuffle instruction emission is delayed till the
19277/// end of the process, to reduce the number of emitted instructions and further
19278/// analysis/transformations.
19279/// The class also will look through the previously emitted shuffle instructions
19280/// and properly mark indices in mask as undef.
19281/// For example, given the code
19282/// \code
19283/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
19284/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
19285/// \endcode
19286/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
19287/// look through %s1 and %s2 and emit
19288/// \code
19289/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19290/// \endcode
19291/// instead.
19292/// If 2 operands are of different size, the smallest one will be resized and
19293/// the mask recalculated properly.
19294/// For example, given the code
19295/// \code
19296/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
19297/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
19298/// \endcode
19299/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
19300/// look through %s1 and %s2 and emit
19301/// \code
19302/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
19303/// \endcode
19304/// instead.
19305class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
19306 bool IsFinalized = false;
19307 /// Combined mask for all applied operands and masks. It is built during
19308 /// analysis and actual emission of shuffle vector instructions.
19309 SmallVector<int> CommonMask;
19310 /// List of operands for the shuffle vector instruction. It hold at max 2
19311 /// operands, if the 3rd is going to be added, the first 2 are combined into
19312 /// shuffle with \p CommonMask mask, the first operand sets to be the
19313 /// resulting shuffle and the second operand sets to be the newly added
19314 /// operand. The \p CommonMask is transformed in the proper way after that.
19315 SmallVector<Value *, 2> InVectors;
19316 IRBuilderBase &Builder;
19317 BoUpSLP &R;
19318
19319 class ShuffleIRBuilder {
19320 IRBuilderBase &Builder;
19321 /// Holds all of the instructions that we gathered.
19322 SetVector<Instruction *> &GatherShuffleExtractSeq;
19323 /// A list of blocks that we are going to CSE.
19324 DenseSet<BasicBlock *> &CSEBlocks;
19325 /// Data layout.
19326 const DataLayout &DL;
19327
19328 public:
19329 ShuffleIRBuilder(IRBuilderBase &Builder,
19330 SetVector<Instruction *> &GatherShuffleExtractSeq,
19331 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
19332 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
19333 CSEBlocks(CSEBlocks), DL(DL) {}
19334 ~ShuffleIRBuilder() = default;
19335 /// Creates shufflevector for the 2 operands with the given mask.
19336 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
19337 if (V1->getType() != V2->getType()) {
19338 assert(V1->getType()->isIntOrIntVectorTy() &&
19339 V1->getType()->isIntOrIntVectorTy() &&
19340 "Expected integer vector types only.");
19341 if (V1->getType() != V2->getType()) {
19342 if (cast<VectorType>(Val: V2->getType())
19343 ->getElementType()
19344 ->getIntegerBitWidth() < cast<VectorType>(Val: V1->getType())
19345 ->getElementType()
19346 ->getIntegerBitWidth())
19347 V2 = Builder.CreateIntCast(
19348 V: V2, DestTy: V1->getType(), isSigned: !isKnownNonNegative(V: V2, SQ: SimplifyQuery(DL)));
19349 else
19350 V1 = Builder.CreateIntCast(
19351 V: V1, DestTy: V2->getType(), isSigned: !isKnownNonNegative(V: V1, SQ: SimplifyQuery(DL)));
19352 }
19353 }
19354 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
19355 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
19356 GatherShuffleExtractSeq.insert(X: I);
19357 CSEBlocks.insert(V: I->getParent());
19358 }
19359 return Vec;
19360 }
19361 /// Creates permutation of the single vector operand with the given mask, if
19362 /// it is not identity mask.
19363 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
19364 if (Mask.empty())
19365 return V1;
19366 unsigned VF = Mask.size();
19367 unsigned LocalVF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19368 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: VF))
19369 return V1;
19370 Value *Vec = Builder.CreateShuffleVector(V: V1, Mask);
19371 if (auto *I = dyn_cast<Instruction>(Val: Vec)) {
19372 GatherShuffleExtractSeq.insert(X: I);
19373 CSEBlocks.insert(V: I->getParent());
19374 }
19375 return Vec;
19376 }
19377 Value *createIdentity(Value *V) { return V; }
19378 Value *createPoison(Type *Ty, unsigned VF) {
19379 return PoisonValue::get(T: getWidenedType(ScalarTy: Ty, VF));
19380 }
19381 /// Resizes 2 input vector to match the sizes, if the they are not equal
19382 /// yet. The smallest vector is resized to the size of the larger vector.
19383 void resizeToMatch(Value *&V1, Value *&V2) {
19384 if (V1->getType() == V2->getType())
19385 return;
19386 int V1VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
19387 int V2VF = cast<FixedVectorType>(Val: V2->getType())->getNumElements();
19388 int VF = std::max(a: V1VF, b: V2VF);
19389 int MinVF = std::min(a: V1VF, b: V2VF);
19390 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
19391 std::iota(first: IdentityMask.begin(), last: std::next(x: IdentityMask.begin(), n: MinVF),
19392 value: 0);
19393 Value *&Op = MinVF == V1VF ? V1 : V2;
19394 Op = Builder.CreateShuffleVector(V: Op, Mask: IdentityMask);
19395 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
19396 GatherShuffleExtractSeq.insert(X: I);
19397 CSEBlocks.insert(V: I->getParent());
19398 }
19399 if (MinVF == V1VF)
19400 V1 = Op;
19401 else
19402 V2 = Op;
19403 }
19404 };
19405
19406 /// Smart shuffle instruction emission, walks through shuffles trees and
19407 /// tries to find the best matching vector for the actual shuffle
19408 /// instruction.
19409 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
19410 assert(V1 && "Expected at least one vector value.");
19411 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
19412 R.CSEBlocks, *R.DL);
19413 return BaseShuffleAnalysis::createShuffle<Value *>(
19414 V1, V2, Mask, Builder&: ShuffleBuilder, ScalarTy);
19415 }
19416
19417 /// Cast value \p V to the vector type with the same number of elements, but
19418 /// the base type \p ScalarTy.
19419 Value *castToScalarTyElem(Value *V,
19420 std::optional<bool> IsSigned = std::nullopt) {
19421 auto *VecTy = cast<VectorType>(Val: V->getType());
19422 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
19423 if (VecTy->getElementType() == ScalarTy->getScalarType())
19424 return V;
19425 return Builder.CreateIntCast(
19426 V, DestTy: VectorType::get(ElementType: ScalarTy->getScalarType(), EC: VecTy->getElementCount()),
19427 isSigned: IsSigned.value_or(u: !isKnownNonNegative(V, SQ: SimplifyQuery(*R.DL))));
19428 }
19429
19430 Value *getVectorizedValue(const TreeEntry &E) {
19431 Value *Vec = E.VectorizedValue;
19432 if (!Vec->getType()->isIntOrIntVectorTy())
19433 return Vec;
19434 return castToScalarTyElem(V: Vec, IsSigned: any_of(Range: E.Scalars, P: [&](Value *V) {
19435 return !isa<PoisonValue>(Val: V) &&
19436 !isKnownNonNegative(
19437 V, SQ: SimplifyQuery(*R.DL));
19438 }));
19439 }
19440
19441public:
19442 ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
19443 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
19444
19445 /// Adjusts extractelements after reusing them.
19446 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
19447 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
19448 unsigned NumParts, bool &UseVecBaseAsInput) {
19449 UseVecBaseAsInput = false;
19450 SmallPtrSet<Value *, 4> UniqueBases;
19451 Value *VecBase = nullptr;
19452 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
19453 if (!E->ReorderIndices.empty()) {
19454 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19455 E->ReorderIndices.end());
19456 reorderScalars(Scalars&: VL, Mask: ReorderMask);
19457 }
19458 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
19459 int Idx = Mask[I];
19460 if (Idx == PoisonMaskElem)
19461 continue;
19462 auto *EI = cast<ExtractElementInst>(Val: VL[I]);
19463 VecBase = EI->getVectorOperand();
19464 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecBase); !TEs.empty())
19465 VecBase = TEs.front()->VectorizedValue;
19466 assert(VecBase && "Expected vectorized value.");
19467 UniqueBases.insert(Ptr: VecBase);
19468 // If the only one use is vectorized - can delete the extractelement
19469 // itself.
19470 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(Ptr: EI) ||
19471 (E->UserTreeIndex && E->UserTreeIndex.EdgeIdx == UINT_MAX &&
19472 !R.isVectorized(V: EI) &&
19473 count_if(Range: E->Scalars, P: [&](Value *V) { return V == EI; }) !=
19474 count_if(Range&: E->UserTreeIndex.UserTE->Scalars,
19475 P: [&](Value *V) { return V == EI; })) ||
19476 (NumParts != 1 && count(Range&: VL, Element: EI) > 1) ||
19477 any_of(Range: EI->users(), P: [&](User *U) {
19478 ArrayRef<TreeEntry *> UTEs = R.getTreeEntries(V: U);
19479 return UTEs.empty() || UTEs.size() > 1 ||
19480 any_of(Range&: UTEs,
19481 P: [&](const TreeEntry *TE) {
19482 return R.DeletedNodes.contains(Ptr: TE) ||
19483 R.TransformedToGatherNodes.contains(Val: TE);
19484 }) ||
19485 (isa<GetElementPtrInst>(Val: U) &&
19486 !R.areAllUsersVectorized(I: cast<Instruction>(Val: U))) ||
19487 (!UTEs.empty() &&
19488 count_if(Range&: R.VectorizableTree,
19489 P: [&](const std::unique_ptr<TreeEntry> &TE) {
19490 return TE->UserTreeIndex.UserTE ==
19491 UTEs.front() &&
19492 is_contained(Range&: VL, Element: EI);
19493 }) != 1);
19494 }))
19495 continue;
19496 R.eraseInstruction(I: EI);
19497 }
19498 if (NumParts == 1 || UniqueBases.size() == 1) {
19499 assert(VecBase && "Expected vectorized value.");
19500 return castToScalarTyElem(V: VecBase);
19501 }
19502 UseVecBaseAsInput = true;
19503 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
19504 for (auto [I, Idx] : enumerate(First&: Mask))
19505 if (Idx != PoisonMaskElem)
19506 Idx = I;
19507 };
19508 // Perform multi-register vector shuffle, joining them into a single virtual
19509 // long vector.
19510 // Need to shuffle each part independently and then insert all this parts
19511 // into a long virtual vector register, forming the original vector.
19512 Value *Vec = nullptr;
19513 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
19514 unsigned SliceSize = getPartNumElems(Size: VL.size(), NumParts);
19515 for (unsigned Part : seq<unsigned>(Size: NumParts)) {
19516 unsigned Limit = getNumElems(Size: VL.size(), PartNumElems: SliceSize, Part);
19517 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(N: Part * SliceSize, M: Limit);
19518 MutableArrayRef<int> SubMask = Mask.slice(N: Part * SliceSize, M: Limit);
19519 constexpr int MaxBases = 2;
19520 SmallVector<Value *, MaxBases> Bases(MaxBases);
19521 auto VLMask = zip(t&: SubVL, u&: SubMask);
19522 const unsigned VF = std::accumulate(
19523 first: VLMask.begin(), last: VLMask.end(), init: 0U, binary_op: [&](unsigned S, const auto &D) {
19524 if (std::get<1>(D) == PoisonMaskElem)
19525 return S;
19526 Value *VecOp =
19527 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
19528 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp);
19529 !TEs.empty())
19530 VecOp = TEs.front()->VectorizedValue;
19531 assert(VecOp && "Expected vectorized value.");
19532 const unsigned Size =
19533 cast<FixedVectorType>(Val: VecOp->getType())->getNumElements();
19534 return std::max(a: S, b: Size);
19535 });
19536 for (const auto [V, I] : VLMask) {
19537 if (I == PoisonMaskElem)
19538 continue;
19539 Value *VecOp = cast<ExtractElementInst>(Val: V)->getVectorOperand();
19540 if (ArrayRef<TreeEntry *> TEs = R.getTreeEntries(V: VecOp); !TEs.empty())
19541 VecOp = TEs.front()->VectorizedValue;
19542 assert(VecOp && "Expected vectorized value.");
19543 VecOp = castToScalarTyElem(V: VecOp);
19544 Bases[I / VF] = VecOp;
19545 }
19546 if (!Bases.front())
19547 continue;
19548 Value *SubVec;
19549 if (Bases.back()) {
19550 SubVec = createShuffle(V1: Bases.front(), V2: Bases.back(), Mask: SubMask);
19551 TransformToIdentity(SubMask);
19552 } else {
19553 SubVec = Bases.front();
19554 }
19555 if (!Vec) {
19556 Vec = SubVec;
19557 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
19558 [&](unsigned P) {
19559 ArrayRef<int> SubMask =
19560 Mask.slice(P * SliceSize,
19561 getNumElems(Mask.size(),
19562 SliceSize, P));
19563 return all_of(SubMask, [](int Idx) {
19564 return Idx == PoisonMaskElem;
19565 });
19566 })) &&
19567 "Expected first part or all previous parts masked.");
19568 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
19569 } else {
19570 unsigned NewVF =
19571 cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19572 if (Vec->getType() != SubVec->getType()) {
19573 unsigned SubVecVF =
19574 cast<FixedVectorType>(Val: SubVec->getType())->getNumElements();
19575 NewVF = std::max(a: NewVF, b: SubVecVF);
19576 }
19577 // Adjust SubMask.
19578 for (int &Idx : SubMask)
19579 if (Idx != PoisonMaskElem)
19580 Idx += NewVF;
19581 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: Part * SliceSize));
19582 Vec = createShuffle(V1: Vec, V2: SubVec, Mask: VecMask);
19583 TransformToIdentity(VecMask);
19584 }
19585 }
19586 copy(Range&: VecMask, Out: Mask.begin());
19587 return Vec;
19588 }
19589 /// Checks if the specified entry \p E needs to be delayed because of its
19590 /// dependency nodes.
19591 std::optional<Value *>
19592 needToDelay(const TreeEntry *E,
19593 ArrayRef<SmallVector<const TreeEntry *>> Deps) const {
19594 // No need to delay emission if all deps are ready.
19595 if (all_of(Range&: Deps, P: [](ArrayRef<const TreeEntry *> TEs) {
19596 return all_of(
19597 Range&: TEs, P: [](const TreeEntry *TE) { return TE->VectorizedValue; });
19598 }))
19599 return std::nullopt;
19600 // Postpone gather emission, will be emitted after the end of the
19601 // process to keep correct order.
19602 auto *ResVecTy = getWidenedType(ScalarTy, VF: E->getVectorFactor());
19603 return Builder.CreateAlignedLoad(
19604 Ty: ResVecTy,
19605 Ptr: PoisonValue::get(T: PointerType::getUnqual(C&: ScalarTy->getContext())),
19606 Align: MaybeAlign());
19607 }
19608 /// Reset the builder to handle perfect diamond match.
19609 void resetForSameNode() {
19610 IsFinalized = false;
19611 CommonMask.clear();
19612 InVectors.clear();
19613 }
19614 /// Adds 2 input vectors (in form of tree entries) and the mask for their
19615 /// shuffling.
19616 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
19617 Value *V1 = getVectorizedValue(E: E1);
19618 Value *V2 = getVectorizedValue(E: E2);
19619 add(V1, V2, Mask);
19620 }
19621 /// Adds single input vector (in form of tree entry) and the mask for its
19622 /// shuffling.
19623 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
19624 Value *V1 = getVectorizedValue(E: E1);
19625 add(V1, Mask);
19626 }
19627 /// Adds 2 input vectors and the mask for their shuffling.
19628 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
19629 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
19630 assert(isa<FixedVectorType>(V1->getType()) &&
19631 isa<FixedVectorType>(V2->getType()) &&
19632 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
19633 V1 = castToScalarTyElem(V: V1);
19634 V2 = castToScalarTyElem(V: V2);
19635 if (InVectors.empty()) {
19636 InVectors.push_back(Elt: V1);
19637 InVectors.push_back(Elt: V2);
19638 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
19639 return;
19640 }
19641 Value *Vec = InVectors.front();
19642 if (InVectors.size() == 2) {
19643 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19644 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19645 } else if (cast<FixedVectorType>(Val: Vec->getType())->getNumElements() !=
19646 Mask.size()) {
19647 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19648 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19649 }
19650 V1 = createShuffle(V1, V2, Mask);
19651 unsigned VF = std::max(a: getVF(V: V1), b: getVF(V: Vec));
19652 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19653 if (Mask[Idx] != PoisonMaskElem)
19654 CommonMask[Idx] = Idx + VF;
19655 InVectors.front() = Vec;
19656 if (InVectors.size() == 2)
19657 InVectors.back() = V1;
19658 else
19659 InVectors.push_back(Elt: V1);
19660 }
19661 /// Adds another one input vector and the mask for the shuffling.
19662 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
19663 assert(isa<FixedVectorType>(V1->getType()) &&
19664 "castToScalarTyElem expects V1 to be FixedVectorType");
19665 V1 = castToScalarTyElem(V: V1);
19666 if (InVectors.empty()) {
19667 InVectors.push_back(Elt: V1);
19668 CommonMask.assign(in_start: Mask.begin(), in_end: Mask.end());
19669 return;
19670 }
19671 const auto *It = find(Range&: InVectors, Val: V1);
19672 if (It == InVectors.end()) {
19673 if (InVectors.size() == 2 ||
19674 InVectors.front()->getType() != V1->getType()) {
19675 Value *V = InVectors.front();
19676 if (InVectors.size() == 2) {
19677 V = createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
19678 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19679 } else if (cast<FixedVectorType>(Val: V->getType())->getNumElements() !=
19680 CommonMask.size()) {
19681 V = createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
19682 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19683 }
19684 unsigned VF = std::max(a: CommonMask.size(), b: Mask.size());
19685 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19686 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
19687 CommonMask[Idx] = V->getType() != V1->getType()
19688 ? Idx + VF
19689 : Mask[Idx] + getVF(V: V1);
19690 if (V->getType() != V1->getType())
19691 V1 = createShuffle(V1, V2: nullptr, Mask);
19692 InVectors.front() = V;
19693 if (InVectors.size() == 2)
19694 InVectors.back() = V1;
19695 else
19696 InVectors.push_back(Elt: V1);
19697 return;
19698 }
19699 // Check if second vector is required if the used elements are already
19700 // used from the first one.
19701 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19702 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
19703 InVectors.push_back(Elt: V1);
19704 break;
19705 }
19706 }
19707 unsigned VF = 0;
19708 for (Value *V : InVectors)
19709 VF = std::max(a: VF, b: getVF(V));
19710 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
19711 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
19712 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
19713 }
19714 /// Adds another one input vector and the mask for the shuffling.
19715 void addOrdered(Value *V1, ArrayRef<unsigned> Order) {
19716 SmallVector<int> NewMask;
19717 inversePermutation(Indices: Order, Mask&: NewMask);
19718 add(V1, Mask: NewMask);
19719 }
19720 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
19721 Value *Root = nullptr) {
19722 return R.gather(VL, Root, ScalarTy,
19723 CreateShuffle: [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
19724 return createShuffle(V1, V2, Mask);
19725 });
19726 }
19727 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
19728 /// Finalize emission of the shuffles.
19729 /// \param Action the action (if any) to be performed before final applying of
19730 /// the \p ExtMask mask.
19731 Value *finalize(
19732 ArrayRef<int> ExtMask,
19733 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
19734 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
19735 function_ref<void(Value *&, SmallVectorImpl<int> &,
19736 function_ref<Value *(Value *, Value *, ArrayRef<int>)>)>
19737 Action = {}) {
19738 IsFinalized = true;
19739 if (Action) {
19740 Value *Vec = InVectors.front();
19741 if (InVectors.size() == 2) {
19742 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19743 InVectors.pop_back();
19744 } else {
19745 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19746 }
19747 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19748 assert(VF > 0 &&
19749 "Expected vector length for the final value before action.");
19750 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
19751 if (VecVF < VF) {
19752 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
19753 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
19754 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: ResizeMask);
19755 }
19756 Action(Vec, CommonMask, [this](Value *V1, Value *V2, ArrayRef<int> Mask) {
19757 return createShuffle(V1, V2, Mask);
19758 });
19759 InVectors.front() = Vec;
19760 }
19761 if (!SubVectors.empty()) {
19762 Value *Vec = InVectors.front();
19763 if (InVectors.size() == 2) {
19764 Vec = createShuffle(V1: Vec, V2: InVectors.back(), Mask: CommonMask);
19765 InVectors.pop_back();
19766 } else {
19767 Vec = createShuffle(V1: Vec, V2: nullptr, Mask: CommonMask);
19768 }
19769 transformMaskAfterShuffle(CommonMask, Mask: CommonMask);
19770 auto CreateSubVectors = [&](Value *Vec,
19771 SmallVectorImpl<int> &CommonMask) {
19772 for (auto [E, Idx] : SubVectors) {
19773 Value *V = getVectorizedValue(E: *E);
19774 unsigned InsertionIndex = Idx * getNumElements(Ty: ScalarTy);
19775 // Use scalar version of the SCalarType to correctly handle shuffles
19776 // for revectorization. The revectorization mode operates by the
19777 // vectors, but here we need to operate on the scalars, because the
19778 // masks were already transformed for the vector elements and we don't
19779 // need doing this transformation again.
19780 Type *OrigScalarTy = ScalarTy;
19781 ScalarTy = ScalarTy->getScalarType();
19782 Vec = createInsertVector(
19783 Builder, Vec, V, Index: InsertionIndex,
19784 Generator: std::bind(f: &ShuffleInstructionBuilder::createShuffle, args: this, args: _1, args: _2,
19785 args: _3));
19786 ScalarTy = OrigScalarTy;
19787 if (!CommonMask.empty()) {
19788 std::iota(first: std::next(x: CommonMask.begin(), n: Idx),
19789 last: std::next(x: CommonMask.begin(), n: Idx + E->getVectorFactor()),
19790 value: Idx);
19791 }
19792 }
19793 return Vec;
19794 };
19795 if (SubVectorsMask.empty()) {
19796 Vec = CreateSubVectors(Vec, CommonMask);
19797 } else {
19798 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
19799 copy(Range&: SubVectorsMask, Out: SVMask.begin());
19800 for (auto [I1, I2] : zip(t&: SVMask, u&: CommonMask)) {
19801 if (I2 != PoisonMaskElem) {
19802 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
19803 I1 = I2 + CommonMask.size();
19804 }
19805 }
19806 Value *InsertVec =
19807 CreateSubVectors(PoisonValue::get(T: Vec->getType()), CommonMask);
19808 Vec = createShuffle(V1: InsertVec, V2: Vec, Mask: SVMask);
19809 transformMaskAfterShuffle(CommonMask, Mask: SVMask);
19810 }
19811 InVectors.front() = Vec;
19812 }
19813
19814 if (!ExtMask.empty()) {
19815 if (CommonMask.empty()) {
19816 CommonMask.assign(in_start: ExtMask.begin(), in_end: ExtMask.end());
19817 } else {
19818 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
19819 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
19820 if (ExtMask[I] == PoisonMaskElem)
19821 continue;
19822 NewMask[I] = CommonMask[ExtMask[I]];
19823 }
19824 CommonMask.swap(RHS&: NewMask);
19825 }
19826 }
19827 if (CommonMask.empty()) {
19828 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
19829 return InVectors.front();
19830 }
19831 if (InVectors.size() == 2)
19832 return createShuffle(V1: InVectors.front(), V2: InVectors.back(), Mask: CommonMask);
19833 return createShuffle(V1: InVectors.front(), V2: nullptr, Mask: CommonMask);
19834 }
19835
19836 ~ShuffleInstructionBuilder() {
19837 assert((IsFinalized || CommonMask.empty()) &&
19838 "Shuffle construction must be finalized.");
19839 }
19840};
19841
19842Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx) {
19843 return vectorizeTree(E: getOperandEntry(E, Idx: NodeIdx));
19844}
19845
19846template <typename BVTy, typename ResTy, typename... Args>
19847ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
19848 Args &...Params) {
19849 assert((E->isGather() || TransformedToGatherNodes.contains(E)) &&
19850 "Expected gather node.");
19851 unsigned VF = E->getVectorFactor();
19852
19853 bool NeedFreeze = false;
19854 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
19855 // Do not process split vectorize node, marked to be gathers/buildvectors.
19856 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
19857 E->CombinedEntriesWithIndices.size());
19858 if (E->State == TreeEntry::SplitVectorize &&
19859 TransformedToGatherNodes.contains(Val: E)) {
19860 SubVectors.clear();
19861 } else {
19862 // Clear values, to be replaced by insertvector instructions.
19863 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
19864 for_each(MutableArrayRef(GatheredScalars)
19865 .slice(N: Idx, M: VectorizableTree[EIdx]->getVectorFactor()),
19866 [&](Value *&V) { V = PoisonValue::get(T: V->getType()); });
19867 transform(
19868 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
19869 return std::make_pair(VectorizableTree[P.first].get(), P.second);
19870 });
19871 }
19872 // Build a mask out of the reorder indices and reorder scalars per this
19873 // mask.
19874 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
19875 E->ReorderIndices.end());
19876 if (!ReorderMask.empty())
19877 reorderScalars(Scalars&: GatheredScalars, Mask: ReorderMask);
19878 SmallVector<int> SubVectorsMask;
19879 inversePermutation(Indices: E->ReorderIndices, Mask&: SubVectorsMask);
19880 // Transform non-clustered elements in the mask to poison (-1).
19881 // "Clustered" operations will be reordered using this mask later.
19882 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
19883 for (unsigned I : seq<unsigned>(Size: GatheredScalars.size()))
19884 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
19885 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
19886 } else {
19887 SubVectorsMask.clear();
19888 }
19889 SmallVector<Value *> StoredGS(GatheredScalars);
19890 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
19891 unsigned I, unsigned SliceSize,
19892 bool IsNotPoisonous) {
19893 if (!isSplat(VL: E->Scalars) || none_of(E->Scalars, [](Value *V) {
19894 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
19895 }))
19896 return false;
19897 TreeEntry *UserTE = E->UserTreeIndex.UserTE;
19898 unsigned EdgeIdx = E->UserTreeIndex.EdgeIdx;
19899 if (UserTE->getNumOperands() != 2)
19900 return false;
19901 if (!IsNotPoisonous) {
19902 auto *It = find_if(ArrayRef(VectorizableTree).drop_front(N: UserTE->Idx + 1),
19903 [=](const std::unique_ptr<TreeEntry> &TE) {
19904 return TE->UserTreeIndex.UserTE == UserTE &&
19905 TE->UserTreeIndex.EdgeIdx != EdgeIdx;
19906 });
19907 if (It == VectorizableTree.end())
19908 return false;
19909 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
19910 if (!(*It)->ReorderIndices.empty()) {
19911 inversePermutation((*It)->ReorderIndices, ReorderMask);
19912 reorderScalars(Scalars&: GS, Mask: ReorderMask);
19913 }
19914 if (!all_of(zip(t&: GatheredScalars, u&: GS), [&](const auto &P) {
19915 Value *V0 = std::get<0>(P);
19916 Value *V1 = std::get<1>(P);
19917 return !isa<UndefValue>(Val: V0) || isa<PoisonValue>(Val: V0) ||
19918 (isa<UndefValue>(Val: V0) && !isa<PoisonValue>(Val: V0) &&
19919 is_contained(Range: E->Scalars, Element: V1));
19920 }))
19921 return false;
19922 }
19923 int Idx;
19924 if ((Mask.size() < InputVF &&
19925 ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts: InputVF, Index&: Idx) &&
19926 Idx == 0) ||
19927 (Mask.size() == InputVF &&
19928 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))) {
19929 std::iota(
19930 first: std::next(x: Mask.begin(), n: I * SliceSize),
19931 last: std::next(x: Mask.begin(),
19932 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
19933 value: 0);
19934 } else {
19935 unsigned IVal =
19936 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
19937 std::fill(
19938 first: std::next(x: Mask.begin(), n: I * SliceSize),
19939 last: std::next(x: Mask.begin(),
19940 n: I * SliceSize + getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I)),
19941 value: IVal);
19942 }
19943 return true;
19944 };
19945 BVTy ShuffleBuilder(ScalarTy, Params...);
19946 ResTy Res = ResTy();
19947 SmallVector<int> Mask;
19948 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
19949 SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
19950 Value *ExtractVecBase = nullptr;
19951 bool UseVecBaseAsInput = false;
19952 SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
19953 SmallVector<SmallVector<const TreeEntry *>> Entries;
19954 Type *OrigScalarTy = GatheredScalars.front()->getType();
19955 auto *VecTy = getWidenedType(ScalarTy, VF: GatheredScalars.size());
19956 unsigned NumParts = ::getNumberOfParts(TTI: *TTI, VecTy, Limit: GatheredScalars.size());
19957 if (!all_of(Range&: GatheredScalars, P: IsaPred<UndefValue>)) {
19958 // Check for gathered extracts.
19959 bool Resized = false;
19960 ExtractShuffles =
19961 tryToGatherExtractElements(VL&: GatheredScalars, Mask&: ExtractMask, NumParts);
19962 if (!ExtractShuffles.empty()) {
19963 SmallVector<const TreeEntry *> ExtractEntries;
19964 for (auto [Idx, I] : enumerate(First&: ExtractMask)) {
19965 if (I == PoisonMaskElem)
19966 continue;
19967 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(
19968 V: cast<ExtractElementInst>(Val: StoredGS[Idx])->getVectorOperand());
19969 !TEs.empty())
19970 ExtractEntries.append(in_start: TEs.begin(), in_end: TEs.end());
19971 }
19972 if (std::optional<ResTy> Delayed =
19973 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
19974 // Delay emission of gathers which are not ready yet.
19975 PostponedGathers.insert(X: E);
19976 // Postpone gather emission, will be emitted after the end of the
19977 // process to keep correct order.
19978 return *Delayed;
19979 }
19980 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
19981 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
19982 ExtractVecBase = VecBase;
19983 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(Val: VecBase->getType()))
19984 if (VF == VecBaseTy->getNumElements() &&
19985 GatheredScalars.size() != VF) {
19986 Resized = true;
19987 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
19988 Elt: PoisonValue::get(T: OrigScalarTy));
19989 NumParts =
19990 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF), Limit: VF);
19991 }
19992 }
19993 }
19994 // Gather extracts after we check for full matched gathers only.
19995 if (!ExtractShuffles.empty() || !E->hasState() ||
19996 E->getOpcode() != Instruction::Load ||
19997 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
19998 any_of(Range: E->Scalars, P: IsaPred<LoadInst>)) &&
19999 any_of(E->Scalars,
20000 [this](Value *V) {
20001 return isa<LoadInst>(Val: V) && isVectorized(V);
20002 })) ||
20003 (E->hasState() && E->isAltShuffle()) ||
20004 all_of(E->Scalars, [this](Value *V) { return isVectorized(V); }) ||
20005 isSplat(VL: E->Scalars) ||
20006 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
20007 GatherShuffles =
20008 isGatherShuffledEntry(TE: E, VL: GatheredScalars, Mask, Entries, NumParts);
20009 }
20010 if (!GatherShuffles.empty()) {
20011 if (std::optional<ResTy> Delayed =
20012 ShuffleBuilder.needToDelay(E, Entries)) {
20013 // Delay emission of gathers which are not ready yet.
20014 PostponedGathers.insert(X: E);
20015 // Postpone gather emission, will be emitted after the end of the
20016 // process to keep correct order.
20017 return *Delayed;
20018 }
20019 if (GatherShuffles.size() == 1 &&
20020 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
20021 Entries.front().front()->isSame(VL: E->Scalars)) {
20022 // Perfect match in the graph, will reuse the previously vectorized
20023 // node. Cost is 0.
20024 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
20025 << shortBundleName(E->Scalars, E->Idx) << ".\n");
20026 // Restore the mask for previous partially matched values.
20027 Mask.resize(N: E->Scalars.size());
20028 const TreeEntry *FrontTE = Entries.front().front();
20029 if (FrontTE->ReorderIndices.empty() &&
20030 ((FrontTE->ReuseShuffleIndices.empty() &&
20031 E->Scalars.size() == FrontTE->Scalars.size()) ||
20032 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
20033 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
20034 } else {
20035 for (auto [I, V] : enumerate(First: E->Scalars)) {
20036 if (isa<PoisonValue>(Val: V)) {
20037 Mask[I] = PoisonMaskElem;
20038 continue;
20039 }
20040 Mask[I] = FrontTE->findLaneForValue(V);
20041 }
20042 }
20043 // Reset the builder(s) to correctly handle perfect diamond matched
20044 // nodes.
20045 ShuffleBuilder.resetForSameNode();
20046 // Full matched entry found, no need to insert subvectors.
20047 if (equal(LRange: E->Scalars, RRange: FrontTE->Scalars) &&
20048 equal(LRange: E->ReorderIndices, RRange: FrontTE->ReorderIndices) &&
20049 equal(LRange: E->ReuseShuffleIndices, RRange: FrontTE->ReuseShuffleIndices)) {
20050 Mask.resize(N: FrontTE->getVectorFactor());
20051 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
20052 ShuffleBuilder.add(*FrontTE, Mask);
20053 Res = ShuffleBuilder.finalize({}, {}, {});
20054 } else {
20055 ShuffleBuilder.add(*FrontTE, Mask);
20056 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
20057 }
20058 return Res;
20059 }
20060 if (!Resized) {
20061 if (GatheredScalars.size() != VF &&
20062 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
20063 return any_of(TEs, [&](const TreeEntry *TE) {
20064 return TE->getVectorFactor() == VF;
20065 });
20066 }))
20067 GatheredScalars.append(NumInputs: VF - GatheredScalars.size(),
20068 Elt: PoisonValue::get(T: OrigScalarTy));
20069 }
20070 // Remove shuffled elements from list of gathers.
20071 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
20072 if (Mask[I] != PoisonMaskElem)
20073 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
20074 }
20075 }
20076 }
20077 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
20078 SmallVectorImpl<int> &ReuseMask,
20079 bool IsRootPoison) {
20080 // For splats with can emit broadcasts instead of gathers, so try to find
20081 // such sequences.
20082 bool IsSplat = IsRootPoison && isSplat(VL: Scalars) &&
20083 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
20084 Scalars.append(NumInputs: VF - Scalars.size(), Elt: PoisonValue::get(T: OrigScalarTy));
20085 SmallVector<int> UndefPos;
20086 DenseMap<Value *, unsigned> UniquePositions;
20087 // Gather unique non-const values and all constant values.
20088 // For repeated values, just shuffle them.
20089 int NumNonConsts = 0;
20090 int SinglePos = 0;
20091 for (auto [I, V] : enumerate(First&: Scalars)) {
20092 if (isa<UndefValue>(Val: V)) {
20093 if (!isa<PoisonValue>(Val: V)) {
20094 ReuseMask[I] = I;
20095 UndefPos.push_back(Elt: I);
20096 }
20097 continue;
20098 }
20099 if (isConstant(V)) {
20100 ReuseMask[I] = I;
20101 continue;
20102 }
20103 ++NumNonConsts;
20104 SinglePos = I;
20105 Value *OrigV = V;
20106 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
20107 if (IsSplat) {
20108 Scalars.front() = OrigV;
20109 ReuseMask[I] = 0;
20110 } else {
20111 const auto Res = UniquePositions.try_emplace(Key: OrigV, Args&: I);
20112 Scalars[Res.first->second] = OrigV;
20113 ReuseMask[I] = Res.first->second;
20114 }
20115 }
20116 if (NumNonConsts == 1) {
20117 // Restore single insert element.
20118 if (IsSplat) {
20119 ReuseMask.assign(NumElts: VF, Elt: PoisonMaskElem);
20120 std::swap(a&: Scalars.front(), b&: Scalars[SinglePos]);
20121 if (!UndefPos.empty() && UndefPos.front() == 0)
20122 Scalars.front() = UndefValue::get(T: OrigScalarTy);
20123 }
20124 ReuseMask[SinglePos] = SinglePos;
20125 } else if (!UndefPos.empty() && IsSplat) {
20126 // For undef values, try to replace them with the simple broadcast.
20127 // We can do it if the broadcasted value is guaranteed to be
20128 // non-poisonous, or by freezing the incoming scalar value first.
20129 auto *It = find_if(Scalars, [this, E](Value *V) {
20130 return !isa<UndefValue>(Val: V) &&
20131 (isVectorized(V) || isGuaranteedNotToBePoison(V, AC) ||
20132 (E->UserTreeIndex && any_of(V->uses(), [E](const Use &U) {
20133 // Check if the value already used in the same operation in
20134 // one of the nodes already.
20135 return E->UserTreeIndex.EdgeIdx != U.getOperandNo() &&
20136 is_contained(Range&: E->UserTreeIndex.UserTE->Scalars,
20137 Element: U.getUser());
20138 })));
20139 });
20140 if (It != Scalars.end()) {
20141 // Replace undefs by the non-poisoned scalars and emit broadcast.
20142 int Pos = std::distance(Scalars.begin(), It);
20143 for (int I : UndefPos) {
20144 // Set the undef position to the non-poisoned scalar.
20145 ReuseMask[I] = Pos;
20146 // Replace the undef by the poison, in the mask it is replaced by
20147 // non-poisoned scalar already.
20148 if (I != Pos)
20149 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
20150 }
20151 } else {
20152 // Replace undefs by the poisons, emit broadcast and then emit
20153 // freeze.
20154 for (int I : UndefPos) {
20155 ReuseMask[I] = PoisonMaskElem;
20156 if (isa<UndefValue>(Val: Scalars[I]))
20157 Scalars[I] = PoisonValue::get(T: OrigScalarTy);
20158 }
20159 NeedFreeze = true;
20160 }
20161 }
20162 };
20163 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
20164 bool IsNonPoisoned = true;
20165 bool IsUsedInExpr = true;
20166 Value *Vec1 = nullptr;
20167 if (!ExtractShuffles.empty()) {
20168 // Gather of extractelements can be represented as just a shuffle of
20169 // a single/two vectors the scalars are extracted from.
20170 // Find input vectors.
20171 Value *Vec2 = nullptr;
20172 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
20173 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
20174 ExtractMask[I] = PoisonMaskElem;
20175 }
20176 if (UseVecBaseAsInput) {
20177 Vec1 = ExtractVecBase;
20178 } else {
20179 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
20180 if (ExtractMask[I] == PoisonMaskElem)
20181 continue;
20182 if (isa<UndefValue>(Val: StoredGS[I]))
20183 continue;
20184 auto *EI = cast<ExtractElementInst>(Val: StoredGS[I]);
20185 Value *VecOp = EI->getVectorOperand();
20186 if (ArrayRef<TreeEntry *> TEs = getTreeEntries(V: VecOp);
20187 !TEs.empty() && TEs.front()->VectorizedValue)
20188 VecOp = TEs.front()->VectorizedValue;
20189 if (!Vec1) {
20190 Vec1 = VecOp;
20191 } else if (Vec1 != VecOp) {
20192 assert((!Vec2 || Vec2 == VecOp) &&
20193 "Expected only 1 or 2 vectors shuffle.");
20194 Vec2 = VecOp;
20195 }
20196 }
20197 }
20198 if (Vec2) {
20199 IsUsedInExpr = false;
20200 IsNonPoisoned &= isGuaranteedNotToBePoison(V: Vec1, AC) &&
20201 isGuaranteedNotToBePoison(V: Vec2, AC);
20202 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
20203 } else if (Vec1) {
20204 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(V: Vec1, AC);
20205 IsUsedInExpr &= FindReusedSplat(
20206 ExtractMask,
20207 cast<FixedVectorType>(Val: Vec1->getType())->getNumElements(), 0,
20208 ExtractMask.size(), IsNotPoisonedVec);
20209 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
20210 IsNonPoisoned &= IsNotPoisonedVec;
20211 } else {
20212 IsUsedInExpr = false;
20213 ShuffleBuilder.add(PoisonValue::get(T: VecTy), ExtractMask,
20214 /*ForExtracts=*/true);
20215 }
20216 }
20217 if (!GatherShuffles.empty()) {
20218 unsigned SliceSize =
20219 getPartNumElems(Size: E->Scalars.size(),
20220 NumParts: ::getNumberOfParts(TTI: *TTI, VecTy, Limit: E->Scalars.size()));
20221 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
20222 for (const auto [I, TEs] : enumerate(First&: Entries)) {
20223 if (TEs.empty()) {
20224 assert(!GatherShuffles[I] &&
20225 "No shuffles with empty entries list expected.");
20226 continue;
20227 }
20228 assert((TEs.size() == 1 || TEs.size() == 2) &&
20229 "Expected shuffle of 1 or 2 entries.");
20230 unsigned Limit = getNumElems(Size: Mask.size(), PartNumElems: SliceSize, Part: I);
20231 auto SubMask = ArrayRef(Mask).slice(N: I * SliceSize, M: Limit);
20232 VecMask.assign(NumElts: VecMask.size(), Elt: PoisonMaskElem);
20233 copy(Range&: SubMask, Out: std::next(x: VecMask.begin(), n: I * SliceSize));
20234 if (TEs.size() == 1) {
20235 bool IsNotPoisonedVec =
20236 TEs.front()->VectorizedValue
20237 ? isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC)
20238 : true;
20239 IsUsedInExpr &=
20240 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
20241 SliceSize, IsNotPoisonedVec);
20242 ShuffleBuilder.add(*TEs.front(), VecMask);
20243 IsNonPoisoned &= IsNotPoisonedVec;
20244 } else {
20245 IsUsedInExpr = false;
20246 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
20247 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
20248 IsNonPoisoned &=
20249 isGuaranteedNotToBePoison(V: TEs.front()->VectorizedValue, AC) &&
20250 isGuaranteedNotToBePoison(V: TEs.back()->VectorizedValue, AC);
20251 }
20252 }
20253 }
20254 // Try to figure out best way to combine values: build a shuffle and insert
20255 // elements or just build several shuffles.
20256 // Insert non-constant scalars.
20257 SmallVector<Value *> NonConstants(GatheredScalars);
20258 int EMSz = ExtractMask.size();
20259 int MSz = Mask.size();
20260 // Try to build constant vector and shuffle with it only if currently we
20261 // have a single permutation and more than 1 scalar constants.
20262 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
20263 bool IsIdentityShuffle =
20264 ((UseVecBaseAsInput ||
20265 all_of(ExtractShuffles,
20266 [](const std::optional<TTI::ShuffleKind> &SK) {
20267 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
20268 TTI::SK_PermuteSingleSrc;
20269 })) &&
20270 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
20271 ShuffleVectorInst::isIdentityMask(Mask: ExtractMask, NumSrcElts: EMSz)) ||
20272 (!GatherShuffles.empty() &&
20273 all_of(GatherShuffles,
20274 [](const std::optional<TTI::ShuffleKind> &SK) {
20275 return SK.value_or(u: TTI::SK_PermuteTwoSrc) ==
20276 TTI::SK_PermuteSingleSrc;
20277 }) &&
20278 none_of(Mask, [&](int I) { return I >= MSz; }) &&
20279 ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: MSz));
20280 bool EnoughConstsForShuffle =
20281 IsSingleShuffle &&
20282 (none_of(GatheredScalars,
20283 [](Value *V) {
20284 return isa<UndefValue>(Val: V) && !isa<PoisonValue>(Val: V);
20285 }) ||
20286 any_of(GatheredScalars,
20287 [](Value *V) {
20288 return isa<Constant>(Val: V) && !isa<UndefValue>(Val: V);
20289 })) &&
20290 (!IsIdentityShuffle ||
20291 (GatheredScalars.size() == 2 &&
20292 any_of(GatheredScalars,
20293 [](Value *V) { return !isa<UndefValue>(Val: V); })) ||
20294 count_if(GatheredScalars, [](Value *V) {
20295 return isa<Constant>(Val: V) && !isa<PoisonValue>(Val: V);
20296 }) > 1);
20297 // NonConstants array contains just non-constant values, GatheredScalars
20298 // contains only constant to build final vector and then shuffle.
20299 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
20300 if (EnoughConstsForShuffle && isa<Constant>(Val: GatheredScalars[I]))
20301 NonConstants[I] = PoisonValue::get(T: OrigScalarTy);
20302 else
20303 GatheredScalars[I] = PoisonValue::get(T: OrigScalarTy);
20304 }
20305 // Generate constants for final shuffle and build a mask for them.
20306 if (!all_of(Range&: GatheredScalars, P: IsaPred<PoisonValue>)) {
20307 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
20308 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
20309 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
20310 ShuffleBuilder.add(BV, BVMask);
20311 }
20312 if (all_of(NonConstants, [=](Value *V) {
20313 return isa<PoisonValue>(Val: V) ||
20314 (IsSingleShuffle && ((IsIdentityShuffle &&
20315 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(Val: V));
20316 }))
20317 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20318 SubVectorsMask);
20319 else
20320 Res = ShuffleBuilder.finalize(
20321 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
20322 [&](Value *&Vec, SmallVectorImpl<int> &Mask, auto CreateShuffle) {
20323 bool IsSplat = isSplat(VL: NonConstants);
20324 SmallVector<int> BVMask(Mask.size(), PoisonMaskElem);
20325 TryPackScalars(NonConstants, BVMask, /*IsRootPoison=*/false);
20326 auto CheckIfSplatIsProfitable = [&]() {
20327 // Estimate the cost of splatting + shuffle and compare with
20328 // insert + shuffle.
20329 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
20330 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
20331 if (isa<ExtractElementInst>(Val: V) || isVectorized(V))
20332 return false;
20333 InstructionCost SplatCost = TTI->getVectorInstrCost(
20334 Opcode: Instruction::InsertElement, Val: VecTy, CostKind, /*Index=*/0,
20335 Op0: PoisonValue::get(T: VecTy), Op1: V);
20336 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20337 for (auto [Idx, I] : enumerate(First&: BVMask))
20338 if (I != PoisonMaskElem)
20339 NewMask[Idx] = Mask.size();
20340 SplatCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteTwoSrc, Tp: VecTy,
20341 Mask: NewMask, CostKind);
20342 InstructionCost BVCost = TTI->getVectorInstrCost(
20343 Opcode: Instruction::InsertElement, Val: VecTy, CostKind,
20344 Index: *find_if(Range&: Mask, P: not_equal_to(Arg: PoisonMaskElem)), Op0: Vec, Op1: V);
20345 // Shuffle required?
20346 if (count(Range&: BVMask, Element: PoisonMaskElem) <
20347 static_cast<int>(BVMask.size() - 1)) {
20348 SmallVector<int> NewMask(Mask.begin(), Mask.end());
20349 for (auto [Idx, I] : enumerate(First&: BVMask))
20350 if (I != PoisonMaskElem)
20351 NewMask[Idx] = I;
20352 BVCost += ::getShuffleCost(TTI: *TTI, Kind: TTI::SK_PermuteSingleSrc,
20353 Tp: VecTy, Mask: NewMask, CostKind);
20354 }
20355 return SplatCost <= BVCost;
20356 };
20357 if (!IsSplat || Mask.size() <= 2 || !CheckIfSplatIsProfitable()) {
20358 for (auto [Idx, I] : enumerate(First&: BVMask))
20359 if (I != PoisonMaskElem)
20360 Mask[Idx] = I;
20361 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
20362 } else {
20363 Value *V = *find_if_not(Range&: NonConstants, P: IsaPred<UndefValue>);
20364 SmallVector<Value *> Values(NonConstants.size(),
20365 PoisonValue::get(T: ScalarTy));
20366 Values[0] = V;
20367 Value *BV = ShuffleBuilder.gather(Values, BVMask.size());
20368 SmallVector<int> SplatMask(BVMask.size(), PoisonMaskElem);
20369 transform(BVMask, SplatMask.begin(), [](int I) {
20370 return I == PoisonMaskElem ? PoisonMaskElem : 0;
20371 });
20372 if (!ShuffleVectorInst::isIdentityMask(Mask: SplatMask, NumSrcElts: VF))
20373 BV = CreateShuffle(BV, nullptr, SplatMask);
20374 for (auto [Idx, I] : enumerate(First&: BVMask))
20375 if (I != PoisonMaskElem)
20376 Mask[Idx] = BVMask.size() + Idx;
20377 Vec = CreateShuffle(Vec, BV, Mask);
20378 for (auto [Idx, I] : enumerate(First&: Mask))
20379 if (I != PoisonMaskElem)
20380 Mask[Idx] = Idx;
20381 }
20382 });
20383 } else if (!allConstant(VL: GatheredScalars)) {
20384 // Gather unique scalars and all constants.
20385 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
20386 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
20387 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
20388 ShuffleBuilder.add(BV, ReuseMask);
20389 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20390 SubVectorsMask);
20391 } else {
20392 // Gather all constants.
20393 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
20394 for (auto [I, V] : enumerate(First&: GatheredScalars)) {
20395 if (!isa<PoisonValue>(Val: V))
20396 Mask[I] = I;
20397 }
20398 Value *BV = ShuffleBuilder.gather(GatheredScalars);
20399 ShuffleBuilder.add(BV, Mask);
20400 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
20401 SubVectorsMask);
20402 }
20403
20404 if (NeedFreeze)
20405 Res = ShuffleBuilder.createFreeze(Res);
20406 return Res;
20407}
20408
20409Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
20410 // Do not do this for split vectorize node, marked to be gathers/buildvectors.
20411 if (E->State != TreeEntry::SplitVectorize ||
20412 !TransformedToGatherNodes.contains(Val: E)) {
20413 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
20414 (void)vectorizeTree(E: VectorizableTree[EIdx].get());
20415 }
20416 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
20417 Params&: Builder, Params&: *this);
20418}
20419
20420/// \returns \p I after propagating metadata from \p VL only for instructions in
20421/// \p VL.
20422static Instruction *propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
20423 SmallVector<Value *> Insts;
20424 for (Value *V : VL)
20425 if (isa<Instruction>(Val: V))
20426 Insts.push_back(Elt: V);
20427 return llvm::propagateMetadata(I: Inst, VL: Insts);
20428}
20429
20430static DebugLoc getDebugLocFromPHI(PHINode &PN) {
20431 if (DebugLoc DL = PN.getDebugLoc())
20432 return DL;
20433 return DebugLoc::getUnknown();
20434}
20435
20436Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
20437 IRBuilderBase::InsertPointGuard Guard(Builder);
20438
20439 Value *V = E->Scalars.front();
20440 Type *ScalarTy = V->getType();
20441 if (!isa<CmpInst>(Val: V))
20442 ScalarTy = getValueType(V);
20443 auto It = MinBWs.find(Val: E);
20444 if (It != MinBWs.end()) {
20445 auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy);
20446 ScalarTy = IntegerType::get(C&: F->getContext(), NumBits: It->second.first);
20447 if (VecTy)
20448 ScalarTy = getWidenedType(ScalarTy, VF: VecTy->getNumElements());
20449 }
20450 if (E->VectorizedValue)
20451 return E->VectorizedValue;
20452 auto *VecTy = getWidenedType(ScalarTy, VF: E->Scalars.size());
20453 if (E->isGather() || TransformedToGatherNodes.contains(Val: E)) {
20454 // Set insert point for non-reduction initial nodes.
20455 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
20456 setInsertPointAfterBundle(E);
20457 Value *Vec = createBuildVector(E, ScalarTy);
20458 E->VectorizedValue = Vec;
20459 return Vec;
20460 }
20461 if (E->State == TreeEntry::SplitVectorize) {
20462 assert(E->CombinedEntriesWithIndices.size() == 2 &&
20463 "Expected exactly 2 combined entries.");
20464 setInsertPointAfterBundle(E);
20465 TreeEntry &OpTE1 =
20466 *VectorizableTree[E->CombinedEntriesWithIndices.front().first];
20467 assert(OpTE1.isSame(
20468 ArrayRef(E->Scalars).take_front(OpTE1.getVectorFactor())) &&
20469 "Expected same first part of scalars.");
20470 Value *Op1 = vectorizeTree(E: &OpTE1);
20471 TreeEntry &OpTE2 =
20472 *VectorizableTree[E->CombinedEntriesWithIndices.back().first];
20473 assert(
20474 OpTE2.isSame(ArrayRef(E->Scalars).take_back(OpTE2.getVectorFactor())) &&
20475 "Expected same second part of scalars.");
20476 Value *Op2 = vectorizeTree(E: &OpTE2);
20477 auto GetOperandSignedness = [&](const TreeEntry *OpE) {
20478 bool IsSigned = false;
20479 auto It = MinBWs.find(Val: OpE);
20480 if (It != MinBWs.end())
20481 IsSigned = It->second.second;
20482 else
20483 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
20484 if (isa<PoisonValue>(Val: V))
20485 return false;
20486 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
20487 });
20488 return IsSigned;
20489 };
20490 if (cast<VectorType>(Val: Op1->getType())->getElementType() !=
20491 ScalarTy->getScalarType()) {
20492 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20493 Op1 = Builder.CreateIntCast(
20494 V: Op1,
20495 DestTy: getWidenedType(
20496 ScalarTy,
20497 VF: cast<FixedVectorType>(Val: Op1->getType())->getNumElements()),
20498 isSigned: GetOperandSignedness(&OpTE1));
20499 }
20500 if (cast<VectorType>(Val: Op2->getType())->getElementType() !=
20501 ScalarTy->getScalarType()) {
20502 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20503 Op2 = Builder.CreateIntCast(
20504 V: Op2,
20505 DestTy: getWidenedType(
20506 ScalarTy,
20507 VF: cast<FixedVectorType>(Val: Op2->getType())->getNumElements()),
20508 isSigned: GetOperandSignedness(&OpTE2));
20509 }
20510 if (E->ReorderIndices.empty()) {
20511 SmallVector<int> Mask(E->getVectorFactor(), PoisonMaskElem);
20512 std::iota(
20513 first: Mask.begin(),
20514 last: std::next(x: Mask.begin(), n: E->CombinedEntriesWithIndices.back().second),
20515 value: 0);
20516 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
20517 if (ScalarTyNumElements != 1) {
20518 assert(SLPReVec && "Only supported by REVEC.");
20519 transformScalarShuffleIndiciesToVector(VecTyNumElements: ScalarTyNumElements, Mask);
20520 }
20521 Value *Vec = Builder.CreateShuffleVector(V: Op1, Mask);
20522 Vec = createInsertVector(Builder, Vec, V: Op2,
20523 Index: E->CombinedEntriesWithIndices.back().second *
20524 ScalarTyNumElements);
20525 E->VectorizedValue = Vec;
20526 return Vec;
20527 }
20528 unsigned CommonVF =
20529 std::max(a: OpTE1.getVectorFactor(), b: OpTE2.getVectorFactor());
20530 const unsigned Scale = getNumElements(Ty: ScalarTy);
20531 CommonVF *= Scale;
20532 if (getNumElements(Ty: Op1->getType()) != CommonVF) {
20533 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20534 copy(Range: createReplicatedMask(ReplicationFactor: Scale, VF: OpTE1.getVectorFactor() * Scale),
20535 Out: Mask.begin());
20536 Op1 = Builder.CreateShuffleVector(V: Op1, Mask);
20537 }
20538 if (getNumElements(Ty: Op2->getType()) != CommonVF) {
20539 SmallVector<int> Mask(CommonVF, PoisonMaskElem);
20540 copy(Range: createReplicatedMask(ReplicationFactor: Scale, VF: OpTE2.getVectorFactor() * Scale),
20541 Out: Mask.begin());
20542 Op2 = Builder.CreateShuffleVector(V: Op2, Mask);
20543 }
20544 Value *Vec = Builder.CreateShuffleVector(V1: Op1, V2: Op2, Mask: E->getSplitMask());
20545 E->VectorizedValue = Vec;
20546 return Vec;
20547 }
20548
20549 bool IsReverseOrder =
20550 !E->ReorderIndices.empty() && isReverseOrder(Order: E->ReorderIndices);
20551 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
20552 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
20553 if (E->getOpcode() == Instruction::Store &&
20554 E->State == TreeEntry::Vectorize) {
20555 ArrayRef<int> Mask =
20556 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
20557 E->ReorderIndices.size());
20558 ShuffleBuilder.add(V1: V, Mask);
20559 } else if ((E->State == TreeEntry::StridedVectorize && IsReverseOrder) ||
20560 E->State == TreeEntry::CompressVectorize) {
20561 ShuffleBuilder.addOrdered(V1: V, Order: {});
20562 } else {
20563 ShuffleBuilder.addOrdered(V1: V, Order: E->ReorderIndices);
20564 }
20565 SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(
20566 E->CombinedEntriesWithIndices.size());
20567 transform(
20568 Range: E->CombinedEntriesWithIndices, d_first: SubVectors.begin(), F: [&](const auto &P) {
20569 return std::make_pair(VectorizableTree[P.first].get(), P.second);
20570 });
20571 assert(
20572 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
20573 "Expected either combined subnodes or reordering");
20574 return ShuffleBuilder.finalize(ExtMask: E->ReuseShuffleIndices, SubVectors, SubVectorsMask: {});
20575 };
20576
20577 assert(!E->isGather() && "Unhandled state");
20578 unsigned ShuffleOrOp =
20579 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
20580 if (!E->isAltShuffle()) {
20581 switch (E->CombinedOp) {
20582 case TreeEntry::ReducedBitcast:
20583 case TreeEntry::ReducedBitcastBSwap:
20584 case TreeEntry::ReducedBitcastLoads:
20585 case TreeEntry::ReducedBitcastBSwapLoads:
20586 case TreeEntry::ReducedCmpBitcast:
20587 ShuffleOrOp = E->CombinedOp;
20588 break;
20589 default:
20590 break;
20591 }
20592 }
20593 Instruction *VL0 = E->getMainOp();
20594 auto GetOperandSignedness = [&](unsigned Idx) {
20595 const TreeEntry *OpE = getOperandEntry(E, Idx);
20596 bool IsSigned = false;
20597 auto It = MinBWs.find(Val: OpE);
20598 if (It != MinBWs.end())
20599 IsSigned = It->second.second;
20600 else
20601 IsSigned = any_of(Range: OpE->Scalars, P: [&](Value *R) {
20602 if (isa<PoisonValue>(Val: V))
20603 return false;
20604 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
20605 });
20606 return IsSigned;
20607 };
20608 switch (ShuffleOrOp) {
20609 case Instruction::PHI: {
20610 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
20611 E != VectorizableTree.front().get() || E->UserTreeIndex) &&
20612 "PHI reordering is free.");
20613 auto *PH = cast<PHINode>(Val: VL0);
20614 Builder.SetInsertPoint(TheBB: PH->getParent(),
20615 IP: PH->getParent()->getFirstNonPHIIt());
20616 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20617 PHINode *NewPhi = Builder.CreatePHI(Ty: VecTy, NumReservedValues: PH->getNumIncomingValues());
20618 Value *V = NewPhi;
20619
20620 // Adjust insertion point once all PHI's have been generated.
20621 Builder.SetInsertPoint(TheBB: PH->getParent(),
20622 IP: PH->getParent()->getFirstInsertionPt());
20623 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20624
20625 V = FinalShuffle(V, E);
20626
20627 E->VectorizedValue = V;
20628 // If phi node is fully emitted - exit.
20629 if (NewPhi->getNumIncomingValues() != 0)
20630 return NewPhi;
20631
20632 // PHINodes may have multiple entries from the same block. We want to
20633 // visit every block once.
20634 SmallDenseMap<BasicBlock *, unsigned, 4> VisitedBBs;
20635 for (unsigned I : seq<unsigned>(Size: PH->getNumIncomingValues())) {
20636 BasicBlock *IBB = PH->getIncomingBlock(i: I);
20637
20638 // Stop emission if all incoming values are generated.
20639 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
20640 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
20641 return NewPhi;
20642 }
20643
20644 auto Res = VisitedBBs.try_emplace(Key: IBB, Args&: I);
20645 if (!Res.second) {
20646 TreeEntry *OpTE = getOperandEntry(E, Idx: I);
20647 if (OpTE->isGather() || DeletedNodes.contains(Ptr: OpTE) ||
20648 TransformedToGatherNodes.contains(Val: OpTE)) {
20649 Value *VecOp = NewPhi->getIncomingValue(i: Res.first->getSecond());
20650 NewPhi->addIncoming(V: VecOp, BB: IBB);
20651 assert(!OpTE->VectorizedValue && "Expected no vectorized value.");
20652 OpTE->VectorizedValue = VecOp;
20653 continue;
20654 }
20655 }
20656
20657 Builder.SetInsertPoint(IBB->getTerminator());
20658 Builder.SetCurrentDebugLocation(getDebugLocFromPHI(PN&: *PH));
20659 Value *Vec = vectorizeOperand(E, NodeIdx: I);
20660 if (VecTy != Vec->getType()) {
20661 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
20662 MinBWs.contains(getOperandEntry(E, I))) &&
20663 "Expected item in MinBWs.");
20664 Vec = Builder.CreateIntCast(V: Vec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
20665 }
20666 NewPhi->addIncoming(V: Vec, BB: IBB);
20667 }
20668
20669 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
20670 "Invalid number of incoming values");
20671 assert(E->VectorizedValue && "Expected vectorized value.");
20672 return E->VectorizedValue;
20673 }
20674
20675 case Instruction::ExtractElement: {
20676 Value *V = E->getSingleOperand(OpIdx: 0);
20677 setInsertPointAfterBundle(E);
20678 V = FinalShuffle(V, E);
20679 E->VectorizedValue = V;
20680 return V;
20681 }
20682 case Instruction::ExtractValue: {
20683 auto *LI = cast<LoadInst>(Val: E->getSingleOperand(OpIdx: 0));
20684 Builder.SetInsertPoint(LI);
20685 Value *Ptr = LI->getPointerOperand();
20686 LoadInst *V = Builder.CreateAlignedLoad(Ty: VecTy, Ptr, Align: LI->getAlign());
20687 Value *NewV = ::propagateMetadata(Inst: V, VL: E->Scalars);
20688 NewV = FinalShuffle(NewV, E);
20689 E->VectorizedValue = NewV;
20690 return NewV;
20691 }
20692 case Instruction::InsertElement: {
20693 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
20694 if (const TreeEntry *OpE = getOperandEntry(E, Idx: 1);
20695 OpE && !OpE->isGather() && OpE->hasState() &&
20696 !OpE->hasCopyableElements())
20697 Builder.SetInsertPoint(cast<Instruction>(Val: E->Scalars.back()));
20698 else
20699 setInsertPointAfterBundle(E);
20700 Value *V = vectorizeOperand(E, NodeIdx: 1);
20701 ArrayRef<Value *> Op = E->getOperand(OpIdx: 1);
20702 Type *ScalarTy = Op.front()->getType();
20703 if (cast<VectorType>(Val: V->getType())->getElementType() != ScalarTy) {
20704 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
20705 std::pair<unsigned, bool> Res = MinBWs.lookup(Val: getOperandEntry(E, Idx: 1));
20706 assert(Res.first > 0 && "Expected item in MinBWs.");
20707 V = Builder.CreateIntCast(
20708 V,
20709 DestTy: getWidenedType(
20710 ScalarTy,
20711 VF: cast<FixedVectorType>(Val: V->getType())->getNumElements()),
20712 isSigned: Res.second);
20713 }
20714
20715 // Create InsertVector shuffle if necessary
20716 auto *FirstInsert = cast<Instruction>(Val: *find_if(Range&: E->Scalars, P: [E](Value *V) {
20717 return !is_contained(Range&: E->Scalars, Element: cast<Instruction>(Val: V)->getOperand(i: 0));
20718 }));
20719 const unsigned NumElts =
20720 cast<FixedVectorType>(Val: FirstInsert->getType())->getNumElements();
20721 const unsigned NumScalars = E->Scalars.size();
20722
20723 unsigned Offset = *getElementIndex(Inst: VL0);
20724 assert(Offset < NumElts && "Failed to find vector index offset");
20725
20726 // Create shuffle to resize vector
20727 SmallVector<int> Mask;
20728 if (!E->ReorderIndices.empty()) {
20729 inversePermutation(Indices: E->ReorderIndices, Mask);
20730 Mask.append(NumInputs: NumElts - NumScalars, Elt: PoisonMaskElem);
20731 } else {
20732 Mask.assign(NumElts, Elt: PoisonMaskElem);
20733 std::iota(first: Mask.begin(), last: std::next(x: Mask.begin(), n: NumScalars), value: 0);
20734 }
20735 // Create InsertVector shuffle if necessary
20736 bool IsIdentity = true;
20737 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
20738 Mask.swap(RHS&: PrevMask);
20739 for (unsigned I = 0; I < NumScalars; ++I) {
20740 Value *Scalar = E->Scalars[PrevMask[I]];
20741 unsigned InsertIdx = *getElementIndex(Inst: Scalar);
20742 IsIdentity &= InsertIdx - Offset == I;
20743 Mask[InsertIdx - Offset] = I;
20744 }
20745 if (!IsIdentity || NumElts != NumScalars) {
20746 Value *V2 = nullptr;
20747 bool IsVNonPoisonous =
20748 !isConstant(V) && isGuaranteedNotToBePoison(V, AC);
20749 SmallVector<int> InsertMask(Mask);
20750 if (NumElts != NumScalars && Offset == 0) {
20751 // Follow all insert element instructions from the current buildvector
20752 // sequence.
20753 InsertElementInst *Ins = cast<InsertElementInst>(Val: VL0);
20754 do {
20755 std::optional<unsigned> InsertIdx = getElementIndex(Inst: Ins);
20756 if (!InsertIdx)
20757 break;
20758 if (InsertMask[*InsertIdx] == PoisonMaskElem)
20759 InsertMask[*InsertIdx] = *InsertIdx;
20760 if (!Ins->hasOneUse())
20761 break;
20762 Ins = dyn_cast_or_null<InsertElementInst>(
20763 Val: Ins->getUniqueUndroppableUser());
20764 } while (Ins);
20765 SmallBitVector UseMask =
20766 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
20767 SmallBitVector IsFirstPoison =
20768 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
20769 SmallBitVector IsFirstUndef =
20770 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
20771 if (!IsFirstPoison.all()) {
20772 unsigned Idx = 0;
20773 for (unsigned I = 0; I < NumElts; I++) {
20774 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I) &&
20775 IsFirstUndef.test(Idx: I)) {
20776 if (IsVNonPoisonous) {
20777 InsertMask[I] = I < NumScalars ? I : 0;
20778 continue;
20779 }
20780 if (!V2)
20781 V2 = UndefValue::get(T: V->getType());
20782 if (Idx >= NumScalars)
20783 Idx = NumScalars - 1;
20784 InsertMask[I] = NumScalars + Idx;
20785 ++Idx;
20786 } else if (InsertMask[I] != PoisonMaskElem &&
20787 Mask[I] == PoisonMaskElem) {
20788 InsertMask[I] = PoisonMaskElem;
20789 }
20790 }
20791 } else {
20792 InsertMask = Mask;
20793 }
20794 }
20795 if (!V2)
20796 V2 = PoisonValue::get(T: V->getType());
20797 V = Builder.CreateShuffleVector(V1: V, V2, Mask: InsertMask);
20798 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20799 GatherShuffleExtractSeq.insert(X: I);
20800 CSEBlocks.insert(V: I->getParent());
20801 }
20802 }
20803
20804 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
20805 for (unsigned I = 0; I < NumElts; I++) {
20806 if (Mask[I] != PoisonMaskElem)
20807 InsertMask[Offset + I] = I;
20808 }
20809 SmallBitVector UseMask =
20810 buildUseMask(VF: NumElts, Mask: InsertMask, MaskArg: UseMask::UndefsAsMask);
20811 SmallBitVector IsFirstUndef =
20812 isUndefVector(V: FirstInsert->getOperand(i: 0), UseMask);
20813 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
20814 NumElts != NumScalars) {
20815 if (IsFirstUndef.all()) {
20816 if (!ShuffleVectorInst::isIdentityMask(Mask: InsertMask, NumSrcElts: NumElts)) {
20817 SmallBitVector IsFirstPoison =
20818 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
20819 if (!IsFirstPoison.all()) {
20820 for (unsigned I = 0; I < NumElts; I++) {
20821 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(Idx: I))
20822 InsertMask[I] = I + NumElts;
20823 }
20824 }
20825 V = Builder.CreateShuffleVector(
20826 V1: V,
20827 V2: IsFirstPoison.all() ? PoisonValue::get(T: V->getType())
20828 : FirstInsert->getOperand(i: 0),
20829 Mask: InsertMask, Name: cast<Instruction>(Val: E->Scalars.back())->getName());
20830 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20831 GatherShuffleExtractSeq.insert(X: I);
20832 CSEBlocks.insert(V: I->getParent());
20833 }
20834 }
20835 } else {
20836 SmallBitVector IsFirstPoison =
20837 isUndefVector<true>(V: FirstInsert->getOperand(i: 0), UseMask);
20838 for (unsigned I = 0; I < NumElts; I++) {
20839 if (InsertMask[I] == PoisonMaskElem)
20840 InsertMask[I] = IsFirstPoison.test(Idx: I) ? PoisonMaskElem : I;
20841 else
20842 InsertMask[I] += NumElts;
20843 }
20844 V = Builder.CreateShuffleVector(
20845 V1: FirstInsert->getOperand(i: 0), V2: V, Mask: InsertMask,
20846 Name: cast<Instruction>(Val: E->Scalars.back())->getName());
20847 if (auto *I = dyn_cast<Instruction>(Val: V)) {
20848 GatherShuffleExtractSeq.insert(X: I);
20849 CSEBlocks.insert(V: I->getParent());
20850 }
20851 }
20852 }
20853
20854 ++NumVectorInstructions;
20855 E->VectorizedValue = V;
20856 return V;
20857 }
20858 case Instruction::ZExt:
20859 case Instruction::SExt:
20860 case Instruction::FPToUI:
20861 case Instruction::FPToSI:
20862 case Instruction::FPExt:
20863 case Instruction::PtrToInt:
20864 case Instruction::IntToPtr:
20865 case Instruction::SIToFP:
20866 case Instruction::UIToFP:
20867 case Instruction::Trunc:
20868 case Instruction::FPTrunc:
20869 case Instruction::BitCast: {
20870 setInsertPointAfterBundle(E);
20871
20872 Value *InVec = vectorizeOperand(E, NodeIdx: 0);
20873
20874 auto *CI = cast<CastInst>(Val: VL0);
20875 Instruction::CastOps VecOpcode = CI->getOpcode();
20876 Type *SrcScalarTy = cast<VectorType>(Val: InVec->getType())->getElementType();
20877 auto SrcIt = MinBWs.find(Val: getOperandEntry(E, Idx: 0));
20878 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
20879 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
20880 SrcScalarTy != CI->getOperand(i_nocapture: 0)->getType()->getScalarType())) {
20881 // Check if the values are candidates to demote.
20882 unsigned SrcBWSz = DL->getTypeSizeInBits(Ty: SrcScalarTy);
20883 if (SrcIt != MinBWs.end())
20884 SrcBWSz = SrcIt->second.first;
20885 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy->getScalarType());
20886 if (BWSz == SrcBWSz) {
20887 VecOpcode = Instruction::BitCast;
20888 } else if (BWSz < SrcBWSz) {
20889 VecOpcode = Instruction::Trunc;
20890 } else if (It != MinBWs.end()) {
20891 assert(BWSz > SrcBWSz && "Invalid cast!");
20892 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
20893 } else if (SrcIt != MinBWs.end()) {
20894 assert(BWSz > SrcBWSz && "Invalid cast!");
20895 VecOpcode =
20896 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
20897 }
20898 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
20899 !SrcIt->second.second) {
20900 VecOpcode = Instruction::UIToFP;
20901 } else if (VecOpcode == Instruction::BitCast && SrcIt != MinBWs.end() &&
20902 ScalarTy->isFPOrFPVectorTy()) {
20903 Type *OrigSrcScalarTy = CI->getSrcTy();
20904 auto *OrigSrcVectorTy =
20905 getWidenedType(ScalarTy: OrigSrcScalarTy, VF: E->Scalars.size());
20906 InVec =
20907 Builder.CreateIntCast(V: InVec, DestTy: OrigSrcVectorTy, isSigned: SrcIt->second.second);
20908 }
20909 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
20910 ? InVec
20911 : Builder.CreateCast(Op: VecOpcode, V: InVec, DestTy: VecTy);
20912 V = FinalShuffle(V, E);
20913
20914 E->VectorizedValue = V;
20915 ++NumVectorInstructions;
20916 return V;
20917 }
20918 case Instruction::FCmp:
20919 case Instruction::ICmp: {
20920 setInsertPointAfterBundle(E);
20921
20922 Value *L = vectorizeOperand(E, NodeIdx: 0);
20923 Value *R = vectorizeOperand(E, NodeIdx: 1);
20924 if (L->getType() != R->getType()) {
20925 assert((getOperandEntry(E, 0)->isGather() ||
20926 getOperandEntry(E, 1)->isGather() ||
20927 MinBWs.contains(getOperandEntry(E, 0)) ||
20928 MinBWs.contains(getOperandEntry(E, 1))) &&
20929 "Expected item in MinBWs.");
20930 if (cast<VectorType>(Val: L->getType())
20931 ->getElementType()
20932 ->getIntegerBitWidth() < cast<VectorType>(Val: R->getType())
20933 ->getElementType()
20934 ->getIntegerBitWidth()) {
20935 Type *CastTy = R->getType();
20936 L = Builder.CreateIntCast(V: L, DestTy: CastTy, isSigned: GetOperandSignedness(0));
20937 } else {
20938 Type *CastTy = L->getType();
20939 R = Builder.CreateIntCast(V: R, DestTy: CastTy, isSigned: GetOperandSignedness(1));
20940 }
20941 }
20942
20943 CmpInst::Predicate P0 = cast<CmpInst>(Val: VL0)->getPredicate();
20944 Value *V = Builder.CreateCmp(Pred: P0, LHS: L, RHS: R);
20945 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
20946 if (auto *ICmp = dyn_cast<ICmpInst>(Val: V); ICmp && It == MinBWs.end())
20947 ICmp->setSameSign(/*B=*/false);
20948 // Do not cast for cmps.
20949 VecTy = cast<FixedVectorType>(Val: V->getType());
20950 V = FinalShuffle(V, E);
20951
20952 E->VectorizedValue = V;
20953 ++NumVectorInstructions;
20954 return V;
20955 }
20956 case Instruction::Select: {
20957 setInsertPointAfterBundle(E);
20958
20959 Value *Cond = vectorizeOperand(E, NodeIdx: 0);
20960 Value *True = vectorizeOperand(E, NodeIdx: 1);
20961 Value *False = vectorizeOperand(E, NodeIdx: 2);
20962 if (True->getType() != VecTy || False->getType() != VecTy) {
20963 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
20964 getOperandEntry(E, 2)->isGather() ||
20965 MinBWs.contains(getOperandEntry(E, 1)) ||
20966 MinBWs.contains(getOperandEntry(E, 2))) &&
20967 "Expected item in MinBWs.");
20968 if (True->getType() != VecTy)
20969 True = Builder.CreateIntCast(V: True, DestTy: VecTy, isSigned: GetOperandSignedness(1));
20970 if (False->getType() != VecTy)
20971 False = Builder.CreateIntCast(V: False, DestTy: VecTy, isSigned: GetOperandSignedness(2));
20972 }
20973
20974 unsigned CondNumElements = getNumElements(Ty: Cond->getType());
20975 unsigned TrueNumElements = getNumElements(Ty: True->getType());
20976 assert(TrueNumElements >= CondNumElements &&
20977 TrueNumElements % CondNumElements == 0 &&
20978 "Cannot vectorize Instruction::Select");
20979 assert(TrueNumElements == getNumElements(False->getType()) &&
20980 "Cannot vectorize Instruction::Select");
20981 if (CondNumElements != TrueNumElements) {
20982 // When the return type is i1 but the source is fixed vector type, we
20983 // need to duplicate the condition value.
20984 Cond = Builder.CreateShuffleVector(
20985 V: Cond, Mask: createReplicatedMask(ReplicationFactor: TrueNumElements / CondNumElements,
20986 VF: CondNumElements));
20987 }
20988 assert(getNumElements(Cond->getType()) == TrueNumElements &&
20989 "Cannot vectorize Instruction::Select");
20990 Value *V =
20991 Builder.CreateSelectWithUnknownProfile(C: Cond, True, False, DEBUG_TYPE);
20992 V = FinalShuffle(V, E);
20993
20994 E->VectorizedValue = V;
20995 ++NumVectorInstructions;
20996 return V;
20997 }
20998 case Instruction::FNeg: {
20999 setInsertPointAfterBundle(E);
21000
21001 Value *Op = vectorizeOperand(E, NodeIdx: 0);
21002
21003 Value *V = Builder.CreateUnOp(
21004 Opc: static_cast<Instruction::UnaryOps>(E->getOpcode()), V: Op);
21005 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21006 if (auto *I = dyn_cast<Instruction>(Val: V))
21007 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21008
21009 V = FinalShuffle(V, E);
21010
21011 E->VectorizedValue = V;
21012 ++NumVectorInstructions;
21013
21014 return V;
21015 }
21016 case Instruction::Freeze: {
21017 setInsertPointAfterBundle(E);
21018
21019 Value *Op = vectorizeOperand(E, NodeIdx: 0);
21020
21021 if (Op->getType() != VecTy) {
21022 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
21023 MinBWs.contains(getOperandEntry(E, 0))) &&
21024 "Expected item in MinBWs.");
21025 Op = Builder.CreateIntCast(V: Op, DestTy: VecTy, isSigned: GetOperandSignedness(0));
21026 }
21027 Value *V = Builder.CreateFreeze(V: Op);
21028 V = FinalShuffle(V, E);
21029
21030 E->VectorizedValue = V;
21031 ++NumVectorInstructions;
21032
21033 return V;
21034 }
21035 case Instruction::Add:
21036 case Instruction::FAdd:
21037 case Instruction::Sub:
21038 case Instruction::FSub:
21039 case Instruction::Mul:
21040 case Instruction::FMul:
21041 case Instruction::UDiv:
21042 case Instruction::SDiv:
21043 case Instruction::FDiv:
21044 case Instruction::URem:
21045 case Instruction::SRem:
21046 case Instruction::FRem:
21047 case Instruction::Shl:
21048 case Instruction::LShr:
21049 case Instruction::AShr:
21050 case Instruction::And:
21051 case Instruction::Or:
21052 case Instruction::Xor: {
21053 setInsertPointAfterBundle(E);
21054
21055 Value *LHS = vectorizeOperand(E, NodeIdx: 0);
21056 Value *RHS = vectorizeOperand(E, NodeIdx: 1);
21057 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
21058 for (unsigned I : seq<unsigned>(Begin: 0, End: E->getNumOperands())) {
21059 ArrayRef<Value *> Ops = E->getOperand(OpIdx: I);
21060 if (all_of(Range&: Ops, P: [&](Value *Op) {
21061 auto *CI = dyn_cast<ConstantInt>(Val: Op);
21062 return CI && CI->getValue().countr_one() >= It->second.first;
21063 })) {
21064 V = FinalShuffle(I == 0 ? RHS : LHS, E);
21065 E->VectorizedValue = V;
21066 ++NumVectorInstructions;
21067 return V;
21068 }
21069 }
21070 }
21071 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
21072 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
21073 getOperandEntry(E, 1)->isGather() ||
21074 MinBWs.contains(getOperandEntry(E, 0)) ||
21075 MinBWs.contains(getOperandEntry(E, 1))) &&
21076 "Expected item in MinBWs.");
21077 if (LHS->getType() != VecTy)
21078 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: GetOperandSignedness(0));
21079 if (RHS->getType() != VecTy)
21080 RHS = Builder.CreateIntCast(V: RHS, DestTy: VecTy, isSigned: GetOperandSignedness(1));
21081 }
21082
21083 Value *V = Builder.CreateBinOp(
21084 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
21085 RHS);
21086 propagateIRFlags(I: V, VL: E->Scalars, OpValue: nullptr, IncludeWrapFlags: It == MinBWs.end());
21087 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21088 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21089 // Drop nuw flags for abs(sub(commutative), true).
21090 if (!MinBWs.contains(Val: E) && ShuffleOrOp == Instruction::Sub &&
21091 any_of(Range&: E->Scalars, P: [E](Value *V) {
21092 return isa<PoisonValue>(Val: V) ||
21093 (E->hasCopyableElements() && E->isCopyableElement(V)) ||
21094 isCommutative(I: cast<Instruction>(Val: V));
21095 }))
21096 I->setHasNoUnsignedWrap(/*b=*/false);
21097 }
21098
21099 V = FinalShuffle(V, E);
21100
21101 E->VectorizedValue = V;
21102 ++NumVectorInstructions;
21103
21104 return V;
21105 }
21106 case Instruction::Load: {
21107 // Loads are inserted at the head of the tree because we don't want to
21108 // sink them all the way down past store instructions.
21109 setInsertPointAfterBundle(E);
21110
21111 LoadInst *LI = cast<LoadInst>(Val: VL0);
21112 Instruction *NewLI;
21113 FixedVectorType *StridedLoadTy = nullptr;
21114 Value *PO = LI->getPointerOperand();
21115 if (E->State == TreeEntry::Vectorize) {
21116 NewLI = Builder.CreateAlignedLoad(Ty: VecTy, Ptr: PO, Align: LI->getAlign());
21117 } else if (E->State == TreeEntry::CompressVectorize) {
21118 auto [CompressMask, LoadVecTy, InterleaveFactor, IsMasked] =
21119 CompressEntryToData.at(Val: E);
21120 Align CommonAlignment = LI->getAlign();
21121 if (IsMasked) {
21122 unsigned VF = getNumElements(Ty: LoadVecTy);
21123 SmallVector<Constant *> MaskValues(
21124 VF / getNumElements(Ty: LI->getType()),
21125 ConstantInt::getFalse(Context&: VecTy->getContext()));
21126 for (int I : CompressMask)
21127 MaskValues[I] = ConstantInt::getTrue(Context&: VecTy->getContext());
21128 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
21129 assert(SLPReVec && "Only supported by REVEC.");
21130 MaskValues = replicateMask(Val: MaskValues, VF: VecTy->getNumElements());
21131 }
21132 Constant *MaskValue = ConstantVector::get(V: MaskValues);
21133 NewLI = Builder.CreateMaskedLoad(Ty: LoadVecTy, Ptr: PO, Alignment: CommonAlignment,
21134 Mask: MaskValue);
21135 } else {
21136 NewLI = Builder.CreateAlignedLoad(Ty: LoadVecTy, Ptr: PO, Align: CommonAlignment);
21137 }
21138 NewLI = ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
21139 // TODO: include this cost into CommonCost.
21140 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: LI->getType())) {
21141 assert(SLPReVec && "FixedVectorType is not expected.");
21142 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(),
21143 Mask&: CompressMask);
21144 }
21145 NewLI =
21146 cast<Instruction>(Val: Builder.CreateShuffleVector(V: NewLI, Mask: CompressMask));
21147 } else if (E->State == TreeEntry::StridedVectorize) {
21148 Value *Ptr0 = cast<LoadInst>(Val: E->Scalars.front())->getPointerOperand();
21149 Value *PtrN = cast<LoadInst>(Val: E->Scalars.back())->getPointerOperand();
21150 PO = IsReverseOrder ? PtrN : Ptr0;
21151 Type *StrideTy = DL->getIndexType(PtrTy: PO->getType());
21152 Value *StrideVal;
21153 const StridedPtrInfo &SPtrInfo = TreeEntryToStridedPtrInfoMap.at(Val: E);
21154 StridedLoadTy = SPtrInfo.Ty;
21155 assert(StridedLoadTy && "Missing StridedPoinerInfo for tree entry.");
21156 unsigned StridedLoadEC =
21157 StridedLoadTy->getElementCount().getKnownMinValue();
21158
21159 Value *Stride = SPtrInfo.StrideVal;
21160 if (!Stride) {
21161 const SCEV *StrideSCEV = SPtrInfo.StrideSCEV;
21162 assert(StrideSCEV && "Neither StrideVal nor StrideSCEV were set.");
21163 SCEVExpander Expander(*SE, "strided-load-vec");
21164 Stride = Expander.expandCodeFor(SH: StrideSCEV, Ty: StrideSCEV->getType(),
21165 I: &*Builder.GetInsertPoint());
21166 }
21167 Value *NewStride =
21168 Builder.CreateIntCast(V: Stride, DestTy: StrideTy, /*isSigned=*/true);
21169 StrideVal = Builder.CreateMul(
21170 LHS: NewStride, RHS: ConstantInt::getSigned(
21171 Ty: StrideTy, V: (IsReverseOrder ? -1 : 1) *
21172 static_cast<int>(
21173 DL->getTypeAllocSize(Ty: ScalarTy))));
21174 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
21175 auto *Inst = Builder.CreateIntrinsic(
21176 ID: Intrinsic::experimental_vp_strided_load,
21177 Types: {StridedLoadTy, PO->getType(), StrideTy},
21178 Args: {PO, StrideVal,
21179 Builder.getAllOnesMask(NumElts: ElementCount::getFixed(MinVal: StridedLoadEC)),
21180 Builder.getInt32(C: StridedLoadEC)});
21181 Inst->addParamAttr(
21182 /*ArgNo=*/0,
21183 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
21184 NewLI = Inst;
21185 } else {
21186 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
21187 Value *VecPtr = vectorizeOperand(E, NodeIdx: 0);
21188 if (isa<FixedVectorType>(Val: ScalarTy)) {
21189 assert(SLPReVec && "FixedVectorType is not expected.");
21190 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
21191 // to expand VecPtr if ScalarTy is a vector type.
21192 unsigned ScalarTyNumElements =
21193 cast<FixedVectorType>(Val: ScalarTy)->getNumElements();
21194 unsigned VecTyNumElements =
21195 cast<FixedVectorType>(Val: VecTy)->getNumElements();
21196 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
21197 "Cannot expand getelementptr.");
21198 unsigned VF = VecTyNumElements / ScalarTyNumElements;
21199 SmallVector<Constant *> Indices(VecTyNumElements);
21200 transform(Range: seq(Size: VecTyNumElements), d_first: Indices.begin(), F: [=](unsigned I) {
21201 return Builder.getInt64(C: I % ScalarTyNumElements);
21202 });
21203 VecPtr = Builder.CreateGEP(
21204 Ty: VecTy->getElementType(),
21205 Ptr: Builder.CreateShuffleVector(
21206 V: VecPtr, Mask: createReplicatedMask(ReplicationFactor: ScalarTyNumElements, VF)),
21207 IdxList: ConstantVector::get(V: Indices));
21208 }
21209 // Use the minimum alignment of the gathered loads.
21210 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL: E->Scalars);
21211 NewLI = Builder.CreateMaskedGather(Ty: VecTy, Ptrs: VecPtr, Alignment: CommonAlignment);
21212 }
21213 Value *V = E->State == TreeEntry::CompressVectorize
21214 ? NewLI
21215 : ::propagateMetadata(Inst: NewLI, VL: E->Scalars);
21216
21217 if (StridedLoadTy != VecTy)
21218 V = Builder.CreateBitOrPointerCast(V, DestTy: VecTy);
21219 V = FinalShuffle(V, E);
21220 E->VectorizedValue = V;
21221 ++NumVectorInstructions;
21222 return V;
21223 }
21224 case Instruction::Store: {
21225 auto *SI = cast<StoreInst>(Val: VL0);
21226
21227 setInsertPointAfterBundle(E);
21228
21229 Value *VecValue = vectorizeOperand(E, NodeIdx: 0);
21230 if (VecValue->getType() != VecTy)
21231 VecValue =
21232 Builder.CreateIntCast(V: VecValue, DestTy: VecTy, isSigned: GetOperandSignedness(0));
21233 VecValue = FinalShuffle(VecValue, E);
21234
21235 Value *Ptr = SI->getPointerOperand();
21236 Instruction *ST;
21237 if (E->State == TreeEntry::Vectorize) {
21238 ST = Builder.CreateAlignedStore(Val: VecValue, Ptr, Align: SI->getAlign());
21239 } else {
21240 assert(E->State == TreeEntry::StridedVectorize &&
21241 "Expected either strided or consecutive stores.");
21242 if (!E->ReorderIndices.empty()) {
21243 SI = cast<StoreInst>(Val: E->Scalars[E->ReorderIndices.front()]);
21244 Ptr = SI->getPointerOperand();
21245 }
21246 Align CommonAlignment = computeCommonAlignment<StoreInst>(VL: E->Scalars);
21247 Type *StrideTy = DL->getIndexType(PtrTy: SI->getPointerOperandType());
21248 auto *Inst = Builder.CreateIntrinsic(
21249 ID: Intrinsic::experimental_vp_strided_store,
21250 Types: {VecTy, Ptr->getType(), StrideTy},
21251 Args: {VecValue, Ptr,
21252 ConstantInt::getSigned(
21253 Ty: StrideTy, V: -static_cast<int>(DL->getTypeAllocSize(Ty: ScalarTy))),
21254 Builder.getAllOnesMask(NumElts: VecTy->getElementCount()),
21255 Builder.getInt32(C: E->Scalars.size())});
21256 Inst->addParamAttr(
21257 /*ArgNo=*/1,
21258 Attr: Attribute::getWithAlignment(Context&: Inst->getContext(), Alignment: CommonAlignment));
21259 ST = Inst;
21260 }
21261
21262 Value *V = ::propagateMetadata(Inst: ST, VL: E->Scalars);
21263
21264 E->VectorizedValue = V;
21265 ++NumVectorInstructions;
21266 return V;
21267 }
21268 case Instruction::GetElementPtr: {
21269 auto *GEP0 = cast<GetElementPtrInst>(Val: VL0);
21270 setInsertPointAfterBundle(E);
21271
21272 Value *Op0 = vectorizeOperand(E, NodeIdx: 0);
21273
21274 SmallVector<Value *> OpVecs;
21275 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
21276 Value *OpVec = vectorizeOperand(E, NodeIdx: J);
21277 OpVecs.push_back(Elt: OpVec);
21278 }
21279
21280 Value *V = Builder.CreateGEP(Ty: GEP0->getSourceElementType(), Ptr: Op0, IdxList: OpVecs);
21281 if (Instruction *I = dyn_cast<GetElementPtrInst>(Val: V)) {
21282 SmallVector<Value *> GEPs;
21283 for (Value *V : E->Scalars) {
21284 if (isa<GetElementPtrInst>(Val: V))
21285 GEPs.push_back(Elt: V);
21286 }
21287 V = ::propagateMetadata(Inst: I, VL: GEPs);
21288 }
21289
21290 V = FinalShuffle(V, E);
21291
21292 E->VectorizedValue = V;
21293 ++NumVectorInstructions;
21294
21295 return V;
21296 }
21297 case Instruction::Call: {
21298 CallInst *CI = cast<CallInst>(Val: VL0);
21299 setInsertPointAfterBundle(E);
21300
21301 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
21302
21303 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
21304 CI, ID, VF: VecTy->getNumElements(),
21305 MinBW: It != MinBWs.end() ? It->second.first : 0, TTI);
21306 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
21307 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
21308 VecCallCosts.first <= VecCallCosts.second;
21309
21310 Value *ScalarArg = nullptr;
21311 SmallVector<Value *> OpVecs;
21312 SmallVector<Type *, 2> TysForDecl;
21313 // Add return type if intrinsic is overloaded on it.
21314 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: -1, TTI))
21315 TysForDecl.push_back(Elt: VecTy);
21316 auto *CEI = cast<CallInst>(Val: VL0);
21317 for (unsigned I : seq<unsigned>(Begin: 0, End: CI->arg_size())) {
21318 // Some intrinsics have scalar arguments. This argument should not be
21319 // vectorized.
21320 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx: I, TTI)) {
21321 ScalarArg = CEI->getArgOperand(i: I);
21322 // if decided to reduce bitwidth of abs intrinsic, it second argument
21323 // must be set false (do not return poison, if value issigned min).
21324 if (ID == Intrinsic::abs && It != MinBWs.end() &&
21325 It->second.first < DL->getTypeSizeInBits(Ty: CEI->getType()))
21326 ScalarArg = Builder.getFalse();
21327 OpVecs.push_back(Elt: ScalarArg);
21328 if (isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
21329 TysForDecl.push_back(Elt: ScalarArg->getType());
21330 continue;
21331 }
21332
21333 Value *OpVec = vectorizeOperand(E, NodeIdx: I);
21334 ScalarArg = CEI->getArgOperand(i: I);
21335 if (cast<VectorType>(Val: OpVec->getType())->getElementType() !=
21336 ScalarArg->getType()->getScalarType() &&
21337 It == MinBWs.end()) {
21338 auto *CastTy =
21339 getWidenedType(ScalarTy: ScalarArg->getType(), VF: VecTy->getNumElements());
21340 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: CastTy, isSigned: GetOperandSignedness(I));
21341 } else if (It != MinBWs.end()) {
21342 OpVec = Builder.CreateIntCast(V: OpVec, DestTy: VecTy, isSigned: GetOperandSignedness(I));
21343 }
21344 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
21345 OpVecs.push_back(Elt: OpVec);
21346 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx: I, TTI))
21347 TysForDecl.push_back(Elt: OpVec->getType());
21348 }
21349
21350 Function *CF;
21351 if (!UseIntrinsic) {
21352 VFShape Shape =
21353 VFShape::get(FTy: CI->getFunctionType(),
21354 EC: ElementCount::getFixed(MinVal: VecTy->getNumElements()),
21355 HasGlobalPred: false /*HasGlobalPred*/);
21356 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
21357 } else {
21358 CF = Intrinsic::getOrInsertDeclaration(M: F->getParent(), id: ID, Tys: TysForDecl);
21359 }
21360
21361 SmallVector<OperandBundleDef, 1> OpBundles;
21362 CI->getOperandBundlesAsDefs(Defs&: OpBundles);
21363 Value *V = Builder.CreateCall(Callee: CF, Args: OpVecs, OpBundles);
21364
21365 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21366 cast<CallInst>(Val: V)->setCallingConv(CF->getCallingConv());
21367 V = FinalShuffle(V, E);
21368
21369 E->VectorizedValue = V;
21370 ++NumVectorInstructions;
21371 return V;
21372 }
21373 case Instruction::ShuffleVector: {
21374 Value *V;
21375 if (SLPReVec && !E->isAltShuffle()) {
21376 setInsertPointAfterBundle(E);
21377 Value *Src = vectorizeOperand(E, NodeIdx: 0);
21378 SmallVector<int> ThisMask(calculateShufflevectorMask(VL: E->Scalars));
21379 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Val: Src)) {
21380 SmallVector<int> NewMask(ThisMask.size());
21381 transform(Range&: ThisMask, d_first: NewMask.begin(), F: [&SVSrc](int Mask) {
21382 return SVSrc->getShuffleMask()[Mask];
21383 });
21384 V = Builder.CreateShuffleVector(V1: SVSrc->getOperand(i_nocapture: 0),
21385 V2: SVSrc->getOperand(i_nocapture: 1), Mask: NewMask);
21386 } else {
21387 V = Builder.CreateShuffleVector(V: Src, Mask: ThisMask);
21388 }
21389 propagateIRFlags(I: V, VL: E->Scalars, OpValue: VL0);
21390 if (auto *I = dyn_cast<Instruction>(Val: V))
21391 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21392 V = FinalShuffle(V, E);
21393 } else {
21394 assert(E->isAltShuffle() &&
21395 ((Instruction::isBinaryOp(E->getOpcode()) &&
21396 Instruction::isBinaryOp(E->getAltOpcode())) ||
21397 (Instruction::isCast(E->getOpcode()) &&
21398 Instruction::isCast(E->getAltOpcode())) ||
21399 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
21400 "Invalid Shuffle Vector Operand");
21401
21402 Value *LHS = nullptr, *RHS = nullptr;
21403 if (Instruction::isBinaryOp(Opcode: E->getOpcode()) || isa<CmpInst>(Val: VL0)) {
21404 setInsertPointAfterBundle(E);
21405 LHS = vectorizeOperand(E, NodeIdx: 0);
21406 RHS = vectorizeOperand(E, NodeIdx: 1);
21407 } else {
21408 setInsertPointAfterBundle(E);
21409 LHS = vectorizeOperand(E, NodeIdx: 0);
21410 }
21411 if (LHS && RHS &&
21412 ((Instruction::isBinaryOp(Opcode: E->getOpcode()) &&
21413 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
21414 (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()))) {
21415 assert((It != MinBWs.end() ||
21416 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
21417 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
21418 MinBWs.contains(getOperandEntry(E, 0)) ||
21419 MinBWs.contains(getOperandEntry(E, 1))) &&
21420 "Expected item in MinBWs.");
21421 Type *CastTy = VecTy;
21422 if (isa<CmpInst>(Val: VL0) && LHS->getType() != RHS->getType()) {
21423 if (cast<VectorType>(Val: LHS->getType())
21424 ->getElementType()
21425 ->getIntegerBitWidth() < cast<VectorType>(Val: RHS->getType())
21426 ->getElementType()
21427 ->getIntegerBitWidth())
21428 CastTy = RHS->getType();
21429 else
21430 CastTy = LHS->getType();
21431 }
21432 if (LHS->getType() != CastTy)
21433 LHS = Builder.CreateIntCast(V: LHS, DestTy: CastTy, isSigned: GetOperandSignedness(0));
21434 if (RHS->getType() != CastTy)
21435 RHS = Builder.CreateIntCast(V: RHS, DestTy: CastTy, isSigned: GetOperandSignedness(1));
21436 }
21437
21438 Value *V0, *V1;
21439 if (Instruction::isBinaryOp(Opcode: E->getOpcode())) {
21440 V0 = Builder.CreateBinOp(
21441 Opc: static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
21442 V1 = Builder.CreateBinOp(
21443 Opc: static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
21444 } else if (auto *CI0 = dyn_cast<CmpInst>(Val: VL0)) {
21445 V0 = Builder.CreateCmp(Pred: CI0->getPredicate(), LHS, RHS);
21446 auto *AltCI = cast<CmpInst>(Val: E->getAltOp());
21447 CmpInst::Predicate AltPred = AltCI->getPredicate();
21448 V1 = Builder.CreateCmp(Pred: AltPred, LHS, RHS);
21449 } else {
21450 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
21451 unsigned SrcBWSz = DL->getTypeSizeInBits(
21452 Ty: cast<VectorType>(Val: LHS->getType())->getElementType());
21453 unsigned BWSz = DL->getTypeSizeInBits(Ty: ScalarTy);
21454 if (BWSz <= SrcBWSz) {
21455 if (BWSz < SrcBWSz)
21456 LHS = Builder.CreateIntCast(V: LHS, DestTy: VecTy, isSigned: It->second.first);
21457 assert(LHS->getType() == VecTy &&
21458 "Expected same type as operand.");
21459 if (auto *I = dyn_cast<Instruction>(Val: LHS))
21460 LHS = ::propagateMetadata(Inst: I, VL: E->Scalars);
21461 LHS = FinalShuffle(LHS, E);
21462 E->VectorizedValue = LHS;
21463 ++NumVectorInstructions;
21464 return LHS;
21465 }
21466 }
21467 V0 = Builder.CreateCast(
21468 Op: static_cast<Instruction::CastOps>(E->getOpcode()), V: LHS, DestTy: VecTy);
21469 V1 = Builder.CreateCast(
21470 Op: static_cast<Instruction::CastOps>(E->getAltOpcode()), V: LHS, DestTy: VecTy);
21471 }
21472 // Add V0 and V1 to later analysis to try to find and remove matching
21473 // instruction, if any.
21474 for (Value *V : {V0, V1}) {
21475 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21476 GatherShuffleExtractSeq.insert(X: I);
21477 CSEBlocks.insert(V: I->getParent());
21478 }
21479 }
21480
21481 // Create shuffle to take alternate operations from the vector.
21482 // Also, gather up main and alt scalar ops to propagate IR flags to
21483 // each vector operation.
21484 ValueList OpScalars, AltScalars;
21485 SmallVector<int> Mask;
21486 E->buildAltOpShuffleMask(
21487 IsAltOp: [E, this](Instruction *I) {
21488 assert(E->getMatchingMainOpOrAltOp(I) &&
21489 "Unexpected main/alternate opcode");
21490 return isAlternateInstruction(I, MainOp: E->getMainOp(), AltOp: E->getAltOp(),
21491 TLI: *TLI);
21492 },
21493 Mask, OpScalars: &OpScalars, AltScalars: &AltScalars);
21494
21495 propagateIRFlags(I: V0, VL: OpScalars, OpValue: E->getMainOp(), IncludeWrapFlags: It == MinBWs.end());
21496 propagateIRFlags(I: V1, VL: AltScalars, OpValue: E->getAltOp(), IncludeWrapFlags: It == MinBWs.end());
21497 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
21498 // Drop nuw flags for abs(sub(commutative), true).
21499 if (auto *I = dyn_cast<Instruction>(Val: Vec);
21500 I && Opcode == Instruction::Sub && !MinBWs.contains(Val: E) &&
21501 any_of(Range&: E->Scalars, P: [E](Value *V) {
21502 if (isa<PoisonValue>(Val: V))
21503 return false;
21504 if (E->hasCopyableElements() && E->isCopyableElement(V))
21505 return false;
21506 auto *IV = cast<Instruction>(Val: V);
21507 return IV->getOpcode() == Instruction::Sub && isCommutative(I: IV);
21508 }))
21509 I->setHasNoUnsignedWrap(/*b=*/false);
21510 };
21511 DropNuwFlag(V0, E->getOpcode());
21512 DropNuwFlag(V1, E->getAltOpcode());
21513
21514 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
21515 assert(SLPReVec && "FixedVectorType is not expected.");
21516 transformScalarShuffleIndiciesToVector(VecTyNumElements: VecTy->getNumElements(), Mask);
21517 }
21518 V = Builder.CreateShuffleVector(V1: V0, V2: V1, Mask);
21519 if (auto *I = dyn_cast<Instruction>(Val: V)) {
21520 V = ::propagateMetadata(Inst: I, VL: E->Scalars);
21521 GatherShuffleExtractSeq.insert(X: I);
21522 CSEBlocks.insert(V: I->getParent());
21523 }
21524 }
21525
21526 E->VectorizedValue = V;
21527 ++NumVectorInstructions;
21528
21529 return V;
21530 }
21531 case TreeEntry::ReducedBitcast:
21532 case TreeEntry::ReducedBitcastBSwap: {
21533 assert(UserIgnoreList && "Expected reduction operations only.");
21534 setInsertPointAfterBundle(E);
21535 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
21536 ZExt->VectorizedValue = PoisonValue::get(T: getWidenedType(
21537 ScalarTy: ZExt->getMainOp()->getType(), VF: ZExt->getVectorFactor()));
21538 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
21539 Const->VectorizedValue = PoisonValue::get(T: getWidenedType(
21540 ScalarTy: Const->Scalars.front()->getType(), VF: Const->getVectorFactor()));
21541 Value *Op = vectorizeOperand(E: ZExt, NodeIdx: 0);
21542 auto *SrcType = IntegerType::get(
21543 C&: Op->getContext(),
21544 NumBits: DL->getTypeSizeInBits(Ty: cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy()) *
21545 E->getVectorFactor());
21546 auto *OrigScalarTy = ScalarTy;
21547 // Set the scalar type properly to avoid casting to the extending type.
21548 ScalarTy = cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy();
21549 Op = FinalShuffle(Op, E);
21550 auto *V = Builder.CreateBitCast(V: Op, DestTy: SrcType);
21551 ++NumVectorInstructions;
21552 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwap) {
21553 V = Builder.CreateUnaryIntrinsic(ID: Intrinsic::bswap, V);
21554 ++NumVectorInstructions;
21555 }
21556 if (SrcType != OrigScalarTy) {
21557 V = Builder.CreateIntCast(V, DestTy: OrigScalarTy, /*isSigned=*/false);
21558 ++NumVectorInstructions;
21559 }
21560 E->VectorizedValue = V;
21561 return V;
21562 }
21563 case TreeEntry::ReducedBitcastLoads:
21564 case TreeEntry::ReducedBitcastBSwapLoads: {
21565 assert(UserIgnoreList && "Expected reduction operations only.");
21566 TreeEntry *ZExt = getOperandEntry(E, /*Idx=*/0);
21567 TreeEntry *Load = getOperandEntry(E: ZExt, /*Idx=*/0);
21568 setInsertPointAfterBundle(Load);
21569 ZExt->VectorizedValue = PoisonValue::get(T: getWidenedType(
21570 ScalarTy: ZExt->getMainOp()->getType(), VF: ZExt->getVectorFactor()));
21571 TreeEntry *Const = getOperandEntry(E, /*Idx=*/1);
21572 Const->VectorizedValue = PoisonValue::get(T: getWidenedType(
21573 ScalarTy: Const->Scalars.front()->getType(), VF: Const->getVectorFactor()));
21574 Load->VectorizedValue = PoisonValue::get(T: getWidenedType(
21575 ScalarTy: Load->getMainOp()->getType(), VF: Load->getVectorFactor()));
21576 LoadInst *LI = cast<LoadInst>(Val: Load->getMainOp());
21577 Value *PO = LI->getPointerOperand();
21578 auto *SrcTy = IntegerType::get(
21579 C&: ScalarTy->getContext(),
21580 NumBits: DL->getTypeSizeInBits(Ty: cast<CastInst>(Val: ZExt->getMainOp())->getSrcTy()) *
21581 E->getVectorFactor());
21582 auto *OrigScalarTy = ScalarTy;
21583 ScalarTy = ZExt->getMainOp()->getType();
21584 Value *V = Builder.CreateAlignedLoad(Ty: SrcTy, Ptr: PO, Align: LI->getAlign());
21585 ++NumVectorInstructions;
21586 if (ShuffleOrOp == TreeEntry::ReducedBitcastBSwapLoads) {
21587 V = Builder.CreateUnaryIntrinsic(ID: Intrinsic::bswap, V);
21588 ++NumVectorInstructions;
21589 }
21590 if (SrcTy != OrigScalarTy) {
21591 V = Builder.CreateIntCast(V, DestTy: OrigScalarTy, /*isSigned=*/false);
21592 ++NumVectorInstructions;
21593 }
21594 E->VectorizedValue = V;
21595 return V;
21596 }
21597 case TreeEntry::ReducedCmpBitcast: {
21598 assert(UserIgnoreList && "Expected reduction operations only.");
21599 setInsertPointAfterBundle(E);
21600 TreeEntry *Op1TE = getOperandEntry(E, /*Idx=*/1);
21601 TreeEntry *Op2TE = getOperandEntry(E, /*Idx=*/2);
21602 Op1TE->VectorizedValue =
21603 PoisonValue::get(T: getWidenedType(ScalarTy, VF: Op1TE->getVectorFactor()));
21604 Op2TE->VectorizedValue =
21605 PoisonValue::get(T: getWidenedType(ScalarTy, VF: Op2TE->getVectorFactor()));
21606 Value *Cmp = vectorizeOperand(E, /*NodeIdx=*/0);
21607 // Set the scalar type properly to avoid casting to the extending type.
21608 auto *DstTy =
21609 IntegerType::getIntNTy(C&: ScalarTy->getContext(), N: E->getVectorFactor());
21610 auto *V = Builder.CreateBitCast(V: Cmp, DestTy: DstTy);
21611 ++NumVectorInstructions;
21612 if (DstTy != ScalarTy) {
21613 V = Builder.CreateIntCast(V, DestTy: ScalarTy, /*isSigned=*/false);
21614 ++NumVectorInstructions;
21615 }
21616 E->VectorizedValue = V;
21617 return V;
21618 }
21619 default:
21620 llvm_unreachable("unknown inst");
21621 }
21622 return nullptr;
21623}
21624
21625Value *BoUpSLP::vectorizeTree() {
21626 ExtraValueToDebugLocsMap ExternallyUsedValues;
21627 return vectorizeTree(ExternallyUsedValues);
21628}
21629
21630Value *BoUpSLP::vectorizeTree(
21631 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
21632 Instruction *ReductionRoot,
21633 ArrayRef<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
21634 VectorValuesAndScales) {
21635 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
21636 // need to rebuild it.
21637 EntryToLastInstruction.clear();
21638 // All blocks must be scheduled before any instructions are inserted.
21639 for (auto &BSIter : BlocksSchedules)
21640 scheduleBlock(R: *this, BS: BSIter.second.get());
21641 // Cache last instructions for the nodes to avoid side effects, which may
21642 // appear during vectorization, like extra uses, etc.
21643 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21644 // Need to generate insertion point for loads nodes of the bitcast/bswap
21645 // ops.
21646 if (TE->isGather() || DeletedNodes.contains(Ptr: TE.get()) ||
21647 (TE->State == TreeEntry::CombinedVectorize &&
21648 (TE->CombinedOp == TreeEntry::ReducedBitcast ||
21649 TE->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
21650 ((TE->CombinedOp == TreeEntry::ReducedBitcastLoads ||
21651 TE->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
21652 TE->CombinedOp == TreeEntry::ReducedCmpBitcast) &&
21653 (!TE->hasState() || TE->getOpcode() != Instruction::Load)))))
21654 continue;
21655 (void)getLastInstructionInBundle(E: TE.get());
21656 }
21657
21658 if (ReductionRoot)
21659 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
21660 IP: ReductionRoot->getIterator());
21661 else
21662 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
21663
21664 // Vectorize gather operands of the nodes with the external uses only.
21665 SmallVector<std::pair<TreeEntry *, Instruction *>> GatherEntries;
21666 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21667 if (DeletedNodes.contains(Ptr: TE.get()))
21668 continue;
21669 if (TE->isGather() && !TE->VectorizedValue && TE->UserTreeIndex.UserTE &&
21670 TE->UserTreeIndex.UserTE->hasState() &&
21671 TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize &&
21672 (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI ||
21673 TE->UserTreeIndex.UserTE->isAltShuffle()) &&
21674 !TE->UserTreeIndex.UserTE->hasCopyableElements() &&
21675 all_of(Range&: TE->UserTreeIndex.UserTE->Scalars,
21676 P: [](Value *V) { return isUsedOutsideBlock(V); })) {
21677 Instruction &LastInst =
21678 getLastInstructionInBundle(E: TE->UserTreeIndex.UserTE);
21679 GatherEntries.emplace_back(Args: TE.get(), Args: &LastInst);
21680 }
21681 }
21682 for (auto &Entry : GatherEntries) {
21683 IRBuilderBase::InsertPointGuard Guard(Builder);
21684 Builder.SetInsertPoint(Entry.second);
21685 Builder.SetCurrentDebugLocation(Entry.second->getDebugLoc());
21686 (void)vectorizeTree(E: Entry.first);
21687 }
21688 // Emit gathered loads first to emit better code for the users of those
21689 // gathered loads.
21690 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
21691 if (DeletedNodes.contains(Ptr: TE.get()))
21692 continue;
21693 if (GatheredLoadsEntriesFirst.has_value() &&
21694 TE->Idx >= *GatheredLoadsEntriesFirst && !TE->VectorizedValue &&
21695 (!TE->isGather() || TE->UserTreeIndex)) {
21696 assert((TE->UserTreeIndex ||
21697 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
21698 "Expected gathered load node.");
21699 (void)vectorizeTree(E: TE.get());
21700 }
21701 }
21702 (void)vectorizeTree(E: VectorizableTree[0].get());
21703 // Run through the list of postponed gathers and emit them, replacing the temp
21704 // emitted allocas with actual vector instructions.
21705 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
21706 DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;
21707 for (const TreeEntry *E : PostponedNodes) {
21708 auto *TE = const_cast<TreeEntry *>(E);
21709 auto *PrevVec = cast<Instruction>(Val&: TE->VectorizedValue);
21710 TE->VectorizedValue = nullptr;
21711 auto *UserI = cast<Instruction>(Val&: TE->UserTreeIndex.UserTE->VectorizedValue);
21712 // If user is a PHI node, its vector code have to be inserted right before
21713 // block terminator. Since the node was delayed, there were some unresolved
21714 // dependencies at the moment when stab instruction was emitted. In a case
21715 // when any of these dependencies turn out an operand of another PHI, coming
21716 // from this same block, position of a stab instruction will become invalid.
21717 // The is because source vector that supposed to feed this gather node was
21718 // inserted at the end of the block [after stab instruction]. So we need
21719 // to adjust insertion point again to the end of block.
21720 if (isa<PHINode>(Val: UserI) ||
21721 (TE->UserTreeIndex.UserTE->hasState() &&
21722 TE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
21723 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI)) {
21724 // Insert before all users.
21725 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
21726 for (User *U : PrevVec->users()) {
21727 if (U == UserI)
21728 continue;
21729 auto *UI = dyn_cast<Instruction>(Val: U);
21730 if (!UI || isa<PHINode>(Val: UI) || UI->getParent() != InsertPt->getParent())
21731 continue;
21732 if (UI->comesBefore(Other: InsertPt))
21733 InsertPt = UI;
21734 }
21735 Builder.SetInsertPoint(InsertPt);
21736 } else {
21737 Builder.SetInsertPoint(PrevVec);
21738 }
21739 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
21740 Value *Vec = vectorizeTree(E: TE);
21741 if (auto *VecI = dyn_cast<Instruction>(Val: Vec);
21742 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
21743 Builder.GetInsertPoint()->comesBefore(Other: VecI))
21744 VecI->moveBeforePreserving(BB&: *Builder.GetInsertBlock(),
21745 I: Builder.GetInsertPoint());
21746 if (Vec->getType() != PrevVec->getType()) {
21747 assert(Vec->getType()->isIntOrIntVectorTy() &&
21748 PrevVec->getType()->isIntOrIntVectorTy() &&
21749 "Expected integer vector types only.");
21750 std::optional<bool> IsSigned;
21751 for (Value *V : TE->Scalars) {
21752 if (isVectorized(V)) {
21753 for (const TreeEntry *MNTE : getTreeEntries(V)) {
21754 auto It = MinBWs.find(Val: MNTE);
21755 if (It != MinBWs.end()) {
21756 IsSigned = IsSigned.value_or(u: false) || It->second.second;
21757 if (*IsSigned)
21758 break;
21759 }
21760 }
21761 if (IsSigned.value_or(u: false))
21762 break;
21763 // Scan through gather nodes.
21764 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(Val: V)) {
21765 auto It = MinBWs.find(Val: BVE);
21766 if (It != MinBWs.end()) {
21767 IsSigned = IsSigned.value_or(u: false) || It->second.second;
21768 if (*IsSigned)
21769 break;
21770 }
21771 }
21772 if (IsSigned.value_or(u: false))
21773 break;
21774 if (auto *EE = dyn_cast<ExtractElementInst>(Val: V)) {
21775 IsSigned =
21776 IsSigned.value_or(u: false) ||
21777 !isKnownNonNegative(V: EE->getVectorOperand(), SQ: SimplifyQuery(*DL));
21778 continue;
21779 }
21780 if (IsSigned.value_or(u: false))
21781 break;
21782 }
21783 }
21784 if (IsSigned.value_or(u: false)) {
21785 // Final attempt - check user node.
21786 auto It = MinBWs.find(Val: TE->UserTreeIndex.UserTE);
21787 if (It != MinBWs.end())
21788 IsSigned = It->second.second;
21789 }
21790 assert(IsSigned &&
21791 "Expected user node or perfect diamond match in MinBWs.");
21792 Vec = Builder.CreateIntCast(V: Vec, DestTy: PrevVec->getType(), isSigned: *IsSigned);
21793 }
21794 PrevVec->replaceAllUsesWith(V: Vec);
21795 PostponedValues.try_emplace(Key: Vec).first->second.push_back(Elt: TE);
21796 // Replace the stub vector node, if it was used before for one of the
21797 // buildvector nodes already.
21798 auto It = PostponedValues.find(Val: PrevVec);
21799 if (It != PostponedValues.end()) {
21800 for (TreeEntry *VTE : It->getSecond())
21801 VTE->VectorizedValue = Vec;
21802 }
21803 eraseInstruction(I: PrevVec);
21804 }
21805
21806 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
21807 << " values .\n");
21808
21809 SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;
21810 // Maps vector instruction to original insertelement instruction
21811 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
21812 // Maps extract Scalar to the corresponding extractelement instruction in the
21813 // basic block. Only one extractelement per block should be emitted.
21814 DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>
21815 ScalarToEEs;
21816 SmallDenseSet<Value *, 4> UsedInserts;
21817 DenseMap<std::pair<Value *, Type *>, Value *> VectorCasts;
21818 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
21819 SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;
21820 // Extract all of the elements with the external uses.
21821 for (const auto &ExternalUse : ExternalUses) {
21822 Value *Scalar = ExternalUse.Scalar;
21823 llvm::User *User = ExternalUse.User;
21824
21825 // Skip users that we already RAUW. This happens when one instruction
21826 // has multiple uses of the same value.
21827 if (User && !is_contained(Range: Scalar->users(), Element: User))
21828 continue;
21829 const TreeEntry *E = &ExternalUse.E;
21830 assert(E && "Invalid scalar");
21831 assert(!E->isGather() && "Extracting from a gather list");
21832 // Non-instruction pointers are not deleted, just skip them.
21833 if (E->getOpcode() == Instruction::GetElementPtr &&
21834 !isa<GetElementPtrInst>(Val: Scalar))
21835 continue;
21836
21837 Value *Vec = E->VectorizedValue;
21838 assert(Vec && "Can't find vectorizable value");
21839
21840 Value *Lane = Builder.getInt32(C: ExternalUse.Lane);
21841 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
21842 if (Scalar->getType() != Vec->getType()) {
21843 Value *Ex = nullptr;
21844 Value *ExV = nullptr;
21845 auto *Inst = dyn_cast<Instruction>(Val: Scalar);
21846 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Ptr: Inst);
21847 auto It = ScalarToEEs.find(Val: Scalar);
21848 if (It != ScalarToEEs.end()) {
21849 // No need to emit many extracts, just move the only one in the
21850 // current block.
21851 auto EEIt = It->second.find(Val: ReplaceInst ? Inst->getParent()
21852 : Builder.GetInsertBlock());
21853 if (EEIt != It->second.end()) {
21854 Value *PrevV = EEIt->second.first;
21855 if (auto *I = dyn_cast<Instruction>(Val: PrevV);
21856 I && !ReplaceInst &&
21857 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
21858 Builder.GetInsertPoint()->comesBefore(Other: I)) {
21859 I->moveBefore(BB&: *Builder.GetInsertPoint()->getParent(),
21860 I: Builder.GetInsertPoint());
21861 if (auto *CI = dyn_cast<Instruction>(Val: EEIt->second.second))
21862 CI->moveAfter(MovePos: I);
21863 }
21864 Ex = PrevV;
21865 ExV = EEIt->second.second ? EEIt->second.second : Ex;
21866 }
21867 }
21868 if (!Ex) {
21869 // "Reuse" the existing extract to improve final codegen.
21870 if (ReplaceInst) {
21871 // Leave the instruction as is, if it cheaper extracts and all
21872 // operands are scalar.
21873 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Inst)) {
21874 IgnoredExtracts.insert(V: EE);
21875 Ex = EE;
21876 } else {
21877 auto *CloneInst = Inst->clone();
21878 CloneInst->insertBefore(InsertPos: Inst->getIterator());
21879 if (Inst->hasName())
21880 CloneInst->takeName(V: Inst);
21881 Ex = CloneInst;
21882 }
21883 } else if (auto *ES = dyn_cast<ExtractElementInst>(Val: Scalar);
21884 ES && isa<Instruction>(Val: Vec)) {
21885 Value *V = ES->getVectorOperand();
21886 auto *IVec = cast<Instruction>(Val: Vec);
21887 if (ArrayRef<TreeEntry *> ETEs = getTreeEntries(V); !ETEs.empty())
21888 V = ETEs.front()->VectorizedValue;
21889 if (auto *IV = dyn_cast<Instruction>(Val: V);
21890 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
21891 IV->comesBefore(Other: IVec))
21892 Ex = Builder.CreateExtractElement(Vec: V, Idx: ES->getIndexOperand());
21893 else
21894 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
21895 } else if (auto *VecTy =
21896 dyn_cast<FixedVectorType>(Val: Scalar->getType())) {
21897 assert(SLPReVec && "FixedVectorType is not expected.");
21898 unsigned VecTyNumElements = VecTy->getNumElements();
21899 // When REVEC is enabled, we need to extract a vector.
21900 // Note: The element size of Scalar may be different from the
21901 // element size of Vec.
21902 Ex = createExtractVector(Builder, Vec, SubVecVF: VecTyNumElements,
21903 Index: ExternalUse.Lane * VecTyNumElements);
21904 } else {
21905 Ex = Builder.CreateExtractElement(Vec, Idx: Lane);
21906 }
21907 // If necessary, sign-extend or zero-extend ScalarRoot
21908 // to the larger type.
21909 ExV = Ex;
21910 if (Scalar->getType() != Ex->getType())
21911 ExV = Builder.CreateIntCast(
21912 V: Ex, DestTy: Scalar->getType(),
21913 isSigned: !isKnownNonNegative(V: Scalar, SQ: SimplifyQuery(*DL)));
21914 auto *I = dyn_cast<Instruction>(Val: Ex);
21915 ScalarToEEs[Scalar].try_emplace(Key: I ? I->getParent()
21916 : &F->getEntryBlock(),
21917 Args: std::make_pair(x&: Ex, y&: ExV));
21918 }
21919 // The then branch of the previous if may produce constants, since 0
21920 // operand might be a constant.
21921 if (auto *ExI = dyn_cast<Instruction>(Val: Ex);
21922 ExI && !isa<PHINode>(Val: ExI) && !mayHaveNonDefUseDependency(I: *ExI)) {
21923 GatherShuffleExtractSeq.insert(X: ExI);
21924 CSEBlocks.insert(V: ExI->getParent());
21925 }
21926 return ExV;
21927 }
21928 assert(isa<FixedVectorType>(Scalar->getType()) &&
21929 isa<InsertElementInst>(Scalar) &&
21930 "In-tree scalar of vector type is not insertelement?");
21931 auto *IE = cast<InsertElementInst>(Val: Scalar);
21932 VectorToInsertElement.try_emplace(Key: Vec, Args&: IE);
21933 return Vec;
21934 };
21935 // If User == nullptr, the Scalar remains as scalar in vectorized
21936 // instructions or is used as extra arg. Generate ExtractElement instruction
21937 // and update the record for this scalar in ExternallyUsedValues.
21938 if (!User) {
21939 if (!ScalarsWithNullptrUser.insert(V: Scalar).second)
21940 continue;
21941 assert(
21942 (ExternallyUsedValues.count(Scalar) ||
21943 ExternalUsesWithNonUsers.count(Scalar) ||
21944 ExternalUsesAsOriginalScalar.contains(Scalar) ||
21945 any_of(
21946 Scalar->users(),
21947 [&, TTI = TTI](llvm::User *U) {
21948 if (ExternalUsesAsOriginalScalar.contains(U))
21949 return true;
21950 ArrayRef<TreeEntry *> UseEntries = getTreeEntries(U);
21951 return !UseEntries.empty() &&
21952 (E->State == TreeEntry::Vectorize ||
21953 E->State == TreeEntry::StridedVectorize ||
21954 E->State == TreeEntry::CompressVectorize) &&
21955 any_of(UseEntries, [&, TTI = TTI](TreeEntry *UseEntry) {
21956 return (UseEntry->State == TreeEntry::Vectorize ||
21957 UseEntry->State ==
21958 TreeEntry::StridedVectorize ||
21959 UseEntry->State ==
21960 TreeEntry::CompressVectorize) &&
21961 doesInTreeUserNeedToExtract(
21962 Scalar, getRootEntryInstruction(*UseEntry),
21963 TLI, TTI);
21964 });
21965 })) &&
21966 "Scalar with nullptr User must be registered in "
21967 "ExternallyUsedValues map or remain as scalar in vectorized "
21968 "instructions");
21969 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
21970 if (auto *PHI = dyn_cast<PHINode>(Val: VecI)) {
21971 if (PHI->getParent()->isLandingPad())
21972 Builder.SetInsertPoint(
21973 TheBB: PHI->getParent(),
21974 IP: std::next(
21975 x: PHI->getParent()->getLandingPadInst()->getIterator()));
21976 else
21977 Builder.SetInsertPoint(TheBB: PHI->getParent(),
21978 IP: PHI->getParent()->getFirstNonPHIIt());
21979 } else {
21980 Builder.SetInsertPoint(TheBB: VecI->getParent(),
21981 IP: std::next(x: VecI->getIterator()));
21982 }
21983 } else {
21984 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
21985 }
21986 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
21987 // Required to update internally referenced instructions.
21988 if (Scalar != NewInst) {
21989 assert((!isa<ExtractElementInst>(Scalar) ||
21990 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
21991 "Extractelements should not be replaced.");
21992 Scalar->replaceAllUsesWith(V: NewInst);
21993 }
21994 continue;
21995 }
21996
21997 if (auto *VU = dyn_cast<InsertElementInst>(Val: User);
21998 VU && VU->getOperand(i_nocapture: 1) == Scalar) {
21999 // Skip if the scalar is another vector op or Vec is not an instruction.
22000 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Val: Vec)) {
22001 if (auto *FTy = dyn_cast<FixedVectorType>(Val: User->getType())) {
22002 if (!UsedInserts.insert(V: VU).second)
22003 continue;
22004 // Need to use original vector, if the root is truncated.
22005 auto BWIt = MinBWs.find(Val: E);
22006 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
22007 auto *ScalarTy = FTy->getElementType();
22008 auto Key = std::make_pair(x&: Vec, y&: ScalarTy);
22009 auto VecIt = VectorCasts.find(Val: Key);
22010 if (VecIt == VectorCasts.end()) {
22011 IRBuilderBase::InsertPointGuard Guard(Builder);
22012 if (auto *IVec = dyn_cast<PHINode>(Val: Vec)) {
22013 if (IVec->getParent()->isLandingPad())
22014 Builder.SetInsertPoint(TheBB: IVec->getParent(),
22015 IP: std::next(x: IVec->getParent()
22016 ->getLandingPadInst()
22017 ->getIterator()));
22018 else
22019 Builder.SetInsertPoint(
22020 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
22021 } else if (auto *IVec = dyn_cast<Instruction>(Val: Vec)) {
22022 Builder.SetInsertPoint(IVec->getNextNode());
22023 }
22024 Vec = Builder.CreateIntCast(
22025 V: Vec,
22026 DestTy: getWidenedType(
22027 ScalarTy,
22028 VF: cast<FixedVectorType>(Val: Vec->getType())->getNumElements()),
22029 isSigned: BWIt->second.second);
22030 VectorCasts.try_emplace(Key, Args&: Vec);
22031 } else {
22032 Vec = VecIt->second;
22033 }
22034 }
22035
22036 std::optional<unsigned> InsertIdx = getElementIndex(Inst: VU);
22037 if (InsertIdx) {
22038 auto *It = find_if(
22039 Range&: ShuffledInserts, P: [VU](const ShuffledInsertData<Value *> &Data) {
22040 // Checks if 2 insertelements are from the same buildvector.
22041 InsertElementInst *VecInsert = Data.InsertElements.front();
22042 return areTwoInsertFromSameBuildVector(
22043 VU, V: VecInsert,
22044 GetBaseOperand: [](InsertElementInst *II) { return II->getOperand(i_nocapture: 0); });
22045 });
22046 unsigned Idx = *InsertIdx;
22047 if (It == ShuffledInserts.end()) {
22048 (void)ShuffledInserts.emplace_back();
22049 It = std::next(x: ShuffledInserts.begin(),
22050 n: ShuffledInserts.size() - 1);
22051 }
22052 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
22053 if (Mask.empty())
22054 Mask.assign(NumElts: FTy->getNumElements(), Elt: PoisonMaskElem);
22055 Mask[Idx] = ExternalUse.Lane;
22056 It->InsertElements.push_back(Elt: cast<InsertElementInst>(Val: User));
22057 continue;
22058 }
22059 }
22060 }
22061 }
22062
22063 // Generate extracts for out-of-tree users.
22064 // Find the insertion point for the extractelement lane.
22065 if (auto *VecI = dyn_cast<Instruction>(Val: Vec)) {
22066 if (PHINode *PH = dyn_cast<PHINode>(Val: User)) {
22067 for (unsigned I : seq<unsigned>(Begin: 0, End: PH->getNumIncomingValues())) {
22068 if (PH->getIncomingValue(i: I) == Scalar) {
22069 Instruction *IncomingTerminator =
22070 PH->getIncomingBlock(i: I)->getTerminator();
22071 if (isa<CatchSwitchInst>(Val: IncomingTerminator)) {
22072 Builder.SetInsertPoint(TheBB: VecI->getParent(),
22073 IP: std::next(x: VecI->getIterator()));
22074 } else {
22075 Builder.SetInsertPoint(PH->getIncomingBlock(i: I)->getTerminator());
22076 }
22077 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22078 PH->setOperand(i_nocapture: I, Val_nocapture: NewInst);
22079 }
22080 }
22081 } else {
22082 Builder.SetInsertPoint(cast<Instruction>(Val: User));
22083 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22084 User->replaceUsesOfWith(From: Scalar, To: NewInst);
22085 }
22086 } else {
22087 Builder.SetInsertPoint(TheBB: &F->getEntryBlock(), IP: F->getEntryBlock().begin());
22088 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
22089 User->replaceUsesOfWith(From: Scalar, To: NewInst);
22090 }
22091
22092 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
22093 }
22094
22095 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
22096 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
22097 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
22098 int VF = cast<FixedVectorType>(Val: V1->getType())->getNumElements();
22099 for (int I = 0, E = Mask.size(); I < E; ++I) {
22100 if (Mask[I] < VF)
22101 CombinedMask1[I] = Mask[I];
22102 else
22103 CombinedMask2[I] = Mask[I] - VF;
22104 }
22105 ShuffleInstructionBuilder ShuffleBuilder(
22106 cast<VectorType>(Val: V1->getType())->getElementType(), Builder, *this);
22107 ShuffleBuilder.add(V1, Mask: CombinedMask1);
22108 if (V2)
22109 ShuffleBuilder.add(V1: V2, Mask: CombinedMask2);
22110 return ShuffleBuilder.finalize(ExtMask: {}, SubVectors: {}, SubVectorsMask: {});
22111 };
22112
22113 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
22114 bool ForSingleMask) {
22115 unsigned VF = Mask.size();
22116 unsigned VecVF = cast<FixedVectorType>(Val: Vec->getType())->getNumElements();
22117 if (VF != VecVF) {
22118 if (any_of(Range&: Mask, P: [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
22119 Vec = CreateShuffle(Vec, nullptr, Mask);
22120 return std::make_pair(x&: Vec, y: true);
22121 }
22122 if (!ForSingleMask) {
22123 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
22124 for (unsigned I = 0; I < VF; ++I) {
22125 if (Mask[I] != PoisonMaskElem)
22126 ResizeMask[Mask[I]] = Mask[I];
22127 }
22128 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
22129 }
22130 }
22131
22132 return std::make_pair(x&: Vec, y: false);
22133 };
22134 // Perform shuffling of the vectorize tree entries for better handling of
22135 // external extracts.
22136 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
22137 // Find the first and the last instruction in the list of insertelements.
22138 sort(C&: ShuffledInserts[I].InsertElements, Comp: isFirstInsertElement);
22139 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
22140 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
22141 Builder.SetInsertPoint(LastInsert);
22142 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
22143 Value *NewInst = performExtractsShuffleAction<Value>(
22144 ShuffleMask: MutableArrayRef(Vector.data(), Vector.size()),
22145 Base: FirstInsert->getOperand(i_nocapture: 0),
22146 GetVF: [](Value *Vec) {
22147 return cast<VectorType>(Val: Vec->getType())
22148 ->getElementCount()
22149 .getKnownMinValue();
22150 },
22151 ResizeAction: ResizeToVF,
22152 Action: [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
22153 ArrayRef<Value *> Vals) {
22154 assert((Vals.size() == 1 || Vals.size() == 2) &&
22155 "Expected exactly 1 or 2 input values.");
22156 if (Vals.size() == 1) {
22157 // Do not create shuffle if the mask is a simple identity
22158 // non-resizing mask.
22159 if (Mask.size() != cast<FixedVectorType>(Val: Vals.front()->getType())
22160 ->getNumElements() ||
22161 !ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts: Mask.size()))
22162 return CreateShuffle(Vals.front(), nullptr, Mask);
22163 return Vals.front();
22164 }
22165 return CreateShuffle(Vals.front() ? Vals.front()
22166 : FirstInsert->getOperand(i_nocapture: 0),
22167 Vals.back(), Mask);
22168 });
22169 auto It = ShuffledInserts[I].InsertElements.rbegin();
22170 // Rebuild buildvector chain.
22171 InsertElementInst *II = nullptr;
22172 if (It != ShuffledInserts[I].InsertElements.rend())
22173 II = *It;
22174 SmallVector<Instruction *> Inserts;
22175 while (It != ShuffledInserts[I].InsertElements.rend()) {
22176 assert(II && "Must be an insertelement instruction.");
22177 if (*It == II)
22178 ++It;
22179 else
22180 Inserts.push_back(Elt: cast<Instruction>(Val: II));
22181 II = dyn_cast<InsertElementInst>(Val: II->getOperand(i_nocapture: 0));
22182 }
22183 for (Instruction *II : reverse(C&: Inserts)) {
22184 II->replaceUsesOfWith(From: II->getOperand(i: 0), To: NewInst);
22185 if (auto *NewI = dyn_cast<Instruction>(Val: NewInst))
22186 if (II->getParent() == NewI->getParent() && II->comesBefore(Other: NewI))
22187 II->moveAfter(MovePos: NewI);
22188 NewInst = II;
22189 }
22190 LastInsert->replaceAllUsesWith(V: NewInst);
22191 for (InsertElementInst *IE : reverse(C&: ShuffledInserts[I].InsertElements)) {
22192 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 0),
22193 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 0)->getType()));
22194 IE->replaceUsesOfWith(From: IE->getOperand(i_nocapture: 1),
22195 To: PoisonValue::get(T: IE->getOperand(i_nocapture: 1)->getType()));
22196 eraseInstruction(I: IE);
22197 }
22198 CSEBlocks.insert(V: LastInsert->getParent());
22199 }
22200
22201 SmallVector<Instruction *> RemovedInsts;
22202 // For each vectorized value:
22203 for (auto &TEPtr : VectorizableTree) {
22204 TreeEntry *Entry = TEPtr.get();
22205
22206 // No need to handle users of gathered values.
22207 if (Entry->isGather() || Entry->State == TreeEntry::SplitVectorize ||
22208 DeletedNodes.contains(Ptr: Entry) ||
22209 TransformedToGatherNodes.contains(Val: Entry))
22210 continue;
22211
22212 if (Entry->CombinedOp == TreeEntry::ReducedBitcast ||
22213 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwap ||
22214 Entry->CombinedOp == TreeEntry::ReducedBitcastLoads ||
22215 Entry->CombinedOp == TreeEntry::ReducedBitcastBSwapLoads ||
22216 Entry->CombinedOp == TreeEntry::ReducedCmpBitcast) {
22217 // Skip constant node
22218 if (!Entry->hasState()) {
22219 assert(allConstant(Entry->Scalars) && "Expected constants only.");
22220 continue;
22221 }
22222 for (Value *Scalar : Entry->Scalars) {
22223 auto *I = dyn_cast<Instruction>(Val: Scalar);
22224
22225 if (!I || Entry->isCopyableElement(V: I))
22226 continue;
22227 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *I << ".\n");
22228 RemovedInsts.push_back(Elt: I);
22229 }
22230 continue;
22231 }
22232
22233 assert(Entry->VectorizedValue && "Can't find vectorizable value");
22234
22235 // For each lane:
22236 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
22237 Value *Scalar = Entry->Scalars[Lane];
22238
22239 if (Entry->getOpcode() == Instruction::GetElementPtr &&
22240 !isa<GetElementPtrInst>(Val: Scalar))
22241 continue;
22242 if (auto *EE = dyn_cast<ExtractElementInst>(Val: Scalar);
22243 EE && IgnoredExtracts.contains(V: EE))
22244 continue;
22245 if (!isa<Instruction>(Val: Scalar) || Entry->isCopyableElement(V: Scalar))
22246 continue;
22247#ifndef NDEBUG
22248 Type *Ty = Scalar->getType();
22249 if (!Ty->isVoidTy()) {
22250 for (User *U : Scalar->users()) {
22251 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
22252
22253 // It is legal to delete users in the ignorelist.
22254 assert((isVectorized(U) ||
22255 (UserIgnoreList && UserIgnoreList->contains(U)) ||
22256 (isa_and_nonnull<Instruction>(U) &&
22257 isDeleted(cast<Instruction>(U)))) &&
22258 "Deleting out-of-tree value");
22259 }
22260 }
22261#endif
22262 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
22263 auto *I = cast<Instruction>(Val: Scalar);
22264 RemovedInsts.push_back(Elt: I);
22265 }
22266 }
22267
22268 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
22269 // new vector instruction.
22270 if (auto *V = dyn_cast<Instruction>(Val&: VectorizableTree[0]->VectorizedValue))
22271 V->mergeDIAssignID(SourceInstructions: RemovedInsts);
22272
22273 // Clear up reduction references, if any.
22274 if (UserIgnoreList) {
22275 for (Instruction *I : RemovedInsts) {
22276 const TreeEntry *IE = getTreeEntries(V: I).front();
22277 if (ArrayRef<TreeEntry *> SplitEntries = getSplitTreeEntries(V: I);
22278 !SplitEntries.empty() && SplitEntries.front()->Idx < IE->Idx)
22279 IE = SplitEntries.front();
22280 if (IE->Idx != 0 &&
22281 !(VectorizableTree.front()->isGather() && IE->UserTreeIndex &&
22282 (ValueToGatherNodes.lookup(Val: I).contains(
22283 key: VectorizableTree.front().get()) ||
22284 (IE->UserTreeIndex.UserTE == VectorizableTree.front().get() &&
22285 IE->UserTreeIndex.EdgeIdx == UINT_MAX))) &&
22286 !(VectorizableTree.front()->State == TreeEntry::SplitVectorize &&
22287 IE->UserTreeIndex &&
22288 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
22289 !(GatheredLoadsEntriesFirst.has_value() &&
22290 IE->Idx >= *GatheredLoadsEntriesFirst &&
22291 VectorizableTree.front()->isGather() &&
22292 is_contained(Range&: VectorizableTree.front()->Scalars, Element: I)) &&
22293 !(!VectorizableTree.front()->isGather() &&
22294 VectorizableTree.front()->isCopyableElement(V: I)))
22295 continue;
22296 SmallVector<SelectInst *> LogicalOpSelects;
22297 I->replaceUsesWithIf(New: PoisonValue::get(T: I->getType()), ShouldReplace: [&](Use &U) {
22298 // Do not replace condition of the logical op in form select <cond>.
22299 bool IsPoisoningLogicalOp = isa<SelectInst>(Val: U.getUser()) &&
22300 (match(V: U.getUser(), P: m_LogicalAnd()) ||
22301 match(V: U.getUser(), P: m_LogicalOr())) &&
22302 U.getOperandNo() == 0;
22303 if (IsPoisoningLogicalOp) {
22304 LogicalOpSelects.push_back(Elt: cast<SelectInst>(Val: U.getUser()));
22305 return false;
22306 }
22307 return UserIgnoreList->contains(V: U.getUser());
22308 });
22309 // Replace conditions of the poisoning logical ops with the non-poison
22310 // constant value.
22311 for (SelectInst *SI : LogicalOpSelects)
22312 SI->setCondition(Constant::getNullValue(Ty: SI->getCondition()->getType()));
22313 }
22314 }
22315 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
22316 // cache correctness.
22317 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
22318 // - instructions are not deleted until later.
22319 removeInstructionsAndOperands(DeadVals: ArrayRef(RemovedInsts), VectorValuesAndScales);
22320
22321 Builder.ClearInsertionPoint();
22322 InstrElementSize.clear();
22323
22324 const TreeEntry &RootTE = *VectorizableTree.front();
22325 Value *Vec = RootTE.VectorizedValue;
22326 if (auto It = MinBWs.find(Val: &RootTE); ReductionBitWidth != 0 &&
22327 It != MinBWs.end() &&
22328 ReductionBitWidth != It->second.first) {
22329 IRBuilder<>::InsertPointGuard Guard(Builder);
22330 Builder.SetInsertPoint(TheBB: ReductionRoot->getParent(),
22331 IP: ReductionRoot->getIterator());
22332 if (isReducedBitcastRoot() || isReducedCmpBitcastRoot()) {
22333 Vec = Builder.CreateIntCast(V: Vec, DestTy: Builder.getIntNTy(N: ReductionBitWidth),
22334 isSigned: It->second.second);
22335
22336 } else {
22337 Vec = Builder.CreateIntCast(
22338 V: Vec,
22339 DestTy: VectorType::get(ElementType: Builder.getIntNTy(N: ReductionBitWidth),
22340 EC: cast<VectorType>(Val: Vec->getType())->getElementCount()),
22341 isSigned: It->second.second);
22342 }
22343 }
22344 return Vec;
22345}
22346
22347void BoUpSLP::optimizeGatherSequence() {
22348 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
22349 << " gather sequences instructions.\n");
22350 // LICM InsertElementInst sequences.
22351 for (Instruction *I : GatherShuffleExtractSeq) {
22352 if (isDeleted(I))
22353 continue;
22354
22355 // Check if this block is inside a loop.
22356 Loop *L = LI->getLoopFor(BB: I->getParent());
22357 if (!L)
22358 continue;
22359
22360 // Check if it has a preheader.
22361 BasicBlock *PreHeader = L->getLoopPreheader();
22362 if (!PreHeader)
22363 continue;
22364
22365 // If the vector or the element that we insert into it are
22366 // instructions that are defined in this basic block then we can't
22367 // hoist this instruction.
22368 if (any_of(Range: I->operands(), P: [L](Value *V) {
22369 auto *OpI = dyn_cast<Instruction>(Val: V);
22370 return OpI && L->contains(Inst: OpI);
22371 }))
22372 continue;
22373
22374 // We can hoist this instruction. Move it to the pre-header.
22375 I->moveBefore(InsertPos: PreHeader->getTerminator()->getIterator());
22376 CSEBlocks.insert(V: PreHeader);
22377 }
22378
22379 // Make a list of all reachable blocks in our CSE queue.
22380 SmallVector<const DomTreeNode *, 8> CSEWorkList;
22381 CSEWorkList.reserve(N: CSEBlocks.size());
22382 for (BasicBlock *BB : CSEBlocks)
22383 if (DomTreeNode *N = DT->getNode(BB)) {
22384 assert(DT->isReachableFromEntry(N));
22385 CSEWorkList.push_back(Elt: N);
22386 }
22387
22388 // Sort blocks by domination. This ensures we visit a block after all blocks
22389 // dominating it are visited.
22390 llvm::sort(C&: CSEWorkList, Comp: [](const DomTreeNode *A, const DomTreeNode *B) {
22391 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
22392 "Different nodes should have different DFS numbers");
22393 return A->getDFSNumIn() < B->getDFSNumIn();
22394 });
22395
22396 // Less defined shuffles can be replaced by the more defined copies.
22397 // Between two shuffles one is less defined if it has the same vector operands
22398 // and its mask indeces are the same as in the first one or undefs. E.g.
22399 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
22400 // poison, <0, 0, 0, 0>.
22401 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
22402 Instruction *I2,
22403 SmallVectorImpl<int> &NewMask) {
22404 if (I1->getType() != I2->getType())
22405 return false;
22406 auto *SI1 = dyn_cast<ShuffleVectorInst>(Val: I1);
22407 auto *SI2 = dyn_cast<ShuffleVectorInst>(Val: I2);
22408 if (!SI1 || !SI2)
22409 return I1->isIdenticalTo(I: I2);
22410 if (SI1->isIdenticalTo(I: SI2))
22411 return true;
22412 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
22413 if (SI1->getOperand(i_nocapture: I) != SI2->getOperand(i_nocapture: I))
22414 return false;
22415 // Check if the second instruction is more defined than the first one.
22416 NewMask.assign(in_start: SI2->getShuffleMask().begin(), in_end: SI2->getShuffleMask().end());
22417 ArrayRef<int> SM1 = SI1->getShuffleMask();
22418 // Count trailing undefs in the mask to check the final number of used
22419 // registers.
22420 unsigned LastUndefsCnt = 0;
22421 for (int I = 0, E = NewMask.size(); I < E; ++I) {
22422 if (SM1[I] == PoisonMaskElem)
22423 ++LastUndefsCnt;
22424 else
22425 LastUndefsCnt = 0;
22426 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
22427 NewMask[I] != SM1[I])
22428 return false;
22429 if (NewMask[I] == PoisonMaskElem)
22430 NewMask[I] = SM1[I];
22431 }
22432 // Check if the last undefs actually change the final number of used vector
22433 // registers.
22434 return SM1.size() - LastUndefsCnt > 1 &&
22435 ::getNumberOfParts(TTI: *TTI, VecTy: SI1->getType()) ==
22436 ::getNumberOfParts(
22437 TTI: *TTI, VecTy: getWidenedType(ScalarTy: SI1->getType()->getElementType(),
22438 VF: SM1.size() - LastUndefsCnt));
22439 };
22440 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
22441 // instructions. TODO: We can further optimize this scan if we split the
22442 // instructions into different buckets based on the insert lane.
22443 SmallVector<Instruction *, 16> Visited;
22444 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
22445 assert(*I &&
22446 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
22447 "Worklist not sorted properly!");
22448 BasicBlock *BB = (*I)->getBlock();
22449 // For all instructions in blocks containing gather sequences:
22450 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
22451 if (isDeleted(I: &In))
22452 continue;
22453 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(Val: &In) &&
22454 !GatherShuffleExtractSeq.contains(key: &In))
22455 continue;
22456
22457 // Check if we can replace this instruction with any of the
22458 // visited instructions.
22459 bool Replaced = false;
22460 for (Instruction *&V : Visited) {
22461 SmallVector<int> NewMask;
22462 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
22463 DT->dominates(A: V->getParent(), B: In.getParent())) {
22464 In.replaceAllUsesWith(V);
22465 eraseInstruction(I: &In);
22466 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: V))
22467 if (!NewMask.empty())
22468 SI->setShuffleMask(NewMask);
22469 Replaced = true;
22470 break;
22471 }
22472 if (isa<ShuffleVectorInst>(Val: In) && isa<ShuffleVectorInst>(Val: V) &&
22473 GatherShuffleExtractSeq.contains(key: V) &&
22474 IsIdenticalOrLessDefined(V, &In, NewMask) &&
22475 DT->dominates(A: In.getParent(), B: V->getParent())) {
22476 In.moveAfter(MovePos: V);
22477 V->replaceAllUsesWith(V: &In);
22478 eraseInstruction(I: V);
22479 if (auto *SI = dyn_cast<ShuffleVectorInst>(Val: &In))
22480 if (!NewMask.empty())
22481 SI->setShuffleMask(NewMask);
22482 V = &In;
22483 Replaced = true;
22484 break;
22485 }
22486 }
22487 if (!Replaced) {
22488 assert(!is_contained(Visited, &In));
22489 Visited.push_back(Elt: &In);
22490 }
22491 }
22492 }
22493 CSEBlocks.clear();
22494 GatherShuffleExtractSeq.clear();
22495}
22496
22497BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle(
22498 ArrayRef<Value *> VL, const InstructionsState &S, const EdgeInfo &EI) {
22499 auto &BundlePtr =
22500 ScheduledBundlesList.emplace_back(Args: std::make_unique<ScheduleBundle>());
22501 for (Value *V : VL) {
22502 if (S.isNonSchedulable(V))
22503 continue;
22504 auto *I = cast<Instruction>(Val: V);
22505 if (S.isCopyableElement(V)) {
22506 // Add a copyable element model.
22507 ScheduleCopyableData &SD =
22508 addScheduleCopyableData(EI, I, SchedulingRegionID, Bundle&: *BundlePtr);
22509 // Group the instructions to a bundle.
22510 BundlePtr->add(SD: &SD);
22511 continue;
22512 }
22513 ScheduleData *BundleMember = getScheduleData(V);
22514 assert(BundleMember && "no ScheduleData for bundle member "
22515 "(maybe not in same basic block)");
22516 // Group the instructions to a bundle.
22517 BundlePtr->add(SD: BundleMember);
22518 ScheduledBundles.try_emplace(Key: I).first->getSecond().push_back(
22519 Elt: BundlePtr.get());
22520 }
22521 assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle");
22522 return *BundlePtr;
22523}
22524
22525// Groups the instructions to a bundle (which is then a single scheduling entity)
22526// and schedules instructions until the bundle gets ready.
22527std::optional<BoUpSLP::ScheduleBundle *>
22528BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
22529 const InstructionsState &S,
22530 const EdgeInfo &EI) {
22531 // No need to schedule PHIs, insertelement, extractelement and extractvalue
22532 // instructions.
22533 if (isa<PHINode>(Val: S.getMainOp()) ||
22534 isVectorLikeInstWithConstOps(V: S.getMainOp()))
22535 return nullptr;
22536 // If the parent node is non-schedulable and the current node is copyable, and
22537 // any of parent instructions are used outside several basic blocks or in
22538 // bin-op node - cancel scheduling, it may cause wrong def-use deps in
22539 // analysis, leading to a crash.
22540 // Non-scheduled nodes may not have related ScheduleData model, which may lead
22541 // to a skipped dep analysis.
22542 bool HasCopyables = S.areInstructionsWithCopyableElements();
22543 bool DoesNotRequireScheduling =
22544 (!HasCopyables && doesNotNeedToSchedule(VL)) ||
22545 all_of(Range&: VL, P: [&](Value *V) { return S.isNonSchedulable(V); });
22546 if (!DoesNotRequireScheduling && S.areInstructionsWithCopyableElements() &&
22547 EI && EI.UserTE->hasState() && EI.UserTE->doesNotNeedToSchedule() &&
22548 EI.UserTE->getOpcode() != Instruction::PHI &&
22549 EI.UserTE->getOpcode() != Instruction::InsertElement &&
22550 any_of(Range&: EI.UserTE->Scalars, P: [](Value *V) {
22551 auto *I = dyn_cast<Instruction>(Val: V);
22552 if (!I)
22553 return false;
22554 for (User *U : I->users()) {
22555 auto *UI = cast<Instruction>(Val: U);
22556 if (isa<BinaryOperator>(Val: UI))
22557 return true;
22558 }
22559 return false;
22560 }))
22561 return std::nullopt;
22562 if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
22563 EI.UserTE->hasCopyableElements() &&
22564 EI.UserTE->getMainOp()->getParent() == S.getMainOp()->getParent() &&
22565 all_of(Range&: VL, P: [&](Value *V) {
22566 if (S.isCopyableElement(V))
22567 return true;
22568 return isUsedOutsideBlock(V);
22569 }))
22570 return std::nullopt;
22571 // If any instruction is used outside block only and its operand is placed
22572 // immediately before it, do not schedule, it may cause wrong def-use chain.
22573 if (S.areInstructionsWithCopyableElements() && any_of(Range&: VL, P: [&](Value *V) {
22574 if (isa<PoisonValue>(Val: V) || S.isCopyableElement(V))
22575 return false;
22576 if (isUsedOutsideBlock(V)) {
22577 for (Value *Op : cast<Instruction>(Val: V)->operands()) {
22578 auto *I = dyn_cast<Instruction>(Val: Op);
22579 if (!I)
22580 continue;
22581 return SLP->isVectorized(V: I) && I->getNextNode() == V;
22582 }
22583 }
22584 return false;
22585 }))
22586 return std::nullopt;
22587 if (S.areInstructionsWithCopyableElements() && EI) {
22588 bool IsNonSchedulableWithParentPhiNode =
22589 EI.UserTE->doesNotNeedToSchedule() && EI.UserTE->UserTreeIndex &&
22590 EI.UserTE->UserTreeIndex.UserTE->hasState() &&
22591 EI.UserTE->UserTreeIndex.UserTE->State != TreeEntry::SplitVectorize &&
22592 EI.UserTE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22593 if (IsNonSchedulableWithParentPhiNode) {
22594 SmallSet<std::pair<Value *, Value *>, 4> Values;
22595 for (const auto [Idx, V] :
22596 enumerate(First&: EI.UserTE->UserTreeIndex.UserTE->Scalars)) {
22597 Value *Op = EI.UserTE->UserTreeIndex.UserTE->getOperand(
22598 OpIdx: EI.UserTE->UserTreeIndex.EdgeIdx)[Idx];
22599 auto *I = dyn_cast<Instruction>(Val: Op);
22600 if (!I || !isCommutative(I))
22601 continue;
22602 if (!Values.insert(V: std::make_pair(x&: V, y&: Op)).second)
22603 return std::nullopt;
22604 }
22605 } else {
22606 // If any of the parent requires scheduling - exit, complex dep between
22607 // schedulable/non-schedulable parents.
22608 if (any_of(Range&: EI.UserTE->Scalars, P: [&](Value *V) {
22609 if (EI.UserTE->hasCopyableElements() &&
22610 EI.UserTE->isCopyableElement(V))
22611 return false;
22612 ArrayRef<TreeEntry *> Entries = SLP->getTreeEntries(V);
22613 return any_of(Range&: Entries, P: [](const TreeEntry *TE) {
22614 return TE->doesNotNeedToSchedule() && TE->UserTreeIndex &&
22615 TE->UserTreeIndex.UserTE->hasState() &&
22616 TE->UserTreeIndex.UserTE->State !=
22617 TreeEntry::SplitVectorize &&
22618 TE->UserTreeIndex.UserTE->getOpcode() == Instruction::PHI;
22619 });
22620 }))
22621 return std::nullopt;
22622 }
22623 }
22624 if (DoesNotRequireScheduling) {
22625 // If all operands were replaced by copyables, the operands of this node
22626 // might be not, so need to recalculate dependencies for schedule data,
22627 // replaced by copyable schedule data.
22628 for (Value *V : VL) {
22629 auto *I = dyn_cast<Instruction>(Val: V);
22630 if (!I || (HasCopyables && S.isCopyableElement(V)))
22631 continue;
22632 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
22633 for (const Use &U : I->operands()) {
22634 unsigned &NumOps =
22635 UserOpToNumOps.try_emplace(Key: std::make_pair(x&: I, y: U.get()), Args: 0)
22636 .first->getSecond();
22637 ++NumOps;
22638 if (auto *Op = dyn_cast<Instruction>(Val: U.get());
22639 Op && areAllOperandsReplacedByCopyableData(User: I, Op, SLP&: *SLP, NumOps)) {
22640 if (ScheduleData *OpSD = getScheduleData(I: Op);
22641 OpSD && OpSD->hasValidDependencies())
22642 // TODO: investigate how to improve it instead of early exiting.
22643 return std::nullopt;
22644 }
22645 }
22646 }
22647 return nullptr;
22648 }
22649
22650 // Initialize the instruction bundle.
22651 Instruction *OldScheduleEnd = ScheduleEnd;
22652 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
22653
22654 auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) {
22655 // Clear deps or recalculate the region, if the memory instruction is a
22656 // copyable. It may have memory deps, which must be recalculated.
22657 SmallVector<ScheduleData *> ControlDependentMembers;
22658 auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) {
22659 SmallDenseMap<std::pair<Instruction *, Value *>, unsigned> UserOpToNumOps;
22660 for (ScheduleEntity *SE : Bundle.getBundle()) {
22661 if (ScheduleCopyableData *SD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
22662 if (ScheduleData *BundleMember = getScheduleData(I: SD->getInst());
22663 BundleMember && BundleMember->hasValidDependencies()) {
22664 BundleMember->clearDirectDependencies();
22665 if (RegionHasStackSave ||
22666 !isGuaranteedToTransferExecutionToSuccessor(
22667 I: BundleMember->getInst()))
22668 ControlDependentMembers.push_back(Elt: BundleMember);
22669 }
22670 continue;
22671 }
22672 auto *SD = cast<ScheduleData>(Val: SE);
22673 if (SD->hasValidDependencies() &&
22674 (!S.areInstructionsWithCopyableElements() ||
22675 !S.isCopyableElement(V: SD->getInst())) &&
22676 !getScheduleCopyableData(I: SD->getInst()).empty() && EI.UserTE &&
22677 EI.UserTE->hasState() &&
22678 (!EI.UserTE->hasCopyableElements() ||
22679 !EI.UserTE->isCopyableElement(V: SD->getInst())))
22680 SD->clearDirectDependencies();
22681 for (const Use &U : SD->getInst()->operands()) {
22682 unsigned &NumOps =
22683 UserOpToNumOps
22684 .try_emplace(Key: std::make_pair(x: SD->getInst(), y: U.get()), Args: 0)
22685 .first->getSecond();
22686 ++NumOps;
22687 if (auto *Op = dyn_cast<Instruction>(Val: U.get());
22688 Op && areAllOperandsReplacedByCopyableData(User: SD->getInst(), Op,
22689 SLP&: *SLP, NumOps)) {
22690 if (ScheduleData *OpSD = getScheduleData(I: Op);
22691 OpSD && OpSD->hasValidDependencies()) {
22692 OpSD->clearDirectDependencies();
22693 if (RegionHasStackSave ||
22694 !isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
22695 ControlDependentMembers.push_back(Elt: OpSD);
22696 }
22697 }
22698 }
22699 }
22700 };
22701 // The scheduling region got new instructions at the lower end (or it is a
22702 // new region for the first bundle). This makes it necessary to
22703 // recalculate all dependencies.
22704 // It is seldom that this needs to be done a second time after adding the
22705 // initial bundle to the region.
22706 if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) {
22707 for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
22708 if (BB != P.first->getParent())
22709 return;
22710 ScheduleData *SD = P.second;
22711 if (isInSchedulingRegion(SD: *SD))
22712 SD->clearDependencies();
22713 });
22714 for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
22715 for_each(P.second, [&](ScheduleCopyableData *SD) {
22716 if (isInSchedulingRegion(SD: *SD))
22717 SD->clearDependencies();
22718 });
22719 });
22720 ReSchedule = true;
22721 }
22722 // Check if the bundle data has deps for copyable elements already. In
22723 // this case need to reset deps and recalculate it.
22724 if (Bundle && !Bundle.getBundle().empty()) {
22725 if (S.areInstructionsWithCopyableElements() ||
22726 !ScheduleCopyableDataMap.empty())
22727 CheckIfNeedToClearDeps(Bundle);
22728 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block "
22729 << BB->getName() << "\n");
22730 calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP,
22731 ControlDeps: ControlDependentMembers);
22732 } else if (!ControlDependentMembers.empty()) {
22733 ScheduleBundle Invalid = ScheduleBundle::invalid();
22734 calculateDependencies(Bundle&: Invalid, /*InsertInReadyList=*/!ReSchedule, SLP,
22735 ControlDeps: ControlDependentMembers);
22736 }
22737
22738 if (ReSchedule) {
22739 resetSchedule();
22740 initialFillReadyList(ReadyList&: ReadyInsts);
22741 }
22742
22743 // Now try to schedule the new bundle or (if no bundle) just calculate
22744 // dependencies. As soon as the bundle is "ready" it means that there are no
22745 // cyclic dependencies and we can schedule it. Note that's important that we
22746 // don't "schedule" the bundle yet.
22747 while (((!Bundle && ReSchedule) || (Bundle && !Bundle.isReady())) &&
22748 !ReadyInsts.empty()) {
22749 ScheduleEntity *Picked = ReadyInsts.pop_back_val();
22750 assert(Picked->isReady() && "must be ready to schedule");
22751 schedule(R: *SLP, S, EI, Data: Picked, ReadyList&: ReadyInsts);
22752 if (Picked == &Bundle)
22753 break;
22754 }
22755 };
22756
22757 // Make sure that the scheduling region contains all
22758 // instructions of the bundle.
22759 for (Value *V : VL) {
22760 if (S.isNonSchedulable(V))
22761 continue;
22762 if (!extendSchedulingRegion(V, S)) {
22763 // If the scheduling region got new instructions at the lower end (or it
22764 // is a new region for the first bundle). This makes it necessary to
22765 // recalculate all dependencies.
22766 // Otherwise the compiler may crash trying to incorrectly calculate
22767 // dependencies and emit instruction in the wrong order at the actual
22768 // scheduling.
22769 ScheduleBundle Invalid = ScheduleBundle::invalid();
22770 TryScheduleBundleImpl(/*ReSchedule=*/false, Invalid);
22771 return std::nullopt;
22772 }
22773 }
22774
22775 bool ReSchedule = false;
22776 for (Value *V : VL) {
22777 if (S.isNonSchedulable(V))
22778 continue;
22779 SmallVector<ScheduleCopyableData *> CopyableData =
22780 getScheduleCopyableData(I: cast<Instruction>(Val: V));
22781 if (!CopyableData.empty()) {
22782 for (ScheduleCopyableData *SD : CopyableData)
22783 ReadyInsts.remove(X: SD);
22784 }
22785 ScheduleData *BundleMember = getScheduleData(V);
22786 assert((BundleMember || S.isCopyableElement(V)) &&
22787 "no ScheduleData for bundle member (maybe not in same basic block)");
22788 if (!BundleMember)
22789 continue;
22790
22791 // Make sure we don't leave the pieces of the bundle in the ready list when
22792 // whole bundle might not be ready.
22793 ReadyInsts.remove(X: BundleMember);
22794 if (ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V);
22795 !Bundles.empty()) {
22796 for (ScheduleBundle *B : Bundles)
22797 ReadyInsts.remove(X: B);
22798 }
22799
22800 if (!S.isCopyableElement(V) && !BundleMember->isScheduled())
22801 continue;
22802 // A bundle member was scheduled as single instruction before and now
22803 // needs to be scheduled as part of the bundle. We just get rid of the
22804 // existing schedule.
22805 // A bundle member has deps calculated before it was copyable element - need
22806 // to reschedule.
22807 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
22808 << " was already scheduled\n");
22809 ReSchedule = true;
22810 }
22811
22812 ScheduleBundle &Bundle = buildBundle(VL, S, EI);
22813 TryScheduleBundleImpl(ReSchedule, Bundle);
22814 if (!Bundle.isReady()) {
22815 for (ScheduleEntity *BD : Bundle.getBundle()) {
22816 // Copyable data scheduling is just removed.
22817 if (isa<ScheduleCopyableData>(Val: BD))
22818 continue;
22819 if (BD->isReady()) {
22820 ArrayRef<ScheduleBundle *> Bundles = getScheduleBundles(V: BD->getInst());
22821 if (Bundles.empty()) {
22822 ReadyInsts.insert(X: BD);
22823 continue;
22824 }
22825 for (ScheduleBundle *B : Bundles)
22826 if (B->isReady())
22827 ReadyInsts.insert(X: B);
22828 }
22829 }
22830 ScheduledBundlesList.pop_back();
22831 SmallVector<ScheduleData *> ControlDependentMembers;
22832 for (Value *V : VL) {
22833 if (S.isNonSchedulable(V))
22834 continue;
22835 auto *I = cast<Instruction>(Val: V);
22836 if (S.isCopyableElement(V: I)) {
22837 // Remove the copyable data from the scheduling region and restore
22838 // previous mappings.
22839 auto KV = std::make_pair(x: EI, y&: I);
22840 assert(ScheduleCopyableDataMap.contains(KV) &&
22841 "no ScheduleCopyableData for copyable element");
22842 ScheduleCopyableData *SD =
22843 ScheduleCopyableDataMapByInst.find(Val: I)->getSecond().pop_back_val();
22844 ScheduleCopyableDataMapByUsers[I].remove(X: SD);
22845 if (EI.UserTE) {
22846 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
22847 const auto *It = find(Range&: Op, Val: I);
22848 assert(It != Op.end() && "Lane not set");
22849 SmallPtrSet<Instruction *, 4> Visited;
22850 do {
22851 int Lane = std::distance(first: Op.begin(), last: It);
22852 assert(Lane >= 0 && "Lane not set");
22853 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
22854 !EI.UserTE->ReorderIndices.empty())
22855 Lane = EI.UserTE->ReorderIndices[Lane];
22856 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
22857 "Couldn't find extract lane");
22858 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
22859 if (!Visited.insert(Ptr: In).second) {
22860 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
22861 break;
22862 }
22863 ScheduleCopyableDataMapByInstUser
22864 [std::make_pair(x: std::make_pair(x&: In, y: EI.EdgeIdx), y&: I)]
22865 .pop_back();
22866 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: I);
22867 } while (It != Op.end());
22868 EdgeInfo UserEI = EI.UserTE->UserTreeIndex;
22869 if (ScheduleCopyableData *UserCD = getScheduleCopyableData(EI: UserEI, V: I))
22870 ScheduleCopyableDataMapByUsers[I].insert(X: UserCD);
22871 }
22872 if (ScheduleCopyableDataMapByUsers[I].empty())
22873 ScheduleCopyableDataMapByUsers.erase(Val: I);
22874 ScheduleCopyableDataMap.erase(Val: KV);
22875 // Need to recalculate dependencies for the actual schedule data.
22876 if (ScheduleData *OpSD = getScheduleData(I);
22877 OpSD && OpSD->hasValidDependencies()) {
22878 OpSD->clearDirectDependencies();
22879 if (RegionHasStackSave ||
22880 !isGuaranteedToTransferExecutionToSuccessor(I: OpSD->getInst()))
22881 ControlDependentMembers.push_back(Elt: OpSD);
22882 }
22883 continue;
22884 }
22885 ScheduledBundles.find(Val: I)->getSecond().pop_back();
22886 }
22887 if (!ControlDependentMembers.empty()) {
22888 ScheduleBundle Invalid = ScheduleBundle::invalid();
22889 calculateDependencies(Bundle&: Invalid, /*InsertInReadyList=*/false, SLP,
22890 ControlDeps: ControlDependentMembers);
22891 }
22892 return std::nullopt;
22893 }
22894 return &Bundle;
22895}
22896
22897BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
22898 // Allocate a new ScheduleData for the instruction.
22899 if (ChunkPos >= ChunkSize) {
22900 ScheduleDataChunks.push_back(Elt: std::make_unique<ScheduleData[]>(num: ChunkSize));
22901 ChunkPos = 0;
22902 }
22903 return &(ScheduleDataChunks.back()[ChunkPos++]);
22904}
22905
22906bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
22907 Value *V, const InstructionsState &S) {
22908 Instruction *I = dyn_cast<Instruction>(Val: V);
22909 assert(I && "bundle member must be an instruction");
22910 if (getScheduleData(I))
22911 return true;
22912 if (!ScheduleStart) {
22913 // It's the first instruction in the new region.
22914 initScheduleData(FromI: I, ToI: I->getNextNode(), PrevLoadStore: nullptr, NextLoadStore: nullptr);
22915 ScheduleStart = I;
22916 ScheduleEnd = I->getNextNode();
22917 assert(ScheduleEnd && "tried to vectorize a terminator?");
22918 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
22919 return true;
22920 }
22921 // Search up and down at the same time, because we don't know if the new
22922 // instruction is above or below the existing scheduling region.
22923 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
22924 // against the budget. Otherwise debug info could affect codegen.
22925 BasicBlock::reverse_iterator UpIter =
22926 ++ScheduleStart->getIterator().getReverse();
22927 BasicBlock::reverse_iterator UpperEnd = BB->rend();
22928 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
22929 BasicBlock::iterator LowerEnd = BB->end();
22930 auto IsAssumeLikeIntr = [](const Instruction &I) {
22931 if (auto *II = dyn_cast<IntrinsicInst>(Val: &I))
22932 return II->isAssumeLikeIntrinsic();
22933 return false;
22934 };
22935 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
22936 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
22937 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
22938 &*DownIter != I) {
22939 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
22940 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
22941 return false;
22942 }
22943
22944 ++UpIter;
22945 ++DownIter;
22946
22947 UpIter = std::find_if_not(first: UpIter, last: UpperEnd, pred: IsAssumeLikeIntr);
22948 DownIter = std::find_if_not(first: DownIter, last: LowerEnd, pred: IsAssumeLikeIntr);
22949 }
22950 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
22951 assert(I->getParent() == ScheduleStart->getParent() &&
22952 "Instruction is in wrong basic block.");
22953 initScheduleData(FromI: I, ToI: ScheduleStart, PrevLoadStore: nullptr, NextLoadStore: FirstLoadStoreInRegion);
22954 ScheduleStart = I;
22955 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
22956 << "\n");
22957 return true;
22958 }
22959 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
22960 "Expected to reach top of the basic block or instruction down the "
22961 "lower end.");
22962 assert(I->getParent() == ScheduleEnd->getParent() &&
22963 "Instruction is in wrong basic block.");
22964 initScheduleData(FromI: ScheduleEnd, ToI: I->getNextNode(), PrevLoadStore: LastLoadStoreInRegion,
22965 NextLoadStore: nullptr);
22966 ScheduleEnd = I->getNextNode();
22967 assert(ScheduleEnd && "tried to vectorize a terminator?");
22968 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
22969 return true;
22970}
22971
22972void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
22973 Instruction *ToI,
22974 ScheduleData *PrevLoadStore,
22975 ScheduleData *NextLoadStore) {
22976 ScheduleData *CurrentLoadStore = PrevLoadStore;
22977 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
22978 // No need to allocate data for non-schedulable instructions.
22979 if (isa<PHINode>(Val: I))
22980 continue;
22981 ScheduleData *SD = ScheduleDataMap.lookup(Val: I);
22982 if (!SD) {
22983 SD = allocateScheduleDataChunks();
22984 ScheduleDataMap[I] = SD;
22985 }
22986 assert(!isInSchedulingRegion(*SD) &&
22987 "new ScheduleData already in scheduling region");
22988 SD->init(BlockSchedulingRegionID: SchedulingRegionID, I);
22989
22990 auto CanIgnoreLoad = [](const Instruction *I) {
22991 const auto *LI = dyn_cast<LoadInst>(Val: I);
22992 // If there is a simple load marked as invariant, we can ignore it.
22993 // But, in the (unlikely) case of non-simple invariant load,
22994 // we should not ignore it.
22995 return LI && LI->isSimple() &&
22996 LI->getMetadata(KindID: LLVMContext::MD_invariant_load);
22997 };
22998
22999 if (I->mayReadOrWriteMemory() &&
23000 // Simple InvariantLoad does not depend on other memory accesses.
23001 !CanIgnoreLoad(I) &&
23002 (!isa<IntrinsicInst>(Val: I) ||
23003 (cast<IntrinsicInst>(Val: I)->getIntrinsicID() != Intrinsic::sideeffect &&
23004 cast<IntrinsicInst>(Val: I)->getIntrinsicID() !=
23005 Intrinsic::pseudoprobe))) {
23006 // Update the linked list of memory accessing instructions.
23007 if (CurrentLoadStore) {
23008 CurrentLoadStore->setNextLoadStore(SD);
23009 } else {
23010 FirstLoadStoreInRegion = SD;
23011 }
23012 CurrentLoadStore = SD;
23013 }
23014
23015 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
23016 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23017 RegionHasStackSave = true;
23018 }
23019 if (NextLoadStore) {
23020 if (CurrentLoadStore)
23021 CurrentLoadStore->setNextLoadStore(NextLoadStore);
23022 } else {
23023 LastLoadStoreInRegion = CurrentLoadStore;
23024 }
23025}
23026
23027void BoUpSLP::BlockScheduling::calculateDependencies(
23028 ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP,
23029 ArrayRef<ScheduleData *> ControlDeps) {
23030 SmallVector<ScheduleEntity *> WorkList;
23031 auto ProcessNode = [&](ScheduleEntity *SE) {
23032 if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SE)) {
23033 if (CD->hasValidDependencies())
23034 return;
23035 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n");
23036 CD->initDependencies();
23037 CD->resetUnscheduledDeps();
23038 const EdgeInfo &EI = CD->getEdgeInfo();
23039 if (EI.UserTE) {
23040 ArrayRef<Value *> Op = EI.UserTE->getOperand(OpIdx: EI.EdgeIdx);
23041 const auto *It = find(Range&: Op, Val: CD->getInst());
23042 assert(It != Op.end() && "Lane not set");
23043 SmallPtrSet<Instruction *, 4> Visited;
23044 do {
23045 int Lane = std::distance(first: Op.begin(), last: It);
23046 assert(Lane >= 0 && "Lane not set");
23047 if (isa<StoreInst>(Val: EI.UserTE->Scalars[Lane]) &&
23048 !EI.UserTE->ReorderIndices.empty())
23049 Lane = EI.UserTE->ReorderIndices[Lane];
23050 assert(Lane < static_cast<int>(EI.UserTE->Scalars.size()) &&
23051 "Couldn't find extract lane");
23052 auto *In = cast<Instruction>(Val: EI.UserTE->Scalars[Lane]);
23053 if (EI.UserTE->isCopyableElement(V: In)) {
23054 // We may have not have related copyable scheduling data, if the
23055 // instruction is non-schedulable.
23056 if (ScheduleCopyableData *UseSD =
23057 getScheduleCopyableData(EI: EI.UserTE->UserTreeIndex, V: In)) {
23058 CD->incDependencies();
23059 if (!UseSD->isScheduled())
23060 CD->incrementUnscheduledDeps(Incr: 1);
23061 if (!UseSD->hasValidDependencies() ||
23062 (InsertInReadyList && UseSD->isReady()))
23063 WorkList.push_back(Elt: UseSD);
23064 }
23065 } else if (Visited.insert(Ptr: In).second) {
23066 if (ScheduleData *UseSD = getScheduleData(I: In)) {
23067 CD->incDependencies();
23068 if (!UseSD->isScheduled())
23069 CD->incrementUnscheduledDeps(Incr: 1);
23070 if (!UseSD->hasValidDependencies() ||
23071 (InsertInReadyList && UseSD->isReady()))
23072 WorkList.push_back(Elt: UseSD);
23073 }
23074 }
23075 It = find(Range: make_range(x: std::next(x: It), y: Op.end()), Val: CD->getInst());
23076 } while (It != Op.end());
23077 if (CD->isReady() && CD->getDependencies() == 0 &&
23078 (EI.UserTE->hasState() &&
23079 (EI.UserTE->getMainOp()->getParent() !=
23080 CD->getInst()->getParent() ||
23081 (isa<PHINode>(Val: EI.UserTE->getMainOp()) &&
23082 (EI.UserTE->getMainOp()->hasNUsesOrMore(N: UsesLimit) ||
23083 any_of(Range: EI.UserTE->getMainOp()->users(), P: [&](User *U) {
23084 auto *IU = dyn_cast<Instruction>(Val: U);
23085 if (!IU)
23086 return true;
23087 return IU->getParent() == EI.UserTE->getMainOp()->getParent();
23088 })))))) {
23089 // If no uses in the block - mark as having pseudo-use, which cannot
23090 // be scheduled.
23091 // Prevents incorrect def-use tracking between external user and
23092 // actual instruction.
23093 CD->incDependencies();
23094 CD->incrementUnscheduledDeps(Incr: 1);
23095 }
23096 }
23097 return;
23098 }
23099 auto *BundleMember = cast<ScheduleData>(Val: SE);
23100 if (BundleMember->hasValidDependencies())
23101 return;
23102 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
23103 BundleMember->initDependencies();
23104 BundleMember->resetUnscheduledDeps();
23105 // Handle def-use chain dependencies.
23106 SmallDenseMap<Value *, unsigned> UserToNumOps;
23107 for (User *U : BundleMember->getInst()->users()) {
23108 if (isa<PHINode>(Val: U))
23109 continue;
23110 if (ScheduleData *UseSD = getScheduleData(V: U)) {
23111 // The operand is a copyable element - skip.
23112 unsigned &NumOps = UserToNumOps.try_emplace(Key: U, Args: 0).first->getSecond();
23113 ++NumOps;
23114 if (areAllOperandsReplacedByCopyableData(
23115 User: cast<Instruction>(Val: U), Op: BundleMember->getInst(), SLP&: *SLP, NumOps))
23116 continue;
23117 BundleMember->incDependencies();
23118 if (!UseSD->isScheduled())
23119 BundleMember->incrementUnscheduledDeps(Incr: 1);
23120 if (!UseSD->hasValidDependencies() ||
23121 (InsertInReadyList && UseSD->isReady()))
23122 WorkList.push_back(Elt: UseSD);
23123 }
23124 }
23125 for (ScheduleCopyableData *UseSD :
23126 getScheduleCopyableDataUsers(User: BundleMember->getInst())) {
23127 BundleMember->incDependencies();
23128 if (!UseSD->isScheduled())
23129 BundleMember->incrementUnscheduledDeps(Incr: 1);
23130 if (!UseSD->hasValidDependencies() ||
23131 (InsertInReadyList && UseSD->isReady()))
23132 WorkList.push_back(Elt: UseSD);
23133 }
23134
23135 SmallPtrSet<const Instruction *, 4> Visited;
23136 auto MakeControlDependent = [&](Instruction *I) {
23137 // Do not mark control dependent twice.
23138 if (!Visited.insert(Ptr: I).second)
23139 return;
23140 auto *DepDest = getScheduleData(I);
23141 assert(DepDest && "must be in schedule window");
23142 DepDest->addControlDependency(Dep: BundleMember);
23143 BundleMember->incDependencies();
23144 if (!DepDest->isScheduled())
23145 BundleMember->incrementUnscheduledDeps(Incr: 1);
23146 if (!DepDest->hasValidDependencies() ||
23147 (InsertInReadyList && DepDest->isReady()))
23148 WorkList.push_back(Elt: DepDest);
23149 };
23150
23151 // Any instruction which isn't safe to speculate at the beginning of the
23152 // block is control depend on any early exit or non-willreturn call
23153 // which proceeds it.
23154 if (!isGuaranteedToTransferExecutionToSuccessor(I: BundleMember->getInst())) {
23155 for (Instruction *I = BundleMember->getInst()->getNextNode();
23156 I != ScheduleEnd; I = I->getNextNode()) {
23157 if (isSafeToSpeculativelyExecute(I, CtxI: &*BB->begin(), AC: SLP->AC))
23158 continue;
23159
23160 // Add the dependency
23161 MakeControlDependent(I);
23162
23163 if (!isGuaranteedToTransferExecutionToSuccessor(I))
23164 // Everything past here must be control dependent on I.
23165 break;
23166 }
23167 }
23168
23169 if (RegionHasStackSave) {
23170 // If we have an inalloc alloca instruction, it needs to be scheduled
23171 // after any preceeding stacksave. We also need to prevent any alloca
23172 // from reordering above a preceeding stackrestore.
23173 if (match(V: BundleMember->getInst(), P: m_Intrinsic<Intrinsic::stacksave>()) ||
23174 match(V: BundleMember->getInst(),
23175 P: m_Intrinsic<Intrinsic::stackrestore>())) {
23176 for (Instruction *I = BundleMember->getInst()->getNextNode();
23177 I != ScheduleEnd; I = I->getNextNode()) {
23178 if (match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) ||
23179 match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23180 // Any allocas past here must be control dependent on I, and I
23181 // must be memory dependend on BundleMember->Inst.
23182 break;
23183
23184 if (!isa<AllocaInst>(Val: I))
23185 continue;
23186
23187 // Add the dependency
23188 MakeControlDependent(I);
23189 }
23190 }
23191
23192 // In addition to the cases handle just above, we need to prevent
23193 // allocas and loads/stores from moving below a stacksave or a
23194 // stackrestore. Avoiding moving allocas below stackrestore is currently
23195 // thought to be conservatism. Moving loads/stores below a stackrestore
23196 // can lead to incorrect code.
23197 if (isa<AllocaInst>(Val: BundleMember->getInst()) ||
23198 BundleMember->getInst()->mayReadOrWriteMemory()) {
23199 for (Instruction *I = BundleMember->getInst()->getNextNode();
23200 I != ScheduleEnd; I = I->getNextNode()) {
23201 if (!match(V: I, P: m_Intrinsic<Intrinsic::stacksave>()) &&
23202 !match(V: I, P: m_Intrinsic<Intrinsic::stackrestore>()))
23203 continue;
23204
23205 // Add the dependency
23206 MakeControlDependent(I);
23207 break;
23208 }
23209 }
23210 }
23211
23212 // Handle the memory dependencies (if any).
23213 ScheduleData *NextLoadStore = BundleMember->getNextLoadStore();
23214 if (!NextLoadStore)
23215 return;
23216 Instruction *SrcInst = BundleMember->getInst();
23217 assert(SrcInst->mayReadOrWriteMemory() &&
23218 "NextLoadStore list for non memory effecting bundle?");
23219 MemoryLocation SrcLoc = getLocation(I: SrcInst);
23220 bool SrcMayWrite = SrcInst->mayWriteToMemory();
23221 unsigned NumAliased = 0;
23222 unsigned DistToSrc = 1;
23223 bool IsNonSimpleSrc = !SrcLoc.Ptr || !isSimple(I: SrcInst);
23224
23225 for (ScheduleData *DepDest = NextLoadStore; DepDest;
23226 DepDest = DepDest->getNextLoadStore()) {
23227 assert(isInSchedulingRegion(*DepDest) && "Expected to be in region");
23228
23229 // We have two limits to reduce the complexity:
23230 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
23231 // SLP->isAliased (which is the expensive part in this loop).
23232 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
23233 // the whole loop (even if the loop is fast, it's quadratic).
23234 // It's important for the loop break condition (see below) to
23235 // check this limit even between two read-only instructions.
23236 if (DistToSrc >= MaxMemDepDistance ||
23237 ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) &&
23238 (IsNonSimpleSrc || NumAliased >= AliasedCheckLimit ||
23239 SLP->isAliased(Loc1: SrcLoc, Inst1: SrcInst, Inst2: DepDest->getInst())))) {
23240
23241 // We increment the counter only if the locations are aliased
23242 // (instead of counting all alias checks). This gives a better
23243 // balance between reduced runtime and accurate dependencies.
23244 NumAliased++;
23245
23246 DepDest->addMemoryDependency(Dep: BundleMember);
23247 BundleMember->incDependencies();
23248 if (!DepDest->isScheduled())
23249 BundleMember->incrementUnscheduledDeps(Incr: 1);
23250 if (!DepDest->hasValidDependencies() ||
23251 (InsertInReadyList && DepDest->isReady()))
23252 WorkList.push_back(Elt: DepDest);
23253 }
23254
23255 // Example, explaining the loop break condition: Let's assume our
23256 // starting instruction is i0 and MaxMemDepDistance = 3.
23257 //
23258 // +--------v--v--v
23259 // i0,i1,i2,i3,i4,i5,i6,i7,i8
23260 // +--------^--^--^
23261 //
23262 // MaxMemDepDistance let us stop alias-checking at i3 and we add
23263 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
23264 // Previously we already added dependencies from i3 to i6,i7,i8
23265 // (because of MaxMemDepDistance). As we added a dependency from
23266 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
23267 // and we can abort this loop at i6.
23268 if (DistToSrc >= 2 * MaxMemDepDistance)
23269 break;
23270 DistToSrc++;
23271 }
23272 };
23273
23274 assert((Bundle || !ControlDeps.empty()) &&
23275 "expected at least one instruction to schedule");
23276 if (Bundle)
23277 WorkList.push_back(Elt: Bundle.getBundle().front());
23278 WorkList.append(in_start: ControlDeps.begin(), in_end: ControlDeps.end());
23279 SmallPtrSet<ScheduleBundle *, 16> Visited;
23280 while (!WorkList.empty()) {
23281 ScheduleEntity *SD = WorkList.pop_back_val();
23282 SmallVector<ScheduleBundle *, 1> CopyableBundle;
23283 ArrayRef<ScheduleBundle *> Bundles;
23284 if (auto *CD = dyn_cast<ScheduleCopyableData>(Val: SD)) {
23285 CopyableBundle.push_back(Elt: &CD->getBundle());
23286 Bundles = CopyableBundle;
23287 } else {
23288 Bundles = getScheduleBundles(V: SD->getInst());
23289 }
23290 if (Bundles.empty()) {
23291 if (!SD->hasValidDependencies())
23292 ProcessNode(SD);
23293 if (InsertInReadyList && SD->isReady()) {
23294 ReadyInsts.insert(X: SD);
23295 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n");
23296 }
23297 continue;
23298 }
23299 for (ScheduleBundle *Bundle : Bundles) {
23300 if (Bundle->hasValidDependencies() || !Visited.insert(Ptr: Bundle).second)
23301 continue;
23302 assert(isInSchedulingRegion(*Bundle) &&
23303 "ScheduleData not in scheduling region");
23304 for_each(Range: Bundle->getBundle(), F: ProcessNode);
23305 }
23306 if (InsertInReadyList && SD->isReady()) {
23307 for (ScheduleBundle *Bundle : Bundles) {
23308 assert(isInSchedulingRegion(*Bundle) &&
23309 "ScheduleData not in scheduling region");
23310 if (!Bundle->isReady())
23311 continue;
23312 ReadyInsts.insert(X: Bundle);
23313 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *Bundle
23314 << "\n");
23315 }
23316 }
23317 }
23318}
23319
23320void BoUpSLP::BlockScheduling::resetSchedule() {
23321 assert(ScheduleStart &&
23322 "tried to reset schedule on block which has not been scheduled");
23323 for_each(Range&: ScheduleDataMap, F: [&](auto &P) {
23324 if (BB != P.first->getParent())
23325 return;
23326 ScheduleData *SD = P.second;
23327 if (isInSchedulingRegion(SD: *SD)) {
23328 SD->setScheduled(/*Scheduled=*/false);
23329 SD->resetUnscheduledDeps();
23330 }
23331 });
23332 for_each(Range&: ScheduleCopyableDataMapByInst, F: [&](auto &P) {
23333 for_each(P.second, [&](ScheduleCopyableData *SD) {
23334 if (isInSchedulingRegion(SD: *SD)) {
23335 SD->setScheduled(/*Scheduled=*/false);
23336 SD->resetUnscheduledDeps();
23337 }
23338 });
23339 });
23340 for_each(Range&: ScheduledBundles, F: [&](auto &P) {
23341 for_each(P.second, [&](ScheduleBundle *Bundle) {
23342 if (isInSchedulingRegion(SD: *Bundle))
23343 Bundle->setScheduled(/*Scheduled=*/false);
23344 });
23345 });
23346 // Reset schedule data for copyable elements.
23347 for (auto &P : ScheduleCopyableDataMap) {
23348 if (isInSchedulingRegion(SD: *P.second)) {
23349 P.second->setScheduled(/*Scheduled=*/false);
23350 P.second->resetUnscheduledDeps();
23351 }
23352 }
23353 ReadyInsts.clear();
23354}
23355
23356void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) {
23357 if (!BS->ScheduleStart)
23358 return;
23359
23360 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
23361
23362 // A key point - if we got here, pre-scheduling was able to find a valid
23363 // scheduling of the sub-graph of the scheduling window which consists
23364 // of all vector bundles and their transitive users. As such, we do not
23365 // need to reschedule anything *outside of* that subgraph.
23366
23367 BS->resetSchedule();
23368
23369 // For the real scheduling we use a more sophisticated ready-list: it is
23370 // sorted by the original instruction location. This lets the final schedule
23371 // be as close as possible to the original instruction order.
23372 // WARNING: If changing this order causes a correctness issue, that means
23373 // there is some missing dependence edge in the schedule data graph.
23374 struct ScheduleDataCompare {
23375 bool operator()(const ScheduleEntity *SD1,
23376 const ScheduleEntity *SD2) const {
23377 return SD2->getSchedulingPriority() < SD1->getSchedulingPriority();
23378 }
23379 };
23380 std::set<ScheduleEntity *, ScheduleDataCompare> ReadyInsts;
23381
23382 // Ensure that all dependency data is updated (for nodes in the sub-graph)
23383 // and fill the ready-list with initial instructions.
23384 int Idx = 0;
23385 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
23386 I = I->getNextNode()) {
23387 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(V: I);
23388 if (!Bundles.empty()) {
23389 for (ScheduleBundle *Bundle : Bundles) {
23390 Bundle->setSchedulingPriority(Idx++);
23391 if (!Bundle->hasValidDependencies())
23392 BS->calculateDependencies(Bundle&: *Bundle, /*InsertInReadyList=*/false, SLP: this);
23393 }
23394 SmallVector<ScheduleCopyableData *> SDs = BS->getScheduleCopyableData(I);
23395 for (ScheduleCopyableData *SD : reverse(C&: SDs)) {
23396 ScheduleBundle &Bundle = SD->getBundle();
23397 Bundle.setSchedulingPriority(Idx++);
23398 if (!Bundle.hasValidDependencies())
23399 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
23400 }
23401 continue;
23402 }
23403 SmallVector<ScheduleCopyableData *> CopyableData =
23404 BS->getScheduleCopyableDataUsers(User: I);
23405 if (ScheduleData *SD = BS->getScheduleData(I)) {
23406 [[maybe_unused]] ArrayRef<TreeEntry *> SDTEs = getTreeEntries(V: I);
23407 assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() ||
23408 SDTEs.front()->doesNotNeedToSchedule() ||
23409 doesNotNeedToBeScheduled(I)) &&
23410 "scheduler and vectorizer bundle mismatch");
23411 SD->setSchedulingPriority(Idx++);
23412 if (!CopyableData.empty() ||
23413 any_of(Range: R.ValueToGatherNodes.lookup(Val: I), P: [&](const TreeEntry *TE) {
23414 assert(TE->isGather() && "expected gather node");
23415 return TE->hasState() && TE->hasCopyableElements() &&
23416 TE->isCopyableElement(V: I);
23417 })) {
23418 SD->clearDirectDependencies();
23419 // Need to calculate deps for these nodes to correctly handle copyable
23420 // dependencies, even if they were cancelled.
23421 // If copyables bundle was cancelled, the deps are cleared and need to
23422 // recalculate them.
23423 ScheduleBundle Bundle;
23424 Bundle.add(SD);
23425 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
23426 }
23427 }
23428 for (ScheduleCopyableData *SD : reverse(C&: CopyableData)) {
23429 ScheduleBundle &Bundle = SD->getBundle();
23430 Bundle.setSchedulingPriority(Idx++);
23431 if (!Bundle.hasValidDependencies())
23432 BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, SLP: this);
23433 }
23434 }
23435 BS->initialFillReadyList(ReadyList&: ReadyInsts);
23436
23437 Instruction *LastScheduledInst = BS->ScheduleEnd;
23438
23439 // Do the "real" scheduling.
23440 SmallPtrSet<Instruction *, 16> Scheduled;
23441 while (!ReadyInsts.empty()) {
23442 auto *Picked = *ReadyInsts.begin();
23443 ReadyInsts.erase(position: ReadyInsts.begin());
23444
23445 // Move the scheduled instruction(s) to their dedicated places, if not
23446 // there yet.
23447 if (auto *Bundle = dyn_cast<ScheduleBundle>(Val: Picked)) {
23448 for (const ScheduleEntity *BundleMember : Bundle->getBundle()) {
23449 Instruction *PickedInst = BundleMember->getInst();
23450 // If copyable must be schedule as part of something else, skip it.
23451 bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(V: PickedInst);
23452 if ((IsCopyable && BS->getScheduleData(I: PickedInst)) ||
23453 (!IsCopyable && !Scheduled.insert(Ptr: PickedInst).second))
23454 continue;
23455 if (PickedInst->getNextNode() != LastScheduledInst)
23456 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
23457 LastScheduledInst = PickedInst;
23458 }
23459 EntryToLastInstruction.try_emplace(Key: Bundle->getTreeEntry(),
23460 Args&: LastScheduledInst);
23461 } else {
23462 auto *SD = cast<ScheduleData>(Val: Picked);
23463 Instruction *PickedInst = SD->getInst();
23464 if (PickedInst->getNextNode() != LastScheduledInst)
23465 PickedInst->moveAfter(MovePos: LastScheduledInst->getPrevNode());
23466 LastScheduledInst = PickedInst;
23467 }
23468 auto Invalid = InstructionsState::invalid();
23469 BS->schedule(R, S: Invalid, EI: EdgeInfo(), Data: Picked, ReadyList&: ReadyInsts);
23470 }
23471
23472 // Check that we didn't break any of our invariants.
23473#ifdef EXPENSIVE_CHECKS
23474 BS->verify();
23475#endif
23476
23477#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
23478 // Check that all schedulable entities got scheduled
23479 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
23480 I = I->getNextNode()) {
23481 ArrayRef<ScheduleBundle *> Bundles = BS->getScheduleBundles(I);
23482 assert(all_of(Bundles,
23483 [](const ScheduleBundle *Bundle) {
23484 return Bundle->isScheduled();
23485 }) &&
23486 "must be scheduled at this point");
23487 }
23488#endif
23489
23490 // Avoid duplicate scheduling of the block.
23491 BS->ScheduleStart = nullptr;
23492}
23493
23494unsigned BoUpSLP::getVectorElementSize(Value *V) {
23495 // If V is a store, just return the width of the stored value (or value
23496 // truncated just before storing) without traversing the expression tree.
23497 // This is the common case.
23498 if (auto *Store = dyn_cast<StoreInst>(Val: V))
23499 return DL->getTypeSizeInBits(Ty: Store->getValueOperand()->getType());
23500
23501 if (auto *IEI = dyn_cast<InsertElementInst>(Val: V))
23502 return getVectorElementSize(V: IEI->getOperand(i_nocapture: 1));
23503
23504 auto E = InstrElementSize.find(Val: V);
23505 if (E != InstrElementSize.end())
23506 return E->second;
23507
23508 // If V is not a store, we can traverse the expression tree to find loads
23509 // that feed it. The type of the loaded value may indicate a more suitable
23510 // width than V's type. We want to base the vector element size on the width
23511 // of memory operations where possible.
23512 SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;
23513 SmallPtrSet<Instruction *, 16> Visited;
23514 if (auto *I = dyn_cast<Instruction>(Val: V)) {
23515 Worklist.emplace_back(Args&: I, Args: I->getParent(), Args: 0);
23516 Visited.insert(Ptr: I);
23517 }
23518
23519 // Traverse the expression tree in bottom-up order looking for loads. If we
23520 // encounter an instruction we don't yet handle, we give up.
23521 auto Width = 0u;
23522 Value *FirstNonBool = nullptr;
23523 while (!Worklist.empty()) {
23524 auto [I, Parent, Level] = Worklist.pop_back_val();
23525
23526 // We should only be looking at scalar instructions here. If the current
23527 // instruction has a vector type, skip.
23528 auto *Ty = I->getType();
23529 if (isa<VectorType>(Val: Ty))
23530 continue;
23531 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
23532 FirstNonBool = I;
23533 if (Level > RecursionMaxDepth)
23534 continue;
23535
23536 // If the current instruction is a load, update MaxWidth to reflect the
23537 // width of the loaded value.
23538 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(Val: I))
23539 Width = std::max<unsigned>(a: Width, b: DL->getTypeSizeInBits(Ty));
23540
23541 // Otherwise, we need to visit the operands of the instruction. We only
23542 // handle the interesting cases from buildTree here. If an operand is an
23543 // instruction we haven't yet visited and from the same basic block as the
23544 // user or the use is a PHI node, we add it to the worklist.
23545 else if (isa<PHINode, CastInst, GetElementPtrInst, CmpInst, SelectInst,
23546 BinaryOperator, UnaryOperator>(Val: I)) {
23547 for (Use &U : I->operands()) {
23548 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
23549 if (Visited.insert(Ptr: J).second &&
23550 (isa<PHINode>(Val: I) || J->getParent() == Parent)) {
23551 Worklist.emplace_back(Args&: J, Args: J->getParent(), Args: Level + 1);
23552 continue;
23553 }
23554 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
23555 FirstNonBool = U.get();
23556 }
23557 } else {
23558 break;
23559 }
23560 }
23561
23562 // If we didn't encounter a memory access in the expression tree, or if we
23563 // gave up for some reason, just return the width of V. Otherwise, return the
23564 // maximum width we found.
23565 if (!Width) {
23566 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
23567 V = FirstNonBool;
23568 Width = DL->getTypeSizeInBits(Ty: V->getType());
23569 }
23570
23571 for (Instruction *I : Visited)
23572 InstrElementSize[I] = Width;
23573
23574 return Width;
23575}
23576
23577bool BoUpSLP::collectValuesToDemote(
23578 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
23579 SmallVectorImpl<unsigned> &ToDemote, DenseSet<const TreeEntry *> &Visited,
23580 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
23581 bool &IsProfitableToDemote, bool IsTruncRoot) const {
23582 // We can always demote constants.
23583 if (all_of(Range: E.Scalars, P: IsaPred<Constant>))
23584 return true;
23585
23586 unsigned OrigBitWidth =
23587 DL->getTypeSizeInBits(Ty: E.Scalars.front()->getType()->getScalarType());
23588 if (OrigBitWidth == BitWidth) {
23589 MaxDepthLevel = 1;
23590 return true;
23591 }
23592
23593 // Check if the node was analyzed already and must keep its original bitwidth.
23594 if (NodesToKeepBWs.contains(V: E.Idx))
23595 return false;
23596
23597 // If the value is not a vectorized instruction in the expression and not used
23598 // by the insertelement instruction and not used in multiple vector nodes, it
23599 // cannot be demoted.
23600 bool IsSignedNode = any_of(Range: E.Scalars, P: [&](Value *R) {
23601 if (isa<PoisonValue>(Val: R))
23602 return false;
23603 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
23604 });
23605 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
23606 if (isa<PoisonValue>(Val: V))
23607 return true;
23608 if (getTreeEntries(V).size() > 1)
23609 return false;
23610 // For lat shuffle of sext/zext with many uses need to check the extra bit
23611 // for unsigned values, otherwise may have incorrect casting for reused
23612 // scalars.
23613 bool IsSignedVal = !isKnownNonNegative(V, SQ: SimplifyQuery(*DL));
23614 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
23615 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23616 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
23617 return true;
23618 }
23619 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
23620 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
23621 if (IsSignedNode)
23622 ++BitWidth1;
23623 if (auto *I = dyn_cast<Instruction>(Val: V)) {
23624 APInt Mask = DB->getDemandedBits(I);
23625 unsigned BitWidth2 =
23626 std::max<unsigned>(a: 1, b: Mask.getBitWidth() - Mask.countl_zero());
23627 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
23628 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth2 - 1);
23629 if (MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL)))
23630 break;
23631 BitWidth2 *= 2;
23632 }
23633 BitWidth1 = std::min(a: BitWidth1, b: BitWidth2);
23634 }
23635 BitWidth = std::max(a: BitWidth, b: BitWidth1);
23636 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
23637 };
23638 auto FinalAnalysis = [&, TTI = TTI]() {
23639 if (!IsProfitableToDemote)
23640 return false;
23641 bool Res = all_of(
23642 Range: E.Scalars, P: std::bind(f&: IsPotentiallyTruncated, args: _1, args: std::ref(t&: BitWidth)));
23643 // Demote gathers.
23644 if (Res && E.isGather()) {
23645 if (E.hasState()) {
23646 if (const TreeEntry *SameTE =
23647 getSameValuesTreeEntry(V: E.getMainOp(), VL: E.Scalars);
23648 SameTE)
23649 if (collectValuesToDemote(E: *SameTE, IsProfitableToDemoteRoot, BitWidth,
23650 ToDemote, Visited, NodesToKeepBWs,
23651 MaxDepthLevel, IsProfitableToDemote,
23652 IsTruncRoot)) {
23653 ToDemote.push_back(Elt: E.Idx);
23654 return true;
23655 }
23656 }
23657 // Check possible extractelement instructions bases and final vector
23658 // length.
23659 SmallPtrSet<Value *, 4> UniqueBases;
23660 for (Value *V : E.Scalars) {
23661 auto *EE = dyn_cast<ExtractElementInst>(Val: V);
23662 if (!EE)
23663 continue;
23664 UniqueBases.insert(Ptr: EE->getVectorOperand());
23665 }
23666 const unsigned VF = E.Scalars.size();
23667 Type *OrigScalarTy = E.Scalars.front()->getType();
23668 if (UniqueBases.size() <= 2 ||
23669 ::getNumberOfParts(TTI: *TTI, VecTy: getWidenedType(ScalarTy: OrigScalarTy, VF)) >=
23670 ::getNumberOfParts(
23671 TTI: *TTI,
23672 VecTy: getWidenedType(
23673 ScalarTy: IntegerType::get(C&: OrigScalarTy->getContext(), NumBits: BitWidth),
23674 VF))) {
23675 ToDemote.push_back(Elt: E.Idx);
23676 return true;
23677 }
23678 }
23679 return Res;
23680 };
23681 if (E.isGather() || !Visited.insert(V: &E).second ||
23682 any_of(Range: E.Scalars, P: [&](Value *V) {
23683 return !isa<Constant>(Val: V) && all_of(Range: V->users(), P: [&](User *U) {
23684 return isa<InsertElementInst>(Val: U) && !isVectorized(V: U);
23685 });
23686 }))
23687 return FinalAnalysis();
23688
23689 if (any_of(Range: E.Scalars, P: [&](Value *V) {
23690 return !isa<Constant>(Val: V) && !all_of(Range: V->users(), P: [=](User *U) {
23691 return isVectorized(V: U) ||
23692 (E.Idx == 0 && UserIgnoreList &&
23693 UserIgnoreList->contains(V: U)) ||
23694 (!isa<CmpInst>(Val: U) && U->getType()->isSized() &&
23695 !U->getType()->isScalableTy() &&
23696 DL->getTypeSizeInBits(Ty: U->getType()) <= BitWidth);
23697 }) && !IsPotentiallyTruncated(V, BitWidth);
23698 }))
23699 return false;
23700
23701 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
23702 bool &NeedToExit) {
23703 NeedToExit = false;
23704 unsigned InitLevel = MaxDepthLevel;
23705 for (const TreeEntry *Op : Operands) {
23706 unsigned Level = InitLevel;
23707 if (!collectValuesToDemote(E: *Op, IsProfitableToDemoteRoot, BitWidth,
23708 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel&: Level,
23709 IsProfitableToDemote, IsTruncRoot)) {
23710 if (!IsProfitableToDemote)
23711 return false;
23712 NeedToExit = true;
23713 if (!FinalAnalysis())
23714 return false;
23715 continue;
23716 }
23717 MaxDepthLevel = std::max(a: MaxDepthLevel, b: Level);
23718 }
23719 return true;
23720 };
23721 auto AttemptCheckBitwidth =
23722 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
23723 // Try all bitwidth < OrigBitWidth.
23724 NeedToExit = false;
23725 unsigned BestFailBitwidth = 0;
23726 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
23727 if (Checker(BitWidth, OrigBitWidth))
23728 return true;
23729 if (BestFailBitwidth == 0 && FinalAnalysis())
23730 BestFailBitwidth = BitWidth;
23731 }
23732 if (BitWidth >= OrigBitWidth) {
23733 if (BestFailBitwidth == 0) {
23734 BitWidth = OrigBitWidth;
23735 return false;
23736 }
23737 MaxDepthLevel = 1;
23738 BitWidth = BestFailBitwidth;
23739 NeedToExit = true;
23740 return true;
23741 }
23742 return false;
23743 };
23744 auto TryProcessInstruction =
23745 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
23746 function_ref<bool(unsigned, unsigned)> Checker = {}) {
23747 if (Operands.empty()) {
23748 if (!IsTruncRoot)
23749 MaxDepthLevel = 1;
23750 for (Value *V : E.Scalars)
23751 (void)IsPotentiallyTruncated(V, BitWidth);
23752 } else {
23753 // Several vectorized uses? Check if we can truncate it, otherwise -
23754 // exit.
23755 if (any_of(Range: E.Scalars, P: [&](Value *V) {
23756 return !V->hasOneUse() && !IsPotentiallyTruncated(V, BitWidth);
23757 }))
23758 return false;
23759 bool NeedToExit = false;
23760 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
23761 return false;
23762 if (NeedToExit)
23763 return true;
23764 if (!ProcessOperands(Operands, NeedToExit))
23765 return false;
23766 if (NeedToExit)
23767 return true;
23768 }
23769
23770 ++MaxDepthLevel;
23771 // Record the entry that we can demote.
23772 ToDemote.push_back(Elt: E.Idx);
23773 return IsProfitableToDemote;
23774 };
23775
23776 if (E.State == TreeEntry::SplitVectorize)
23777 return TryProcessInstruction(
23778 BitWidth,
23779 {VectorizableTree[E.CombinedEntriesWithIndices.front().first].get(),
23780 VectorizableTree[E.CombinedEntriesWithIndices.back().first].get()});
23781
23782 if (E.isAltShuffle()) {
23783 // Combining these opcodes may lead to incorrect analysis, skip for now.
23784 auto IsDangerousOpcode = [](unsigned Opcode) {
23785 switch (Opcode) {
23786 case Instruction::Shl:
23787 case Instruction::AShr:
23788 case Instruction::LShr:
23789 case Instruction::UDiv:
23790 case Instruction::SDiv:
23791 case Instruction::URem:
23792 case Instruction::SRem:
23793 return true;
23794 default:
23795 break;
23796 }
23797 return false;
23798 };
23799 if (IsDangerousOpcode(E.getAltOpcode()))
23800 return FinalAnalysis();
23801 }
23802
23803 switch (E.getOpcode()) {
23804
23805 // We can always demote truncations and extensions. Since truncations can
23806 // seed additional demotion, we save the truncated value.
23807 case Instruction::Trunc:
23808 if (IsProfitableToDemoteRoot)
23809 IsProfitableToDemote = true;
23810 return TryProcessInstruction(BitWidth);
23811 case Instruction::ZExt:
23812 case Instruction::SExt:
23813 if (E.UserTreeIndex.UserTE && E.UserTreeIndex.UserTE->hasState() &&
23814 E.UserTreeIndex.UserTE->getOpcode() == Instruction::BitCast &&
23815 E.UserTreeIndex.UserTE->getMainOp()->getType()->isFPOrFPVectorTy())
23816 return false;
23817 IsProfitableToDemote = true;
23818 return TryProcessInstruction(BitWidth);
23819
23820 // We can demote certain binary operations if we can demote both of their
23821 // operands.
23822 case Instruction::Add:
23823 case Instruction::Sub:
23824 case Instruction::Mul:
23825 case Instruction::And:
23826 case Instruction::Or:
23827 case Instruction::Xor: {
23828 return TryProcessInstruction(
23829 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)});
23830 }
23831 case Instruction::Freeze:
23832 return TryProcessInstruction(BitWidth, getOperandEntry(E: &E, Idx: 0));
23833 case Instruction::Shl: {
23834 // If we are truncating the result of this SHL, and if it's a shift of an
23835 // inrange amount, we can always perform a SHL in a smaller type.
23836 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
23837 return all_of(Range: E.Scalars, P: [&](Value *V) {
23838 if (isa<PoisonValue>(Val: V))
23839 return true;
23840 if (E.isCopyableElement(V))
23841 return true;
23842 auto *I = cast<Instruction>(Val: V);
23843 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
23844 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth);
23845 });
23846 };
23847 return TryProcessInstruction(
23848 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, ShlChecker);
23849 }
23850 case Instruction::LShr: {
23851 // If this is a truncate of a logical shr, we can truncate it to a smaller
23852 // lshr iff we know that the bits we would otherwise be shifting in are
23853 // already zeros.
23854 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23855 return all_of(Range: E.Scalars, P: [&](Value *V) {
23856 if (isa<PoisonValue>(Val: V))
23857 return true;
23858 APInt ShiftedBits = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23859 if (E.isCopyableElement(V))
23860 return MaskedValueIsZero(V, Mask: ShiftedBits, SQ: SimplifyQuery(*DL));
23861 auto *I = cast<Instruction>(Val: V);
23862 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
23863 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
23864 MaskedValueIsZero(V: I->getOperand(i: 0), Mask: ShiftedBits,
23865 SQ: SimplifyQuery(*DL));
23866 });
23867 };
23868 return TryProcessInstruction(
23869 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
23870 LShrChecker);
23871 }
23872 case Instruction::AShr: {
23873 // If this is a truncate of an arithmetic shr, we can truncate it to a
23874 // smaller ashr iff we know that all the bits from the sign bit of the
23875 // original type and the sign bit of the truncate type are similar.
23876 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23877 return all_of(Range: E.Scalars, P: [&](Value *V) {
23878 if (isa<PoisonValue>(Val: V))
23879 return true;
23880 auto *I = cast<Instruction>(Val: V);
23881 KnownBits AmtKnownBits = computeKnownBits(V: I->getOperand(i: 1), DL: *DL);
23882 unsigned ShiftedBits = OrigBitWidth - BitWidth;
23883 return AmtKnownBits.getMaxValue().ult(RHS: BitWidth) &&
23884 ShiftedBits <
23885 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
23886 });
23887 };
23888 return TryProcessInstruction(
23889 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)},
23890 AShrChecker);
23891 }
23892 case Instruction::UDiv:
23893 case Instruction::URem: {
23894 // UDiv and URem can be truncated if all the truncated bits are zero.
23895 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23896 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23897 return all_of(Range: E.Scalars, P: [&](Value *V) {
23898 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23899 if (E.hasCopyableElements() && E.isCopyableElement(V))
23900 return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL));
23901 auto *I = cast<Instruction>(Val: V);
23902 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)) &&
23903 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
23904 });
23905 };
23906 return TryProcessInstruction(
23907 BitWidth, {getOperandEntry(E: &E, Idx: 0), getOperandEntry(E: &E, Idx: 1)}, Checker);
23908 }
23909
23910 // We can demote selects if we can demote their true and false values.
23911 case Instruction::Select: {
23912 return TryProcessInstruction(
23913 BitWidth, {getOperandEntry(E: &E, Idx: 1), getOperandEntry(E: &E, Idx: 2)});
23914 }
23915
23916 // We can demote phis if we can demote all their incoming operands.
23917 case Instruction::PHI: {
23918 const unsigned NumOps = E.getNumOperands();
23919 SmallVector<const TreeEntry *> Ops(NumOps);
23920 transform(Range: seq<unsigned>(Begin: 0, End: NumOps), d_first: Ops.begin(),
23921 F: [&](unsigned Idx) { return getOperandEntry(E: &E, Idx); });
23922
23923 return TryProcessInstruction(BitWidth, Ops);
23924 }
23925
23926 case Instruction::Call: {
23927 auto *IC = dyn_cast<IntrinsicInst>(Val: E.getMainOp());
23928 if (!IC)
23929 break;
23930 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI: IC, TLI);
23931 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
23932 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
23933 break;
23934 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(E: &E, Idx: 0));
23935 function_ref<bool(unsigned, unsigned)> CallChecker;
23936 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23937 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23938 return all_of(Range: E.Scalars, P: [&](Value *V) {
23939 auto *I = cast<Instruction>(Val: V);
23940 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
23941 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth);
23942 return MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
23943 SQ: SimplifyQuery(*DL)) &&
23944 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL));
23945 }
23946 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
23947 "Expected min/max intrinsics only.");
23948 unsigned SignBits = OrigBitWidth - BitWidth;
23949 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
23950 unsigned Op0SignBits =
23951 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
23952 unsigned Op1SignBits =
23953 ComputeNumSignBits(Op: I->getOperand(i: 1), DL: *DL, AC, CxtI: nullptr, DT);
23954 return SignBits <= Op0SignBits &&
23955 ((SignBits != Op0SignBits &&
23956 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
23957 MaskedValueIsZero(V: I->getOperand(i: 0), Mask,
23958 SQ: SimplifyQuery(*DL))) &&
23959 SignBits <= Op1SignBits &&
23960 ((SignBits != Op1SignBits &&
23961 !isKnownNonNegative(V: I->getOperand(i: 1), SQ: SimplifyQuery(*DL))) ||
23962 MaskedValueIsZero(V: I->getOperand(i: 1), Mask, SQ: SimplifyQuery(*DL)));
23963 });
23964 };
23965 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
23966 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
23967 return all_of(Range: E.Scalars, P: [&](Value *V) {
23968 auto *I = cast<Instruction>(Val: V);
23969 unsigned SignBits = OrigBitWidth - BitWidth;
23970 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: BitWidth - 1);
23971 unsigned Op0SignBits =
23972 ComputeNumSignBits(Op: I->getOperand(i: 0), DL: *DL, AC, CxtI: nullptr, DT);
23973 return SignBits <= Op0SignBits &&
23974 ((SignBits != Op0SignBits &&
23975 !isKnownNonNegative(V: I->getOperand(i: 0), SQ: SimplifyQuery(*DL))) ||
23976 MaskedValueIsZero(V: I->getOperand(i: 0), Mask, SQ: SimplifyQuery(*DL)));
23977 });
23978 };
23979 if (ID != Intrinsic::abs) {
23980 Operands.push_back(Elt: getOperandEntry(E: &E, Idx: 1));
23981 CallChecker = CompChecker;
23982 } else {
23983 CallChecker = AbsChecker;
23984 }
23985 InstructionCost BestCost =
23986 std::numeric_limits<InstructionCost::CostType>::max();
23987 unsigned BestBitWidth = BitWidth;
23988 unsigned VF = E.Scalars.size();
23989 // Choose the best bitwidth based on cost estimations.
23990 auto Checker = [&](unsigned BitWidth, unsigned) {
23991 unsigned MinBW = PowerOf2Ceil(A: BitWidth);
23992 SmallVector<Type *> ArgTys =
23993 buildIntrinsicArgTypes(CI: IC, ID, VF, MinBW, TTI);
23994 auto VecCallCosts = getVectorCallCosts(
23995 CI: IC, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: IC->getContext(), NumBits: MinBW), VF),
23996 TTI, TLI, ArgTys);
23997 InstructionCost Cost = std::min(a: VecCallCosts.first, b: VecCallCosts.second);
23998 if (Cost < BestCost) {
23999 BestCost = Cost;
24000 BestBitWidth = BitWidth;
24001 }
24002 return false;
24003 };
24004 [[maybe_unused]] bool NeedToExit;
24005 (void)AttemptCheckBitwidth(Checker, NeedToExit);
24006 BitWidth = BestBitWidth;
24007 return TryProcessInstruction(BitWidth, Operands, CallChecker);
24008 }
24009
24010 // Otherwise, conservatively give up.
24011 default:
24012 break;
24013 }
24014 MaxDepthLevel = 1;
24015 return FinalAnalysis();
24016}
24017
24018static RecurKind getRdxKind(Value *V);
24019
24020void BoUpSLP::computeMinimumValueSizes() {
24021 // We only attempt to truncate integer expressions.
24022 bool IsStoreOrInsertElt =
24023 VectorizableTree.front()->hasState() &&
24024 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
24025 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
24026 if ((IsStoreOrInsertElt || UserIgnoreList) &&
24027 ExtraBitWidthNodes.size() <= 1 &&
24028 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
24029 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
24030 return;
24031
24032 unsigned NodeIdx = 0;
24033 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
24034 NodeIdx = 1;
24035
24036 // Ensure the roots of the vectorizable tree don't form a cycle.
24037 assert((VectorizableTree[NodeIdx]->isGather() || NodeIdx != 0 ||
24038 !VectorizableTree[NodeIdx]->UserTreeIndex) &&
24039 "Unexpected tree is graph.");
24040
24041 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
24042 // resize to the final type.
24043 bool IsTruncRoot = false;
24044 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
24045 SmallVector<unsigned> RootDemotes;
24046 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
24047 if (NodeIdx != 0 &&
24048 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
24049 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
24050 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
24051 IsTruncRoot = true;
24052 RootDemotes.push_back(Elt: NodeIdx);
24053 IsProfitableToDemoteRoot = true;
24054 ++NodeIdx;
24055 }
24056
24057 // Analyzed the reduction already and not profitable - exit.
24058 if (AnalyzedMinBWVals.contains(V: VectorizableTree[NodeIdx]->Scalars.front()))
24059 return;
24060
24061 SmallVector<unsigned> ToDemote;
24062 auto ComputeMaxBitWidth =
24063 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
24064 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
24065 ToDemote.clear();
24066 // Check if the root is trunc and the next node is gather/buildvector, then
24067 // keep trunc in scalars, which is free in most cases.
24068 if (E.isGather() && IsTruncRoot && E.UserTreeIndex &&
24069 !NodesToKeepBWs.contains(V: E.Idx) &&
24070 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
24071 all_of(Range: E.Scalars, P: [&](Value *V) {
24072 return V->hasOneUse() || isa<Constant>(Val: V) ||
24073 (!V->hasNUsesOrMore(N: UsesLimit) &&
24074 none_of(Range: V->users(), P: [&](User *U) {
24075 ArrayRef<TreeEntry *> TEs = getTreeEntries(V: U);
24076 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24077 if (TEs.empty() || is_contained(Range&: TEs, Element: UserTE))
24078 return false;
24079 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24080 SelectInst>(Val: U) ||
24081 isa<SIToFPInst, UIToFPInst>(Val: U) ||
24082 (UserTE->hasState() &&
24083 (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
24084 SelectInst>(Val: UserTE->getMainOp()) ||
24085 isa<SIToFPInst, UIToFPInst>(Val: UserTE->getMainOp()))))
24086 return true;
24087 unsigned UserTESz = DL->getTypeSizeInBits(
24088 Ty: UserTE->Scalars.front()->getType());
24089 if (all_of(Range&: TEs, P: [&](const TreeEntry *TE) {
24090 auto It = MinBWs.find(Val: TE);
24091 return It != MinBWs.end() &&
24092 It->second.first > UserTESz;
24093 }))
24094 return true;
24095 return DL->getTypeSizeInBits(Ty: U->getType()) > UserTESz;
24096 }));
24097 })) {
24098 ToDemote.push_back(Elt: E.Idx);
24099 const TreeEntry *UserTE = E.UserTreeIndex.UserTE;
24100 auto It = MinBWs.find(Val: UserTE);
24101 if (It != MinBWs.end())
24102 return It->second.first;
24103 unsigned MaxBitWidth =
24104 DL->getTypeSizeInBits(Ty: UserTE->Scalars.front()->getType());
24105 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
24106 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24107 MaxBitWidth = 8;
24108 return MaxBitWidth;
24109 }
24110
24111 if (!E.hasState())
24112 return 0u;
24113
24114 unsigned VF = E.getVectorFactor();
24115 Type *ScalarTy = E.Scalars.front()->getType();
24116 unsigned ScalarTyNumElements = getNumElements(Ty: ScalarTy);
24117 auto *TreeRootIT = dyn_cast<IntegerType>(Val: ScalarTy->getScalarType());
24118 if (!TreeRootIT)
24119 return 0u;
24120
24121 if (any_of(Range: E.Scalars,
24122 P: [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
24123 return 0u;
24124
24125 unsigned NumParts = ::getNumberOfParts(
24126 TTI: *TTI, VecTy: getWidenedType(ScalarTy: TreeRootIT, VF: VF * ScalarTyNumElements));
24127
24128 // The maximum bit width required to represent all the values that can be
24129 // demoted without loss of precision. It would be safe to truncate the roots
24130 // of the expression to this width.
24131 unsigned MaxBitWidth = 1u;
24132
24133 // True if the roots can be zero-extended back to their original type,
24134 // rather than sign-extended. We know that if the leading bits are not
24135 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
24136 // True.
24137 // Determine if the sign bit of all the roots is known to be zero. If not,
24138 // IsKnownPositive is set to False.
24139 bool IsKnownPositive = !IsSignedCmp && all_of(Range: E.Scalars, P: [&](Value *R) {
24140 if (isa<PoisonValue>(Val: R))
24141 return true;
24142 KnownBits Known = computeKnownBits(V: R, DL: *DL);
24143 return Known.isNonNegative();
24144 });
24145
24146 if (!IsKnownPositive && !IsTopRoot && E.UserTreeIndex &&
24147 E.UserTreeIndex.UserTE->hasState() &&
24148 E.UserTreeIndex.UserTE->getOpcode() == Instruction::UIToFP)
24149 MaxBitWidth =
24150 std::min(a: DL->getTypeSizeInBits(
24151 Ty: E.UserTreeIndex.UserTE->Scalars.front()->getType()),
24152 b: DL->getTypeSizeInBits(Ty: ScalarTy));
24153
24154 // We first check if all the bits of the roots are demanded. If they're not,
24155 // we can truncate the roots to this narrower type.
24156 for (Value *Root : E.Scalars) {
24157 if (isa<PoisonValue>(Val: Root))
24158 continue;
24159 unsigned NumSignBits = ComputeNumSignBits(Op: Root, DL: *DL, AC, CxtI: nullptr, DT);
24160 TypeSize NumTypeBits =
24161 DL->getTypeSizeInBits(Ty: Root->getType()->getScalarType());
24162 unsigned BitWidth1 = NumTypeBits - NumSignBits;
24163 // If we can't prove that the sign bit is zero, we must add one to the
24164 // maximum bit width to account for the unknown sign bit. This preserves
24165 // the existing sign bit so we can safely sign-extend the root back to the
24166 // original type. Otherwise, if we know the sign bit is zero, we will
24167 // zero-extend the root instead.
24168 //
24169 // FIXME: This is somewhat suboptimal, as there will be cases where adding
24170 // one to the maximum bit width will yield a larger-than-necessary
24171 // type. In general, we need to add an extra bit only if we can't
24172 // prove that the upper bit of the original type is equal to the
24173 // upper bit of the proposed smaller type. If these two bits are
24174 // the same (either zero or one) we know that sign-extending from
24175 // the smaller type will result in the same value. Here, since we
24176 // can't yet prove this, we are just making the proposed smaller
24177 // type larger to ensure correctness.
24178 if (!IsKnownPositive)
24179 ++BitWidth1;
24180
24181 auto *I = dyn_cast<Instruction>(Val: Root);
24182 if (!I) {
24183 MaxBitWidth = std::max(a: BitWidth1, b: MaxBitWidth);
24184 continue;
24185 }
24186 APInt Mask = DB->getDemandedBits(I);
24187 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24188 MaxBitWidth =
24189 std::max<unsigned>(a: std::min(a: BitWidth1, b: BitWidth2), b: MaxBitWidth);
24190 }
24191
24192 if (MaxBitWidth < 8 && MaxBitWidth > 1)
24193 MaxBitWidth = 8;
24194
24195 // If the original type is large, but reduced type does not improve the reg
24196 // use - ignore it.
24197 if (NumParts > 1 &&
24198 NumParts ==
24199 ::getNumberOfParts(
24200 TTI: *TTI, VecTy: getWidenedType(ScalarTy: IntegerType::get(C&: F->getContext(),
24201 NumBits: bit_ceil(Value: MaxBitWidth)),
24202 VF)))
24203 return 0u;
24204
24205 unsigned Opcode = E.getOpcode();
24206 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
24207 Opcode == Instruction::SExt ||
24208 Opcode == Instruction::ZExt || NumParts > 1;
24209 // Conservatively determine if we can actually truncate the roots of the
24210 // expression. Collect the values that can be demoted in ToDemote and
24211 // additional roots that require investigating in Roots.
24212 DenseSet<const TreeEntry *> Visited;
24213 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
24214 bool NeedToDemote = IsProfitableToDemote;
24215
24216 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, BitWidth&: MaxBitWidth,
24217 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
24218 IsProfitableToDemote&: NeedToDemote, IsTruncRoot) ||
24219 (MaxDepthLevel <= Limit &&
24220 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
24221 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
24222 DL->getTypeSizeInBits(Ty: TreeRootIT) /
24223 DL->getTypeSizeInBits(
24224 Ty: E.getMainOp()->getOperand(i: 0)->getType()) >
24225 2)))))
24226 return 0u;
24227 // Round MaxBitWidth up to the next power-of-two.
24228 MaxBitWidth = bit_ceil(Value: MaxBitWidth);
24229
24230 return MaxBitWidth;
24231 };
24232
24233 // If we can truncate the root, we must collect additional values that might
24234 // be demoted as a result. That is, those seeded by truncations we will
24235 // modify.
24236 // Add reduction ops sizes, if any.
24237 if (UserIgnoreList &&
24238 isa<IntegerType>(Val: VectorizableTree.front()->Scalars.front()->getType())) {
24239 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
24240 // x i1> to in)).
24241 if (all_of(Range: *UserIgnoreList,
24242 P: [](Value *V) {
24243 return isa<PoisonValue>(Val: V) ||
24244 cast<Instruction>(Val: V)->getOpcode() == Instruction::Add;
24245 }) &&
24246 VectorizableTree.front()->State == TreeEntry::Vectorize &&
24247 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
24248 cast<CastInst>(Val: VectorizableTree.front()->getMainOp())->getSrcTy() ==
24249 Builder.getInt1Ty()) {
24250 ReductionBitWidth = 1;
24251 } else {
24252 for (Value *V : *UserIgnoreList) {
24253 if (isa<PoisonValue>(Val: V))
24254 continue;
24255 unsigned NumSignBits = ComputeNumSignBits(Op: V, DL: *DL, AC, CxtI: nullptr, DT);
24256 TypeSize NumTypeBits = DL->getTypeSizeInBits(Ty: V->getType());
24257 unsigned BitWidth1 = NumTypeBits - NumSignBits;
24258 if (!isKnownNonNegative(V, SQ: SimplifyQuery(*DL)))
24259 ++BitWidth1;
24260 unsigned BitWidth2 = BitWidth1;
24261 if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: ::getRdxKind(V))) {
24262 APInt Mask = DB->getDemandedBits(I: cast<Instruction>(Val: V));
24263 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
24264 }
24265 ReductionBitWidth =
24266 std::max(a: std::min(a: BitWidth1, b: BitWidth2), b: ReductionBitWidth);
24267 }
24268 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
24269 ReductionBitWidth = 8;
24270
24271 ReductionBitWidth = bit_ceil(Value: ReductionBitWidth);
24272 }
24273 }
24274 bool IsTopRoot = NodeIdx == 0;
24275 while (NodeIdx < VectorizableTree.size() &&
24276 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
24277 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
24278 RootDemotes.push_back(Elt: NodeIdx);
24279 ++NodeIdx;
24280 IsTruncRoot = true;
24281 }
24282 bool IsSignedCmp = false;
24283 if (UserIgnoreList &&
24284 all_of(Range: *UserIgnoreList,
24285 P: match_fn(P: m_CombineOr(L: m_SMin(L: m_Value(), R: m_Value()),
24286 R: m_SMax(L: m_Value(), R: m_Value())))))
24287 IsSignedCmp = true;
24288 while (NodeIdx < VectorizableTree.size()) {
24289 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
24290 unsigned Limit = 2;
24291 if (IsTopRoot &&
24292 ReductionBitWidth ==
24293 DL->getTypeSizeInBits(
24294 Ty: VectorizableTree.front()->Scalars.front()->getType()))
24295 Limit = 3;
24296 unsigned MaxBitWidth = ComputeMaxBitWidth(
24297 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
24298 IsTruncRoot, IsSignedCmp);
24299 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
24300 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
24301 ReductionBitWidth = bit_ceil(Value: MaxBitWidth);
24302 else if (MaxBitWidth == 0)
24303 ReductionBitWidth = 0;
24304 }
24305
24306 for (unsigned Idx : RootDemotes) {
24307 if (all_of(Range&: VectorizableTree[Idx]->Scalars, P: [&](Value *V) {
24308 uint32_t OrigBitWidth =
24309 DL->getTypeSizeInBits(Ty: V->getType()->getScalarType());
24310 if (OrigBitWidth > MaxBitWidth) {
24311 APInt Mask = APInt::getBitsSetFrom(numBits: OrigBitWidth, loBit: MaxBitWidth);
24312 return MaskedValueIsZero(V, Mask, SQ: SimplifyQuery(*DL));
24313 }
24314 return false;
24315 }))
24316 ToDemote.push_back(Elt: Idx);
24317 }
24318 RootDemotes.clear();
24319 IsTopRoot = false;
24320 IsProfitableToDemoteRoot = true;
24321
24322 if (ExtraBitWidthNodes.empty()) {
24323 NodeIdx = VectorizableTree.size();
24324 } else {
24325 unsigned NewIdx = 0;
24326 do {
24327 NewIdx = *ExtraBitWidthNodes.begin();
24328 ExtraBitWidthNodes.erase(I: ExtraBitWidthNodes.begin());
24329 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
24330 NodeIdx = NewIdx;
24331 IsTruncRoot =
24332 NodeIdx < VectorizableTree.size() &&
24333 VectorizableTree[NodeIdx]->UserTreeIndex &&
24334 VectorizableTree[NodeIdx]->UserTreeIndex.EdgeIdx == 0 &&
24335 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24336 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24337 Instruction::Trunc &&
24338 !VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->isAltShuffle();
24339 IsSignedCmp =
24340 NodeIdx < VectorizableTree.size() &&
24341 VectorizableTree[NodeIdx]->UserTreeIndex &&
24342 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->hasState() &&
24343 VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->getOpcode() ==
24344 Instruction::ICmp &&
24345 any_of(
24346 Range&: VectorizableTree[NodeIdx]->UserTreeIndex.UserTE->Scalars,
24347 P: [&](Value *V) {
24348 auto *IC = dyn_cast<ICmpInst>(Val: V);
24349 return IC && (IC->isSigned() ||
24350 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 0),
24351 SQ: SimplifyQuery(*DL)) ||
24352 !isKnownNonNegative(V: IC->getOperand(i_nocapture: 1),
24353 SQ: SimplifyQuery(*DL)));
24354 });
24355 }
24356
24357 // If the maximum bit width we compute is less than the width of the roots'
24358 // type, we can proceed with the narrowing. Otherwise, do nothing.
24359 if (MaxBitWidth == 0 ||
24360 MaxBitWidth >=
24361 cast<IntegerType>(Val: TreeRoot.front()->getType()->getScalarType())
24362 ->getBitWidth()) {
24363 if (UserIgnoreList)
24364 AnalyzedMinBWVals.insert_range(R&: TreeRoot);
24365 NodesToKeepBWs.insert_range(R&: ToDemote);
24366 continue;
24367 }
24368
24369 // Finally, map the values we can demote to the maximum bit with we
24370 // computed.
24371 for (unsigned Idx : ToDemote) {
24372 TreeEntry *TE = VectorizableTree[Idx].get();
24373 if (MinBWs.contains(Val: TE))
24374 continue;
24375 bool IsSigned = any_of(Range&: TE->Scalars, P: [&](Value *R) {
24376 if (isa<PoisonValue>(Val: R))
24377 return false;
24378 return !isKnownNonNegative(V: R, SQ: SimplifyQuery(*DL));
24379 });
24380 MinBWs.try_emplace(Key: TE, Args&: MaxBitWidth, Args&: IsSigned);
24381 }
24382 }
24383}
24384
24385PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
24386 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
24387 auto *TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
24388 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(IR&: F);
24389 auto *AA = &AM.getResult<AAManager>(IR&: F);
24390 auto *LI = &AM.getResult<LoopAnalysis>(IR&: F);
24391 auto *DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
24392 auto *AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
24393 auto *DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
24394 auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
24395
24396 bool Changed = runImpl(F, SE_: SE, TTI_: TTI, TLI_: TLI, AA_: AA, LI_: LI, DT_: DT, AC_: AC, DB_: DB, ORE_: ORE);
24397 if (!Changed)
24398 return PreservedAnalyses::all();
24399
24400 PreservedAnalyses PA;
24401 PA.preserveSet<CFGAnalyses>();
24402 return PA;
24403}
24404
24405bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
24406 TargetTransformInfo *TTI_,
24407 TargetLibraryInfo *TLI_, AAResults *AA_,
24408 LoopInfo *LI_, DominatorTree *DT_,
24409 AssumptionCache *AC_, DemandedBits *DB_,
24410 OptimizationRemarkEmitter *ORE_) {
24411 if (!RunSLPVectorization)
24412 return false;
24413 SE = SE_;
24414 TTI = TTI_;
24415 TLI = TLI_;
24416 AA = AA_;
24417 LI = LI_;
24418 DT = DT_;
24419 AC = AC_;
24420 DB = DB_;
24421 DL = &F.getDataLayout();
24422
24423 Stores.clear();
24424 GEPs.clear();
24425 bool Changed = false;
24426
24427 // If the target claims to have no vector registers don't attempt
24428 // vectorization.
24429 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true))) {
24430 LLVM_DEBUG(
24431 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
24432 return false;
24433 }
24434
24435 // Don't vectorize when the attribute NoImplicitFloat is used.
24436 if (F.hasFnAttribute(Kind: Attribute::NoImplicitFloat))
24437 return false;
24438
24439 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
24440
24441 // Use the bottom up slp vectorizer to construct chains that start with
24442 // store instructions.
24443 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
24444
24445 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
24446 // delete instructions.
24447
24448 // Update DFS numbers now so that we can use them for ordering.
24449 DT->updateDFSNumbers();
24450
24451 // Scan the blocks in the function in post order.
24452 for (auto *BB : post_order(G: &F.getEntryBlock())) {
24453 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(Val: BB->getTerminator()))
24454 continue;
24455
24456 // Start new block - clear the list of reduction roots.
24457 R.clearReductionData();
24458 collectSeedInstructions(BB);
24459
24460 // Vectorize trees that end at stores.
24461 if (!Stores.empty()) {
24462 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
24463 << " underlying objects.\n");
24464 Changed |= vectorizeStoreChains(R);
24465 }
24466
24467 // Vectorize trees that end at reductions.
24468 Changed |= vectorizeChainsInBlock(BB, R);
24469
24470 // Vectorize the index computations of getelementptr instructions. This
24471 // is primarily intended to catch gather-like idioms ending at
24472 // non-consecutive loads.
24473 if (!GEPs.empty()) {
24474 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
24475 << " underlying objects.\n");
24476 Changed |= vectorizeGEPIndices(BB, R);
24477 }
24478 }
24479
24480 if (Changed) {
24481 R.optimizeGatherSequence();
24482 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
24483 }
24484 return Changed;
24485}
24486
24487std::optional<bool>
24488SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
24489 unsigned Idx, unsigned MinVF,
24490 unsigned &Size) {
24491 Size = 0;
24492 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
24493 << "\n");
24494 const unsigned Sz = R.getVectorElementSize(V: Chain[0]);
24495 unsigned VF = Chain.size();
24496
24497 if (!has_single_bit(Value: Sz) ||
24498 !hasFullVectorsOrPowerOf2(
24499 TTI: *TTI, Ty: cast<StoreInst>(Val: Chain.front())->getValueOperand()->getType(),
24500 Sz: VF) ||
24501 VF < 2 || VF < MinVF) {
24502 // Check if vectorizing with a non-power-of-2 VF should be considered. At
24503 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
24504 // all vector lanes are used.
24505 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
24506 return false;
24507 }
24508
24509 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
24510 << "\n");
24511
24512 SetVector<Value *> ValOps;
24513 for (Value *V : Chain)
24514 ValOps.insert(X: cast<StoreInst>(Val: V)->getValueOperand());
24515 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
24516 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
24517 InstructionsState S = Analysis.buildInstructionsState(
24518 VL: ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true);
24519 if (all_of(Range&: ValOps, P: IsaPred<Instruction>) && ValOps.size() > 1) {
24520 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
24521 bool IsAllowedSize =
24522 hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ValOps.front()->getType(),
24523 Sz: ValOps.size()) ||
24524 (VectorizeNonPowerOf2 && has_single_bit(Value: ValOps.size() + 1));
24525 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
24526 (!S.getMainOp()->isSafeToRemove() ||
24527 any_of(Range: ValOps.getArrayRef(),
24528 P: [&](Value *V) {
24529 return !isa<ExtractElementInst>(Val: V) &&
24530 (V->getNumUses() > Chain.size() ||
24531 any_of(Range: V->users(), P: [&](User *U) {
24532 return !Stores.contains(V: U);
24533 }));
24534 }))) ||
24535 (ValOps.size() > Chain.size() / 2 && !S)) {
24536 Size = (!IsAllowedSize && S) ? 1 : 2;
24537 return false;
24538 }
24539 }
24540 R.buildTree(Roots: Chain);
24541 // Check if tree tiny and store itself or its value is not vectorized.
24542 if (R.isTreeTinyAndNotFullyVectorizable()) {
24543 if (R.isGathered(V: Chain.front()) ||
24544 R.isNotScheduled(V: cast<StoreInst>(Val: Chain.front())->getValueOperand()))
24545 return std::nullopt;
24546 Size = R.getCanonicalGraphSize();
24547 return false;
24548 }
24549 if (R.isProfitableToReorder()) {
24550 R.reorderTopToBottom();
24551 R.reorderBottomToTop();
24552 }
24553 R.transformNodes();
24554 R.computeMinimumValueSizes();
24555
24556 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
24557 R.buildExternalUses();
24558
24559 Size = R.getCanonicalGraphSize();
24560 if (S && S.getOpcode() == Instruction::Load)
24561 Size = 2; // cut off masked gather small trees
24562 InstructionCost Cost = R.getTreeCost(TreeCost);
24563
24564 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
24565 if (Cost < -SLPCostThreshold) {
24566 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
24567
24568 using namespace ore;
24569
24570 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "StoresVectorized",
24571 cast<StoreInst>(Val: Chain[0]))
24572 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
24573 << " and with tree size "
24574 << NV("TreeSize", R.getTreeSize()));
24575
24576 R.vectorizeTree();
24577 return true;
24578 }
24579
24580 return false;
24581}
24582
24583/// Checks if the quadratic mean deviation is less than 90% of the mean size.
24584static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes) {
24585 unsigned Num = 0;
24586 uint64_t Sum = std::accumulate(
24587 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
24588 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24589 unsigned Size = Val.first;
24590 if (Size == 1)
24591 return V;
24592 ++Num;
24593 return V + Size;
24594 });
24595 if (Num == 0)
24596 return true;
24597 uint64_t Mean = Sum / Num;
24598 if (Mean == 0)
24599 return true;
24600 uint64_t Dev = std::accumulate(
24601 first: Sizes.begin(), last: Sizes.end(), init: static_cast<uint64_t>(0),
24602 binary_op: [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
24603 unsigned P = Val.first;
24604 if (P == 1)
24605 return V;
24606 return V + (P - Mean) * (P - Mean);
24607 }) /
24608 Num;
24609 return Dev * 96 / (Mean * Mean) == 0;
24610}
24611
24612namespace {
24613
24614/// A group of stores that we'll try to bundle together using vector ops.
24615/// They are ordered using the signed distance of their address operand to the
24616/// address of this group's BaseInstr.
24617class RelatedStoreInsts {
24618public:
24619 RelatedStoreInsts(unsigned BaseInstrIdx, ArrayRef<StoreInst *> AllStores)
24620 : AllStores(AllStores) {
24621 reset(NewBaseInstr: BaseInstrIdx);
24622 }
24623
24624 void reset(unsigned NewBaseInstr) {
24625 assert(NewBaseInstr < AllStores.size() &&
24626 "Instruction index out of bounds");
24627 BaseInstrIdx = NewBaseInstr;
24628 Instrs.clear();
24629 insertOrLookup(InstrIdx: NewBaseInstr, PtrDist: 0);
24630 }
24631
24632 /// Tries to insert \p InstrIdx as the store with a pointer distance of
24633 /// \p PtrDist.
24634 /// Does nothing if there is already a store with that \p PtrDist.
24635 /// \returns The previously associated Instruction index, or std::nullopt
24636 std::optional<unsigned> insertOrLookup(unsigned InstrIdx, int64_t PtrDist) {
24637 auto [It, Inserted] = Instrs.emplace(args&: PtrDist, args&: InstrIdx);
24638 return Inserted ? std::nullopt : std::make_optional(t&: It->second);
24639 }
24640
24641 using DistToInstMap = std::map<int64_t, unsigned>;
24642 const DistToInstMap &getStores() const { return Instrs; }
24643
24644 /// If \p SI is related to this group of stores, return the distance of its
24645 /// pointer operand to the one the group's BaseInstr.
24646 std::optional<int64_t> getPointerDiff(StoreInst &SI, const DataLayout &DL,
24647 ScalarEvolution &SE) const {
24648 StoreInst &BaseStore = *AllStores[BaseInstrIdx];
24649 return getPointersDiff(
24650 ElemTyA: BaseStore.getValueOperand()->getType(), PtrA: BaseStore.getPointerOperand(),
24651 ElemTyB: SI.getValueOperand()->getType(), PtrB: SI.getPointerOperand(), DL, SE,
24652 /*StrictCheck=*/true);
24653 }
24654
24655 /// Recompute the pointer distances to be based on \p NewBaseInstIdx.
24656 /// Stores whose index is less than \p MinSafeIdx will be dropped.
24657 void rebase(unsigned MinSafeIdx, unsigned NewBaseInstIdx,
24658 int64_t DistFromCurBase) {
24659 DistToInstMap PrevSet = std::move(Instrs);
24660 reset(NewBaseInstr: NewBaseInstIdx);
24661
24662 // Re-insert stores that come after MinSafeIdx to try and vectorize them
24663 // again. Their distance will be "rebased" to use NewBaseInstIdx as
24664 // reference.
24665 for (auto [Dist, InstIdx] : PrevSet) {
24666 if (InstIdx >= MinSafeIdx)
24667 insertOrLookup(InstrIdx: InstIdx, PtrDist: Dist - DistFromCurBase);
24668 }
24669 }
24670
24671 /// Remove all stores that have been vectorized from this group.
24672 void clearVectorizedStores(const BoUpSLP::ValueSet &VectorizedStores) {
24673 DistToInstMap::reverse_iterator LastVectorizedStore = find_if(
24674 Range: reverse(C&: Instrs), P: [&](const std::pair<int64_t, unsigned> &DistAndIdx) {
24675 return VectorizedStores.contains(Ptr: AllStores[DistAndIdx.second]);
24676 });
24677
24678 // Get a forward iterator pointing after the last vectorized store and erase
24679 // all stores before it so we don't try to vectorize them again.
24680 DistToInstMap::iterator VectorizedStoresEnd = LastVectorizedStore.base();
24681 Instrs.erase(first: Instrs.begin(), last: VectorizedStoresEnd);
24682 }
24683
24684private:
24685 /// The index of the Base instruction, i.e. the one with a 0 pointer distance.
24686 unsigned BaseInstrIdx;
24687
24688 /// Maps a pointer distance from \p BaseInstrIdx to an instruction index.
24689 DistToInstMap Instrs;
24690
24691 /// Reference to all the stores in the BB being analyzed.
24692 ArrayRef<StoreInst *> AllStores;
24693};
24694
24695} // end anonymous namespace
24696
24697bool SLPVectorizerPass::vectorizeStores(
24698 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
24699 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
24700 &Visited) {
24701 // We may run into multiple chains that merge into a single chain. We mark the
24702 // stores that we vectorized so that we don't visit the same store twice.
24703 BoUpSLP::ValueSet VectorizedStores;
24704 bool Changed = false;
24705
24706 auto TryToVectorize = [&](const RelatedStoreInsts::DistToInstMap &StoreSeq) {
24707 int64_t PrevDist = -1;
24708 BoUpSLP::ValueList Operands;
24709 // Collect the chain into a list.
24710 for (auto [Idx, Data] : enumerate(First: StoreSeq)) {
24711 auto &[Dist, InstIdx] = Data;
24712 if (Operands.empty() || Dist - PrevDist == 1) {
24713 Operands.push_back(Elt: Stores[InstIdx]);
24714 PrevDist = Dist;
24715 if (Idx != StoreSeq.size() - 1)
24716 continue;
24717 }
24718 llvm::scope_exit E([&, &Dist = Dist, &InstIdx = InstIdx]() {
24719 Operands.clear();
24720 Operands.push_back(Elt: Stores[InstIdx]);
24721 PrevDist = Dist;
24722 });
24723
24724 if (Operands.size() <= 1 ||
24725 !Visited
24726 .insert(V: {Operands.front(),
24727 cast<StoreInst>(Val: Operands.front())->getValueOperand(),
24728 Operands.back(),
24729 cast<StoreInst>(Val: Operands.back())->getValueOperand(),
24730 Operands.size()})
24731 .second)
24732 continue;
24733
24734 unsigned MaxVecRegSize = R.getMaxVecRegSize();
24735 unsigned EltSize = R.getVectorElementSize(V: Operands[0]);
24736 unsigned MaxElts = llvm::bit_floor(Value: MaxVecRegSize / EltSize);
24737
24738 unsigned MaxVF =
24739 std::min(a: R.getMaximumVF(ElemWidth: EltSize, Opcode: Instruction::Store), b: MaxElts);
24740 auto *Store = cast<StoreInst>(Val: Operands[0]);
24741 Type *StoreTy = Store->getValueOperand()->getType();
24742 Type *ValueTy = StoreTy;
24743 if (auto *Trunc = dyn_cast<TruncInst>(Val: Store->getValueOperand()))
24744 ValueTy = Trunc->getSrcTy();
24745 // When REVEC is enabled, StoreTy and ValueTy may be FixedVectorType. But
24746 // getStoreMinimumVF only support scalar type as arguments. As a result,
24747 // we need to use the element type of StoreTy and ValueTy to retrieve the
24748 // VF and then transform it back.
24749 // Remember: VF is defined as the number we want to vectorize, not the
24750 // number of elements in the final vector.
24751 Type *StoreScalarTy = StoreTy->getScalarType();
24752 unsigned MinVF = PowerOf2Ceil(A: TTI->getStoreMinimumVF(
24753 VF: R.getMinVF(Sz: DL->getTypeStoreSizeInBits(Ty: StoreScalarTy)), ScalarMemTy: StoreScalarTy,
24754 ScalarValTy: ValueTy->getScalarType()));
24755 MinVF /= getNumElements(Ty: StoreTy);
24756 MinVF = std::max<unsigned>(a: 2, b: MinVF);
24757
24758 if (MaxVF < MinVF) {
24759 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24760 << ") < "
24761 << "MinVF (" << MinVF << ")\n");
24762 continue;
24763 }
24764
24765 unsigned NonPowerOf2VF = 0;
24766 if (VectorizeNonPowerOf2) {
24767 // First try vectorizing with a non-power-of-2 VF. At the moment, only
24768 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
24769 // lanes are used.
24770 unsigned CandVF = std::clamp<unsigned>(val: Operands.size(), lo: MinVF, hi: MaxVF);
24771 if (has_single_bit(Value: CandVF + 1)) {
24772 NonPowerOf2VF = CandVF;
24773 assert(NonPowerOf2VF != MaxVF &&
24774 "Non-power-of-2 VF should not be equal to MaxVF");
24775 }
24776 }
24777
24778 // MaxRegVF represents the number of instructions (scalar, or vector in
24779 // case of revec) that can be vectorized to naturally fit in a vector
24780 // register.
24781 unsigned MaxRegVF = MaxVF;
24782
24783 MaxVF = std::min<unsigned>(a: MaxVF, b: bit_floor(Value: Operands.size()));
24784 if (MaxVF < MinVF) {
24785 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
24786 << ") < "
24787 << "MinVF (" << MinVF << ")\n");
24788 continue;
24789 }
24790
24791 SmallVector<unsigned> CandidateVFs;
24792 for (unsigned VF = std::max(a: MaxVF, b: NonPowerOf2VF); VF >= MinVF;
24793 VF = divideCeil(Numerator: VF, Denominator: 2))
24794 CandidateVFs.push_back(Elt: VF);
24795
24796 unsigned End = Operands.size();
24797 unsigned Repeat = 0;
24798 constexpr unsigned MaxAttempts = 4;
24799 // first: the best TreeSize from all prior loops over CandidateVFs, gets
24800 // updated after looping through CandidateVFs
24801 // second: the best TreeSize from all prior loops including the current
24802 // one
24803 llvm::SmallVector<std::pair<unsigned, unsigned>> RangeSizesStorage(
24804 Operands.size(), {1, 1});
24805 // The `slice` and `drop_front` interfaces are convenient
24806 const auto RangeSizes = MutableArrayRef(RangeSizesStorage);
24807 DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;
24808 auto IsNotVectorized = [](const std::pair<unsigned, unsigned> &P) {
24809 return P.first > 0;
24810 };
24811 auto IsVectorized = [](const std::pair<unsigned, unsigned> &P) {
24812 return P.first == 0;
24813 };
24814 auto VFIsProfitable = [](unsigned Size,
24815 const std::pair<unsigned, unsigned> &P) {
24816 return Size >= P.first;
24817 };
24818 auto FirstSizeSame = [](unsigned Size,
24819 const std::pair<unsigned, unsigned> &P) {
24820 return Size == P.first;
24821 };
24822 while (true) {
24823 ++Repeat;
24824 bool RepeatChanged = false;
24825 bool AnyProfitableGraph = false;
24826 for (unsigned VF : CandidateVFs) {
24827 AnyProfitableGraph = false;
24828 unsigned FirstUnvecStore = std::distance(
24829 first: RangeSizes.begin(), last: find_if(Range: RangeSizes, P: IsNotVectorized));
24830
24831 // Form slices of size VF starting from FirstUnvecStore and try to
24832 // vectorize them.
24833 while (FirstUnvecStore < End) {
24834 unsigned FirstVecStore = std::distance(
24835 first: RangeSizes.begin(),
24836 last: find_if(Range: RangeSizes.drop_front(N: FirstUnvecStore), P: IsVectorized));
24837 unsigned MaxSliceEnd = FirstVecStore >= End ? End : FirstVecStore;
24838 for (unsigned SliceStartIdx = FirstUnvecStore;
24839 SliceStartIdx + VF <= MaxSliceEnd;) {
24840 if (!checkTreeSizes(Sizes: RangeSizes.slice(N: SliceStartIdx, M: VF))) {
24841 ++SliceStartIdx;
24842 continue;
24843 }
24844 ArrayRef<Value *> Slice =
24845 ArrayRef(Operands).slice(N: SliceStartIdx, M: VF);
24846 assert(all_of(Slice,
24847 [&](Value *V) {
24848 return cast<StoreInst>(V)
24849 ->getValueOperand()
24850 ->getType() ==
24851 cast<StoreInst>(Slice.front())
24852 ->getValueOperand()
24853 ->getType();
24854 }) &&
24855 "Expected all operands of same type.");
24856 if (!NonSchedulable.empty()) {
24857 auto [NonSchedSizeMax, NonSchedSizeMin] =
24858 NonSchedulable.lookup(Val: Slice.front());
24859 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= VF) {
24860 // VF is too ambitious. Try to vectorize another slice before
24861 // trying a smaller VF.
24862 SliceStartIdx += NonSchedSizeMax;
24863 continue;
24864 }
24865 }
24866 unsigned TreeSize;
24867 std::optional<bool> Res =
24868 vectorizeStoreChain(Chain: Slice, R, Idx: SliceStartIdx, MinVF, Size&: TreeSize);
24869 if (!Res) {
24870 // Update the range of non schedulable VFs for slices starting
24871 // at SliceStartIdx.
24872 NonSchedulable
24873 .try_emplace(Key: Slice.front(), Args: std::make_pair(x&: VF, y&: VF))
24874 .first->getSecond()
24875 .second = VF;
24876 } else if (*Res) {
24877 // Mark the vectorized stores so that we don't vectorize them
24878 // again.
24879 VectorizedStores.insert_range(R&: Slice);
24880 AnyProfitableGraph = RepeatChanged = Changed = true;
24881 // If we vectorized initial block, no need to try to vectorize
24882 // it again.
24883 for (std::pair<unsigned, unsigned> &P :
24884 RangeSizes.slice(N: SliceStartIdx, M: VF))
24885 P.first = P.second = 0;
24886 if (SliceStartIdx < FirstUnvecStore + MinVF) {
24887 for (std::pair<unsigned, unsigned> &P : RangeSizes.slice(
24888 N: FirstUnvecStore, M: SliceStartIdx - FirstUnvecStore))
24889 P.first = P.second = 0;
24890 FirstUnvecStore = SliceStartIdx + VF;
24891 }
24892 if (SliceStartIdx > MaxSliceEnd - VF - MinVF) {
24893 for (std::pair<unsigned, unsigned> &P :
24894 RangeSizes.slice(N: SliceStartIdx + VF,
24895 M: MaxSliceEnd - (SliceStartIdx + VF)))
24896 P.first = P.second = 0;
24897 if (MaxSliceEnd == End)
24898 End = SliceStartIdx;
24899 MaxSliceEnd = SliceStartIdx;
24900 }
24901 SliceStartIdx += VF;
24902 continue;
24903 }
24904 if (VF > 2 && Res &&
24905 !all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
24906 P: std::bind(f&: VFIsProfitable, args&: TreeSize, args: _1))) {
24907 SliceStartIdx += VF;
24908 continue;
24909 }
24910 // Check for the very big VFs that we're not rebuilding same
24911 // trees, just with larger number of elements.
24912 if (VF > MaxRegVF && TreeSize > 1 &&
24913 all_of(Range: RangeSizes.slice(N: SliceStartIdx, M: VF),
24914 P: std::bind(f&: FirstSizeSame, args&: TreeSize, args: _1))) {
24915 SliceStartIdx += VF;
24916 while (SliceStartIdx != MaxSliceEnd &&
24917 RangeSizes[SliceStartIdx].first == TreeSize)
24918 ++SliceStartIdx;
24919 continue;
24920 }
24921 if (TreeSize > 1)
24922 for (std::pair<unsigned, unsigned> &P :
24923 RangeSizes.slice(N: SliceStartIdx, M: VF))
24924 P.second = std::max(a: P.second, b: TreeSize);
24925 ++SliceStartIdx;
24926 AnyProfitableGraph = true;
24927 }
24928 if (FirstUnvecStore >= End)
24929 break;
24930 if (MaxSliceEnd - FirstUnvecStore < VF &&
24931 MaxSliceEnd - FirstUnvecStore >= MinVF)
24932 AnyProfitableGraph = true;
24933 FirstUnvecStore = std::distance(
24934 first: RangeSizes.begin(),
24935 last: find_if(Range: RangeSizes.drop_front(N: MaxSliceEnd), P: IsNotVectorized));
24936 }
24937 if (!AnyProfitableGraph && VF >= MaxRegVF && has_single_bit(Value: VF))
24938 break;
24939 // For the MaxRegVF case, save RangeSizes to limit compile time
24940 if (VF == MaxRegVF)
24941 for (std::pair<unsigned, unsigned> &P : RangeSizes)
24942 if (P.first != 0)
24943 P.first = std::max(a: P.second, b: P.first);
24944 }
24945 // All values vectorized - exit.
24946 if (all_of(Range: RangeSizes, P: IsVectorized))
24947 break;
24948 // Check if tried all attempts or no need for the last attempts at all.
24949 if (Repeat >= MaxAttempts ||
24950 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
24951 break;
24952 constexpr unsigned StoresLimit = 64;
24953 const unsigned MaxTotalNum = std::min<unsigned>(
24954 a: Operands.size(),
24955 b: static_cast<unsigned>(
24956 End -
24957 std::distance(first: RangeSizes.begin(),
24958 last: find_if(Range: RangeSizes, P: IsNotVectorized)) +
24959 1));
24960 unsigned VF = bit_ceil(Value: CandidateVFs.front()) * 2;
24961 if (VF > MaxTotalNum || VF >= StoresLimit)
24962 break;
24963 for (std::pair<unsigned, unsigned> &P : RangeSizes) {
24964 if (P.first != 0)
24965 P.first = std::max(a: P.second, b: P.first);
24966 }
24967 // Attempt again to vectorize even larger chains if all previous
24968 // attempts were unsuccessful because of the cost issues.
24969 CandidateVFs.clear();
24970 unsigned Limit =
24971 getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: StoreTy, Sz: MaxTotalNum);
24972 if (bit_floor(Value: Limit) == VF && Limit != VF)
24973 CandidateVFs.push_back(Elt: Limit);
24974 CandidateVFs.push_back(Elt: VF);
24975 }
24976 }
24977 };
24978
24979 /// Groups of stores to vectorize
24980 SmallVector<RelatedStoreInsts> SortedStores;
24981
24982 // Inserts the specified store SI with the given index Idx to the set of the
24983 // stores. If the store with the same distance is found already - stop
24984 // insertion, try to vectorize already found stores. If some stores from this
24985 // sequence were not vectorized - try to vectorize them with the new store
24986 // later. But this logic is applied only to the stores, that come before the
24987 // previous store with the same distance.
24988 // Example:
24989 // 1. store x, %p
24990 // 2. store y, %p+1
24991 // 3. store z, %p+2
24992 // 4. store a, %p
24993 // 5. store b, %p+3
24994 // - Scan this from the last to first store. The very first bunch of stores is
24995 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
24996 // vector).
24997 // - The next store in the list - #1 - has the same distance from store #5 as
24998 // the store #4.
24999 // - Try to vectorize sequence of stores 4,2,3,5.
25000 // - If all these stores are vectorized - just drop them.
25001 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
25002 // - Start new stores sequence.
25003 // The new bunch of stores is {1, {1, 0}}.
25004 // - Add the stores from previous sequence, that were not vectorized.
25005 // Here we consider the stores in the reversed order, rather they are used in
25006 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
25007 // Store #3 can be added -> comes after store #4 with the same distance as
25008 // store #1.
25009 // Store #5 cannot be added - comes before store #4.
25010 // This logic allows to improve the compile time, we assume that the stores
25011 // after previous store with the same distance most likely have memory
25012 // dependencies and no need to waste compile time to try to vectorize them.
25013 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
25014 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
25015 std::optional<int64_t> PtrDist;
25016 auto *RelatedStores = find_if(
25017 Range&: SortedStores, P: [&PtrDist, SI, this](const RelatedStoreInsts &StoreSeq) {
25018 PtrDist = StoreSeq.getPointerDiff(SI&: *SI, DL: *DL, SE&: *SE);
25019 return PtrDist.has_value();
25020 });
25021
25022 // We did not find a comparable store, start a new group.
25023 if (RelatedStores == SortedStores.end()) {
25024 SortedStores.emplace_back(Args&: Idx, Args&: Stores);
25025 return;
25026 }
25027
25028 // If there is already a store in the group with the same PtrDiff, try to
25029 // vectorize the existing instructions before adding the current store.
25030 // Otherwise, insert this store and keep collecting.
25031 if (std::optional<unsigned> PrevInst =
25032 RelatedStores->insertOrLookup(InstrIdx: Idx, PtrDist: *PtrDist)) {
25033 TryToVectorize(RelatedStores->getStores());
25034 RelatedStores->clearVectorizedStores(VectorizedStores);
25035 RelatedStores->rebase(/*MinSafeIdx=*/*PrevInst + 1,
25036 /*NewBaseInstIdx=*/Idx,
25037 /*DistFromCurBase=*/*PtrDist);
25038 }
25039 };
25040 Type *PrevValTy = nullptr;
25041 for (auto [I, SI] : enumerate(First&: Stores)) {
25042 if (R.isDeleted(I: SI))
25043 continue;
25044 if (!PrevValTy)
25045 PrevValTy = SI->getValueOperand()->getType();
25046 // Check that we do not try to vectorize stores of different types.
25047 if (PrevValTy != SI->getValueOperand()->getType()) {
25048 for (RelatedStoreInsts &StoreSeq : SortedStores)
25049 TryToVectorize(StoreSeq.getStores());
25050 SortedStores.clear();
25051 PrevValTy = SI->getValueOperand()->getType();
25052 }
25053 FillStoresSet(I, SI);
25054 }
25055
25056 // Final vectorization attempt.
25057 for (RelatedStoreInsts &StoreSeq : SortedStores)
25058 TryToVectorize(StoreSeq.getStores());
25059
25060 return Changed;
25061}
25062
25063void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
25064 // Initialize the collections. We will make a single pass over the block.
25065 Stores.clear();
25066 GEPs.clear();
25067
25068 // Visit the store and getelementptr instructions in BB and organize them in
25069 // Stores and GEPs according to the underlying objects of their pointer
25070 // operands.
25071 for (Instruction &I : *BB) {
25072 // Ignore store instructions that are volatile or have a pointer operand
25073 // that doesn't point to a scalar type.
25074 if (auto *SI = dyn_cast<StoreInst>(Val: &I)) {
25075 if (!SI->isSimple())
25076 continue;
25077 if (!isValidElementType(Ty: SI->getValueOperand()->getType()))
25078 continue;
25079 Stores[getUnderlyingObject(V: SI->getPointerOperand())].push_back(Elt: SI);
25080 }
25081
25082 // Ignore getelementptr instructions that have more than one index, a
25083 // constant index, or a pointer operand that doesn't point to a scalar
25084 // type.
25085 else if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: &I)) {
25086 if (GEP->getNumIndices() != 1)
25087 continue;
25088 Value *Idx = GEP->idx_begin()->get();
25089 if (isa<Constant>(Val: Idx))
25090 continue;
25091 if (!isValidElementType(Ty: Idx->getType()))
25092 continue;
25093 if (GEP->getType()->isVectorTy())
25094 continue;
25095 GEPs[GEP->getPointerOperand()].push_back(Elt: GEP);
25096 }
25097 }
25098}
25099
25100bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
25101 bool MaxVFOnly) {
25102 if (VL.size() < 2)
25103 return false;
25104
25105 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
25106 << VL.size() << ".\n");
25107
25108 // Check that all of the parts are instructions of the same type,
25109 // we permit an alternate opcode via InstructionsState.
25110 InstructionsState S = getSameOpcode(VL, TLI: *TLI);
25111 if (!S)
25112 return false;
25113
25114 Instruction *I0 = S.getMainOp();
25115 // Make sure invalid types (including vector type) are rejected before
25116 // determining vectorization factor for scalar instructions.
25117 for (Value *V : VL) {
25118 Type *Ty = V->getType();
25119 if (!isa<InsertElementInst>(Val: V) && !isValidElementType(Ty)) {
25120 // NOTE: the following will give user internal llvm type name, which may
25121 // not be useful.
25122 R.getORE()->emit(RemarkBuilder: [&]() {
25123 std::string TypeStr;
25124 llvm::raw_string_ostream OS(TypeStr);
25125 Ty->print(O&: OS);
25126 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
25127 << "Cannot SLP vectorize list: type "
25128 << TypeStr + " is unsupported by vectorizer";
25129 });
25130 return false;
25131 }
25132 }
25133
25134 Type *ScalarTy = getValueType(V: VL[0]);
25135 unsigned Sz = R.getVectorElementSize(V: I0);
25136 unsigned MinVF = R.getMinVF(Sz);
25137 unsigned MaxVF = std::max<unsigned>(
25138 a: getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: ScalarTy, Sz: VL.size()), b: MinVF);
25139 MaxVF = std::min(a: R.getMaximumVF(ElemWidth: Sz, Opcode: S.getOpcode()), b: MaxVF);
25140 if (MaxVF < 2) {
25141 R.getORE()->emit(RemarkBuilder: [&]() {
25142 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
25143 << "Cannot SLP vectorize list: vectorization factor "
25144 << "less than 2 is not supported";
25145 });
25146 return false;
25147 }
25148
25149 bool Changed = false;
25150 bool CandidateFound = false;
25151 InstructionCost MinCost = SLPCostThreshold.getValue();
25152
25153 unsigned NextInst = 0, MaxInst = VL.size();
25154 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
25155 VF = getFloorFullVectorNumberOfElements(TTI: *TTI, Ty: I0->getType(), Sz: VF - 1)) {
25156 // No actual vectorization should happen, if number of parts is the same as
25157 // provided vectorization factor (i.e. the scalar type is used for vector
25158 // code during codegen).
25159 auto *VecTy = getWidenedType(ScalarTy, VF);
25160 if (TTI->getNumberOfParts(Tp: VecTy) == VF)
25161 continue;
25162 for (unsigned I = NextInst; I < MaxInst; ++I) {
25163 unsigned ActualVF = std::min(a: MaxInst - I, b: VF);
25164
25165 if (!hasFullVectorsOrPowerOf2(TTI: *TTI, Ty: ScalarTy, Sz: ActualVF))
25166 continue;
25167
25168 if (MaxVFOnly && ActualVF < MaxVF)
25169 break;
25170 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
25171 break;
25172
25173 SmallVector<Value *> Ops(ActualVF, nullptr);
25174 unsigned Idx = 0;
25175 for (Value *V : VL.drop_front(N: I)) {
25176 // Check that a previous iteration of this loop did not delete the
25177 // Value.
25178 if (auto *Inst = dyn_cast<Instruction>(Val: V);
25179 !Inst || !R.isDeleted(I: Inst)) {
25180 Ops[Idx] = V;
25181 ++Idx;
25182 if (Idx == ActualVF)
25183 break;
25184 }
25185 }
25186 // Not enough vectorizable instructions - exit.
25187 if (Idx != ActualVF)
25188 break;
25189
25190 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
25191 << "\n");
25192
25193 R.buildTree(Roots: Ops);
25194 if (R.isTreeTinyAndNotFullyVectorizable())
25195 continue;
25196 if (R.isProfitableToReorder()) {
25197 R.reorderTopToBottom();
25198 R.reorderBottomToTop(IgnoreReorder: !isa<InsertElementInst>(Val: Ops.front()));
25199 }
25200 R.transformNodes();
25201 R.computeMinimumValueSizes();
25202 InstructionCost TreeCost = R.calculateTreeCostAndTrimNonProfitable();
25203 R.buildExternalUses();
25204
25205 InstructionCost Cost = R.getTreeCost(TreeCost);
25206 CandidateFound = true;
25207 MinCost = std::min(a: MinCost, b: Cost);
25208
25209 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
25210 << " for VF=" << ActualVF << "\n");
25211 if (Cost < -SLPCostThreshold) {
25212 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
25213 R.getORE()->emit(OptDiag: OptimizationRemark(SV_NAME, "VectorizedList",
25214 cast<Instruction>(Val: Ops[0]))
25215 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
25216 << " and with tree size "
25217 << ore::NV("TreeSize", R.getTreeSize()));
25218
25219 R.vectorizeTree();
25220 // Move to the next bundle.
25221 I += VF - 1;
25222 NextInst = I + 1;
25223 Changed = true;
25224 }
25225 }
25226 }
25227
25228 if (!Changed && CandidateFound) {
25229 R.getORE()->emit(RemarkBuilder: [&]() {
25230 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
25231 << "List vectorization was possible but not beneficial with cost "
25232 << ore::NV("Cost", MinCost) << " >= "
25233 << ore::NV("Treshold", -SLPCostThreshold);
25234 });
25235 } else if (!Changed) {
25236 R.getORE()->emit(RemarkBuilder: [&]() {
25237 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
25238 << "Cannot SLP vectorize list: vectorization was impossible"
25239 << " with available vectorization factors";
25240 });
25241 }
25242 return Changed;
25243}
25244
25245namespace {
25246
25247/// Model horizontal reductions.
25248///
25249/// A horizontal reduction is a tree of reduction instructions that has values
25250/// that can be put into a vector as its leaves. For example:
25251///
25252/// mul mul mul mul
25253/// \ / \ /
25254/// + +
25255/// \ /
25256/// +
25257/// This tree has "mul" as its leaf values and "+" as its reduction
25258/// instructions. A reduction can feed into a store or a binary operation
25259/// feeding a phi.
25260/// ...
25261/// \ /
25262/// +
25263/// |
25264/// phi +=
25265///
25266/// Or:
25267/// ...
25268/// \ /
25269/// +
25270/// |
25271/// *p =
25272///
25273class HorizontalReduction {
25274 using ReductionOpsType = SmallVector<Value *, 16>;
25275 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
25276 ReductionOpsListType ReductionOps;
25277 /// List of possibly reduced values.
25278 SmallVector<SmallVector<Value *>> ReducedVals;
25279 /// Maps reduced value to the corresponding reduction operation.
25280 SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;
25281 WeakTrackingVH ReductionRoot;
25282 /// The type of reduction operation.
25283 RecurKind RdxKind;
25284 /// Checks if the optimization of original scalar identity operations on
25285 /// matched horizontal reductions is enabled and allowed.
25286 bool IsSupportedHorRdxIdentityOp = false;
25287 /// The minimum number of the reduced values.
25288 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
25289 /// Contains vector values for reduction including their scale factor and
25290 /// signedness. The last bool is true, if the value was reduced in-tree.
25291 SmallVector<std::tuple<WeakTrackingVH, unsigned, bool, bool>>
25292 VectorValuesAndScales;
25293
25294 static bool isCmpSelMinMax(Instruction *I) {
25295 return match(V: I, P: m_Select(C: m_Cmp(), L: m_Value(), R: m_Value())) &&
25296 RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: getRdxKind(V: I));
25297 }
25298
25299 // And/or are potentially poison-safe logical patterns like:
25300 // select x, y, false
25301 // select x, true, y
25302 static bool isBoolLogicOp(Instruction *I) {
25303 return isa<SelectInst>(Val: I) &&
25304 (match(V: I, P: m_LogicalAnd()) || match(V: I, P: m_LogicalOr()));
25305 }
25306
25307 /// Checks if instruction is associative and can be vectorized.
25308 static bool isVectorizable(RecurKind Kind, Instruction *I,
25309 bool TwoElementReduction = false) {
25310 if (Kind == RecurKind::None)
25311 return false;
25312
25313 // Integer ops that map to select instructions or intrinsics are fine.
25314 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
25315 isBoolLogicOp(I))
25316 return true;
25317
25318 // No need to check for associativity, if 2 reduced values.
25319 if (TwoElementReduction)
25320 return true;
25321
25322 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
25323 // FP min/max are associative except for NaN and -0.0. We do not
25324 // have to rule out -0.0 here because the intrinsic semantics do not
25325 // specify a fixed result for it.
25326 return I->getFastMathFlags().noNaNs();
25327 }
25328
25329 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
25330 return true;
25331
25332 return I->isAssociative();
25333 }
25334
25335 static Value *getRdxOperand(Instruction *I, unsigned Index) {
25336 // Poison-safe 'or' takes the form: select X, true, Y
25337 // To make that work with the normal operand processing, we skip the
25338 // true value operand.
25339 // TODO: Change the code and data structures to handle this without a hack.
25340 if (getRdxKind(V: I) == RecurKind::Or && isa<SelectInst>(Val: I) && Index == 1)
25341 return I->getOperand(i: 2);
25342 return I->getOperand(i: Index);
25343 }
25344
25345 /// Creates reduction operation with the current opcode.
25346 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
25347 Value *RHS, const Twine &Name, bool UseSelect) {
25348 Type *OpTy = LHS->getType();
25349 assert(OpTy == RHS->getType() && "Expected LHS and RHS of same type");
25350 switch (Kind) {
25351 case RecurKind::Or: {
25352 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
25353 return Builder.CreateSelectWithUnknownProfile(
25354 C: LHS, True: ConstantInt::getAllOnesValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
25355 False: RHS, DEBUG_TYPE, Name);
25356 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25357 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25358 Name);
25359 }
25360 case RecurKind::And: {
25361 if (UseSelect && OpTy == CmpInst::makeCmpResultType(opnd_type: OpTy))
25362 return Builder.CreateSelectWithUnknownProfile(
25363 C: LHS, True: RHS,
25364 False: ConstantInt::getNullValue(Ty: CmpInst::makeCmpResultType(opnd_type: OpTy)),
25365 DEBUG_TYPE, Name);
25366 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25367 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25368 Name);
25369 }
25370 case RecurKind::Add:
25371 case RecurKind::Mul:
25372 case RecurKind::Xor:
25373 case RecurKind::FAdd:
25374 case RecurKind::FMul: {
25375 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
25376 return Builder.CreateBinOp(Opc: (Instruction::BinaryOps)RdxOpcode, LHS, RHS,
25377 Name);
25378 }
25379 case RecurKind::SMax:
25380 case RecurKind::SMin:
25381 case RecurKind::UMax:
25382 case RecurKind::UMin:
25383 if (UseSelect) {
25384 CmpInst::Predicate Pred = llvm::getMinMaxReductionPredicate(RK: Kind);
25385 Value *Cmp = Builder.CreateICmp(P: Pred, LHS, RHS, Name);
25386 return Builder.CreateSelectWithUnknownProfile(C: Cmp, True: LHS, False: RHS, DEBUG_TYPE,
25387 Name);
25388 }
25389 [[fallthrough]];
25390 case RecurKind::FMax:
25391 case RecurKind::FMin:
25392 case RecurKind::FMaximum:
25393 case RecurKind::FMinimum:
25394 case RecurKind::FMaximumNum:
25395 case RecurKind::FMinimumNum: {
25396 Intrinsic::ID Id = llvm::getMinMaxReductionIntrinsicOp(RK: Kind);
25397 return Builder.CreateBinaryIntrinsic(ID: Id, LHS, RHS);
25398 }
25399 default:
25400 llvm_unreachable("Unknown reduction operation.");
25401 }
25402 }
25403
25404 /// Creates reduction operation with the current opcode with the IR flags
25405 /// from \p ReductionOps, dropping nuw/nsw flags.
25406 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
25407 Value *RHS, const Twine &Name,
25408 const ReductionOpsListType &ReductionOps) {
25409 bool UseSelect = ReductionOps.size() == 2 ||
25410 // Logical or/and.
25411 (ReductionOps.size() == 1 &&
25412 any_of(Range: ReductionOps.front(), P: IsaPred<SelectInst>));
25413 assert((!UseSelect || ReductionOps.size() != 2 ||
25414 isa<SelectInst>(ReductionOps[1][0])) &&
25415 "Expected cmp + select pairs for reduction");
25416 Value *Op = createOp(Builder, Kind: RdxKind, LHS, RHS, Name, UseSelect);
25417 if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind: RdxKind)) {
25418 if (auto *Sel = dyn_cast<SelectInst>(Val: Op)) {
25419 propagateIRFlags(I: Sel->getCondition(), VL: ReductionOps[0], OpValue: nullptr,
25420 /*IncludeWrapFlags=*/false);
25421 propagateIRFlags(I: Op, VL: ReductionOps[1], OpValue: nullptr,
25422 /*IncludeWrapFlags=*/false);
25423 return Op;
25424 }
25425 }
25426 propagateIRFlags(I: Op, VL: ReductionOps[0], OpValue: nullptr, /*IncludeWrapFlags=*/false);
25427 return Op;
25428 }
25429
25430public:
25431 static RecurKind getRdxKind(Value *V) {
25432 auto *I = dyn_cast<Instruction>(Val: V);
25433 if (!I)
25434 return RecurKind::None;
25435 if (match(V: I, P: m_Add(L: m_Value(), R: m_Value())))
25436 return RecurKind::Add;
25437 if (match(V: I, P: m_Mul(L: m_Value(), R: m_Value())))
25438 return RecurKind::Mul;
25439 if (match(V: I, P: m_And(L: m_Value(), R: m_Value())) ||
25440 match(V: I, P: m_LogicalAnd(L: m_Value(), R: m_Value())))
25441 return RecurKind::And;
25442 if (match(V: I, P: m_Or(L: m_Value(), R: m_Value())) ||
25443 match(V: I, P: m_LogicalOr(L: m_Value(), R: m_Value())))
25444 return RecurKind::Or;
25445 if (match(V: I, P: m_Xor(L: m_Value(), R: m_Value())))
25446 return RecurKind::Xor;
25447 if (match(V: I, P: m_FAdd(L: m_Value(), R: m_Value())))
25448 return RecurKind::FAdd;
25449 if (match(V: I, P: m_FMul(L: m_Value(), R: m_Value())))
25450 return RecurKind::FMul;
25451
25452 if (match(V: I, P: m_Intrinsic<Intrinsic::maxnum>(Op0: m_Value(), Op1: m_Value())))
25453 return RecurKind::FMax;
25454 if (match(V: I, P: m_Intrinsic<Intrinsic::minnum>(Op0: m_Value(), Op1: m_Value())))
25455 return RecurKind::FMin;
25456
25457 if (match(V: I, P: m_FMaximum(Op0: m_Value(), Op1: m_Value())))
25458 return RecurKind::FMaximum;
25459 if (match(V: I, P: m_FMinimum(Op0: m_Value(), Op1: m_Value())))
25460 return RecurKind::FMinimum;
25461 // This matches either cmp+select or intrinsics. SLP is expected to handle
25462 // either form.
25463 // TODO: If we are canonicalizing to intrinsics, we can remove several
25464 // special-case paths that deal with selects.
25465 if (match(V: I, P: m_SMax(L: m_Value(), R: m_Value())))
25466 return RecurKind::SMax;
25467 if (match(V: I, P: m_SMin(L: m_Value(), R: m_Value())))
25468 return RecurKind::SMin;
25469 if (match(V: I, P: m_UMax(L: m_Value(), R: m_Value())))
25470 return RecurKind::UMax;
25471 if (match(V: I, P: m_UMin(L: m_Value(), R: m_Value())))
25472 return RecurKind::UMin;
25473
25474 if (auto *Select = dyn_cast<SelectInst>(Val: I)) {
25475 // Try harder: look for min/max pattern based on instructions producing
25476 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
25477 // During the intermediate stages of SLP, it's very common to have
25478 // pattern like this (since optimizeGatherSequence is run only once
25479 // at the end):
25480 // %1 = extractelement <2 x i32> %a, i32 0
25481 // %2 = extractelement <2 x i32> %a, i32 1
25482 // %cond = icmp sgt i32 %1, %2
25483 // %3 = extractelement <2 x i32> %a, i32 0
25484 // %4 = extractelement <2 x i32> %a, i32 1
25485 // %select = select i1 %cond, i32 %3, i32 %4
25486 CmpPredicate Pred;
25487 Instruction *L1;
25488 Instruction *L2;
25489
25490 Value *LHS = Select->getTrueValue();
25491 Value *RHS = Select->getFalseValue();
25492 Value *Cond = Select->getCondition();
25493
25494 // TODO: Support inverse predicates.
25495 if (match(V: Cond, P: m_Cmp(Pred, L: m_Specific(V: LHS), R: m_Instruction(I&: L2)))) {
25496 if (!isa<ExtractElementInst>(Val: RHS) ||
25497 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
25498 return RecurKind::None;
25499 } else if (match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Specific(V: RHS)))) {
25500 if (!isa<ExtractElementInst>(Val: LHS) ||
25501 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)))
25502 return RecurKind::None;
25503 } else {
25504 if (!isa<ExtractElementInst>(Val: LHS) || !isa<ExtractElementInst>(Val: RHS))
25505 return RecurKind::None;
25506 if (!match(V: Cond, P: m_Cmp(Pred, L: m_Instruction(I&: L1), R: m_Instruction(I&: L2))) ||
25507 !L1->isIdenticalTo(I: cast<Instruction>(Val: LHS)) ||
25508 !L2->isIdenticalTo(I: cast<Instruction>(Val: RHS)))
25509 return RecurKind::None;
25510 }
25511
25512 switch (Pred) {
25513 default:
25514 return RecurKind::None;
25515 case CmpInst::ICMP_SGT:
25516 case CmpInst::ICMP_SGE:
25517 return RecurKind::SMax;
25518 case CmpInst::ICMP_SLT:
25519 case CmpInst::ICMP_SLE:
25520 return RecurKind::SMin;
25521 case CmpInst::ICMP_UGT:
25522 case CmpInst::ICMP_UGE:
25523 return RecurKind::UMax;
25524 case CmpInst::ICMP_ULT:
25525 case CmpInst::ICMP_ULE:
25526 return RecurKind::UMin;
25527 }
25528 }
25529 return RecurKind::None;
25530 }
25531
25532 /// Get the index of the first operand.
25533 static unsigned getFirstOperandIndex(Instruction *I) {
25534 return isCmpSelMinMax(I) ? 1 : 0;
25535 }
25536
25537private:
25538 /// Total number of operands in the reduction operation.
25539 static unsigned getNumberOfOperands(Instruction *I) {
25540 return isCmpSelMinMax(I) ? 3 : 2;
25541 }
25542
25543 /// Checks if the instruction is in basic block \p BB.
25544 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
25545 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
25546 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
25547 auto *Sel = cast<SelectInst>(Val: I);
25548 auto *Cmp = dyn_cast<Instruction>(Val: Sel->getCondition());
25549 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
25550 }
25551 return I->getParent() == BB;
25552 }
25553
25554 /// Expected number of uses for reduction operations/reduced values.
25555 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
25556 if (IsCmpSelMinMax) {
25557 // SelectInst must be used twice while the condition op must have single
25558 // use only.
25559 if (auto *Sel = dyn_cast<SelectInst>(Val: I))
25560 return Sel->hasNUses(N: 2) && Sel->getCondition()->hasOneUse();
25561 return I->hasNUses(N: 2);
25562 }
25563
25564 // Arithmetic reduction operation must be used once only.
25565 return I->hasOneUse();
25566 }
25567
25568 /// Initializes the list of reduction operations.
25569 void initReductionOps(Instruction *I) {
25570 if (isCmpSelMinMax(I))
25571 ReductionOps.assign(NumElts: 2, Elt: ReductionOpsType());
25572 else
25573 ReductionOps.assign(NumElts: 1, Elt: ReductionOpsType());
25574 }
25575
25576 /// Add all reduction operations for the reduction instruction \p I.
25577 void addReductionOps(Instruction *I) {
25578 if (isCmpSelMinMax(I)) {
25579 ReductionOps[0].emplace_back(Args: cast<SelectInst>(Val: I)->getCondition());
25580 ReductionOps[1].emplace_back(Args&: I);
25581 } else {
25582 ReductionOps[0].emplace_back(Args&: I);
25583 }
25584 }
25585
25586 static bool isGoodForReduction(ArrayRef<Value *> Data) {
25587 int Sz = Data.size();
25588 auto *I = dyn_cast<Instruction>(Val: Data.front());
25589 return Sz > 1 || isConstant(V: Data.front()) ||
25590 (I && !isa<LoadInst>(Val: I) && isValidForAlternation(Opcode: I->getOpcode()));
25591 }
25592
25593 /// Optimizes original placement of the reduced values for the reduction tree.
25594 /// For example, if there is a zext i1 + selects, we can merge select
25595 /// into zext and improve emission of the reductions.
25596 void optimizeReducedVals() {
25597 SmallDenseMap<unsigned, unsigned> UsedReductionOpIds;
25598 for (const auto [Idx, Vals] : enumerate(First&: ReducedVals)) {
25599 if (auto *I = dyn_cast<Instruction>(Val: Vals.front()))
25600 UsedReductionOpIds.try_emplace(Key: I->getOpcode(), Args&: Idx);
25601 }
25602 // Check if zext i1 can be merged with select.
25603 auto ZExtIt = UsedReductionOpIds.find(Val: Instruction::ZExt);
25604 auto SelectIt = UsedReductionOpIds.find(Val: Instruction::Select);
25605 if (ZExtIt != UsedReductionOpIds.end() &&
25606 SelectIt != UsedReductionOpIds.end()) {
25607 unsigned ZExtIdx = ZExtIt->second;
25608 unsigned SelectIdx = SelectIt->second;
25609 auto *ZExt = cast<ZExtInst>(Val: ReducedVals[ZExtIdx].front());
25610 // ZExt is compatible with Select? Merge select to zext, if so.
25611 if (ZExt->getSrcTy()->isIntegerTy(Bitwidth: 1) &&
25612 ZExt->getType() == ReducedVals[SelectIdx].front()->getType()) {
25613 ReducedVals[ZExtIdx].append(RHS: ReducedVals[SelectIdx]);
25614 ReducedVals.erase(CI: std::next(x: ReducedVals.begin(), n: SelectIdx));
25615 }
25616 }
25617 }
25618
25619public:
25620 HorizontalReduction() = default;
25621 HorizontalReduction(Instruction *I, ArrayRef<Value *> Ops)
25622 : ReductionRoot(I), ReductionLimit(2) {
25623 RdxKind = HorizontalReduction::getRdxKind(V: I);
25624 ReductionOps.emplace_back().push_back(Elt: I);
25625 ReducedVals.emplace_back().assign(in_start: Ops.begin(), in_end: Ops.end());
25626 for (Value *V : Ops)
25627 ReducedValsToOps[V].push_back(Elt: I);
25628 }
25629
25630 bool matchReductionForOperands() const {
25631 // Analyze "regular" integer/FP types for reductions - no target-specific
25632 // types or pointers.
25633 assert(ReductionRoot && "Reduction root is not set!");
25634 if (!isVectorizable(Kind: RdxKind, I: cast<Instruction>(Val: ReductionRoot),
25635 TwoElementReduction: all_of(Range: ReducedVals, P: [](ArrayRef<Value *> Ops) {
25636 return Ops.size() == 2;
25637 })))
25638 return false;
25639
25640 return true;
25641 }
25642
25643 /// Try to find a reduction tree.
25644 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
25645 ScalarEvolution &SE, const DataLayout &DL,
25646 const TargetLibraryInfo &TLI) {
25647 RdxKind = HorizontalReduction::getRdxKind(V: Root);
25648 if (!isVectorizable(Kind: RdxKind, I: Root))
25649 return false;
25650
25651 // Analyze "regular" integer/FP types for reductions - no target-specific
25652 // types or pointers.
25653 Type *Ty = Root->getType();
25654 if (!isValidElementType(Ty) || Ty->isPointerTy())
25655 return false;
25656
25657 // Though the ultimate reduction may have multiple uses, its condition must
25658 // have only single use.
25659 if (auto *Sel = dyn_cast<SelectInst>(Val: Root))
25660 if (!Sel->getCondition()->hasOneUse())
25661 return false;
25662
25663 ReductionRoot = Root;
25664
25665 // Iterate through all the operands of the possible reduction tree and
25666 // gather all the reduced values, sorting them by their value id.
25667 BasicBlock *BB = Root->getParent();
25668 bool IsCmpSelMinMax = isCmpSelMinMax(I: Root);
25669 SmallVector<std::pair<Instruction *, unsigned>> Worklist(
25670 1, std::make_pair(x&: Root, y: 0));
25671 // Checks if the operands of the \p TreeN instruction are also reduction
25672 // operations or should be treated as reduced values or an extra argument,
25673 // which is not part of the reduction.
25674 auto CheckOperands = [&](Instruction *TreeN,
25675 SmallVectorImpl<Value *> &PossibleReducedVals,
25676 SmallVectorImpl<Instruction *> &ReductionOps,
25677 unsigned Level) {
25678 for (int I : reverse(C: seq<int>(Begin: getFirstOperandIndex(I: TreeN),
25679 End: getNumberOfOperands(I: TreeN)))) {
25680 Value *EdgeVal = getRdxOperand(I: TreeN, Index: I);
25681 ReducedValsToOps[EdgeVal].push_back(Elt: TreeN);
25682 auto *EdgeInst = dyn_cast<Instruction>(Val: EdgeVal);
25683 // If the edge is not an instruction, or it is different from the main
25684 // reduction opcode or has too many uses - possible reduced value.
25685 // Also, do not try to reduce const values, if the operation is not
25686 // foldable.
25687 if (!EdgeInst || Level > RecursionMaxDepth ||
25688 getRdxKind(V: EdgeInst) != RdxKind ||
25689 IsCmpSelMinMax != isCmpSelMinMax(I: EdgeInst) ||
25690 !hasRequiredNumberOfUses(IsCmpSelMinMax, I: EdgeInst) ||
25691 !isVectorizable(Kind: RdxKind, I: EdgeInst) ||
25692 (R.isAnalyzedReductionRoot(I: EdgeInst) &&
25693 all_of(Range: EdgeInst->operands(), P: IsaPred<Constant>))) {
25694 PossibleReducedVals.push_back(Elt: EdgeVal);
25695 continue;
25696 }
25697 ReductionOps.push_back(Elt: EdgeInst);
25698 }
25699 };
25700 // Try to regroup reduced values so that it gets more profitable to try to
25701 // reduce them. Values are grouped by their value ids, instructions - by
25702 // instruction op id and/or alternate op id, plus do extra analysis for
25703 // loads (grouping them by the distance between pointers) and cmp
25704 // instructions (grouping them by the predicate).
25705 SmallMapVector<
25706 size_t, SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,
25707 8>
25708 PossibleReducedVals;
25709 initReductionOps(I: Root);
25710 DenseMap<std::pair<size_t, Value *>, SmallVector<LoadInst *>> LoadsMap;
25711 SmallSet<size_t, 2> LoadKeyUsed;
25712
25713 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
25714 Key = hash_combine(args: hash_value(ptr: LI->getParent()), args: Key);
25715 Value *Ptr =
25716 getUnderlyingObject(V: LI->getPointerOperand(), MaxLookup: RecursionMaxDepth);
25717 if (!LoadKeyUsed.insert(V: Key).second) {
25718 auto LIt = LoadsMap.find(Val: std::make_pair(x&: Key, y&: Ptr));
25719 if (LIt != LoadsMap.end()) {
25720 for (LoadInst *RLI : LIt->second) {
25721 if (getPointersDiff(ElemTyA: RLI->getType(), PtrA: RLI->getPointerOperand(),
25722 ElemTyB: LI->getType(), PtrB: LI->getPointerOperand(), DL, SE,
25723 /*StrictCheck=*/true))
25724 return hash_value(ptr: RLI->getPointerOperand());
25725 }
25726 for (LoadInst *RLI : LIt->second) {
25727 if (arePointersCompatible(Ptr1: RLI->getPointerOperand(),
25728 Ptr2: LI->getPointerOperand(), TLI)) {
25729 hash_code SubKey = hash_value(ptr: RLI->getPointerOperand());
25730 return SubKey;
25731 }
25732 }
25733 if (LIt->second.size() > 2) {
25734 hash_code SubKey =
25735 hash_value(ptr: LIt->second.back()->getPointerOperand());
25736 return SubKey;
25737 }
25738 }
25739 }
25740 LoadsMap.try_emplace(Key: std::make_pair(x&: Key, y&: Ptr))
25741 .first->second.push_back(Elt: LI);
25742 return hash_value(ptr: LI->getPointerOperand());
25743 };
25744
25745 while (!Worklist.empty()) {
25746 auto [TreeN, Level] = Worklist.pop_back_val();
25747 SmallVector<Value *> PossibleRedVals;
25748 SmallVector<Instruction *> PossibleReductionOps;
25749 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
25750 addReductionOps(I: TreeN);
25751 // Add reduction values. The values are sorted for better vectorization
25752 // results.
25753 for (Value *V : PossibleRedVals) {
25754 size_t Key, Idx;
25755 std::tie(args&: Key, args&: Idx) = generateKeySubkey(V, TLI: &TLI, LoadsSubkeyGenerator: GenerateLoadsSubkey,
25756 /*AllowAlternate=*/false);
25757 ++PossibleReducedVals[Key][Idx].try_emplace(Key: V, Args: 0).first->second;
25758 }
25759 for (Instruction *I : reverse(C&: PossibleReductionOps))
25760 Worklist.emplace_back(Args&: I, Args: I->getParent() == BB ? 0 : Level + 1);
25761 }
25762 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
25763 // Sort values by the total number of values kinds to start the reduction
25764 // from the longest possible reduced values sequences.
25765 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
25766 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
25767 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
25768 for (auto &Slice : PossibleRedVals) {
25769 PossibleRedValsVect.emplace_back();
25770 auto RedValsVect = Slice.second.takeVector();
25771 stable_sort(Range&: RedValsVect, C: llvm::less_second());
25772 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
25773 PossibleRedValsVect.back().append(NumInputs: Data.second, Elt: Data.first);
25774 }
25775 stable_sort(Range&: PossibleRedValsVect, C: [](const auto &P1, const auto &P2) {
25776 return P1.size() > P2.size();
25777 });
25778 bool First = true;
25779 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
25780 if (First) {
25781 First = false;
25782 ReducedVals.emplace_back();
25783 } else if (!isGoodForReduction(Data)) {
25784 auto *LI = dyn_cast<LoadInst>(Val: Data.front());
25785 auto *LastLI = dyn_cast<LoadInst>(Val: ReducedVals.back().front());
25786 if (!LI || !LastLI ||
25787 getUnderlyingObject(V: LI->getPointerOperand()) !=
25788 getUnderlyingObject(V: LastLI->getPointerOperand()))
25789 ReducedVals.emplace_back();
25790 }
25791 ReducedVals.back().append(in_start: Data.rbegin(), in_end: Data.rend());
25792 }
25793 }
25794 // Post optimize reduced values to get better reduction sequences and sort
25795 // them by size.
25796 optimizeReducedVals();
25797 // Sort the reduced values by number of same/alternate opcode and/or pointer
25798 // operand.
25799 stable_sort(Range&: ReducedVals, C: [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
25800 return P1.size() > P2.size();
25801 });
25802 return true;
25803 }
25804
25805 /// Attempt to vectorize the tree found by matchAssociativeReduction.
25806 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
25807 const TargetLibraryInfo &TLI, AssumptionCache *AC,
25808 DominatorTree &DT) {
25809 constexpr unsigned RegMaxNumber = 4;
25810 constexpr unsigned RedValsMaxNumber = 128;
25811 // If there are a sufficient number of reduction values, reduce
25812 // to a nearby power-of-2. We can safely generate oversized
25813 // vectors and rely on the backend to split them to legal sizes.
25814 if (unsigned NumReducedVals = std::accumulate(
25815 first: ReducedVals.begin(), last: ReducedVals.end(), init: 0,
25816 binary_op: [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
25817 if (!isGoodForReduction(Data: Vals))
25818 return Num;
25819 return Num + Vals.size();
25820 });
25821 NumReducedVals < ReductionLimit &&
25822 all_of(Range&: ReducedVals, P: [](ArrayRef<Value *> RedV) {
25823 return RedV.size() < 2 || !allConstant(VL: RedV) || !isSplat(VL: RedV);
25824 })) {
25825 for (ReductionOpsType &RdxOps : ReductionOps)
25826 for (Value *RdxOp : RdxOps)
25827 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
25828 return nullptr;
25829 }
25830
25831 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
25832 TargetFolder(DL));
25833 Builder.SetInsertPoint(cast<Instruction>(Val&: ReductionRoot));
25834
25835 // Track the reduced values in case if they are replaced by extractelement
25836 // because of the vectorization.
25837 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
25838 ReducedVals.front().size());
25839
25840 // The compare instruction of a min/max is the insertion point for new
25841 // instructions and may be replaced with a new compare instruction.
25842 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
25843 assert(isa<SelectInst>(RdxRootInst) &&
25844 "Expected min/max reduction to have select root instruction");
25845 Value *ScalarCond = cast<SelectInst>(Val: RdxRootInst)->getCondition();
25846 assert(isa<Instruction>(ScalarCond) &&
25847 "Expected min/max reduction to have compare condition");
25848 return cast<Instruction>(Val: ScalarCond);
25849 };
25850
25851 bool AnyBoolLogicOp = any_of(Range&: ReductionOps.back(), P: [](Value *V) {
25852 return isBoolLogicOp(I: cast<Instruction>(Val: V));
25853 });
25854 // Return new VectorizedTree, based on previous value.
25855 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
25856 if (VectorizedTree) {
25857 // Update the final value in the reduction.
25858 Builder.SetCurrentDebugLocation(
25859 cast<Instruction>(Val: ReductionOps.front().front())->getDebugLoc());
25860 if (AnyBoolLogicOp) {
25861 auto It = ReducedValsToOps.find(Val: VectorizedTree);
25862 auto It1 = ReducedValsToOps.find(Val: Res);
25863 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
25864 isGuaranteedNotToBePoison(V: VectorizedTree, AC) ||
25865 (It != ReducedValsToOps.end() &&
25866 any_of(Range&: It->getSecond(), P: [&](Instruction *I) {
25867 return isBoolLogicOp(I) &&
25868 getRdxOperand(I, Index: 0) == VectorizedTree;
25869 }))) {
25870 ;
25871 } else if (isGuaranteedNotToBePoison(V: Res, AC) ||
25872 (It1 != ReducedValsToOps.end() &&
25873 any_of(Range&: It1->getSecond(), P: [&](Instruction *I) {
25874 return isBoolLogicOp(I) && getRdxOperand(I, Index: 0) == Res;
25875 }))) {
25876 std::swap(a&: VectorizedTree, b&: Res);
25877 } else {
25878 VectorizedTree = Builder.CreateFreeze(V: VectorizedTree);
25879 }
25880 }
25881
25882 return createOp(Builder, RdxKind, LHS: VectorizedTree, RHS: Res, Name: "op.rdx",
25883 ReductionOps);
25884 }
25885 // Initialize the final value in the reduction.
25886 return Res;
25887 };
25888 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
25889 ReductionOps.front().size());
25890 for (ReductionOpsType &RdxOps : ReductionOps)
25891 for (Value *RdxOp : RdxOps) {
25892 if (!RdxOp)
25893 continue;
25894 IgnoreList.insert(V: RdxOp);
25895 }
25896 // Intersect the fast-math-flags from all reduction operations.
25897 FastMathFlags RdxFMF;
25898 RdxFMF.set();
25899 for (Value *U : IgnoreList)
25900 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: U))
25901 RdxFMF &= FPMO->getFastMathFlags();
25902 bool IsCmpSelMinMax = isCmpSelMinMax(I: cast<Instruction>(Val&: ReductionRoot));
25903
25904 // Need to track reduced vals, they may be changed during vectorization of
25905 // subvectors.
25906 for (ArrayRef<Value *> Candidates : ReducedVals)
25907 for (Value *V : Candidates)
25908 TrackedVals.try_emplace(Key: V, Args&: V);
25909
25910 auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,
25911 Value *V) -> unsigned & {
25912 auto *It = MV.find(Key: V);
25913 assert(It != MV.end() && "Unable to find given key.");
25914 return It->second;
25915 };
25916
25917 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
25918 // List of the values that were reduced in other trees as part of gather
25919 // nodes and thus requiring extract if fully vectorized in other trees.
25920 SmallPtrSet<Value *, 4> RequiredExtract;
25921 WeakTrackingVH VectorizedTree = nullptr;
25922 bool CheckForReusedReductionOps = false;
25923 // Try to vectorize elements based on their type.
25924 SmallVector<InstructionsState> States;
25925 SmallVector<SmallVector<Value *>> LocalReducedVals;
25926 // Try merge consecutive reduced values into a single vectorizable group and
25927 // check, if they can be vectorized as copyables.
25928 for (ArrayRef<Value *> RV : ReducedVals) {
25929 // Loads are not very compatible with undefs.
25930 if (isa<UndefValue>(Val: RV.front()) &&
25931 (States.empty() || !States.back() ||
25932 States.back().getOpcode() == Instruction::Load)) {
25933 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25934 States.push_back(Elt: InstructionsState::invalid());
25935 continue;
25936 }
25937 if (!LocalReducedVals.empty() &&
25938 isa<UndefValue>(Val: LocalReducedVals.back().front()) &&
25939 isa<LoadInst>(Val: RV.front())) {
25940 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25941 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
25942 continue;
25943 }
25944 SmallVector<Value *> Ops;
25945 if (!LocalReducedVals.empty())
25946 Ops = LocalReducedVals.back();
25947 Ops.append(in_start: RV.begin(), in_end: RV.end());
25948 InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
25949 InstructionsState OpS =
25950 Analysis.buildInstructionsState(VL: Ops, R: V, TryCopyableElementsVectorization: VectorizeCopyableElements);
25951 if (LocalReducedVals.empty()) {
25952 LocalReducedVals.push_back(Elt: Ops);
25953 States.push_back(Elt: OpS);
25954 continue;
25955 }
25956 if (OpS) {
25957 LocalReducedVals.back().swap(RHS&: Ops);
25958 States.back() = OpS;
25959 continue;
25960 }
25961 LocalReducedVals.emplace_back().append(in_start: RV.begin(), in_end: RV.end());
25962 States.push_back(Elt: getSameOpcode(VL: RV, TLI));
25963 }
25964 ReducedVals.swap(RHS&: LocalReducedVals);
25965 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
25966 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
25967 InstructionsState S = States[I];
25968 SmallVector<Value *> Candidates;
25969 Candidates.reserve(N: 2 * OrigReducedVals.size());
25970 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
25971 for (Value *ReducedVal : OrigReducedVals) {
25972 Value *RdxVal = TrackedVals.at(Val: ReducedVal);
25973 // Check if the reduction value was not overriden by the extractelement
25974 // instruction because of the vectorization and exclude it, if it is not
25975 // compatible with other values.
25976 // Also check if the instruction was folded to constant/other value.
25977 auto *Inst = dyn_cast<Instruction>(Val: RdxVal);
25978 if ((Inst && isVectorLikeInstWithConstOps(V: Inst) &&
25979 (!S || (!S.getMatchingMainOpOrAltOp(I: Inst) &&
25980 !S.isCopyableElement(V: Inst)))) ||
25981 (S && !Inst && !isa<PoisonValue>(Val: RdxVal) &&
25982 !S.isCopyableElement(V: RdxVal)))
25983 continue;
25984 Candidates.push_back(Elt: RdxVal);
25985 TrackedToOrig.try_emplace(Key: RdxVal, Args&: ReducedVal);
25986 }
25987 bool ShuffledExtracts = false;
25988 // Try to handle shuffled extractelements.
25989 if (S && S.getOpcode() == Instruction::ExtractElement &&
25990 !S.isAltShuffle() && I + 1 < E) {
25991 SmallVector<Value *> CommonCandidates(Candidates);
25992 for (Value *RV : ReducedVals[I + 1]) {
25993 Value *RdxVal = TrackedVals.at(Val: RV);
25994 // Check if the reduction value was not overriden by the
25995 // extractelement instruction because of the vectorization and
25996 // exclude it, if it is not compatible with other values.
25997 auto *Inst = dyn_cast<ExtractElementInst>(Val: RdxVal);
25998 if (!Inst)
25999 continue;
26000 CommonCandidates.push_back(Elt: RdxVal);
26001 TrackedToOrig.try_emplace(Key: RdxVal, Args&: RV);
26002 }
26003 SmallVector<int> Mask;
26004 if (isFixedVectorShuffle(VL: CommonCandidates, Mask, AC)) {
26005 ++I;
26006 Candidates.swap(RHS&: CommonCandidates);
26007 ShuffledExtracts = true;
26008 }
26009 }
26010
26011 // Emit code for constant values.
26012 if (Candidates.size() > 1 && allConstant(VL: Candidates)) {
26013 Value *Res = Candidates.front();
26014 Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
26015 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
26016 for (Value *VC : ArrayRef(Candidates).drop_front()) {
26017 Res = createOp(Builder, RdxKind, LHS: Res, RHS: VC, Name: "const.rdx", ReductionOps);
26018 Value *OrigV = TrackedToOrig.at(Val: VC);
26019 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
26020 if (auto *ResI = dyn_cast<Instruction>(Val: Res))
26021 V.analyzedReductionRoot(I: ResI);
26022 }
26023 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
26024 continue;
26025 }
26026
26027 unsigned NumReducedVals = Candidates.size();
26028 if (NumReducedVals < ReductionLimit &&
26029 (NumReducedVals < 2 || !isSplat(VL: Candidates)))
26030 continue;
26031
26032 // Check if we support repeated scalar values processing (optimization of
26033 // original scalar identity operations on matched horizontal reductions).
26034 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
26035 RdxKind != RecurKind::FMul &&
26036 RdxKind != RecurKind::FMulAdd;
26037 // Gather same values.
26038 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
26039 if (IsSupportedHorRdxIdentityOp)
26040 for (Value *V : Candidates) {
26041 Value *OrigV = TrackedToOrig.at(Val: V);
26042 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
26043 }
26044 // Used to check if the reduced values used same number of times. In this
26045 // case the compiler may produce better code. E.g. if reduced values are
26046 // aabbccdd (8 x values), then the first node of the tree will have a node
26047 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
26048 // Plus, the final reduction will be performed on <8 x aabbccdd>.
26049 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
26050 // x abcd) * 2.
26051 // Currently it only handles add/fadd/xor. and/or/min/max do not require
26052 // this analysis, other operations may require an extra estimation of
26053 // the profitability.
26054 bool SameScaleFactor = false;
26055 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
26056 SameValuesCounter.size() != Candidates.size();
26057 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
26058 if (OptReusedScalars) {
26059 SameScaleFactor =
26060 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
26061 RdxKind == RecurKind::Xor) &&
26062 all_of(Range: drop_begin(RangeOrContainer&: SameValuesCounter),
26063 P: [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
26064 return P.second == SameValuesCounter.front().second;
26065 });
26066 Candidates.resize(N: SameValuesCounter.size());
26067 transform(Range&: SameValuesCounter, d_first: Candidates.begin(),
26068 F: [&](const auto &P) { return TrackedVals.at(P.first); });
26069 NumReducedVals = Candidates.size();
26070 // Have a reduction of the same element.
26071 if (NumReducedVals == 1) {
26072 Value *OrigV = TrackedToOrig.at(Val: Candidates.front());
26073 unsigned Cnt = At(SameValuesCounter, OrigV);
26074 Value *RedVal =
26075 emitScaleForReusedOps(VectorizedValue: Candidates.front(), Builder, Cnt);
26076 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
26077 VectorizedVals.try_emplace(Key: OrigV, Args&: Cnt);
26078 ExternallyUsedValues.insert(V: OrigV);
26079 continue;
26080 }
26081 }
26082
26083 unsigned MaxVecRegSize = V.getMaxVecRegSize();
26084 unsigned EltSize = V.getVectorElementSize(V: Candidates[0]);
26085 const unsigned MaxElts = std::clamp<unsigned>(
26086 val: llvm::bit_floor(Value: MaxVecRegSize / EltSize), lo: RedValsMaxNumber,
26087 hi: RegMaxNumber * RedValsMaxNumber);
26088
26089 unsigned ReduxWidth = NumReducedVals;
26090 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
26091 unsigned NumParts, NumRegs;
26092 Type *ScalarTy = Candidates.front()->getType();
26093 ReduxWidth =
26094 getFloorFullVectorNumberOfElements(TTI, Ty: ScalarTy, Sz: ReduxWidth);
26095 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
26096 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
26097 NumRegs =
26098 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
26099 while (NumParts > NumRegs) {
26100 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
26101 ReduxWidth = bit_floor(Value: ReduxWidth - 1);
26102 VectorType *Tp = getWidenedType(ScalarTy, VF: ReduxWidth);
26103 NumParts = ::getNumberOfParts(TTI, VecTy: Tp);
26104 NumRegs =
26105 TTI.getNumberOfRegisters(ClassID: TTI.getRegisterClassForType(Vector: true, Ty: Tp));
26106 }
26107 if (NumParts > NumRegs / 2)
26108 ReduxWidth = bit_floor(Value: ReduxWidth);
26109 return ReduxWidth;
26110 };
26111 if (!VectorizeNonPowerOf2 || !has_single_bit(Value: ReduxWidth + 1))
26112 ReduxWidth = GetVectorFactor(ReduxWidth);
26113 ReduxWidth = std::min(a: ReduxWidth, b: MaxElts);
26114
26115 unsigned Start = 0;
26116 unsigned Pos = Start;
26117 // Restarts vectorization attempt with lower vector factor.
26118 unsigned PrevReduxWidth = ReduxWidth;
26119 bool CheckForReusedReductionOpsLocal = false;
26120 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
26121 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(Vals: IgnoreList);
26122 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
26123 // Check if any of the reduction ops are gathered. If so, worth
26124 // trying again with less number of reduction ops.
26125 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
26126 }
26127 ++Pos;
26128 if (Pos < NumReducedVals - ReduxWidth + 1)
26129 return IsAnyRedOpGathered;
26130 Pos = Start;
26131 --ReduxWidth;
26132 if (ReduxWidth > 1)
26133 ReduxWidth = GetVectorFactor(ReduxWidth);
26134 return IsAnyRedOpGathered;
26135 };
26136 bool AnyVectorized = false;
26137 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
26138 while (Pos < NumReducedVals - ReduxWidth + 1 &&
26139 ReduxWidth >= ReductionLimit) {
26140 // Dependency in tree of the reduction ops - drop this attempt, try
26141 // later.
26142 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
26143 Start == 0) {
26144 CheckForReusedReductionOps = true;
26145 break;
26146 }
26147 PrevReduxWidth = ReduxWidth;
26148 ArrayRef<Value *> VL(std::next(x: Candidates.begin(), n: Pos), ReduxWidth);
26149 // Been analyzed already - skip.
26150 if (IgnoredCandidates.contains(V: std::make_pair(x&: Pos, y&: ReduxWidth)) ||
26151 (!has_single_bit(Value: ReduxWidth) &&
26152 (IgnoredCandidates.contains(
26153 V: std::make_pair(x&: Pos, y: bit_floor(Value: ReduxWidth))) ||
26154 IgnoredCandidates.contains(
26155 V: std::make_pair(x: Pos + (ReduxWidth - bit_floor(Value: ReduxWidth)),
26156 y: bit_floor(Value: ReduxWidth))))) ||
26157 V.areAnalyzedReductionVals(VL)) {
26158 (void)AdjustReducedVals(/*IgnoreVL=*/true);
26159 continue;
26160 }
26161 // Early exit if any of the reduction values were deleted during
26162 // previous vectorization attempts.
26163 if (any_of(Range&: VL, P: [&V](Value *RedVal) {
26164 auto *RedValI = dyn_cast<Instruction>(Val: RedVal);
26165 return RedValI && V.isDeleted(I: RedValI);
26166 }))
26167 break;
26168 V.buildTree(Roots: VL, UserIgnoreLst: IgnoreList);
26169 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
26170 if (!AdjustReducedVals())
26171 V.analyzedReductionVals(VL);
26172 continue;
26173 }
26174 V.reorderTopToBottom();
26175 // No need to reorder the root node at all for reassociative reduction.
26176 V.reorderBottomToTop(/*IgnoreReorder=*/RdxFMF.allowReassoc() ||
26177 VL.front()->getType()->isIntOrIntVectorTy() ||
26178 ReductionLimit > 2);
26179 // Keep extracted other reduction values, if they are used in the
26180 // vectorization trees.
26181 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
26182 ExternallyUsedValues);
26183 // The reduction root is used as the insertion point for new
26184 // instructions, so set it as externally used to prevent it from being
26185 // deleted.
26186 LocalExternallyUsedValues.insert(V: ReductionRoot);
26187 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
26188 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
26189 continue;
26190 for (Value *V : ReducedVals[Cnt])
26191 if (isa<Instruction>(Val: V))
26192 LocalExternallyUsedValues.insert(V: TrackedVals[V]);
26193 }
26194 if (!IsSupportedHorRdxIdentityOp) {
26195 // Number of uses of the candidates in the vector of values.
26196 assert(SameValuesCounter.empty() &&
26197 "Reused values counter map is not empty");
26198 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
26199 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26200 continue;
26201 Value *V = Candidates[Cnt];
26202 Value *OrigV = TrackedToOrig.at(Val: V);
26203 ++SameValuesCounter.try_emplace(Key: OrigV).first->second;
26204 }
26205 }
26206 V.transformNodes();
26207 V.computeMinimumValueSizes();
26208 InstructionCost TreeCost = V.calculateTreeCostAndTrimNonProfitable(VectorizedVals: VL);
26209
26210 SmallPtrSet<Value *, 4> VLScalars(llvm::from_range, VL);
26211 // Gather externally used values.
26212 SmallPtrSet<Value *, 4> Visited;
26213 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
26214 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
26215 continue;
26216 Value *RdxVal = Candidates[Cnt];
26217 if (auto It = TrackedVals.find(Val: RdxVal); It != TrackedVals.end())
26218 RdxVal = It->second;
26219 if (!Visited.insert(Ptr: RdxVal).second)
26220 continue;
26221 // Check if the scalar was vectorized as part of the vectorization
26222 // tree but not the top node.
26223 if (!VLScalars.contains(Ptr: RdxVal) && V.isVectorized(V: RdxVal)) {
26224 LocalExternallyUsedValues.insert(V: RdxVal);
26225 continue;
26226 }
26227 Value *OrigV = TrackedToOrig.at(Val: RdxVal);
26228 unsigned NumOps =
26229 VectorizedVals.lookup(Val: OrigV) + At(SameValuesCounter, OrigV);
26230 if (NumOps != ReducedValsToOps.at(Val: OrigV).size())
26231 LocalExternallyUsedValues.insert(V: RdxVal);
26232 }
26233 // Do not need the list of reused scalars in regular mode anymore.
26234 if (!IsSupportedHorRdxIdentityOp)
26235 SameValuesCounter.clear();
26236 for (Value *RdxVal : VL)
26237 if (RequiredExtract.contains(Ptr: RdxVal))
26238 LocalExternallyUsedValues.insert(V: RdxVal);
26239 V.buildExternalUses(ExternallyUsedValues: LocalExternallyUsedValues);
26240
26241 // Estimate cost.
26242 InstructionCost ReductionCost;
26243 if (V.isReducedBitcastRoot() || V.isReducedCmpBitcastRoot())
26244 ReductionCost = 0;
26245 else
26246 ReductionCost =
26247 getReductionCost(TTI, ReducedVals: VL, IsCmpSelMinMax, FMF: RdxFMF, R: V, DT, DL, TLI);
26248 InstructionCost Cost = V.getTreeCost(TreeCost, VectorizedVals: VL, ReductionCost);
26249 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
26250 << " for reduction\n");
26251 if (!Cost.isValid())
26252 break;
26253 if (Cost >= -SLPCostThreshold) {
26254 V.getORE()->emit(RemarkBuilder: [&]() {
26255 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
26256 ReducedValsToOps.at(Val: VL[0]).front())
26257 << "Vectorizing horizontal reduction is possible "
26258 << "but not beneficial with cost " << ore::NV("Cost", Cost)
26259 << " and threshold "
26260 << ore::NV("Threshold", -SLPCostThreshold);
26261 });
26262 if (!AdjustReducedVals()) {
26263 V.analyzedReductionVals(VL);
26264 unsigned Offset = Pos == Start ? Pos : Pos - 1;
26265 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
26266 // Add subvectors of VL to the list of the analyzed values.
26267 for (unsigned VF = getFloorFullVectorNumberOfElements(
26268 TTI: *TTI, Ty: VL.front()->getType(), Sz: ReduxWidth - 1);
26269 VF >= ReductionLimit;
26270 VF = getFloorFullVectorNumberOfElements(
26271 TTI: *TTI, Ty: VL.front()->getType(), Sz: VF - 1)) {
26272 if (has_single_bit(Value: VF) &&
26273 V.getCanonicalGraphSize() != V.getTreeSize())
26274 continue;
26275 for (unsigned Idx : seq<unsigned>(Size: ReduxWidth - VF))
26276 IgnoredCandidates.insert(V: std::make_pair(x: Offset + Idx, y&: VF));
26277 }
26278 }
26279 }
26280 continue;
26281 }
26282
26283 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
26284 << Cost << ". (HorRdx)\n");
26285 V.getORE()->emit(RemarkBuilder: [&]() {
26286 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
26287 ReducedValsToOps.at(Val: VL[0]).front())
26288 << "Vectorized horizontal reduction with cost "
26289 << ore::NV("Cost", Cost) << " and with tree size "
26290 << ore::NV("TreeSize", V.getTreeSize());
26291 });
26292
26293 Builder.setFastMathFlags(RdxFMF);
26294
26295 // Emit a reduction. If the root is a select (min/max idiom), the insert
26296 // point is the compare condition of that select.
26297 Instruction *RdxRootInst = cast<Instruction>(Val&: ReductionRoot);
26298 Instruction *InsertPt = RdxRootInst;
26299 if (IsCmpSelMinMax)
26300 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
26301
26302 // Vectorize a tree.
26303 Value *VectorizedRoot = V.vectorizeTree(
26304 ExternallyUsedValues: LocalExternallyUsedValues, ReductionRoot: InsertPt, VectorValuesAndScales);
26305 // Update TrackedToOrig mapping, since the tracked values might be
26306 // updated.
26307 for (Value *RdxVal : Candidates) {
26308 Value *OrigVal = TrackedToOrig.at(Val: RdxVal);
26309 Value *TransformedRdxVal = TrackedVals.at(Val: OrigVal);
26310 if (TransformedRdxVal != RdxVal)
26311 TrackedToOrig.try_emplace(Key: TransformedRdxVal, Args&: OrigVal);
26312 }
26313
26314 Builder.SetInsertPoint(InsertPt);
26315
26316 // To prevent poison from leaking across what used to be sequential,
26317 // safe, scalar boolean logic operations, the reduction operand must be
26318 // frozen.
26319 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(V: VectorizedRoot, AC))
26320 VectorizedRoot = Builder.CreateFreeze(V: VectorizedRoot);
26321
26322 // Emit code to correctly handle reused reduced values, if required.
26323 if (OptReusedScalars && !SameScaleFactor) {
26324 VectorizedRoot = emitReusedOps(VectorizedValue: VectorizedRoot, Builder, R&: V,
26325 SameValuesCounter, TrackedToOrig);
26326 }
26327
26328 Type *ScalarTy = VL.front()->getType();
26329 Type *VecTy = VectorizedRoot->getType();
26330 Type *RedScalarTy = VecTy->getScalarType();
26331 VectorValuesAndScales.emplace_back(
26332 Args&: VectorizedRoot,
26333 Args: OptReusedScalars && SameScaleFactor
26334 ? SameValuesCounter.front().second
26335 : 1,
26336 Args: RedScalarTy != ScalarTy->getScalarType()
26337 ? V.isSignedMinBitwidthRootNode()
26338 : true,
26339 Args: V.isReducedBitcastRoot() || V.isReducedCmpBitcastRoot());
26340
26341 // Count vectorized reduced values to exclude them from final reduction.
26342 for (Value *RdxVal : VL) {
26343 Value *OrigV = TrackedToOrig.at(Val: RdxVal);
26344 if (IsSupportedHorRdxIdentityOp) {
26345 VectorizedVals.try_emplace(Key: OrigV, Args&: At(SameValuesCounter, OrigV));
26346 continue;
26347 }
26348 ++VectorizedVals.try_emplace(Key: OrigV).first->getSecond();
26349 if (!V.isVectorized(V: RdxVal))
26350 RequiredExtract.insert(Ptr: RdxVal);
26351 }
26352 Pos += ReduxWidth;
26353 Start = Pos;
26354 ReduxWidth = NumReducedVals - Pos;
26355 if (ReduxWidth > 1)
26356 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
26357 AnyVectorized = true;
26358 }
26359 if (OptReusedScalars && !AnyVectorized) {
26360 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
26361 Value *RdxVal = TrackedVals.at(Val: P.first);
26362 Value *RedVal = emitScaleForReusedOps(VectorizedValue: RdxVal, Builder, Cnt: P.second);
26363 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
26364 VectorizedVals.try_emplace(Key: P.first, Args: P.second);
26365 }
26366 continue;
26367 }
26368 }
26369 if (!VectorValuesAndScales.empty())
26370 VectorizedTree = GetNewVectorizedTree(
26371 VectorizedTree,
26372 emitReduction(Builder, TTI: *TTI, DestTy: ReductionRoot->getType()));
26373
26374 if (!VectorizedTree) {
26375 if (!CheckForReusedReductionOps) {
26376 for (ReductionOpsType &RdxOps : ReductionOps)
26377 for (Value *RdxOp : RdxOps)
26378 V.analyzedReductionRoot(I: cast<Instruction>(Val: RdxOp));
26379 }
26380 return nullptr;
26381 }
26382
26383 // Reorder operands of bool logical op in the natural order to avoid
26384 // possible problem with poison propagation. If not possible to reorder
26385 // (both operands are originally RHS), emit an extra freeze instruction
26386 // for the LHS operand.
26387 // I.e., if we have original code like this:
26388 // RedOp1 = select i1 ?, i1 LHS, i1 false
26389 // RedOp2 = select i1 RHS, i1 ?, i1 false
26390
26391 // Then, we swap LHS/RHS to create a new op that matches the poison
26392 // semantics of the original code.
26393
26394 // If we have original code like this and both values could be poison:
26395 // RedOp1 = select i1 ?, i1 LHS, i1 false
26396 // RedOp2 = select i1 ?, i1 RHS, i1 false
26397
26398 // Then, we must freeze LHS in the new op.
26399 auto FixBoolLogicalOps =
26400 [&, VectorizedTree](Value *&LHS, Value *&RHS, Instruction *RedOp1,
26401 Instruction *RedOp2, bool InitStep) {
26402 if (!AnyBoolLogicOp)
26403 return;
26404 if (isBoolLogicOp(I: RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
26405 getRdxOperand(I: RedOp1, Index: 0) == LHS ||
26406 isGuaranteedNotToBePoison(V: LHS, AC)))
26407 return;
26408 bool NeedFreeze = LHS != VectorizedTree;
26409 if (isBoolLogicOp(I: RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
26410 getRdxOperand(I: RedOp2, Index: 0) == RHS ||
26411 isGuaranteedNotToBePoison(V: RHS, AC))) {
26412 // If RedOp2 was used as a second operand - do not swap.
26413 if ((InitStep || RHS != VectorizedTree) &&
26414 getRdxOperand(I: RedOp2, Index: 0) == RHS &&
26415 ((isBoolLogicOp(I: RedOp1) &&
26416 getRdxOperand(I: RedOp1, Index: 1) == RedOp2) ||
26417 any_of(Range&: ReductionOps, P: [&](ArrayRef<Value *> Ops) {
26418 return any_of(Range&: Ops, P: [&](Value *Op) {
26419 auto *OpI = dyn_cast<Instruction>(Val: Op);
26420 return OpI && isBoolLogicOp(I: OpI) &&
26421 getRdxOperand(I: OpI, Index: 1) == RedOp2;
26422 });
26423 }))) {
26424 NeedFreeze = false;
26425 } else {
26426 std::swap(a&: LHS, b&: RHS);
26427 return;
26428 }
26429 }
26430 if (NeedFreeze)
26431 LHS = Builder.CreateFreeze(V: LHS);
26432 };
26433 // Finish the reduction.
26434 // Need to add extra arguments and not vectorized possible reduction values.
26435 // Try to avoid dependencies between the scalar remainders after reductions.
26436 auto FinalGen = [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,
26437 bool InitStep) {
26438 unsigned Sz = InstVals.size();
26439 SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 + Sz % 2);
26440 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
26441 Instruction *RedOp = InstVals[I + 1].first;
26442 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
26443 Value *RdxVal1 = InstVals[I].second;
26444 Value *StableRdxVal1 = RdxVal1;
26445 auto It1 = TrackedVals.find(Val: RdxVal1);
26446 if (It1 != TrackedVals.end())
26447 StableRdxVal1 = It1->second;
26448 Value *RdxVal2 = InstVals[I + 1].second;
26449 Value *StableRdxVal2 = RdxVal2;
26450 auto It2 = TrackedVals.find(Val: RdxVal2);
26451 if (It2 != TrackedVals.end())
26452 StableRdxVal2 = It2->second;
26453 // To prevent poison from leaking across what used to be sequential,
26454 // safe, scalar boolean logic operations, the reduction operand must be
26455 // frozen.
26456 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
26457 RedOp, InitStep);
26458 Value *ExtraRed = createOp(Builder, RdxKind, LHS: StableRdxVal1,
26459 RHS: StableRdxVal2, Name: "op.rdx", ReductionOps);
26460 ExtraReds[I / 2] = std::make_pair(x: InstVals[I].first, y&: ExtraRed);
26461 }
26462 if (Sz % 2 == 1)
26463 ExtraReds[Sz / 2] = InstVals.back();
26464 return ExtraReds;
26465 };
26466 SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;
26467 ExtraReductions.emplace_back(Args: cast<Instruction>(Val&: ReductionRoot),
26468 Args&: VectorizedTree);
26469 SmallPtrSet<Value *, 8> Visited;
26470 for (ArrayRef<Value *> Candidates : ReducedVals) {
26471 for (Value *RdxVal : Candidates) {
26472 if (!Visited.insert(Ptr: RdxVal).second)
26473 continue;
26474 unsigned NumOps = VectorizedVals.lookup(Val: RdxVal);
26475 for (Instruction *RedOp :
26476 ArrayRef(ReducedValsToOps.at(Val: RdxVal)).drop_back(N: NumOps))
26477 ExtraReductions.emplace_back(Args&: RedOp, Args&: RdxVal);
26478 }
26479 }
26480 // Iterate through all not-vectorized reduction values/extra arguments.
26481 bool InitStep = true;
26482 while (ExtraReductions.size() > 1) {
26483 SmallVector<std::pair<Instruction *, Value *>> NewReds =
26484 FinalGen(ExtraReductions, InitStep);
26485 ExtraReductions.swap(RHS&: NewReds);
26486 InitStep = false;
26487 }
26488 VectorizedTree = ExtraReductions.front().second;
26489
26490 ReductionRoot->replaceAllUsesWith(V: VectorizedTree);
26491
26492 // The original scalar reduction is expected to have no remaining
26493 // uses outside the reduction tree itself. Assert that we got this
26494 // correct, replace internal uses with undef, and mark for eventual
26495 // deletion.
26496#ifndef NDEBUG
26497 SmallPtrSet<Value *, 4> IgnoreSet;
26498 for (ArrayRef<Value *> RdxOps : ReductionOps)
26499 IgnoreSet.insert_range(RdxOps);
26500#endif
26501 for (ArrayRef<Value *> RdxOps : ReductionOps) {
26502 for (Value *Ignore : RdxOps) {
26503 if (!Ignore)
26504 continue;
26505#ifndef NDEBUG
26506 for (auto *U : Ignore->users()) {
26507 assert(IgnoreSet.count(U) &&
26508 "All users must be either in the reduction ops list.");
26509 }
26510#endif
26511 if (!Ignore->use_empty()) {
26512 Value *P = PoisonValue::get(T: Ignore->getType());
26513 Ignore->replaceAllUsesWith(V: P);
26514 }
26515 }
26516 V.removeInstructionsAndOperands(DeadVals: RdxOps, VectorValuesAndScales);
26517 }
26518 return VectorizedTree;
26519 }
26520
26521private:
26522 /// Creates the reduction from the given \p Vec vector value with the given
26523 /// scale \p Scale and signedness \p IsSigned.
26524 Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
26525 Value *Vec, unsigned Scale, bool IsSigned, Type *DestTy,
26526 bool ReducedInTree) {
26527 Value *Rdx;
26528 if (ReducedInTree) {
26529 Rdx = Vec;
26530 } else if (auto *VecTy = dyn_cast<FixedVectorType>(Val: DestTy)) {
26531 unsigned DestTyNumElements = getNumElements(Ty: VecTy);
26532 unsigned VF = getNumElements(Ty: Vec->getType()) / DestTyNumElements;
26533 Rdx = PoisonValue::get(
26534 T: getWidenedType(ScalarTy: Vec->getType()->getScalarType(), VF: DestTyNumElements));
26535 for (unsigned I : seq<unsigned>(Size: DestTyNumElements)) {
26536 // Do reduction for each lane.
26537 // e.g., do reduce add for
26538 // VL[0] = <4 x Ty> <a, b, c, d>
26539 // VL[1] = <4 x Ty> <e, f, g, h>
26540 // Lane[0] = <2 x Ty> <a, e>
26541 // Lane[1] = <2 x Ty> <b, f>
26542 // Lane[2] = <2 x Ty> <c, g>
26543 // Lane[3] = <2 x Ty> <d, h>
26544 // result[0] = reduce add Lane[0]
26545 // result[1] = reduce add Lane[1]
26546 // result[2] = reduce add Lane[2]
26547 // result[3] = reduce add Lane[3]
26548 SmallVector<int, 16> Mask = createStrideMask(Start: I, Stride: DestTyNumElements, VF);
26549 Value *Lane = Builder.CreateShuffleVector(V: Vec, Mask);
26550 Rdx = Builder.CreateInsertElement(
26551 Vec: Rdx, NewElt: emitReduction(VectorizedValue: Lane, Builder, TTI: &TTI, DestTy), Idx: I);
26552 }
26553 } else {
26554 Rdx = emitReduction(VectorizedValue: Vec, Builder, TTI: &TTI, DestTy);
26555 }
26556 if (Rdx->getType() != DestTy)
26557 Rdx = Builder.CreateIntCast(V: Rdx, DestTy, isSigned: IsSigned);
26558 // Improved analysis for add/fadd/xor reductions with same scale
26559 // factor for all operands of reductions. We can emit scalar ops for
26560 // them instead.
26561 if (Scale > 1)
26562 Rdx = emitScaleForReusedOps(VectorizedValue: Rdx, Builder, Cnt: Scale);
26563 return Rdx;
26564 }
26565
26566 /// Calculate the cost of a reduction.
26567 InstructionCost getReductionCost(TargetTransformInfo *TTI,
26568 ArrayRef<Value *> ReducedVals,
26569 bool IsCmpSelMinMax, FastMathFlags FMF,
26570 const BoUpSLP &R, DominatorTree &DT,
26571 const DataLayout &DL,
26572 const TargetLibraryInfo &TLI) {
26573 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
26574 Type *ScalarTy = ReducedVals.front()->getType();
26575 unsigned ReduxWidth = ReducedVals.size();
26576 FixedVectorType *VectorTy = R.getReductionType();
26577 InstructionCost VectorCost = 0, ScalarCost;
26578 // If all of the reduced values are constant, the vector cost is 0, since
26579 // the reduction value can be calculated at the compile time.
26580 bool AllConsts = allConstant(VL: ReducedVals);
26581 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
26582 InstructionCost Cost = 0;
26583 // Scalar cost is repeated for N-1 elements.
26584 int Cnt = ReducedVals.size();
26585 for (Value *RdxVal : ReducedVals) {
26586 if (!isa<Instruction>(Val: RdxVal))
26587 continue;
26588 if (Cnt == 1)
26589 break;
26590 --Cnt;
26591 if (RdxVal->hasNUsesOrMore(N: IsCmpSelMinMax ? 3 : 2)) {
26592 Cost += GenCostFn();
26593 continue;
26594 }
26595 InstructionCost ScalarCost = 0;
26596 for (User *U : RdxVal->users()) {
26597 auto *RdxOp = cast<Instruction>(Val: U);
26598 if (hasRequiredNumberOfUses(IsCmpSelMinMax, I: RdxOp)) {
26599 if (RdxKind == RecurKind::FAdd) {
26600 InstructionCost FMACost = canConvertToFMA(
26601 VL: RdxOp, S: getSameOpcode(VL: RdxOp, TLI), DT, DL, TTI&: *TTI, TLI);
26602 if (FMACost.isValid()) {
26603 LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
26604 if (auto *I = dyn_cast<Instruction>(Val: RdxVal)) {
26605 // Also, exclude scalar fmul cost.
26606 InstructionCost FMulCost =
26607 TTI->getInstructionCost(U: I, CostKind);
26608 LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
26609 FMACost -= FMulCost;
26610 }
26611 ScalarCost += FMACost;
26612 continue;
26613 }
26614 }
26615 ScalarCost += TTI->getInstructionCost(U: RdxOp, CostKind);
26616 continue;
26617 }
26618 ScalarCost = InstructionCost::getInvalid();
26619 break;
26620 }
26621 if (ScalarCost.isValid())
26622 Cost += ScalarCost;
26623 else
26624 Cost += GenCostFn();
26625 }
26626 return Cost;
26627 };
26628 // Require reduction cost if:
26629 // 1. This type is not a full register type and no other vectors with the
26630 // same type in the storage (first vector with small type).
26631 // 2. The storage does not have any vector with full vector use (first
26632 // vector with full register use).
26633 bool DoesRequireReductionOp = !AllConsts && VectorValuesAndScales.empty();
26634 switch (RdxKind) {
26635 case RecurKind::Add:
26636 case RecurKind::Mul:
26637 case RecurKind::Or:
26638 case RecurKind::And:
26639 case RecurKind::Xor:
26640 case RecurKind::FAdd:
26641 case RecurKind::FMul: {
26642 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind: RdxKind);
26643 if (!AllConsts) {
26644 if (DoesRequireReductionOp) {
26645 if (auto *VecTy = dyn_cast<FixedVectorType>(Val: ScalarTy)) {
26646 assert(SLPReVec && "FixedVectorType is not expected.");
26647 unsigned ScalarTyNumElements = VecTy->getNumElements();
26648 for (unsigned I : seq<unsigned>(Size: ReducedVals.size())) {
26649 VectorCost += TTI->getShuffleCost(
26650 Kind: TTI::SK_PermuteSingleSrc,
26651 DstTy: FixedVectorType::get(ElementType: VecTy->getScalarType(),
26652 NumElts: ReducedVals.size()),
26653 SrcTy: VectorTy,
26654 Mask: createStrideMask(Start: I, Stride: ScalarTyNumElements, VF: ReducedVals.size()));
26655 VectorCost += TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VecTy,
26656 FMF, CostKind);
26657 }
26658 VectorCost += TTI->getScalarizationOverhead(
26659 Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: ScalarTyNumElements), /*Insert*/ true,
26660 /*Extract*/ false, CostKind: TTI::TCK_RecipThroughput);
26661 } else {
26662 Type *RedTy = VectorTy->getElementType();
26663 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26664 u: std::make_pair(x&: RedTy, y: true));
26665 if (RType == RedTy) {
26666 VectorCost = TTI->getArithmeticReductionCost(Opcode: RdxOpcode, Ty: VectorTy,
26667 FMF, CostKind);
26668 } else {
26669 VectorCost = TTI->getExtendedReductionCost(
26670 Opcode: RdxOpcode, IsUnsigned: !IsSigned, ResTy: RedTy,
26671 Ty: getWidenedType(ScalarTy: RType, VF: ReduxWidth), FMF, CostKind);
26672 }
26673 }
26674 } else {
26675 Type *RedTy = VectorTy->getElementType();
26676 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26677 u: std::make_pair(x&: RedTy, y: true));
26678 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
26679 InstructionCost FMACost = InstructionCost::getInvalid();
26680 if (RdxKind == RecurKind::FAdd) {
26681 // Check if the reduction operands can be converted to FMA.
26682 SmallVector<Value *> Ops;
26683 FastMathFlags FMF;
26684 FMF.set();
26685 for (Value *RdxVal : ReducedVals) {
26686 if (!RdxVal->hasOneUse()) {
26687 Ops.clear();
26688 break;
26689 }
26690 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: RdxVal))
26691 FMF &= FPCI->getFastMathFlags();
26692 Ops.push_back(Elt: RdxVal->user_back());
26693 }
26694 if (!Ops.empty()) {
26695 FMACost = canConvertToFMA(VL: Ops, S: getSameOpcode(VL: Ops, TLI), DT, DL,
26696 TTI&: *TTI, TLI);
26697 if (FMACost.isValid()) {
26698 // Calculate actual FMAD cost.
26699 IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
26700 {RVecTy, RVecTy, RVecTy}, FMF);
26701 FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
26702
26703 LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
26704 // Also, exclude vector fmul cost.
26705 InstructionCost FMulCost = TTI->getArithmeticInstrCost(
26706 Opcode: Instruction::FMul, Ty: RVecTy, CostKind);
26707 LLVM_DEBUG(dbgs()
26708 << "Minus vector FMul cost: " << FMulCost << "\n");
26709 FMACost -= FMulCost;
26710 }
26711 }
26712 }
26713 if (FMACost.isValid())
26714 VectorCost += FMACost;
26715 else
26716 VectorCost +=
26717 TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: RVecTy, CostKind);
26718 if (RType != RedTy) {
26719 unsigned Opcode = Instruction::Trunc;
26720 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26721 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26722 VectorCost += TTI->getCastInstrCost(
26723 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
26724 }
26725 }
26726 }
26727 ScalarCost = EvaluateScalarCost([&]() {
26728 return TTI->getArithmeticInstrCost(Opcode: RdxOpcode, Ty: ScalarTy, CostKind);
26729 });
26730 break;
26731 }
26732 case RecurKind::FMax:
26733 case RecurKind::FMin:
26734 case RecurKind::FMaximum:
26735 case RecurKind::FMinimum:
26736 case RecurKind::SMax:
26737 case RecurKind::SMin:
26738 case RecurKind::UMax:
26739 case RecurKind::UMin: {
26740 Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK: RdxKind);
26741 if (!AllConsts) {
26742 if (DoesRequireReductionOp) {
26743 VectorCost = TTI->getMinMaxReductionCost(IID: Id, Ty: VectorTy, FMF, CostKind);
26744 } else {
26745 // Check if the previous reduction already exists and account it as
26746 // series of operations + single reduction.
26747 Type *RedTy = VectorTy->getElementType();
26748 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
26749 u: std::make_pair(x&: RedTy, y: true));
26750 VectorType *RVecTy = getWidenedType(ScalarTy: RType, VF: ReduxWidth);
26751 IntrinsicCostAttributes ICA(Id, RVecTy, {RVecTy, RVecTy}, FMF);
26752 VectorCost += TTI->getIntrinsicInstrCost(ICA, CostKind);
26753 if (RType != RedTy) {
26754 unsigned Opcode = Instruction::Trunc;
26755 if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
26756 Opcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
26757 VectorCost += TTI->getCastInstrCost(
26758 Opcode, Dst: VectorTy, Src: RVecTy, CCH: TTI::CastContextHint::None, CostKind);
26759 }
26760 }
26761 }
26762 ScalarCost = EvaluateScalarCost([&]() {
26763 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
26764 return TTI->getIntrinsicInstrCost(ICA, CostKind);
26765 });
26766 break;
26767 }
26768 default:
26769 llvm_unreachable("Expected arithmetic or min/max reduction operation");
26770 }
26771
26772 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
26773 << " for reduction of " << shortBundleName(ReducedVals)
26774 << " (It is a splitting reduction)\n");
26775 return VectorCost - ScalarCost;
26776 }
26777
26778 /// Splits the values, stored in VectorValuesAndScales, into registers/free
26779 /// sub-registers, combines them with the given reduction operation as a
26780 /// vector operation and then performs single (small enough) reduction.
26781 Value *emitReduction(IRBuilderBase &Builder, const TargetTransformInfo &TTI,
26782 Type *DestTy) {
26783 Value *ReducedSubTree = nullptr;
26784 // Creates reduction and combines with the previous reduction.
26785 auto CreateSingleOp = [&](Value *Vec, unsigned Scale, bool IsSigned,
26786 bool ReducedInTree) {
26787 Value *Rdx = createSingleOp(Builder, TTI, Vec, Scale, IsSigned, DestTy,
26788 ReducedInTree);
26789 if (ReducedSubTree)
26790 ReducedSubTree = createOp(Builder, RdxKind, LHS: ReducedSubTree, RHS: Rdx,
26791 Name: "op.rdx", ReductionOps);
26792 else
26793 ReducedSubTree = Rdx;
26794 };
26795 if (VectorValuesAndScales.size() == 1) {
26796 const auto &[Vec, Scale, IsSigned, ReducedInTree] =
26797 VectorValuesAndScales.front();
26798 CreateSingleOp(Vec, Scale, IsSigned, ReducedInTree);
26799 return ReducedSubTree;
26800 }
26801 // Scales Vec using given Cnt scale factor and then performs vector combine
26802 // with previous value of VecOp.
26803 Value *VecRes = nullptr;
26804 bool VecResSignedness = false;
26805 auto CreateVecOp = [&](Value *Vec, unsigned Cnt, bool IsSigned,
26806 bool ReducedInTree) {
26807 if (ReducedInTree) {
26808 CreateSingleOp(Vec, Cnt, IsSigned, ReducedInTree);
26809 return;
26810 }
26811 Type *ScalarTy = Vec->getType()->getScalarType();
26812 // Scale Vec using given Cnt scale factor.
26813 if (Cnt > 1) {
26814 ElementCount EC = cast<VectorType>(Val: Vec->getType())->getElementCount();
26815 switch (RdxKind) {
26816 case RecurKind::Add: {
26817 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy) {
26818 unsigned VF = getNumElements(Ty: Vec->getType());
26819 LLVM_DEBUG(dbgs() << "SLP: ctpop " << Cnt << "of " << Vec
26820 << ". (HorRdx)\n");
26821 SmallVector<int> Mask(Cnt * VF, PoisonMaskElem);
26822 for (unsigned I : seq<unsigned>(Size: Cnt))
26823 std::iota(first: std::next(x: Mask.begin(), n: VF * I),
26824 last: std::next(x: Mask.begin(), n: VF * (I + 1)), value: 0);
26825 ++NumVectorInstructions;
26826 Vec = Builder.CreateShuffleVector(V: Vec, Mask);
26827 break;
26828 }
26829 // res = mul vv, n
26830 if (ScalarTy != DestTy->getScalarType())
26831 Vec = Builder.CreateIntCast(
26832 V: Vec, DestTy: getWidenedType(ScalarTy: DestTy, VF: getNumElements(Ty: Vec->getType())),
26833 isSigned: IsSigned);
26834 Value *Scale = ConstantVector::getSplat(
26835 EC, Elt: ConstantInt::get(Ty: DestTy->getScalarType(), V: Cnt));
26836 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of " << Vec
26837 << ". (HorRdx)\n");
26838 ++NumVectorInstructions;
26839 Vec = Builder.CreateMul(LHS: Vec, RHS: Scale);
26840 break;
26841 }
26842 case RecurKind::Xor: {
26843 // res = n % 2 ? 0 : vv
26844 LLVM_DEBUG(dbgs()
26845 << "SLP: Xor " << Cnt << "of " << Vec << ". (HorRdx)\n");
26846 if (Cnt % 2 == 0)
26847 Vec = Constant::getNullValue(Ty: Vec->getType());
26848 break;
26849 }
26850 case RecurKind::FAdd: {
26851 // res = fmul v, n
26852 Value *Scale =
26853 ConstantVector::getSplat(EC, Elt: ConstantFP::get(Ty: ScalarTy, V: Cnt));
26854 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of " << Vec
26855 << ". (HorRdx)\n");
26856 ++NumVectorInstructions;
26857 Vec = Builder.CreateFMul(L: Vec, R: Scale);
26858 break;
26859 }
26860 case RecurKind::And:
26861 case RecurKind::Or:
26862 case RecurKind::SMax:
26863 case RecurKind::SMin:
26864 case RecurKind::UMax:
26865 case RecurKind::UMin:
26866 case RecurKind::FMax:
26867 case RecurKind::FMin:
26868 case RecurKind::FMaximum:
26869 case RecurKind::FMinimum:
26870 // res = vv
26871 break;
26872 case RecurKind::Sub:
26873 case RecurKind::AddChainWithSubs:
26874 case RecurKind::Mul:
26875 case RecurKind::FMul:
26876 case RecurKind::FMulAdd:
26877 case RecurKind::AnyOf:
26878 case RecurKind::FindIV:
26879 case RecurKind::FindLast:
26880 case RecurKind::FMaxNum:
26881 case RecurKind::FMinNum:
26882 case RecurKind::FMaximumNum:
26883 case RecurKind::FMinimumNum:
26884 case RecurKind::None:
26885 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
26886 }
26887 }
26888 // Combine Vec with the previous VecOp.
26889 if (!VecRes) {
26890 VecRes = Vec;
26891 VecResSignedness = IsSigned;
26892 } else {
26893 ++NumVectorInstructions;
26894 if (ScalarTy == Builder.getInt1Ty() && ScalarTy != DestTy &&
26895 VecRes->getType()->getScalarType() == Builder.getInt1Ty()) {
26896 // Handle ctpop.
26897 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
26898 unsigned VecVF = getNumElements(Ty: Vec->getType());
26899 SmallVector<int> Mask(VecResVF + VecVF, PoisonMaskElem);
26900 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
26901 // Ensure that VecRes is always larger than Vec
26902 if (VecResVF < VecVF) {
26903 std::swap(a&: VecRes, b&: Vec);
26904 std::swap(a&: VecResVF, b&: VecVF);
26905 }
26906 if (VecResVF != VecVF) {
26907 SmallVector<int> ResizeMask(VecResVF, PoisonMaskElem);
26908 std::iota(first: ResizeMask.begin(), last: std::next(x: ResizeMask.begin(), n: VecVF), value: 0);
26909 Vec = Builder.CreateShuffleVector(V: Vec, Mask: ResizeMask);
26910 }
26911 VecRes = Builder.CreateShuffleVector(V1: VecRes, V2: Vec, Mask, Name: "rdx.op");
26912 return;
26913 }
26914 if (VecRes->getType()->getScalarType() != DestTy->getScalarType()) {
26915 assert(getNumElements(VecRes->getType()) % getNumElements(DestTy) ==
26916 0 &&
26917 "Expected the number of elements in VecRes to be a multiple "
26918 "of the number of elements in DestTy");
26919 VecRes = Builder.CreateIntCast(
26920 V: VecRes,
26921 DestTy: getWidenedType(ScalarTy: DestTy->getScalarType(),
26922 VF: getNumElements(Ty: VecRes->getType())),
26923 isSigned: VecResSignedness);
26924 }
26925 if (ScalarTy != DestTy->getScalarType())
26926 Vec = Builder.CreateIntCast(
26927 V: Vec,
26928 DestTy: getWidenedType(ScalarTy: DestTy->getScalarType(),
26929 VF: getNumElements(Ty: Vec->getType())),
26930 isSigned: IsSigned);
26931 unsigned VecResVF = getNumElements(Ty: VecRes->getType());
26932 unsigned VecVF = getNumElements(Ty: Vec->getType());
26933 // Ensure that VecRes is always larger than Vec
26934 if (VecResVF < VecVF) {
26935 std::swap(a&: VecRes, b&: Vec);
26936 std::swap(a&: VecResVF, b&: VecVF);
26937 }
26938 // extract + op + insert
26939 Value *Op = VecRes;
26940 if (VecResVF != VecVF)
26941 Op = createExtractVector(Builder, Vec: VecRes, SubVecVF: VecVF, /*Index=*/0);
26942 Op = createOp(Builder, RdxKind, LHS: Op, RHS: Vec, Name: "rdx.op", ReductionOps);
26943 if (VecResVF != VecVF)
26944 Op = createInsertVector(Builder, Vec: VecRes, V: Op, /*Index=*/0);
26945 VecRes = Op;
26946 }
26947 };
26948 for (auto [Vec, Scale, IsSigned, ReducedInTree] : VectorValuesAndScales)
26949 CreateVecOp(Vec, Scale, IsSigned, ReducedInTree);
26950 CreateSingleOp(VecRes, /*Scale=*/1, /*IsSigned=*/false,
26951 /*ReducedInTree=*/false);
26952
26953 return ReducedSubTree;
26954 }
26955
26956 /// Emit a horizontal reduction of the vectorized value.
26957 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
26958 const TargetTransformInfo *TTI, Type *DestTy) {
26959 assert(VectorizedValue && "Need to have a vectorized tree node");
26960 assert(RdxKind != RecurKind::FMulAdd &&
26961 "A call to the llvm.fmuladd intrinsic is not handled yet");
26962
26963 auto *FTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
26964 if (FTy->getScalarType() == Builder.getInt1Ty() &&
26965 RdxKind == RecurKind::Add &&
26966 DestTy->getScalarType() != FTy->getScalarType()) {
26967 // Convert vector_reduce_add(ZExt(<n x i1>)) to
26968 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
26969 Value *V = Builder.CreateBitCast(
26970 V: VectorizedValue, DestTy: Builder.getIntNTy(N: FTy->getNumElements()));
26971 ++NumVectorInstructions;
26972 return Builder.CreateUnaryIntrinsic(ID: Intrinsic::ctpop, V);
26973 }
26974 ++NumVectorInstructions;
26975 return createSimpleReduction(B&: Builder, Src: VectorizedValue, RdxKind);
26976 }
26977
26978 /// Emits optimized code for unique scalar value reused \p Cnt times.
26979 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
26980 unsigned Cnt) {
26981 assert(IsSupportedHorRdxIdentityOp &&
26982 "The optimization of matched scalar identity horizontal reductions "
26983 "must be supported.");
26984 if (Cnt == 1)
26985 return VectorizedValue;
26986 switch (RdxKind) {
26987 case RecurKind::Add: {
26988 // res = mul vv, n
26989 Value *Scale =
26990 ConstantInt::get(Ty: VectorizedValue->getType(), V: Cnt,
26991 /*IsSigned=*/false, /*ImplicitTrunc=*/true);
26992 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
26993 << VectorizedValue << ". (HorRdx)\n");
26994 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
26995 }
26996 case RecurKind::Xor: {
26997 // res = n % 2 ? 0 : vv
26998 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
26999 << ". (HorRdx)\n");
27000 if (Cnt % 2 == 0)
27001 return Constant::getNullValue(Ty: VectorizedValue->getType());
27002 return VectorizedValue;
27003 }
27004 case RecurKind::FAdd: {
27005 // res = fmul v, n
27006 Value *Scale = ConstantFP::get(Ty: VectorizedValue->getType(), V: Cnt);
27007 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
27008 << VectorizedValue << ". (HorRdx)\n");
27009 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
27010 }
27011 case RecurKind::And:
27012 case RecurKind::Or:
27013 case RecurKind::SMax:
27014 case RecurKind::SMin:
27015 case RecurKind::UMax:
27016 case RecurKind::UMin:
27017 case RecurKind::FMax:
27018 case RecurKind::FMin:
27019 case RecurKind::FMaximum:
27020 case RecurKind::FMinimum:
27021 // res = vv
27022 return VectorizedValue;
27023 case RecurKind::Sub:
27024 case RecurKind::AddChainWithSubs:
27025 case RecurKind::Mul:
27026 case RecurKind::FMul:
27027 case RecurKind::FMulAdd:
27028 case RecurKind::AnyOf:
27029 case RecurKind::FindIV:
27030 case RecurKind::FindLast:
27031 case RecurKind::FMaxNum:
27032 case RecurKind::FMinNum:
27033 case RecurKind::FMaximumNum:
27034 case RecurKind::FMinimumNum:
27035 case RecurKind::None:
27036 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
27037 }
27038 return nullptr;
27039 }
27040
27041 /// Emits actual operation for the scalar identity values, found during
27042 /// horizontal reduction analysis.
27043 Value *
27044 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
27045 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
27046 const DenseMap<Value *, Value *> &TrackedToOrig) {
27047 assert(IsSupportedHorRdxIdentityOp &&
27048 "The optimization of matched scalar identity horizontal reductions "
27049 "must be supported.");
27050 ArrayRef<Value *> VL = R.getRootNodeScalars();
27051 auto *VTy = cast<FixedVectorType>(Val: VectorizedValue->getType());
27052 if (VTy->getElementType() != VL.front()->getType()) {
27053 VectorizedValue = Builder.CreateIntCast(
27054 V: VectorizedValue,
27055 DestTy: getWidenedType(ScalarTy: VL.front()->getType(), VF: VTy->getNumElements()),
27056 isSigned: R.isSignedMinBitwidthRootNode());
27057 }
27058 switch (RdxKind) {
27059 case RecurKind::Add: {
27060 // root = mul prev_root, <1, 1, n, 1>
27061 SmallVector<Constant *> Vals;
27062 for (Value *V : VL) {
27063 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
27064 Vals.push_back(Elt: ConstantInt::get(Ty: V->getType(), V: Cnt, /*IsSigned=*/false));
27065 }
27066 auto *Scale = ConstantVector::get(V: Vals);
27067 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
27068 << VectorizedValue << ". (HorRdx)\n");
27069 return Builder.CreateMul(LHS: VectorizedValue, RHS: Scale);
27070 }
27071 case RecurKind::And:
27072 case RecurKind::Or:
27073 // No need for multiple or/and(s).
27074 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
27075 << ". (HorRdx)\n");
27076 return VectorizedValue;
27077 case RecurKind::SMax:
27078 case RecurKind::SMin:
27079 case RecurKind::UMax:
27080 case RecurKind::UMin:
27081 case RecurKind::FMax:
27082 case RecurKind::FMin:
27083 case RecurKind::FMaximum:
27084 case RecurKind::FMinimum:
27085 // No need for multiple min/max(s) of the same value.
27086 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
27087 << ". (HorRdx)\n");
27088 return VectorizedValue;
27089 case RecurKind::Xor: {
27090 // Replace values with even number of repeats with 0, since
27091 // x xor x = 0.
27092 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
27093 // 7>, if elements 4th and 6th elements have even number of repeats.
27094 SmallVector<int> Mask(
27095 cast<FixedVectorType>(Val: VectorizedValue->getType())->getNumElements(),
27096 PoisonMaskElem);
27097 std::iota(first: Mask.begin(), last: Mask.end(), value: 0);
27098 bool NeedShuffle = false;
27099 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
27100 Value *V = VL[I];
27101 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
27102 if (Cnt % 2 == 0) {
27103 Mask[I] = VF;
27104 NeedShuffle = true;
27105 }
27106 }
27107 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
27108 : Mask) dbgs()
27109 << I << " ";
27110 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
27111 if (NeedShuffle)
27112 VectorizedValue = Builder.CreateShuffleVector(
27113 V1: VectorizedValue,
27114 V2: ConstantVector::getNullValue(Ty: VectorizedValue->getType()), Mask);
27115 return VectorizedValue;
27116 }
27117 case RecurKind::FAdd: {
27118 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
27119 SmallVector<Constant *> Vals;
27120 for (Value *V : VL) {
27121 unsigned Cnt = SameValuesCounter.lookup(Key: TrackedToOrig.at(Val: V));
27122 Vals.push_back(Elt: ConstantFP::get(Ty: V->getType(), V: Cnt));
27123 }
27124 auto *Scale = ConstantVector::get(V: Vals);
27125 return Builder.CreateFMul(L: VectorizedValue, R: Scale);
27126 }
27127 case RecurKind::Sub:
27128 case RecurKind::AddChainWithSubs:
27129 case RecurKind::Mul:
27130 case RecurKind::FMul:
27131 case RecurKind::FMulAdd:
27132 case RecurKind::AnyOf:
27133 case RecurKind::FindIV:
27134 case RecurKind::FindLast:
27135 case RecurKind::FMaxNum:
27136 case RecurKind::FMinNum:
27137 case RecurKind::FMaximumNum:
27138 case RecurKind::FMinimumNum:
27139 case RecurKind::None:
27140 llvm_unreachable("Unexpected reduction kind for reused scalars.");
27141 }
27142 return nullptr;
27143 }
27144};
27145} // end anonymous namespace
27146
27147/// Gets recurrence kind from the specified value.
27148static RecurKind getRdxKind(Value *V) {
27149 return HorizontalReduction::getRdxKind(V);
27150}
27151static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
27152 if (auto *IE = dyn_cast<InsertElementInst>(Val: InsertInst))
27153 return cast<FixedVectorType>(Val: IE->getType())->getNumElements();
27154
27155 unsigned AggregateSize = 1;
27156 auto *IV = cast<InsertValueInst>(Val: InsertInst);
27157 Type *CurrentType = IV->getType();
27158 do {
27159 if (auto *ST = dyn_cast<StructType>(Val: CurrentType)) {
27160 for (auto *Elt : ST->elements())
27161 if (Elt != ST->getElementType(N: 0)) // check homogeneity
27162 return std::nullopt;
27163 AggregateSize *= ST->getNumElements();
27164 CurrentType = ST->getElementType(N: 0);
27165 } else if (auto *AT = dyn_cast<ArrayType>(Val: CurrentType)) {
27166 AggregateSize *= AT->getNumElements();
27167 CurrentType = AT->getElementType();
27168 } else if (auto *VT = dyn_cast<FixedVectorType>(Val: CurrentType)) {
27169 AggregateSize *= VT->getNumElements();
27170 return AggregateSize;
27171 } else if (CurrentType->isSingleValueType()) {
27172 return AggregateSize;
27173 } else {
27174 return std::nullopt;
27175 }
27176 } while (true);
27177}
27178
27179static void findBuildAggregateRec(Instruction *LastInsertInst,
27180 TargetTransformInfo *TTI,
27181 SmallVectorImpl<Value *> &BuildVectorOpds,
27182 SmallVectorImpl<Value *> &InsertElts,
27183 unsigned OperandOffset, const BoUpSLP &R) {
27184 do {
27185 Value *InsertedOperand = LastInsertInst->getOperand(i: 1);
27186 std::optional<unsigned> OperandIndex =
27187 getElementIndex(Inst: LastInsertInst, Offset: OperandOffset);
27188 if (!OperandIndex || R.isDeleted(I: LastInsertInst))
27189 return;
27190 if (isa<InsertElementInst, InsertValueInst>(Val: InsertedOperand)) {
27191 findBuildAggregateRec(LastInsertInst: cast<Instruction>(Val: InsertedOperand), TTI,
27192 BuildVectorOpds, InsertElts, OperandOffset: *OperandIndex, R);
27193
27194 } else {
27195 BuildVectorOpds[*OperandIndex] = InsertedOperand;
27196 InsertElts[*OperandIndex] = LastInsertInst;
27197 }
27198 LastInsertInst = dyn_cast<Instruction>(Val: LastInsertInst->getOperand(i: 0));
27199 } while (LastInsertInst != nullptr &&
27200 isa<InsertValueInst, InsertElementInst>(Val: LastInsertInst) &&
27201 LastInsertInst->hasOneUse());
27202}
27203
27204/// Recognize construction of vectors like
27205/// %ra = insertelement <4 x float> poison, float %s0, i32 0
27206/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
27207/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
27208/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
27209/// starting from the last insertelement or insertvalue instruction.
27210///
27211/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
27212/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
27213/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
27214///
27215/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
27216///
27217/// \return true if it matches.
27218static bool findBuildAggregate(Instruction *LastInsertInst,
27219 TargetTransformInfo *TTI,
27220 SmallVectorImpl<Value *> &BuildVectorOpds,
27221 SmallVectorImpl<Value *> &InsertElts,
27222 const BoUpSLP &R) {
27223
27224 assert((isa<InsertElementInst>(LastInsertInst) ||
27225 isa<InsertValueInst>(LastInsertInst)) &&
27226 "Expected insertelement or insertvalue instruction!");
27227
27228 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
27229 "Expected empty result vectors!");
27230
27231 std::optional<unsigned> AggregateSize = getAggregateSize(InsertInst: LastInsertInst);
27232 if (!AggregateSize)
27233 return false;
27234 BuildVectorOpds.resize(N: *AggregateSize);
27235 InsertElts.resize(N: *AggregateSize);
27236
27237 findBuildAggregateRec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, OperandOffset: 0, R);
27238 llvm::erase(C&: BuildVectorOpds, V: nullptr);
27239 llvm::erase(C&: InsertElts, V: nullptr);
27240 if (BuildVectorOpds.size() >= 2)
27241 return true;
27242
27243 return false;
27244}
27245
27246/// Try and get a reduction instruction from a phi node.
27247///
27248/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
27249/// if they come from either \p ParentBB or a containing loop latch.
27250///
27251/// \returns A candidate reduction value if possible, or \code nullptr \endcode
27252/// if not possible.
27253static Instruction *getReductionInstr(const DominatorTree *DT, PHINode *P,
27254 BasicBlock *ParentBB, LoopInfo *LI) {
27255 // There are situations where the reduction value is not dominated by the
27256 // reduction phi. Vectorizing such cases has been reported to cause
27257 // miscompiles. See PR25787.
27258 auto DominatedReduxValue = [&](Value *R) {
27259 return isa<Instruction>(Val: R) &&
27260 DT->dominates(A: P->getParent(), B: cast<Instruction>(Val: R)->getParent());
27261 };
27262
27263 Instruction *Rdx = nullptr;
27264
27265 // Return the incoming value if it comes from the same BB as the phi node.
27266 if (P->getIncomingBlock(i: 0) == ParentBB) {
27267 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
27268 } else if (P->getIncomingBlock(i: 1) == ParentBB) {
27269 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
27270 }
27271
27272 if (Rdx && DominatedReduxValue(Rdx))
27273 return Rdx;
27274
27275 // Otherwise, check whether we have a loop latch to look at.
27276 Loop *BBL = LI->getLoopFor(BB: ParentBB);
27277 if (!BBL)
27278 return nullptr;
27279 BasicBlock *BBLatch = BBL->getLoopLatch();
27280 if (!BBLatch)
27281 return nullptr;
27282
27283 // There is a loop latch, return the incoming value if it comes from
27284 // that. This reduction pattern occasionally turns up.
27285 if (P->getIncomingBlock(i: 0) == BBLatch) {
27286 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 0));
27287 } else if (P->getIncomingBlock(i: 1) == BBLatch) {
27288 Rdx = dyn_cast<Instruction>(Val: P->getIncomingValue(i: 1));
27289 }
27290
27291 if (Rdx && DominatedReduxValue(Rdx))
27292 return Rdx;
27293
27294 return nullptr;
27295}
27296
27297static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
27298 if (match(V: I, P: m_BinOp(L: m_Value(V&: V0), R: m_Value(V&: V1))))
27299 return true;
27300 if (match(V: I, P: m_FMaxNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27301 return true;
27302 if (match(V: I, P: m_FMinNum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27303 return true;
27304 if (match(V: I, P: m_FMaximum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27305 return true;
27306 if (match(V: I, P: m_FMinimum(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27307 return true;
27308 if (match(V: I, P: m_Intrinsic<Intrinsic::smax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27309 return true;
27310 if (match(V: I, P: m_Intrinsic<Intrinsic::smin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27311 return true;
27312 if (match(V: I, P: m_Intrinsic<Intrinsic::umax>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27313 return true;
27314 if (match(V: I, P: m_Intrinsic<Intrinsic::umin>(Op0: m_Value(V&: V0), Op1: m_Value(V&: V1))))
27315 return true;
27316 return false;
27317}
27318
27319/// We could have an initial reduction that is not an add.
27320/// r *= v1 + v2 + v3 + v4
27321/// In such a case start looking for a tree rooted in the first '+'.
27322/// \Returns the new root if found, which may be nullptr if not an instruction.
27323static Instruction *tryGetSecondaryReductionRoot(PHINode *Phi,
27324 Instruction *Root) {
27325 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
27326 isa<IntrinsicInst>(Root)) &&
27327 "Expected binop, select, or intrinsic for reduction matching");
27328 Value *LHS =
27329 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root));
27330 Value *RHS =
27331 Root->getOperand(i: HorizontalReduction::getFirstOperandIndex(I: Root) + 1);
27332 if (LHS == Phi)
27333 return dyn_cast<Instruction>(Val: RHS);
27334 if (RHS == Phi)
27335 return dyn_cast<Instruction>(Val: LHS);
27336 return nullptr;
27337}
27338
27339/// \p Returns the first operand of \p I that does not match \p Phi. If
27340/// operand is not an instruction it returns nullptr.
27341static Instruction *getNonPhiOperand(Instruction *I, PHINode *Phi) {
27342 Value *Op0 = nullptr;
27343 Value *Op1 = nullptr;
27344 if (!matchRdxBop(I, V0&: Op0, V1&: Op1))
27345 return nullptr;
27346 return dyn_cast<Instruction>(Val: Op0 == Phi ? Op1 : Op0);
27347}
27348
27349/// \Returns true if \p I is a candidate instruction for reduction vectorization.
27350static bool isReductionCandidate(Instruction *I) {
27351 bool IsSelect = match(V: I, P: m_Select(C: m_Value(), L: m_Value(), R: m_Value()));
27352 Value *B0 = nullptr, *B1 = nullptr;
27353 bool IsBinop = matchRdxBop(I, V0&: B0, V1&: B1);
27354 return IsBinop || IsSelect;
27355}
27356
27357bool SLPVectorizerPass::vectorizeHorReduction(
27358 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
27359 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
27360 if (!ShouldVectorizeHor)
27361 return false;
27362 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Val: Root);
27363
27364 if (Root->getParent() != BB || isa<PHINode>(Val: Root))
27365 return false;
27366
27367 // If we can find a secondary reduction root, use that instead.
27368 auto SelectRoot = [&]() {
27369 if (TryOperandsAsNewSeeds && isReductionCandidate(I: Root) &&
27370 HorizontalReduction::getRdxKind(V: Root) != RecurKind::None)
27371 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(Phi: P, Root))
27372 return NewRoot;
27373 return Root;
27374 };
27375
27376 // Start analysis starting from Root instruction. If horizontal reduction is
27377 // found, try to vectorize it. If it is not a horizontal reduction or
27378 // vectorization is not possible or not effective, and currently analyzed
27379 // instruction is a binary operation, try to vectorize the operands, using
27380 // pre-order DFS traversal order. If the operands were not vectorized, repeat
27381 // the same procedure considering each operand as a possible root of the
27382 // horizontal reduction.
27383 // Interrupt the process if the Root instruction itself was vectorized or all
27384 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
27385 // If a horizintal reduction was not matched or vectorized we collect
27386 // instructions for possible later attempts for vectorization.
27387 std::queue<std::pair<Instruction *, unsigned>> Stack;
27388 Stack.emplace(args: SelectRoot(), args: 0);
27389 SmallPtrSet<Value *, 8> VisitedInstrs;
27390 bool Res = false;
27391 auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
27392 if (R.isAnalyzedReductionRoot(I: Inst))
27393 return nullptr;
27394 if (!isReductionCandidate(I: Inst))
27395 return nullptr;
27396 HorizontalReduction HorRdx;
27397 if (!HorRdx.matchAssociativeReduction(R, Root: Inst, SE&: *SE, DL: *DL, TLI: *TLI))
27398 return nullptr;
27399 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI, TLI: *TLI, AC, DT&: *DT);
27400 };
27401 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
27402 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
27403 FutureSeed = getNonPhiOperand(I: Root, Phi: P);
27404 if (!FutureSeed)
27405 return false;
27406 }
27407 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
27408 // analysis is done separately.
27409 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(Val: FutureSeed))
27410 PostponedInsts.push_back(Elt: FutureSeed);
27411 return true;
27412 };
27413
27414 while (!Stack.empty()) {
27415 Instruction *Inst;
27416 unsigned Level;
27417 std::tie(args&: Inst, args&: Level) = Stack.front();
27418 Stack.pop();
27419 // Do not try to analyze instruction that has already been vectorized.
27420 // This may happen when we vectorize instruction operands on a previous
27421 // iteration while stack was populated before that happened.
27422 if (R.isDeleted(I: Inst))
27423 continue;
27424 if (Value *VectorizedV = TryToReduce(Inst)) {
27425 Res = true;
27426 if (auto *I = dyn_cast<Instruction>(Val: VectorizedV)) {
27427 // Try to find another reduction.
27428 Stack.emplace(args&: I, args&: Level);
27429 continue;
27430 }
27431 if (R.isDeleted(I: Inst))
27432 continue;
27433 } else {
27434 // We could not vectorize `Inst` so try to use it as a future seed.
27435 if (!TryAppendToPostponedInsts(Inst)) {
27436 assert(Stack.empty() && "Expected empty stack");
27437 break;
27438 }
27439 }
27440
27441 // Try to vectorize operands.
27442 // Continue analysis for the instruction from the same basic block only to
27443 // save compile time.
27444 if (++Level < RecursionMaxDepth)
27445 for (auto *Op : Inst->operand_values())
27446 if (VisitedInstrs.insert(Ptr: Op).second)
27447 if (auto *I = dyn_cast<Instruction>(Val: Op))
27448 // Do not try to vectorize CmpInst operands, this is done
27449 // separately.
27450 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(Val: I) &&
27451 !R.isDeleted(I) && I->getParent() == BB)
27452 Stack.emplace(args&: I, args&: Level);
27453 }
27454 return Res;
27455}
27456
27457bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
27458 if (!I)
27459 return false;
27460
27461 if (!isa<BinaryOperator, CmpInst>(Val: I) || isa<VectorType>(Val: I->getType()))
27462 return false;
27463 // Skip potential FMA candidates.
27464 if ((I->getOpcode() == Instruction::FAdd ||
27465 I->getOpcode() == Instruction::FSub) &&
27466 canConvertToFMA(VL: I, S: getSameOpcode(VL: I, TLI: *TLI), DT&: *DT, DL: *DL, TTI&: *TTI, TLI: *TLI)
27467 .isValid())
27468 return false;
27469
27470 Value *P = I->getParent();
27471
27472 // Vectorize in current basic block only.
27473 auto *Op0 = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
27474 auto *Op1 = dyn_cast<Instruction>(Val: I->getOperand(i: 1));
27475 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
27476 R.isDeleted(I: Op0) || R.isDeleted(I: Op1))
27477 return false;
27478
27479 // First collect all possible candidates
27480 SmallVector<std::pair<Value *, Value *>, 4> Candidates;
27481 Candidates.emplace_back(Args&: Op0, Args&: Op1);
27482
27483 auto *A = dyn_cast<BinaryOperator>(Val: Op0);
27484 auto *B = dyn_cast<BinaryOperator>(Val: Op1);
27485 // Try to skip B.
27486 if (A && B && B->hasOneUse()) {
27487 auto *B0 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 0));
27488 auto *B1 = dyn_cast<BinaryOperator>(Val: B->getOperand(i_nocapture: 1));
27489 if (B0 && B0->getParent() == P && !R.isDeleted(I: B0))
27490 Candidates.emplace_back(Args&: A, Args&: B0);
27491 if (B1 && B1->getParent() == P && !R.isDeleted(I: B1))
27492 Candidates.emplace_back(Args&: A, Args&: B1);
27493 }
27494 // Try to skip A.
27495 if (B && A && A->hasOneUse()) {
27496 auto *A0 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 0));
27497 auto *A1 = dyn_cast<BinaryOperator>(Val: A->getOperand(i_nocapture: 1));
27498 if (A0 && A0->getParent() == P && !R.isDeleted(I: A0))
27499 Candidates.emplace_back(Args&: A0, Args&: B);
27500 if (A1 && A1->getParent() == P && !R.isDeleted(I: A1))
27501 Candidates.emplace_back(Args&: A1, Args&: B);
27502 }
27503
27504 auto TryToReduce = [this, &R, &TTI = *TTI](Instruction *Inst,
27505 ArrayRef<Value *> Ops) {
27506 if (!isReductionCandidate(I: Inst))
27507 return false;
27508 Type *Ty = Inst->getType();
27509 if (!isValidElementType(Ty) || Ty->isPointerTy())
27510 return false;
27511 HorizontalReduction HorRdx(Inst, Ops);
27512 if (!HorRdx.matchReductionForOperands())
27513 return false;
27514 // Check the cost of operations.
27515 VectorType *VecTy = getWidenedType(ScalarTy: Ty, VF: Ops.size());
27516 constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
27517 InstructionCost ScalarCost =
27518 TTI.getScalarizationOverhead(
27519 Ty: VecTy, DemandedElts: APInt::getAllOnes(numBits: getNumElements(Ty: VecTy)), /*Insert=*/false,
27520 /*Extract=*/true, CostKind) +
27521 TTI.getInstructionCost(U: Inst, CostKind);
27522 InstructionCost RedCost;
27523 switch (::getRdxKind(V: Inst)) {
27524 case RecurKind::Add:
27525 case RecurKind::Mul:
27526 case RecurKind::Or:
27527 case RecurKind::And:
27528 case RecurKind::Xor:
27529 case RecurKind::FAdd:
27530 case RecurKind::FMul: {
27531 FastMathFlags FMF;
27532 if (auto *FPCI = dyn_cast<FPMathOperator>(Val: Inst))
27533 FMF = FPCI->getFastMathFlags();
27534 RedCost = TTI.getArithmeticReductionCost(Opcode: Inst->getOpcode(), Ty: VecTy, FMF,
27535 CostKind);
27536 break;
27537 }
27538 default:
27539 return false;
27540 }
27541 if (RedCost >= ScalarCost)
27542 return false;
27543
27544 return HorRdx.tryToReduce(V&: R, DL: *DL, TTI: &TTI, TLI: *TLI, AC, DT&: *DT) != nullptr;
27545 };
27546 if (Candidates.size() == 1)
27547 return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList(VL: {Op0, Op1}, R);
27548
27549 // We have multiple options. Try to pick the single best.
27550 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
27551 if (!BestCandidate)
27552 return false;
27553 return (*BestCandidate == 0 &&
27554 TryToReduce(I, {Candidates[*BestCandidate].first,
27555 Candidates[*BestCandidate].second})) ||
27556 tryToVectorizeList(VL: {Candidates[*BestCandidate].first,
27557 Candidates[*BestCandidate].second},
27558 R);
27559}
27560
27561bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
27562 BasicBlock *BB, BoUpSLP &R) {
27563 SmallVector<WeakTrackingVH> PostponedInsts;
27564 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
27565 Res |= tryToVectorize(Insts: PostponedInsts, R);
27566 return Res;
27567}
27568
27569bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
27570 BoUpSLP &R) {
27571 bool Res = false;
27572 for (Value *V : Insts)
27573 if (auto *Inst = dyn_cast<Instruction>(Val: V); Inst && !R.isDeleted(I: Inst))
27574 Res |= tryToVectorize(I: Inst, R);
27575 return Res;
27576}
27577
27578bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
27579 BasicBlock *BB, BoUpSLP &R,
27580 bool MaxVFOnly) {
27581 if (!R.canMapToVector(T: IVI->getType()))
27582 return false;
27583
27584 SmallVector<Value *, 16> BuildVectorOpds;
27585 SmallVector<Value *, 16> BuildVectorInsts;
27586 if (!findBuildAggregate(LastInsertInst: IVI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R))
27587 return false;
27588
27589 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
27590 R.getORE()->emit(RemarkBuilder: [&]() {
27591 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
27592 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
27593 "trying reduction first.";
27594 });
27595 return false;
27596 }
27597 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
27598 // Aggregate value is unlikely to be processed in vector register.
27599 return tryToVectorizeList(VL: BuildVectorOpds, R, MaxVFOnly);
27600}
27601
27602bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
27603 BasicBlock *BB, BoUpSLP &R,
27604 bool MaxVFOnly) {
27605 SmallVector<Value *, 16> BuildVectorInsts;
27606 SmallVector<Value *, 16> BuildVectorOpds;
27607 SmallVector<int> Mask;
27608 if (!findBuildAggregate(LastInsertInst: IEI, TTI, BuildVectorOpds, InsertElts&: BuildVectorInsts, R) ||
27609 (all_of(Range&: BuildVectorOpds, P: IsaPred<ExtractElementInst, UndefValue>) &&
27610 isFixedVectorShuffle(VL: BuildVectorOpds, Mask, AC)))
27611 return false;
27612
27613 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
27614 R.getORE()->emit(RemarkBuilder: [&]() {
27615 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
27616 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
27617 "trying reduction first.";
27618 });
27619 return false;
27620 }
27621 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
27622 return tryToVectorizeList(VL: BuildVectorInsts, R, MaxVFOnly);
27623}
27624
27625template <typename T>
27626static bool tryToVectorizeSequence(
27627 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
27628 function_ref<bool(ArrayRef<T *>, T *)> AreCompatible,
27629 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
27630 bool MaxVFOnly, BoUpSLP &R) {
27631 bool Changed = false;
27632 // Sort by type, parent, operands.
27633 stable_sort(Incoming, Comparator);
27634
27635 // Try to vectorize elements base on their type.
27636 SmallVector<T *> Candidates;
27637 SmallVector<T *> VL;
27638 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
27639 VL.clear()) {
27640 // Look for the next elements with the same type, parent and operand
27641 // kinds.
27642 auto *I = dyn_cast<Instruction>(*IncIt);
27643 if (!I || R.isDeleted(I)) {
27644 ++IncIt;
27645 continue;
27646 }
27647 auto *SameTypeIt = IncIt;
27648 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
27649 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
27650 AreCompatible(VL, *SameTypeIt))) {
27651 auto *I = dyn_cast<Instruction>(*SameTypeIt);
27652 ++SameTypeIt;
27653 if (I && !R.isDeleted(I))
27654 VL.push_back(cast<T>(I));
27655 }
27656
27657 // Try to vectorize them.
27658 unsigned NumElts = VL.size();
27659 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
27660 << NumElts << ")\n");
27661 // The vectorization is a 3-state attempt:
27662 // 1. Try to vectorize instructions with the same/alternate opcodes with the
27663 // size of maximal register at first.
27664 // 2. Try to vectorize remaining instructions with the same type, if
27665 // possible. This may result in the better vectorization results rather than
27666 // if we try just to vectorize instructions with the same/alternate opcodes.
27667 // 3. Final attempt to try to vectorize all instructions with the
27668 // same/alternate ops only, this may result in some extra final
27669 // vectorization.
27670 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
27671 // Success start over because instructions might have been changed.
27672 Changed = true;
27673 VL.swap(Candidates);
27674 Candidates.clear();
27675 for (T *V : VL) {
27676 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27677 Candidates.push_back(V);
27678 }
27679 } else {
27680 /// \Returns the minimum number of elements that we will attempt to
27681 /// vectorize.
27682 auto GetMinNumElements = [&R](Value *V) {
27683 unsigned EltSize = R.getVectorElementSize(V);
27684 return std::max(a: 2U, b: R.getMaxVecRegSize() / EltSize);
27685 };
27686 if (NumElts < GetMinNumElements(*IncIt) &&
27687 (Candidates.empty() ||
27688 Candidates.front()->getType() == (*IncIt)->getType())) {
27689 for (T *V : VL) {
27690 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
27691 Candidates.push_back(V);
27692 }
27693 }
27694 }
27695 // Final attempt to vectorize instructions with the same types.
27696 if (Candidates.size() > 1 &&
27697 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
27698 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
27699 // Success start over because instructions might have been changed.
27700 Changed = true;
27701 } else if (MaxVFOnly) {
27702 // Try to vectorize using small vectors.
27703 SmallVector<T *> VL;
27704 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
27705 VL.clear()) {
27706 auto *I = dyn_cast<Instruction>(*It);
27707 if (!I || R.isDeleted(I)) {
27708 ++It;
27709 continue;
27710 }
27711 auto *SameTypeIt = It;
27712 while (SameTypeIt != End &&
27713 (!isa<Instruction>(*SameTypeIt) ||
27714 R.isDeleted(I: cast<Instruction>(*SameTypeIt)) ||
27715 AreCompatible(*SameTypeIt, *It))) {
27716 auto *I = dyn_cast<Instruction>(*SameTypeIt);
27717 ++SameTypeIt;
27718 if (I && !R.isDeleted(I))
27719 VL.push_back(cast<T>(I));
27720 }
27721 unsigned NumElts = VL.size();
27722 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
27723 /*MaxVFOnly=*/false))
27724 Changed = true;
27725 It = SameTypeIt;
27726 }
27727 }
27728 Candidates.clear();
27729 }
27730
27731 // Start over at the next instruction of a different type (or the end).
27732 IncIt = SameTypeIt;
27733 }
27734 return Changed;
27735}
27736
27737/// Compare two cmp instructions. If IsCompatibility is true, function returns
27738/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
27739/// operands. If IsCompatibility is false, function implements strict weak
27740/// ordering relation between two cmp instructions, returning true if the first
27741/// instruction is "less" than the second, i.e. its predicate is less than the
27742/// predicate of the second or the operands IDs are less than the operands IDs
27743/// of the second cmp instruction.
27744template <bool IsCompatibility>
27745static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
27746 const DominatorTree &DT) {
27747 assert(isValidElementType(V->getType()) &&
27748 isValidElementType(V2->getType()) &&
27749 "Expected valid element types only.");
27750 if (V == V2)
27751 return IsCompatibility;
27752 auto *CI1 = cast<CmpInst>(Val: V);
27753 auto *CI2 = cast<CmpInst>(Val: V2);
27754 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() <
27755 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
27756 return !IsCompatibility;
27757 if (CI1->getOperand(i_nocapture: 0)->getType()->getTypeID() >
27758 CI2->getOperand(i_nocapture: 0)->getType()->getTypeID())
27759 return false;
27760 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() <
27761 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
27762 return !IsCompatibility;
27763 if (CI1->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits() >
27764 CI2->getOperand(i_nocapture: 0)->getType()->getScalarSizeInBits())
27765 return false;
27766 CmpInst::Predicate Pred1 = CI1->getPredicate();
27767 CmpInst::Predicate Pred2 = CI2->getPredicate();
27768 CmpInst::Predicate SwapPred1 = CmpInst::getSwappedPredicate(pred: Pred1);
27769 CmpInst::Predicate SwapPred2 = CmpInst::getSwappedPredicate(pred: Pred2);
27770 CmpInst::Predicate BasePred1 = std::min(a: Pred1, b: SwapPred1);
27771 CmpInst::Predicate BasePred2 = std::min(a: Pred2, b: SwapPred2);
27772 if (BasePred1 < BasePred2)
27773 return !IsCompatibility;
27774 if (BasePred1 > BasePred2)
27775 return false;
27776 // Compare operands.
27777 bool CI1Preds = Pred1 == BasePred1;
27778 bool CI2Preds = Pred2 == BasePred1;
27779 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
27780 auto *Op1 = CI1->getOperand(i_nocapture: CI1Preds ? I : E - I - 1);
27781 auto *Op2 = CI2->getOperand(i_nocapture: CI2Preds ? I : E - I - 1);
27782 if (Op1 == Op2)
27783 continue;
27784 if (Op1->getValueID() < Op2->getValueID())
27785 return !IsCompatibility;
27786 if (Op1->getValueID() > Op2->getValueID())
27787 return false;
27788 if (auto *I1 = dyn_cast<Instruction>(Val: Op1))
27789 if (auto *I2 = dyn_cast<Instruction>(Val: Op2)) {
27790 if (IsCompatibility) {
27791 if (I1->getParent() != I2->getParent())
27792 return false;
27793 } else {
27794 // Try to compare nodes with same parent.
27795 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(BB: I1->getParent());
27796 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(BB: I2->getParent());
27797 if (!NodeI1)
27798 return NodeI2 != nullptr;
27799 if (!NodeI2)
27800 return false;
27801 assert((NodeI1 == NodeI2) ==
27802 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27803 "Different nodes should have different DFS numbers");
27804 if (NodeI1 != NodeI2)
27805 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27806 }
27807 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI);
27808 if (S && (IsCompatibility || !S.isAltShuffle()))
27809 continue;
27810 if (IsCompatibility)
27811 return false;
27812 if (I1->getOpcode() != I2->getOpcode())
27813 return I1->getOpcode() < I2->getOpcode();
27814 }
27815 }
27816 return IsCompatibility;
27817}
27818
27819template <typename ItT>
27820bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
27821 BasicBlock *BB, BoUpSLP &R) {
27822 bool Changed = false;
27823 // Try to find reductions first.
27824 for (CmpInst *I : CmpInsts) {
27825 if (R.isDeleted(I))
27826 continue;
27827 for (Value *Op : I->operands())
27828 if (auto *RootOp = dyn_cast<Instruction>(Val: Op)) {
27829 Changed |= vectorizeRootInstruction(P: nullptr, Root: RootOp, BB, R);
27830 if (R.isDeleted(I))
27831 break;
27832 }
27833 }
27834 // Try to vectorize operands as vector bundles.
27835 for (CmpInst *I : CmpInsts) {
27836 if (R.isDeleted(I))
27837 continue;
27838 Changed |= tryToVectorize(I, R);
27839 }
27840 // Try to vectorize list of compares.
27841 // Sort by type, compare predicate, etc.
27842 auto CompareSorter = [&](Value *V, Value *V2) {
27843 if (V == V2)
27844 return false;
27845 return compareCmp<false>(V, V2, TLI&: *TLI, DT: *DT);
27846 };
27847
27848 auto AreCompatibleCompares = [&](ArrayRef<Value *> VL, Value *V1) {
27849 if (VL.empty() || VL.back() == V1)
27850 return true;
27851 return compareCmp<true>(V: V1, V2: VL.back(), TLI&: *TLI, DT: *DT);
27852 };
27853
27854 SmallVector<Value *> Vals;
27855 for (Instruction *V : CmpInsts)
27856 if (!R.isDeleted(I: V) && isValidElementType(Ty: getValueType(V)))
27857 Vals.push_back(Elt: V);
27858 if (Vals.size() <= 1)
27859 return Changed;
27860 Changed |= tryToVectorizeSequence<Value>(
27861 Vals, CompareSorter, AreCompatibleCompares,
27862 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
27863 // Exclude possible reductions from other blocks.
27864 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
27865 return any_of(V->users(), [V](User *U) {
27866 auto *Select = dyn_cast<SelectInst>(Val: U);
27867 return Select &&
27868 Select->getParent() != cast<Instruction>(Val: V)->getParent();
27869 });
27870 });
27871 if (ArePossiblyReducedInOtherBlock)
27872 return false;
27873 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
27874 },
27875 /*MaxVFOnly=*/true, R);
27876 return Changed;
27877}
27878
27879bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
27880 BasicBlock *BB, BoUpSLP &R) {
27881 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
27882 "This function only accepts Insert instructions");
27883 bool OpsChanged = false;
27884 SmallVector<WeakTrackingVH> PostponedInsts;
27885 for (auto *I : reverse(C&: Instructions)) {
27886 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
27887 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
27888 continue;
27889 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
27890 OpsChanged |=
27891 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/true);
27892 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
27893 OpsChanged |=
27894 vectorizeInsertElementInst(IEI: LastInsertElem, BB, R, /*MaxVFOnly=*/true);
27895 }
27896 // pass2 - try to vectorize reductions only
27897 if (R.isDeleted(I))
27898 continue;
27899 OpsChanged |= vectorizeHorReduction(P: nullptr, Root: I, BB, R, PostponedInsts);
27900 if (R.isDeleted(I) || isa<CmpInst>(Val: I))
27901 continue;
27902 // pass3 - try to match and vectorize a buildvector sequence.
27903 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(Val: I)) {
27904 OpsChanged |=
27905 vectorizeInsertValueInst(IVI: LastInsertValue, BB, R, /*MaxVFOnly=*/false);
27906 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(Val: I)) {
27907 OpsChanged |= vectorizeInsertElementInst(IEI: LastInsertElem, BB, R,
27908 /*MaxVFOnly=*/false);
27909 }
27910 }
27911 // Now try to vectorize postponed instructions.
27912 OpsChanged |= tryToVectorize(Insts: PostponedInsts, R);
27913
27914 Instructions.clear();
27915 return OpsChanged;
27916}
27917
27918bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
27919 bool Changed = false;
27920 SmallVector<Value *, 4> Incoming;
27921 SmallPtrSet<Value *, 16> VisitedInstrs;
27922 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
27923 // node. Allows better to identify the chains that can be vectorized in the
27924 // better way.
27925 DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
27926 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
27927 assert(isValidElementType(V1->getType()) &&
27928 isValidElementType(V2->getType()) &&
27929 "Expected vectorizable types only.");
27930 if (V1 == V2)
27931 return false;
27932 // It is fine to compare type IDs here, since we expect only vectorizable
27933 // types, like ints, floats and pointers, we don't care about other type.
27934 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
27935 return true;
27936 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
27937 return false;
27938 if (V1->getType()->getScalarSizeInBits() <
27939 V2->getType()->getScalarSizeInBits())
27940 return true;
27941 if (V1->getType()->getScalarSizeInBits() >
27942 V2->getType()->getScalarSizeInBits())
27943 return false;
27944 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
27945 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
27946 if (Opcodes1.size() < Opcodes2.size())
27947 return true;
27948 if (Opcodes1.size() > Opcodes2.size())
27949 return false;
27950 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
27951 {
27952 // Instructions come first.
27953 auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]);
27954 auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I]);
27955 if (I1 && I2) {
27956 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
27957 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
27958 if (!NodeI1)
27959 return NodeI2 != nullptr;
27960 if (!NodeI2)
27961 return false;
27962 assert((NodeI1 == NodeI2) ==
27963 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27964 "Different nodes should have different DFS numbers");
27965 if (NodeI1 != NodeI2)
27966 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27967 InstructionsState S = getSameOpcode(VL: {I1, I2}, TLI: *TLI);
27968 if (S && !S.isAltShuffle() && I1->getOpcode() == I2->getOpcode()) {
27969 const auto *E1 = dyn_cast<ExtractElementInst>(Val: I1);
27970 const auto *E2 = dyn_cast<ExtractElementInst>(Val: I2);
27971 if (!E1 || !E2)
27972 continue;
27973
27974 // Sort on ExtractElementInsts primarily by vector operands. Prefer
27975 // program order of the vector operands.
27976 const auto *V1 = dyn_cast<Instruction>(Val: E1->getVectorOperand());
27977 const auto *V2 = dyn_cast<Instruction>(Val: E2->getVectorOperand());
27978 if (V1 != V2) {
27979 if (V1 && !V2)
27980 return true;
27981 if (!V1 && V2)
27982 return false;
27983 DomTreeNodeBase<BasicBlock> *NodeI1 =
27984 DT->getNode(BB: V1->getParent());
27985 DomTreeNodeBase<BasicBlock> *NodeI2 =
27986 DT->getNode(BB: V2->getParent());
27987 if (!NodeI1)
27988 return NodeI2 != nullptr;
27989 if (!NodeI2)
27990 return false;
27991 assert((NodeI1 == NodeI2) ==
27992 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
27993 "Different nodes should have different DFS numbers");
27994 if (NodeI1 != NodeI2)
27995 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
27996 return V1->comesBefore(Other: V2);
27997 }
27998 // If we have the same vector operand, try to sort by constant
27999 // index.
28000 std::optional<unsigned> Id1 = getExtractIndex(E: E1);
28001 std::optional<unsigned> Id2 = getExtractIndex(E: E2);
28002 // Bring constants to the top
28003 if (Id1 && !Id2)
28004 return true;
28005 if (!Id1 && Id2)
28006 return false;
28007 // First elements come first.
28008 if (Id1 && Id2)
28009 return *Id1 < *Id2;
28010
28011 continue;
28012 }
28013 if (I1->getOpcode() == I2->getOpcode())
28014 continue;
28015 return I1->getOpcode() < I2->getOpcode();
28016 }
28017 if (I1)
28018 return true;
28019 if (I2)
28020 return false;
28021 }
28022 {
28023 // Non-undef constants come next.
28024 bool C1 = isa<Constant>(Val: Opcodes1[I]) && !isa<UndefValue>(Val: Opcodes1[I]);
28025 bool C2 = isa<Constant>(Val: Opcodes2[I]) && !isa<UndefValue>(Val: Opcodes2[I]);
28026 if (C1 && C2)
28027 continue;
28028 if (C1)
28029 return true;
28030 if (C2)
28031 return false;
28032 }
28033 bool U1 = isa<UndefValue>(Val: Opcodes1[I]);
28034 bool U2 = isa<UndefValue>(Val: Opcodes2[I]);
28035 {
28036 // Non-constant non-instructions come next.
28037 if (!U1 && !U2) {
28038 auto ValID1 = Opcodes1[I]->getValueID();
28039 auto ValID2 = Opcodes2[I]->getValueID();
28040 if (ValID1 == ValID2)
28041 continue;
28042 if (ValID1 < ValID2)
28043 return true;
28044 if (ValID1 > ValID2)
28045 return false;
28046 }
28047 if (!U1)
28048 return true;
28049 if (!U2)
28050 return false;
28051 }
28052 // Undefs come last.
28053 assert(U1 && U2 && "The only thing left should be undef & undef.");
28054 }
28055 return false;
28056 };
28057 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef<Value *> VL,
28058 Value *V1) {
28059 if (VL.empty() || V1 == VL.back())
28060 return true;
28061 Value *V2 = VL.back();
28062 if (V1->getType() != V2->getType())
28063 return false;
28064 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
28065 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
28066 if (Opcodes1.size() != Opcodes2.size())
28067 return false;
28068 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
28069 // Undefs are compatible with any other value.
28070 if (isa<UndefValue>(Val: Opcodes1[I]) || isa<UndefValue>(Val: Opcodes2[I]))
28071 continue;
28072 if (auto *I1 = dyn_cast<Instruction>(Val: Opcodes1[I]))
28073 if (auto *I2 = dyn_cast<Instruction>(Val: Opcodes2[I])) {
28074 if (R.isDeleted(I: I1) || R.isDeleted(I: I2))
28075 return false;
28076 if (I1->getParent() != I2->getParent())
28077 return false;
28078 if (getSameOpcode(VL: {I1, I2}, TLI: *TLI))
28079 continue;
28080 return false;
28081 }
28082 if (isa<Constant>(Val: Opcodes1[I]) && isa<Constant>(Val: Opcodes2[I]))
28083 continue;
28084 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
28085 return false;
28086 }
28087 return true;
28088 };
28089
28090 bool HaveVectorizedPhiNodes = false;
28091 do {
28092 // Collect the incoming values from the PHIs.
28093 Incoming.clear();
28094 for (Instruction &I : *BB) {
28095 auto *P = dyn_cast<PHINode>(Val: &I);
28096 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
28097 break;
28098
28099 // No need to analyze deleted, vectorized and non-vectorizable
28100 // instructions.
28101 if (!VisitedInstrs.count(Ptr: P) && !R.isDeleted(I: P) &&
28102 isValidElementType(Ty: P->getType()))
28103 Incoming.push_back(Elt: P);
28104 }
28105
28106 if (Incoming.size() <= 1)
28107 break;
28108
28109 // Find the corresponding non-phi nodes for better matching when trying to
28110 // build the tree.
28111 for (Value *V : Incoming) {
28112 SmallVectorImpl<Value *> &Opcodes =
28113 PHIToOpcodes.try_emplace(Key: V).first->getSecond();
28114 if (!Opcodes.empty())
28115 continue;
28116 SmallVector<Value *, 4> Nodes(1, V);
28117 SmallPtrSet<Value *, 4> Visited;
28118 while (!Nodes.empty()) {
28119 auto *PHI = cast<PHINode>(Val: Nodes.pop_back_val());
28120 if (!Visited.insert(Ptr: PHI).second)
28121 continue;
28122 for (Value *V : PHI->incoming_values()) {
28123 if (auto *PHI1 = dyn_cast<PHINode>(Val: (V))) {
28124 Nodes.push_back(Elt: PHI1);
28125 continue;
28126 }
28127 Opcodes.emplace_back(Args&: V);
28128 }
28129 }
28130 }
28131
28132 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
28133 Incoming, Comparator: PHICompare, AreCompatible: AreCompatiblePHIs,
28134 TryToVectorizeHelper: [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
28135 return tryToVectorizeList(VL: Candidates, R, MaxVFOnly);
28136 },
28137 /*MaxVFOnly=*/true, R);
28138 Changed |= HaveVectorizedPhiNodes;
28139 if (HaveVectorizedPhiNodes && any_of(Range&: PHIToOpcodes, P: [&](const auto &P) {
28140 auto *PHI = dyn_cast<PHINode>(P.first);
28141 return !PHI || R.isDeleted(I: PHI);
28142 }))
28143 PHIToOpcodes.clear();
28144 VisitedInstrs.insert_range(R&: Incoming);
28145 } while (HaveVectorizedPhiNodes);
28146
28147 VisitedInstrs.clear();
28148
28149 InstSetVector PostProcessInserts;
28150 SmallSetVector<CmpInst *, 8> PostProcessCmps;
28151 // Vectorizes Inserts in `PostProcessInserts` and if `VectorizeCmps` is true
28152 // also vectorizes `PostProcessCmps`.
28153 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
28154 bool Changed = vectorizeInserts(Instructions&: PostProcessInserts, BB, R);
28155 if (VectorizeCmps) {
28156 Changed |= vectorizeCmpInsts(CmpInsts: reverse(C&: PostProcessCmps), BB, R);
28157 PostProcessCmps.clear();
28158 }
28159 PostProcessInserts.clear();
28160 return Changed;
28161 };
28162 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
28163 auto IsInPostProcessInstrs = [&](Instruction *I) {
28164 if (auto *Cmp = dyn_cast<CmpInst>(Val: I))
28165 return PostProcessCmps.contains(key: Cmp);
28166 return isa<InsertElementInst, InsertValueInst>(Val: I) &&
28167 PostProcessInserts.contains(key: I);
28168 };
28169 // Returns true if `I` is an instruction without users, like terminator, or
28170 // function call with ignored return value, store. Ignore unused instructions
28171 // (basing on instruction type, except for CallInst and InvokeInst).
28172 auto HasNoUsers = [](Instruction *I) {
28173 return I->use_empty() &&
28174 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(Val: I));
28175 };
28176 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
28177 // Skip instructions with scalable type. The num of elements is unknown at
28178 // compile-time for scalable type.
28179 if (isa<ScalableVectorType>(Val: It->getType()))
28180 continue;
28181
28182 // Skip instructions marked for the deletion.
28183 if (R.isDeleted(I: &*It))
28184 continue;
28185 // We may go through BB multiple times so skip the one we have checked.
28186 if (!VisitedInstrs.insert(Ptr: &*It).second) {
28187 if (HasNoUsers(&*It) &&
28188 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
28189 // We would like to start over since some instructions are deleted
28190 // and the iterator may become invalid value.
28191 Changed = true;
28192 It = BB->begin();
28193 E = BB->end();
28194 }
28195 continue;
28196 }
28197
28198 // Try to vectorize reductions that use PHINodes.
28199 if (PHINode *P = dyn_cast<PHINode>(Val&: It)) {
28200 // Check that the PHI is a reduction PHI.
28201 if (P->getNumIncomingValues() == 2) {
28202 // Try to match and vectorize a horizontal reduction.
28203 Instruction *Root = getReductionInstr(DT, P, ParentBB: BB, LI);
28204 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
28205 Changed = true;
28206 It = BB->begin();
28207 E = BB->end();
28208 continue;
28209 }
28210 }
28211 // Try to vectorize the incoming values of the PHI, to catch reductions
28212 // that feed into PHIs.
28213 for (unsigned I : seq<unsigned>(Size: P->getNumIncomingValues())) {
28214 // Skip if the incoming block is the current BB for now. Also, bypass
28215 // unreachable IR for efficiency and to avoid crashing.
28216 // TODO: Collect the skipped incoming values and try to vectorize them
28217 // after processing BB.
28218 if (BB == P->getIncomingBlock(i: I) ||
28219 !DT->isReachableFromEntry(A: P->getIncomingBlock(i: I)))
28220 continue;
28221
28222 // Postponed instructions should not be vectorized here, delay their
28223 // vectorization.
28224 if (auto *PI = dyn_cast<Instruction>(Val: P->getIncomingValue(i: I));
28225 PI && !IsInPostProcessInstrs(PI)) {
28226 bool Res =
28227 vectorizeRootInstruction(P: nullptr, Root: PI, BB: P->getIncomingBlock(i: I), R);
28228 Changed |= Res;
28229 if (Res && R.isDeleted(I: P)) {
28230 It = BB->begin();
28231 E = BB->end();
28232 break;
28233 }
28234 }
28235 }
28236 continue;
28237 }
28238
28239 if (HasNoUsers(&*It)) {
28240 bool OpsChanged = false;
28241 auto *SI = dyn_cast<StoreInst>(Val&: It);
28242 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
28243 if (SI) {
28244 auto *I = Stores.find(Key: getUnderlyingObject(V: SI->getPointerOperand()));
28245 // Try to vectorize chain in store, if this is the only store to the
28246 // address in the block.
28247 // TODO: This is just a temporarily solution to save compile time. Need
28248 // to investigate if we can safely turn on slp-vectorize-hor-store
28249 // instead to allow lookup for reduction chains in all non-vectorized
28250 // stores (need to check side effects and compile time).
28251 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
28252 SI->getValueOperand()->hasOneUse();
28253 }
28254 if (TryToVectorizeRoot) {
28255 for (auto *V : It->operand_values()) {
28256 // Postponed instructions should not be vectorized here, delay their
28257 // vectorization.
28258 if (auto *VI = dyn_cast<Instruction>(Val: V);
28259 VI && !IsInPostProcessInstrs(VI))
28260 // Try to match and vectorize a horizontal reduction.
28261 OpsChanged |= vectorizeRootInstruction(P: nullptr, Root: VI, BB, R);
28262 }
28263 }
28264 // Start vectorization of post-process list of instructions from the
28265 // top-tree instructions to try to vectorize as many instructions as
28266 // possible.
28267 OpsChanged |=
28268 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
28269 if (OpsChanged) {
28270 // We would like to start over since some instructions are deleted
28271 // and the iterator may become invalid value.
28272 Changed = true;
28273 It = BB->begin();
28274 E = BB->end();
28275 continue;
28276 }
28277 }
28278
28279 if (isa<InsertElementInst, InsertValueInst>(Val: It))
28280 PostProcessInserts.insert(X: &*It);
28281 else if (isa<CmpInst>(Val: It))
28282 PostProcessCmps.insert(X: cast<CmpInst>(Val: &*It));
28283 }
28284
28285 return Changed;
28286}
28287
28288bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
28289 auto Changed = false;
28290 for (auto &Entry : GEPs) {
28291 // If the getelementptr list has fewer than two elements, there's nothing
28292 // to do.
28293 if (Entry.second.size() < 2)
28294 continue;
28295
28296 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
28297 << Entry.second.size() << ".\n");
28298
28299 // Process the GEP list in chunks suitable for the target's supported
28300 // vector size. If a vector register can't hold 1 element, we are done. We
28301 // are trying to vectorize the index computations, so the maximum number of
28302 // elements is based on the size of the index expression, rather than the
28303 // size of the GEP itself (the target's pointer size).
28304 auto *It = find_if(Range&: Entry.second, P: [&](GetElementPtrInst *GEP) {
28305 return !R.isDeleted(I: GEP);
28306 });
28307 if (It == Entry.second.end())
28308 continue;
28309 unsigned MaxVecRegSize = R.getMaxVecRegSize();
28310 unsigned EltSize = R.getVectorElementSize(V: *(*It)->idx_begin());
28311 if (MaxVecRegSize < EltSize)
28312 continue;
28313
28314 unsigned MaxElts = MaxVecRegSize / EltSize;
28315 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
28316 auto Len = std::min<unsigned>(a: BE - BI, b: MaxElts);
28317 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
28318
28319 // Initialize a set a candidate getelementptrs. Note that we use a
28320 // SetVector here to preserve program order. If the index computations
28321 // are vectorizable and begin with loads, we want to minimize the chance
28322 // of having to reorder them later.
28323 SetVector<Value *> Candidates(llvm::from_range, GEPList);
28324
28325 // Some of the candidates may have already been vectorized after we
28326 // initially collected them or their index is optimized to constant value.
28327 // If so, they are marked as deleted, so remove them from the set of
28328 // candidates.
28329 Candidates.remove_if(P: [&R](Value *I) {
28330 return R.isDeleted(I: cast<Instruction>(Val: I)) ||
28331 isa<Constant>(Val: cast<GetElementPtrInst>(Val: I)->idx_begin()->get());
28332 });
28333
28334 // Remove from the set of candidates all pairs of getelementptrs with
28335 // constant differences. Such getelementptrs are likely not good
28336 // candidates for vectorization in a bottom-up phase since one can be
28337 // computed from the other. We also ensure all candidate getelementptr
28338 // indices are unique.
28339 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
28340 auto *GEPI = GEPList[I];
28341 if (!Candidates.count(key: GEPI))
28342 continue;
28343 const SCEV *SCEVI = SE->getSCEV(V: GEPList[I]);
28344 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
28345 auto *GEPJ = GEPList[J];
28346 if (!Candidates.count(key: GEPJ))
28347 continue;
28348 const SCEV *SCEVJ = SE->getSCEV(V: GEPList[J]);
28349 if (isa<SCEVConstant>(Val: SE->getMinusSCEV(LHS: SCEVI, RHS: SCEVJ))) {
28350 Candidates.remove(X: GEPI);
28351 Candidates.remove(X: GEPJ);
28352 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
28353 Candidates.remove(X: GEPJ);
28354 }
28355 }
28356 }
28357
28358 // We break out of the above computation as soon as we know there are
28359 // fewer than two candidates remaining.
28360 if (Candidates.size() < 2)
28361 continue;
28362
28363 // Add the single, non-constant index of each candidate to the bundle. We
28364 // ensured the indices met these constraints when we originally collected
28365 // the getelementptrs.
28366 SmallVector<Value *, 16> Bundle(Candidates.size());
28367 auto BundleIndex = 0u;
28368 for (auto *V : Candidates) {
28369 auto *GEP = cast<GetElementPtrInst>(Val: V);
28370 auto *GEPIdx = GEP->idx_begin()->get();
28371 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
28372 Bundle[BundleIndex++] = GEPIdx;
28373 }
28374
28375 // Try and vectorize the indices. We are currently only interested in
28376 // gather-like cases of the form:
28377 //
28378 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
28379 //
28380 // where the loads of "a", the loads of "b", and the subtractions can be
28381 // performed in parallel. It's likely that detecting this pattern in a
28382 // bottom-up phase will be simpler and less costly than building a
28383 // full-blown top-down phase beginning at the consecutive loads.
28384 Changed |= tryToVectorizeList(VL: Bundle, R);
28385 }
28386 }
28387 return Changed;
28388}
28389
28390bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
28391 bool Changed = false;
28392 // Sort by type, base pointers and values operand. Value operands must be
28393 // compatible (have the same opcode, same parent), otherwise it is
28394 // definitely not profitable to try to vectorize them.
28395 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
28396 if (V->getValueOperand()->getType()->getTypeID() <
28397 V2->getValueOperand()->getType()->getTypeID())
28398 return true;
28399 if (V->getValueOperand()->getType()->getTypeID() >
28400 V2->getValueOperand()->getType()->getTypeID())
28401 return false;
28402 if (V->getPointerOperandType()->getTypeID() <
28403 V2->getPointerOperandType()->getTypeID())
28404 return true;
28405 if (V->getPointerOperandType()->getTypeID() >
28406 V2->getPointerOperandType()->getTypeID())
28407 return false;
28408 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
28409 V2->getValueOperand()->getType()->getScalarSizeInBits())
28410 return true;
28411 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
28412 V2->getValueOperand()->getType()->getScalarSizeInBits())
28413 return false;
28414 // UndefValues are compatible with all other values.
28415 auto *I1 = dyn_cast<Instruction>(Val: V->getValueOperand());
28416 auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
28417 if (I1 && I2) {
28418 DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = DT->getNode(BB: I1->getParent());
28419 DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = DT->getNode(BB: I2->getParent());
28420 assert(NodeI1 && "Should only process reachable instructions");
28421 assert(NodeI2 && "Should only process reachable instructions");
28422 assert((NodeI1 == NodeI2) ==
28423 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
28424 "Different nodes should have different DFS numbers");
28425 if (NodeI1 != NodeI2)
28426 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
28427 return I1->getOpcode() < I2->getOpcode();
28428 }
28429 if (I1 && !I2)
28430 return true;
28431 if (!I1 && I2)
28432 return false;
28433 return V->getValueOperand()->getValueID() <
28434 V2->getValueOperand()->getValueID();
28435 };
28436
28437 bool SameParent = true;
28438 auto AreCompatibleStores = [&](ArrayRef<StoreInst *> VL, StoreInst *V1) {
28439 if (VL.empty()) {
28440 SameParent = true;
28441 return true;
28442 }
28443 StoreInst *V2 = VL.back();
28444 if (V1 == V2)
28445 return true;
28446 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
28447 return false;
28448 if (V1->getPointerOperandType() != V2->getPointerOperandType())
28449 return false;
28450 // Undefs are compatible with any other value.
28451 if (isa<UndefValue>(Val: V1->getValueOperand()) ||
28452 isa<UndefValue>(Val: V2->getValueOperand()))
28453 return true;
28454 if (isa<Constant>(Val: V1->getValueOperand()) &&
28455 isa<Constant>(Val: V2->getValueOperand()))
28456 return true;
28457 // Check if the operands of the stores can be vectorized. They can be
28458 // vectorized, if they have compatible operands or have operands, which can
28459 // be vectorized as copyables.
28460 auto *I1 = dyn_cast<Instruction>(Val: V1->getValueOperand());
28461 auto *I2 = dyn_cast<Instruction>(Val: V2->getValueOperand());
28462 if (I1 || I2) {
28463 // Accept only tail-following non-compatible values for now.
28464 // TODO: investigate if it is possible to vectorize incompatible values,
28465 // if the copyables are first in the list.
28466 if (I1 && !I2)
28467 return false;
28468 SameParent &= I1 && I2 && I1->getParent() == I2->getParent();
28469 SmallVector<Value *> NewVL(VL.size() + 1);
28470 for (auto [SI, V] : zip(t&: VL, u&: NewVL))
28471 V = SI->getValueOperand();
28472 NewVL.back() = V1->getValueOperand();
28473 InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI);
28474 InstructionsState S = Analysis.buildInstructionsState(
28475 VL: NewVL, R, TryCopyableElementsVectorization: VectorizeCopyableElements, /*WithProfitabilityCheck=*/true,
28476 /*SkipSameCodeCheck=*/!SameParent);
28477 if (S)
28478 return true;
28479 if (!SameParent)
28480 return false;
28481 }
28482 return V1->getValueOperand()->getValueID() ==
28483 V2->getValueOperand()->getValueID();
28484 };
28485
28486 // Attempt to sort and vectorize each of the store-groups.
28487 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;
28488 for (auto &Pair : Stores) {
28489 if (Pair.second.size() < 2)
28490 continue;
28491
28492 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
28493 << Pair.second.size() << ".\n");
28494
28495 if (!isValidElementType(Ty: Pair.second.front()->getValueOperand()->getType()))
28496 continue;
28497
28498 // Reverse stores to do bottom-to-top analysis. This is important if the
28499 // values are stores to the same addresses several times, in this case need
28500 // to follow the stores order (reversed to meet the memory dependecies).
28501 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
28502 Pair.second.rend());
28503 Changed |= tryToVectorizeSequence<StoreInst>(
28504 Incoming&: ReversedStores, Comparator: StoreSorter, AreCompatible: AreCompatibleStores,
28505 TryToVectorizeHelper: [&](ArrayRef<StoreInst *> Candidates, bool) {
28506 return vectorizeStores(Stores: Candidates, R, Visited&: Attempted);
28507 },
28508 /*MaxVFOnly=*/false, R);
28509 }
28510 return Changed;
28511}
28512