1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/DenseMapInfo.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
74#include "llvm/ADT/SmallPtrSet.h"
75#include "llvm/ADT/SmallVector.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
80#include "llvm/ADT/iterator_range.h"
81#include "llvm/Analysis/AssumptionCache.h"
82#include "llvm/Analysis/BasicAliasAnalysis.h"
83#include "llvm/Analysis/BlockFrequencyInfo.h"
84#include "llvm/Analysis/CFG.h"
85#include "llvm/Analysis/CodeMetrics.h"
86#include "llvm/Analysis/DemandedBits.h"
87#include "llvm/Analysis/GlobalsModRef.h"
88#include "llvm/Analysis/LoopAccessAnalysis.h"
89#include "llvm/Analysis/LoopAnalysisManager.h"
90#include "llvm/Analysis/LoopInfo.h"
91#include "llvm/Analysis/LoopIterator.h"
92#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93#include "llvm/Analysis/ProfileSummaryInfo.h"
94#include "llvm/Analysis/ScalarEvolution.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/TargetLibraryInfo.h"
97#include "llvm/Analysis/TargetTransformInfo.h"
98#include "llvm/Analysis/ValueTracking.h"
99#include "llvm/Analysis/VectorUtils.h"
100#include "llvm/IR/Attributes.h"
101#include "llvm/IR/BasicBlock.h"
102#include "llvm/IR/CFG.h"
103#include "llvm/IR/Constant.h"
104#include "llvm/IR/Constants.h"
105#include "llvm/IR/DataLayout.h"
106#include "llvm/IR/DebugInfo.h"
107#include "llvm/IR/DebugLoc.h"
108#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/DiagnosticInfo.h"
110#include "llvm/IR/Dominators.h"
111#include "llvm/IR/Function.h"
112#include "llvm/IR/IRBuilder.h"
113#include "llvm/IR/InstrTypes.h"
114#include "llvm/IR/Instruction.h"
115#include "llvm/IR/Instructions.h"
116#include "llvm/IR/IntrinsicInst.h"
117#include "llvm/IR/Intrinsics.h"
118#include "llvm/IR/MDBuilder.h"
119#include "llvm/IR/Metadata.h"
120#include "llvm/IR/Module.h"
121#include "llvm/IR/Operator.h"
122#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/ProfDataUtils.h"
124#include "llvm/IR/Type.h"
125#include "llvm/IR/Use.h"
126#include "llvm/IR/User.h"
127#include "llvm/IR/Value.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Debug.h"
132#include "llvm/Support/ErrorHandling.h"
133#include "llvm/Support/InstructionCost.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/NativeFormatting.h"
136#include "llvm/Support/raw_ostream.h"
137#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139#include "llvm/Transforms/Utils/Local.h"
140#include "llvm/Transforms/Utils/LoopSimplify.h"
141#include "llvm/Transforms/Utils/LoopUtils.h"
142#include "llvm/Transforms/Utils/LoopVersioning.h"
143#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144#include "llvm/Transforms/Utils/SizeOpts.h"
145#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146#include <algorithm>
147#include <cassert>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <memory>
153#include <string>
154#include <tuple>
155#include <utility>
156
157using namespace llvm;
158
159#define LV_NAME "loop-vectorize"
160#define DEBUG_TYPE LV_NAME
161
162#ifndef NDEBUG
163const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164#endif
165
166/// @{
167/// Metadata attribute names
168const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169const char LLVMLoopVectorizeFollowupVectorized[] =
170 "llvm.loop.vectorize.followup_vectorized";
171const char LLVMLoopVectorizeFollowupEpilogue[] =
172 "llvm.loop.vectorize.followup_epilogue";
173/// @}
174
175STATISTIC(LoopsVectorized, "Number of loops vectorized");
176STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178
179static cl::opt<bool> EnableEpilogueVectorization(
180 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
181 cl::desc("Enable vectorization of epilogue loops."));
182
183static cl::opt<unsigned> EpilogueVectorizationForceVF(
184 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
185 cl::desc("When epilogue vectorization is enabled, and a value greater than "
186 "1 is specified, forces the given VF for all applicable epilogue "
187 "loops."));
188
189static cl::opt<unsigned> EpilogueVectorizationMinVF(
190 "epilogue-vectorization-minimum-VF", cl::Hidden,
191 cl::desc("Only loops with vectorization factor equal to or larger than "
192 "the specified value are considered for epilogue vectorization."));
193
194/// Loops with a known constant trip count below this number are vectorized only
195/// if no scalar iteration overheads are incurred.
196static cl::opt<unsigned> TinyTripCountVectorThreshold(
197 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
198 cl::desc("Loops with a constant trip count that is smaller than this "
199 "value are vectorized only if no scalar iteration overheads "
200 "are incurred."));
201
202static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
203 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
204 cl::desc("The maximum allowed number of runtime memory checks"));
205
206// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207// that predication is preferred, and this lists all options. I.e., the
208// vectorizer will try to fold the tail-loop (epilogue) into the vector body
209// and predicate the instructions accordingly. If tail-folding fails, there are
210// different fallback strategies depending on these values:
211namespace PreferPredicateTy {
212 enum Option {
213 ScalarEpilogue = 0,
214 PredicateElseScalarEpilogue,
215 PredicateOrDontVectorize
216 };
217} // namespace PreferPredicateTy
218
219static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220 "prefer-predicate-over-epilogue",
221 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
222 cl::Hidden,
223 cl::desc("Tail-folding and predication preferences over creating a scalar "
224 "epilogue loop."),
225 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226 "scalar-epilogue",
227 "Don't tail-predicate loops, create scalar epilogue"),
228 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229 "predicate-else-scalar-epilogue",
230 "prefer tail-folding, create scalar epilogue if tail "
231 "folding fails."),
232 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233 "predicate-dont-vectorize",
234 "prefers tail-folding, don't attempt vectorization if "
235 "tail-folding fails.")));
236
237static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
238 "force-tail-folding-style", cl::desc("Force the tail folding style"),
239 cl::init(Val: TailFoldingStyle::None),
240 cl::values(
241 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
242 clEnumValN(
243 TailFoldingStyle::Data, "data",
244 "Create lane mask for data only, using active.lane.mask intrinsic"),
245 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
246 "data-without-lane-mask",
247 "Create lane mask with compare/stepvector"),
248 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
249 "Create lane mask using active.lane.mask intrinsic, and use "
250 "it for both data and control flow"),
251 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
252 "data-and-control-without-rt-check",
253 "Similar to data-and-control, but remove the runtime check"),
254 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
255 "Use predicated EVL instructions for tail folding. If EVL "
256 "is unsupported, fallback to data-without-lane-mask.")));
257
258static cl::opt<bool> MaximizeBandwidth(
259 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
260 cl::desc("Maximize bandwidth when selecting vectorization factor which "
261 "will be determined by the smallest type in loop."));
262
263static cl::opt<bool> EnableInterleavedMemAccesses(
264 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
265 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
266
267/// An interleave-group may need masking if it resides in a block that needs
268/// predication, or in order to mask away gaps.
269static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
270 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
271 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
272
273static cl::opt<unsigned> ForceTargetNumScalarRegs(
274 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
275 cl::desc("A flag that overrides the target's number of scalar registers."));
276
277static cl::opt<unsigned> ForceTargetNumVectorRegs(
278 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of vector registers."));
280
281static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
282 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
283 cl::desc("A flag that overrides the target's max interleave factor for "
284 "scalar loops."));
285
286static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
287 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
288 cl::desc("A flag that overrides the target's max interleave factor for "
289 "vectorized loops."));
290
291cl::opt<unsigned> llvm::ForceTargetInstructionCost(
292 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
293 cl::desc("A flag that overrides the target's expected cost for "
294 "an instruction to a single constant value. Mostly "
295 "useful for getting consistent testing."));
296
297static cl::opt<bool> ForceTargetSupportsScalableVectors(
298 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
299 cl::desc(
300 "Pretend that scalable vectors are supported, even if the target does "
301 "not support them. This flag should only be used for testing."));
302
303static cl::opt<unsigned> SmallLoopCost(
304 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
305 cl::desc(
306 "The cost of a loop that is considered 'small' by the interleaver."));
307
308static cl::opt<bool> LoopVectorizeWithBlockFrequency(
309 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
310 cl::desc("Enable the use of the block frequency analysis to access PGO "
311 "heuristics minimizing code growth in cold regions and being more "
312 "aggressive in hot regions."));
313
314// Runtime interleave loops for load/store throughput.
315static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
316 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
317 cl::desc(
318 "Enable runtime interleaving until load/store ports are saturated"));
319
320/// The number of stores in a loop that are allowed to need predication.
321static cl::opt<unsigned> NumberOfStoresToPredicate(
322 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
323 cl::desc("Max number of stores to be predicated behind an if."));
324
325static cl::opt<bool> EnableIndVarRegisterHeur(
326 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
327 cl::desc("Count the induction variable only once when interleaving"));
328
329static cl::opt<bool> EnableCondStoresVectorization(
330 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
331 cl::desc("Enable if predication of stores during vectorization."));
332
333static cl::opt<unsigned> MaxNestedScalarReductionIC(
334 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
335 cl::desc("The maximum interleave count to use when interleaving a scalar "
336 "reduction in a nested loop."));
337
338static cl::opt<bool>
339 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
340 cl::Hidden,
341 cl::desc("Prefer in-loop vector reductions, "
342 "overriding the targets preference."));
343
344static cl::opt<bool> ForceOrderedReductions(
345 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
346 cl::desc("Enable the vectorisation of loops with in-order (strict) "
347 "FP reductions"));
348
349static cl::opt<bool> PreferPredicatedReductionSelect(
350 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
351 cl::desc(
352 "Prefer predicating a reduction operation over an after loop select."));
353
354cl::opt<bool> llvm::EnableVPlanNativePath(
355 "enable-vplan-native-path", cl::Hidden,
356 cl::desc("Enable VPlan-native vectorization path with "
357 "support for outer loop vectorization."));
358
359cl::opt<bool>
360 llvm::VerifyEachVPlan("vplan-verify-each",
361#ifdef EXPENSIVE_CHECKS
362 cl::init(true),
363#else
364 cl::init(Val: false),
365#endif
366 cl::Hidden,
367 cl::desc("Verfiy VPlans after VPlan transforms."));
368
369// This flag enables the stress testing of the VPlan H-CFG construction in the
370// VPlan-native vectorization path. It must be used in conjuction with
371// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
372// verification of the H-CFGs built.
373static cl::opt<bool> VPlanBuildStressTest(
374 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
375 cl::desc(
376 "Build VPlan for every supported loop nest in the function and bail "
377 "out right after the build (stress test the VPlan H-CFG construction "
378 "in the VPlan-native vectorization path)."));
379
380cl::opt<bool> llvm::EnableLoopInterleaving(
381 "interleave-loops", cl::init(Val: true), cl::Hidden,
382 cl::desc("Enable loop interleaving in Loop vectorization passes"));
383cl::opt<bool> llvm::EnableLoopVectorization(
384 "vectorize-loops", cl::init(Val: true), cl::Hidden,
385 cl::desc("Run the Loop vectorization passes"));
386
387static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
388 "force-widen-divrem-via-safe-divisor", cl::Hidden,
389 cl::desc(
390 "Override cost based safe divisor widening for div/rem instructions"));
391
392static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
393 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
394 cl::Hidden,
395 cl::desc("Try wider VFs if they enable the use of vector variants"));
396
397static cl::opt<bool> EnableEarlyExitVectorization(
398 "enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
399 cl::desc(
400 "Enable vectorization of early exit loops with uncountable exits."));
401
402// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
403// variables not overflowing do not hold. See `emitSCEVChecks`.
404static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
405// Likelyhood of bypassing the vectorized loop because pointers overlap. See
406// `emitMemRuntimeChecks`.
407static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
408// Likelyhood of bypassing the vectorized loop because there are zero trips left
409// after prolog. See `emitIterationCountCheck`.
410static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
411
412/// A helper function that returns true if the given type is irregular. The
413/// type is irregular if its allocated size doesn't equal the store size of an
414/// element of the corresponding vector type.
415static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
416 // Determine if an array of N elements of type Ty is "bitcast compatible"
417 // with a <N x Ty> vector.
418 // This is only true if there is no padding between the array elements.
419 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
420}
421
422/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
423/// ElementCount to include loops whose trip count is a function of vscale.
424static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
425 const Loop *L) {
426 return ElementCount::getFixed(MinVal: SE->getSmallConstantTripCount(L));
427}
428
429/// Returns "best known" trip count, which is either a valid positive trip count
430/// or std::nullopt when an estimate cannot be made (including when the trip
431/// count would overflow), for the specified loop \p L as defined by the
432/// following procedure:
433/// 1) Returns exact trip count if it is known.
434/// 2) Returns expected trip count according to profile data if any.
435/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
436/// 4) Returns std::nullopt if all of the above failed.
437static std::optional<ElementCount>
438getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
439 bool CanUseConstantMax = true) {
440 // Check if exact trip count is known.
441 if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
442 return ExpectedTC;
443
444 // Check if there is an expected trip count available from profile data.
445 if (LoopVectorizeWithBlockFrequency)
446 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
447 return ElementCount::getFixed(MinVal: *EstimatedTC);
448
449 if (!CanUseConstantMax)
450 return std::nullopt;
451
452 // Check if upper bound estimate is known.
453 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
454 return ElementCount::getFixed(MinVal: ExpectedTC);
455
456 return std::nullopt;
457}
458
459namespace {
460// Forward declare GeneratedRTChecks.
461class GeneratedRTChecks;
462
463using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
464} // namespace
465
466namespace llvm {
467
468AnalysisKey ShouldRunExtraVectorPasses::Key;
469
470/// InnerLoopVectorizer vectorizes loops which contain only one basic
471/// block to a specified vectorization factor (VF).
472/// This class performs the widening of scalars into vectors, or multiple
473/// scalars. This class also implements the following features:
474/// * It inserts an epilogue loop for handling loops that don't have iteration
475/// counts that are known to be a multiple of the vectorization factor.
476/// * It handles the code generation for reduction variables.
477/// * Scalarization (implementation using scalars) of un-vectorizable
478/// instructions.
479/// InnerLoopVectorizer does not perform any vectorization-legality
480/// checks, and relies on the caller to check for the different legality
481/// aspects. The InnerLoopVectorizer relies on the
482/// LoopVectorizationLegality class to provide information about the induction
483/// and reduction variables that were found to a given vectorization factor.
484class InnerLoopVectorizer {
485public:
486 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
487 LoopInfo *LI, DominatorTree *DT,
488 const TargetLibraryInfo *TLI,
489 const TargetTransformInfo *TTI, AssumptionCache *AC,
490 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
491 ElementCount MinProfitableTripCount,
492 unsigned UnrollFactor, LoopVectorizationCostModel *CM,
493 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
494 GeneratedRTChecks &RTChecks, VPlan &Plan)
495 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
496 AC(AC), ORE(ORE), VF(VecWidth),
497 MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
498 Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
499 RTChecks(RTChecks), Plan(Plan),
500 VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
501
502 virtual ~InnerLoopVectorizer() = default;
503
504 /// Create a new empty loop that will contain vectorized instructions later
505 /// on, while the old loop will be used as the scalar remainder. Control flow
506 /// is generated around the vectorized (and scalar epilogue) loops consisting
507 /// of various checks and bypasses. Return the pre-header block of the new
508 /// loop. In the case of epilogue vectorization, this function is overriden to
509 /// handle the more complex control flow around the loops.
510 virtual BasicBlock *createVectorizedLoopSkeleton();
511
512 /// Fix the vectorized code, taking care of header phi's, and more.
513 void fixVectorizedLoop(VPTransformState &State);
514
515 /// Fix the non-induction PHIs in \p Plan.
516 void fixNonInductionPHIs(VPTransformState &State);
517
518 /// Returns the original loop trip count.
519 Value *getTripCount() const { return TripCount; }
520
521 /// Used to set the trip count after ILV's construction and after the
522 /// preheader block has been executed. Note that this always holds the trip
523 /// count of the original loop for both main loop and epilogue vectorization.
524 void setTripCount(Value *TC) { TripCount = TC; }
525
526 /// Return the additional bypass block which targets the scalar loop by
527 /// skipping the epilogue loop after completing the main loop.
528 BasicBlock *getAdditionalBypassBlock() const {
529 assert(AdditionalBypassBlock &&
530 "Trying to access AdditionalBypassBlock but it has not been set");
531 return AdditionalBypassBlock;
532 }
533
534protected:
535 friend class LoopVectorizationPlanner;
536
537 /// Returns (and creates if needed) the trip count of the widened loop.
538 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
539
540 // Create a check to see if the vector loop should be executed
541 Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
542
543 /// Emit a bypass check to see if the vector trip count is zero, including if
544 /// it overflows.
545 void emitIterationCountCheck(BasicBlock *Bypass);
546
547 /// Emit a bypass check to see if all of the SCEV assumptions we've
548 /// had to make are correct. Returns the block containing the checks or
549 /// nullptr if no checks have been added.
550 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
551
552 /// Emit bypass checks to check any memory assumptions we may have made.
553 /// Returns the block containing the checks or nullptr if no checks have been
554 /// added.
555 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
556
557 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
558 /// vector loop preheader, middle block and scalar preheader.
559 void createVectorLoopSkeleton(StringRef Prefix);
560
561 /// Allow subclasses to override and print debug traces before/after vplan
562 /// execution, when trace information is requested.
563 virtual void printDebugTracesAtStart() {}
564 virtual void printDebugTracesAtEnd() {}
565
566 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
567 /// vector preheader and its predecessor, also connecting the new block to the
568 /// scalar preheader.
569 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
570
571 /// The original loop.
572 Loop *OrigLoop;
573
574 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
575 /// dynamic knowledge to simplify SCEV expressions and converts them to a
576 /// more usable form.
577 PredicatedScalarEvolution &PSE;
578
579 /// Loop Info.
580 LoopInfo *LI;
581
582 /// Dominator Tree.
583 DominatorTree *DT;
584
585 /// Target Library Info.
586 const TargetLibraryInfo *TLI;
587
588 /// Target Transform Info.
589 const TargetTransformInfo *TTI;
590
591 /// Assumption Cache.
592 AssumptionCache *AC;
593
594 /// Interface to emit optimization remarks.
595 OptimizationRemarkEmitter *ORE;
596
597 /// The vectorization SIMD factor to use. Each vector will have this many
598 /// vector elements.
599 ElementCount VF;
600
601 ElementCount MinProfitableTripCount;
602
603 /// The vectorization unroll factor to use. Each scalar is vectorized to this
604 /// many different vector instructions.
605 unsigned UF;
606
607 /// The builder that we use
608 IRBuilder<> Builder;
609
610 // --- Vectorization state ---
611
612 /// The vector-loop preheader.
613 BasicBlock *LoopVectorPreHeader = nullptr;
614
615 /// The scalar-loop preheader.
616 BasicBlock *LoopScalarPreHeader = nullptr;
617
618 /// Middle Block between the vector and the scalar.
619 BasicBlock *LoopMiddleBlock = nullptr;
620
621 /// Trip count of the original loop.
622 Value *TripCount = nullptr;
623
624 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
625 Value *VectorTripCount = nullptr;
626
627 /// The profitablity analysis.
628 LoopVectorizationCostModel *Cost;
629
630 /// BFI and PSI are used to check for profile guided size optimizations.
631 BlockFrequencyInfo *BFI;
632 ProfileSummaryInfo *PSI;
633
634 /// Structure to hold information about generated runtime checks, responsible
635 /// for cleaning the checks, if vectorization turns out unprofitable.
636 GeneratedRTChecks &RTChecks;
637
638 /// The additional bypass block which conditionally skips over the epilogue
639 /// loop after executing the main loop. Needed to resume inductions and
640 /// reductions during epilogue vectorization.
641 BasicBlock *AdditionalBypassBlock = nullptr;
642
643 VPlan &Plan;
644
645 /// The vector preheader block of \p Plan, used as target for check blocks
646 /// introduced during skeleton creation.
647 VPBlockBase *VectorPHVPB;
648};
649
650/// Encapsulate information regarding vectorization of a loop and its epilogue.
651/// This information is meant to be updated and used across two stages of
652/// epilogue vectorization.
653struct EpilogueLoopVectorizationInfo {
654 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
655 unsigned MainLoopUF = 0;
656 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
657 unsigned EpilogueUF = 0;
658 BasicBlock *MainLoopIterationCountCheck = nullptr;
659 BasicBlock *EpilogueIterationCountCheck = nullptr;
660 BasicBlock *SCEVSafetyCheck = nullptr;
661 BasicBlock *MemSafetyCheck = nullptr;
662 Value *TripCount = nullptr;
663 Value *VectorTripCount = nullptr;
664 VPlan &EpiloguePlan;
665
666 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
667 ElementCount EVF, unsigned EUF,
668 VPlan &EpiloguePlan)
669 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
670 EpiloguePlan(EpiloguePlan) {
671 assert(EUF == 1 &&
672 "A high UF for the epilogue loop is likely not beneficial.");
673 }
674};
675
676/// An extension of the inner loop vectorizer that creates a skeleton for a
677/// vectorized loop that has its epilogue (residual) also vectorized.
678/// The idea is to run the vplan on a given loop twice, firstly to setup the
679/// skeleton and vectorize the main loop, and secondly to complete the skeleton
680/// from the first step and vectorize the epilogue. This is achieved by
681/// deriving two concrete strategy classes from this base class and invoking
682/// them in succession from the loop vectorizer planner.
683class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
684public:
685 InnerLoopAndEpilogueVectorizer(
686 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
687 DominatorTree *DT, const TargetLibraryInfo *TLI,
688 const TargetTransformInfo *TTI, AssumptionCache *AC,
689 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
690 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
691 ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
692 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
693 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
694 BFI, PSI, Checks, Plan),
695 EPI(EPI) {}
696
697 // Override this function to handle the more complex control flow around the
698 // three loops.
699 BasicBlock *createVectorizedLoopSkeleton() final {
700 return createEpilogueVectorizedLoopSkeleton();
701 }
702
703 /// The interface for creating a vectorized skeleton using one of two
704 /// different strategies, each corresponding to one execution of the vplan
705 /// as described above.
706 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
707
708 /// Holds and updates state information required to vectorize the main loop
709 /// and its epilogue in two separate passes. This setup helps us avoid
710 /// regenerating and recomputing runtime safety checks. It also helps us to
711 /// shorten the iteration-count-check path length for the cases where the
712 /// iteration count of the loop is so small that the main vector loop is
713 /// completely skipped.
714 EpilogueLoopVectorizationInfo &EPI;
715};
716
717/// A specialized derived class of inner loop vectorizer that performs
718/// vectorization of *main* loops in the process of vectorizing loops and their
719/// epilogues.
720class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
721public:
722 EpilogueVectorizerMainLoop(
723 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
724 DominatorTree *DT, const TargetLibraryInfo *TLI,
725 const TargetTransformInfo *TTI, AssumptionCache *AC,
726 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
727 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
728 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
729 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
730 EPI, CM, BFI, PSI, Check, Plan) {}
731 /// Implements the interface for creating a vectorized skeleton using the
732 /// *main loop* strategy (ie the first pass of vplan execution).
733 BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
734
735protected:
736 /// Emits an iteration count bypass check once for the main loop (when \p
737 /// ForEpilogue is false) and once for the epilogue loop (when \p
738 /// ForEpilogue is true).
739 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
740 void printDebugTracesAtStart() override;
741 void printDebugTracesAtEnd() override;
742};
743
744// A specialized derived class of inner loop vectorizer that performs
745// vectorization of *epilogue* loops in the process of vectorizing loops and
746// their epilogues.
747class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
748public:
749 EpilogueVectorizerEpilogueLoop(
750 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
751 DominatorTree *DT, const TargetLibraryInfo *TLI,
752 const TargetTransformInfo *TTI, AssumptionCache *AC,
753 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
754 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
755 ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
756 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
757 EPI, CM, BFI, PSI, Checks, Plan) {
758 TripCount = EPI.TripCount;
759 }
760 /// Implements the interface for creating a vectorized skeleton using the
761 /// *epilogue loop* strategy (ie the second pass of vplan execution).
762 BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
763
764protected:
765 /// Emits an iteration count bypass check after the main vector loop has
766 /// finished to see if there are any iterations left to execute by either
767 /// the vector epilogue or the scalar epilogue.
768 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
769 BasicBlock *Bypass,
770 BasicBlock *Insert);
771 void printDebugTracesAtStart() override;
772 void printDebugTracesAtEnd() override;
773};
774} // end namespace llvm
775
776/// Look for a meaningful debug location on the instruction or its operands.
777static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
778 if (!I)
779 return DebugLoc::getUnknown();
780
781 DebugLoc Empty;
782 if (I->getDebugLoc() != Empty)
783 return I->getDebugLoc();
784
785 for (Use &Op : I->operands()) {
786 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
787 if (OpInst->getDebugLoc() != Empty)
788 return OpInst->getDebugLoc();
789 }
790
791 return I->getDebugLoc();
792}
793
794/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
795/// is passed, the message relates to that particular instruction.
796#ifndef NDEBUG
797static void debugVectorizationMessage(const StringRef Prefix,
798 const StringRef DebugMsg,
799 Instruction *I) {
800 dbgs() << "LV: " << Prefix << DebugMsg;
801 if (I != nullptr)
802 dbgs() << " " << *I;
803 else
804 dbgs() << '.';
805 dbgs() << '\n';
806}
807#endif
808
809/// Create an analysis remark that explains why vectorization failed
810///
811/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
812/// RemarkName is the identifier for the remark. If \p I is passed it is an
813/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
814/// the location of the remark. If \p DL is passed, use it as debug location for
815/// the remark. \return the remark object that can be streamed to.
816static OptimizationRemarkAnalysis
817createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
818 Instruction *I, DebugLoc DL = {}) {
819 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
820 // If debug location is attached to the instruction, use it. Otherwise if DL
821 // was not provided, use the loop's.
822 if (I && I->getDebugLoc())
823 DL = I->getDebugLoc();
824 else if (!DL)
825 DL = TheLoop->getStartLoc();
826
827 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
828}
829
830namespace llvm {
831
832/// Return a value for Step multiplied by VF.
833Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
834 int64_t Step) {
835 assert(Ty->isIntegerTy() && "Expected an integer step");
836 return B.CreateElementCount(Ty, EC: VF.multiplyCoefficientBy(RHS: Step));
837}
838
839/// Return the runtime value for VF.
840Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
841 return B.CreateElementCount(Ty, EC: VF);
842}
843
844void reportVectorizationFailure(const StringRef DebugMsg,
845 const StringRef OREMsg, const StringRef ORETag,
846 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
847 Instruction *I) {
848 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
849 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
850 ORE->emit(
851 OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
852 << "loop not vectorized: " << OREMsg);
853}
854
855/// Reports an informative message: print \p Msg for debugging purposes as well
856/// as an optimization remark. Uses either \p I as location of the remark, or
857/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
858/// remark. If \p DL is passed, use it as debug location for the remark.
859static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
860 OptimizationRemarkEmitter *ORE,
861 Loop *TheLoop, Instruction *I = nullptr,
862 DebugLoc DL = {}) {
863 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
864 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
865 ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
866 I, DL)
867 << Msg);
868}
869
870/// Report successful vectorization of the loop. In case an outer loop is
871/// vectorized, prepend "outer" to the vectorization remark.
872static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
873 VectorizationFactor VF, unsigned IC) {
874 LLVM_DEBUG(debugVectorizationMessage(
875 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
876 nullptr));
877 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
878 ORE->emit(RemarkBuilder: [&]() {
879 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
880 TheLoop->getHeader())
881 << "vectorized " << LoopType << "loop (vectorization width: "
882 << ore::NV("VectorizationFactor", VF.Width)
883 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
884 });
885}
886
887} // end namespace llvm
888
889namespace llvm {
890
891// Loop vectorization cost-model hints how the scalar epilogue loop should be
892// lowered.
893enum ScalarEpilogueLowering {
894
895 // The default: allowing scalar epilogues.
896 CM_ScalarEpilogueAllowed,
897
898 // Vectorization with OptForSize: don't allow epilogues.
899 CM_ScalarEpilogueNotAllowedOptSize,
900
901 // A special case of vectorisation with OptForSize: loops with a very small
902 // trip count are considered for vectorization under OptForSize, thereby
903 // making sure the cost of their loop body is dominant, free of runtime
904 // guards and scalar iteration overheads.
905 CM_ScalarEpilogueNotAllowedLowTripLoop,
906
907 // Loop hint predicate indicating an epilogue is undesired.
908 CM_ScalarEpilogueNotNeededUsePredicate,
909
910 // Directive indicating we must either tail fold or not vectorize
911 CM_ScalarEpilogueNotAllowedUsePredicate
912};
913
914/// LoopVectorizationCostModel - estimates the expected speedups due to
915/// vectorization.
916/// In many cases vectorization is not profitable. This can happen because of
917/// a number of reasons. In this class we mainly attempt to predict the
918/// expected speedup/slowdowns due to the supported instruction set. We use the
919/// TargetTransformInfo to query the different backends for the cost of
920/// different operations.
921class LoopVectorizationCostModel {
922 friend class LoopVectorizationPlanner;
923
924public:
925 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
926 PredicatedScalarEvolution &PSE, LoopInfo *LI,
927 LoopVectorizationLegality *Legal,
928 const TargetTransformInfo &TTI,
929 const TargetLibraryInfo *TLI, DemandedBits *DB,
930 AssumptionCache *AC,
931 OptimizationRemarkEmitter *ORE, const Function *F,
932 const LoopVectorizeHints *Hints,
933 InterleavedAccessInfo &IAI,
934 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
935 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
936 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
937 Hints(Hints), InterleaveInfo(IAI) {
938 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
939 initializeVScaleForTuning();
940 CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
941 // Query this against the original loop and save it here because the profile
942 // of the original loop header may change as the transformation happens.
943 OptForSize = llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
944 QueryType: PGSOQueryType::IRPass);
945 }
946
947 /// \return An upper bound for the vectorization factors (both fixed and
948 /// scalable). If the factors are 0, vectorization and interleaving should be
949 /// avoided up front.
950 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
951
952 /// \return True if runtime checks are required for vectorization, and false
953 /// otherwise.
954 bool runtimeChecksRequired();
955
956 /// Setup cost-based decisions for user vectorization factor.
957 /// \return true if the UserVF is a feasible VF to be chosen.
958 bool selectUserVectorizationFactor(ElementCount UserVF) {
959 collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
960 return expectedCost(VF: UserVF).isValid();
961 }
962
963 /// \return True if maximizing vector bandwidth is enabled by the target or
964 /// user options, for the given register kind.
965 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
966
967 /// \return True if maximizing vector bandwidth is enabled by the target or
968 /// user options, for the given vector factor.
969 bool useMaxBandwidth(ElementCount VF);
970
971 /// \return The size (in bits) of the smallest and widest types in the code
972 /// that needs to be vectorized. We ignore values that remain scalar such as
973 /// 64 bit loop indices.
974 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
975
976 /// \return The desired interleave count.
977 /// If interleave count has been specified by metadata it will be returned.
978 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
979 /// are the selected vectorization factor and the cost of the selected VF.
980 unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
981 InstructionCost LoopCost);
982
983 /// Memory access instruction may be vectorized in more than one way.
984 /// Form of instruction after vectorization depends on cost.
985 /// This function takes cost-based decisions for Load/Store instructions
986 /// and collects them in a map. This decisions map is used for building
987 /// the lists of loop-uniform and loop-scalar instructions.
988 /// The calculated cost is saved with widening decision in order to
989 /// avoid redundant calculations.
990 void setCostBasedWideningDecision(ElementCount VF);
991
992 /// A call may be vectorized in different ways depending on whether we have
993 /// vectorized variants available and whether the target supports masking.
994 /// This function analyzes all calls in the function at the supplied VF,
995 /// makes a decision based on the costs of available options, and stores that
996 /// decision in a map for use in planning and plan execution.
997 void setVectorizedCallDecision(ElementCount VF);
998
999 /// Collect values we want to ignore in the cost model.
1000 void collectValuesToIgnore();
1001
1002 /// Collect all element types in the loop for which widening is needed.
1003 void collectElementTypesForWidening();
1004
1005 /// Split reductions into those that happen in the loop, and those that happen
1006 /// outside. In loop reductions are collected into InLoopReductions.
1007 void collectInLoopReductions();
1008
1009 /// Returns true if we should use strict in-order reductions for the given
1010 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1011 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1012 /// of FP operations.
1013 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1014 return !Hints->allowReordering() && RdxDesc.isOrdered();
1015 }
1016
1017 /// \returns The smallest bitwidth each instruction can be represented with.
1018 /// The vector equivalents of these instructions should be truncated to this
1019 /// type.
1020 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1021 return MinBWs;
1022 }
1023
1024 /// \returns True if it is more profitable to scalarize instruction \p I for
1025 /// vectorization factor \p VF.
1026 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1027 assert(VF.isVector() &&
1028 "Profitable to scalarize relevant only for VF > 1.");
1029 assert(
1030 TheLoop->isInnermost() &&
1031 "cost-model should not be used for outer loops (in VPlan-native path)");
1032
1033 auto Scalars = InstsToScalarize.find(Val: VF);
1034 assert(Scalars != InstsToScalarize.end() &&
1035 "VF not yet analyzed for scalarization profitability");
1036 return Scalars->second.contains(Val: I);
1037 }
1038
1039 /// Returns true if \p I is known to be uniform after vectorization.
1040 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1041 assert(
1042 TheLoop->isInnermost() &&
1043 "cost-model should not be used for outer loops (in VPlan-native path)");
1044 // Pseudo probe needs to be duplicated for each unrolled iteration and
1045 // vector lane so that profiled loop trip count can be accurately
1046 // accumulated instead of being under counted.
1047 if (isa<PseudoProbeInst>(Val: I))
1048 return false;
1049
1050 if (VF.isScalar())
1051 return true;
1052
1053 auto UniformsPerVF = Uniforms.find(Val: VF);
1054 assert(UniformsPerVF != Uniforms.end() &&
1055 "VF not yet analyzed for uniformity");
1056 return UniformsPerVF->second.count(Ptr: I);
1057 }
1058
1059 /// Returns true if \p I is known to be scalar after vectorization.
1060 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1061 assert(
1062 TheLoop->isInnermost() &&
1063 "cost-model should not be used for outer loops (in VPlan-native path)");
1064 if (VF.isScalar())
1065 return true;
1066
1067 auto ScalarsPerVF = Scalars.find(Val: VF);
1068 assert(ScalarsPerVF != Scalars.end() &&
1069 "Scalar values are not calculated for VF");
1070 return ScalarsPerVF->second.count(Ptr: I);
1071 }
1072
1073 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1074 /// for vectorization factor \p VF.
1075 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1076 return VF.isVector() && MinBWs.contains(Key: I) &&
1077 !isProfitableToScalarize(I, VF) &&
1078 !isScalarAfterVectorization(I, VF);
1079 }
1080
1081 /// Decision that was taken during cost calculation for memory instruction.
1082 enum InstWidening {
1083 CM_Unknown,
1084 CM_Widen, // For consecutive accesses with stride +1.
1085 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1086 CM_Interleave,
1087 CM_GatherScatter,
1088 CM_Scalarize,
1089 CM_VectorCall,
1090 CM_IntrinsicCall
1091 };
1092
1093 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1094 /// instruction \p I and vector width \p VF.
1095 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1096 InstructionCost Cost) {
1097 assert(VF.isVector() && "Expected VF >=2");
1098 WideningDecisions[{I, VF}] = {W, Cost};
1099 }
1100
1101 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1102 /// interleaving group \p Grp and vector width \p VF.
1103 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1104 ElementCount VF, InstWidening W,
1105 InstructionCost Cost) {
1106 assert(VF.isVector() && "Expected VF >=2");
1107 /// Broadcast this decicion to all instructions inside the group.
1108 /// When interleaving, the cost will only be assigned one instruction, the
1109 /// insert position. For other cases, add the appropriate fraction of the
1110 /// total cost to each instruction. This ensures accurate costs are used,
1111 /// even if the insert position instruction is not used.
1112 InstructionCost InsertPosCost = Cost;
1113 InstructionCost OtherMemberCost = 0;
1114 if (W != CM_Interleave)
1115 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1116 ;
1117 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1118 if (auto *I = Grp->getMember(Index: Idx)) {
1119 if (Grp->getInsertPos() == I)
1120 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1121 else
1122 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1123 }
1124 }
1125 }
1126
1127 /// Return the cost model decision for the given instruction \p I and vector
1128 /// width \p VF. Return CM_Unknown if this instruction did not pass
1129 /// through the cost modeling.
1130 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1131 assert(VF.isVector() && "Expected VF to be a vector VF");
1132 assert(
1133 TheLoop->isInnermost() &&
1134 "cost-model should not be used for outer loops (in VPlan-native path)");
1135
1136 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1137 auto Itr = WideningDecisions.find(Val: InstOnVF);
1138 if (Itr == WideningDecisions.end())
1139 return CM_Unknown;
1140 return Itr->second.first;
1141 }
1142
1143 /// Return the vectorization cost for the given instruction \p I and vector
1144 /// width \p VF.
1145 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1146 assert(VF.isVector() && "Expected VF >=2");
1147 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1148 assert(WideningDecisions.contains(InstOnVF) &&
1149 "The cost is not calculated");
1150 return WideningDecisions[InstOnVF].second;
1151 }
1152
1153 struct CallWideningDecision {
1154 InstWidening Kind;
1155 Function *Variant;
1156 Intrinsic::ID IID;
1157 std::optional<unsigned> MaskPos;
1158 InstructionCost Cost;
1159 };
1160
1161 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1162 Function *Variant, Intrinsic::ID IID,
1163 std::optional<unsigned> MaskPos,
1164 InstructionCost Cost) {
1165 assert(!VF.isScalar() && "Expected vector VF");
1166 CallWideningDecisions[{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1167 }
1168
1169 CallWideningDecision getCallWideningDecision(CallInst *CI,
1170 ElementCount VF) const {
1171 assert(!VF.isScalar() && "Expected vector VF");
1172 return CallWideningDecisions.at(Val: {CI, VF});
1173 }
1174
1175 /// Return True if instruction \p I is an optimizable truncate whose operand
1176 /// is an induction variable. Such a truncate will be removed by adding a new
1177 /// induction variable with the destination type.
1178 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1179 // If the instruction is not a truncate, return false.
1180 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1181 if (!Trunc)
1182 return false;
1183
1184 // Get the source and destination types of the truncate.
1185 Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1186 Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1187
1188 // If the truncate is free for the given types, return false. Replacing a
1189 // free truncate with an induction variable would add an induction variable
1190 // update instruction to each iteration of the loop. We exclude from this
1191 // check the primary induction variable since it will need an update
1192 // instruction regardless.
1193 Value *Op = Trunc->getOperand(i_nocapture: 0);
1194 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1195 return false;
1196
1197 // If the truncated value is not an induction variable, return false.
1198 return Legal->isInductionPhi(V: Op);
1199 }
1200
1201 /// Collects the instructions to scalarize for each predicated instruction in
1202 /// the loop.
1203 void collectInstsToScalarize(ElementCount VF);
1204
1205 /// Collect values that will not be widened, including Uniforms, Scalars, and
1206 /// Instructions to Scalarize for the given \p VF.
1207 /// The sets depend on CM decision for Load/Store instructions
1208 /// that may be vectorized as interleave, gather-scatter or scalarized.
1209 /// Also make a decision on what to do about call instructions in the loop
1210 /// at that VF -- scalarize, call a known vector routine, or call a
1211 /// vector intrinsic.
1212 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1213 // Do the analysis once.
1214 if (VF.isScalar() || Uniforms.contains(Val: VF))
1215 return;
1216 setCostBasedWideningDecision(VF);
1217 collectLoopUniforms(VF);
1218 setVectorizedCallDecision(VF);
1219 collectLoopScalars(VF);
1220 collectInstsToScalarize(VF);
1221 }
1222
1223 /// Returns true if the target machine supports masked store operation
1224 /// for the given \p DataType and kind of access to \p Ptr.
1225 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1226 unsigned AddressSpace) const {
1227 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1228 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1229 }
1230
1231 /// Returns true if the target machine supports masked load operation
1232 /// for the given \p DataType and kind of access to \p Ptr.
1233 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1234 unsigned AddressSpace) const {
1235 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1236 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1237 }
1238
1239 /// Returns true if the target machine can represent \p V as a masked gather
1240 /// or scatter operation.
1241 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1242 bool LI = isa<LoadInst>(Val: V);
1243 bool SI = isa<StoreInst>(Val: V);
1244 if (!LI && !SI)
1245 return false;
1246 auto *Ty = getLoadStoreType(I: V);
1247 Align Align = getLoadStoreAlignment(I: V);
1248 if (VF.isVector())
1249 Ty = VectorType::get(ElementType: Ty, EC: VF);
1250 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1251 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1252 }
1253
1254 /// Returns true if the target machine supports all of the reduction
1255 /// variables found for the given VF.
1256 bool canVectorizeReductions(ElementCount VF) const {
1257 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1258 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1259 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1260 }));
1261 }
1262
1263 /// Given costs for both strategies, return true if the scalar predication
1264 /// lowering should be used for div/rem. This incorporates an override
1265 /// option so it is not simply a cost comparison.
1266 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1267 InstructionCost SafeDivisorCost) const {
1268 switch (ForceSafeDivisor) {
1269 case cl::BOU_UNSET:
1270 return ScalarCost < SafeDivisorCost;
1271 case cl::BOU_TRUE:
1272 return false;
1273 case cl::BOU_FALSE:
1274 return true;
1275 }
1276 llvm_unreachable("impossible case value");
1277 }
1278
1279 /// Returns true if \p I is an instruction which requires predication and
1280 /// for which our chosen predication strategy is scalarization (i.e. we
1281 /// don't have an alternate strategy such as masking available).
1282 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1283 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1284
1285 /// Returns true if \p I is an instruction that needs to be predicated
1286 /// at runtime. The result is independent of the predication mechanism.
1287 /// Superset of instructions that return true for isScalarWithPredication.
1288 bool isPredicatedInst(Instruction *I) const;
1289
1290 /// Return the costs for our two available strategies for lowering a
1291 /// div/rem operation which requires speculating at least one lane.
1292 /// First result is for scalarization (will be invalid for scalable
1293 /// vectors); second is for the safe-divisor strategy.
1294 std::pair<InstructionCost, InstructionCost>
1295 getDivRemSpeculationCost(Instruction *I,
1296 ElementCount VF) const;
1297
1298 /// Returns true if \p I is a memory instruction with consecutive memory
1299 /// access that can be widened.
1300 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1301
1302 /// Returns true if \p I is a memory instruction in an interleaved-group
1303 /// of memory accesses that can be vectorized with wide vector loads/stores
1304 /// and shuffles.
1305 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1306
1307 /// Check if \p Instr belongs to any interleaved access group.
1308 bool isAccessInterleaved(Instruction *Instr) const {
1309 return InterleaveInfo.isInterleaved(Instr);
1310 }
1311
1312 /// Get the interleaved access group that \p Instr belongs to.
1313 const InterleaveGroup<Instruction> *
1314 getInterleavedAccessGroup(Instruction *Instr) const {
1315 return InterleaveInfo.getInterleaveGroup(Instr);
1316 }
1317
1318 /// Returns true if we're required to use a scalar epilogue for at least
1319 /// the final iteration of the original loop.
1320 bool requiresScalarEpilogue(bool IsVectorizing) const {
1321 if (!isScalarEpilogueAllowed()) {
1322 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1323 return false;
1324 }
1325 // If we might exit from anywhere but the latch and early exit vectorization
1326 // is disabled, we must run the exiting iteration in scalar form.
1327 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1328 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1329 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1330 "from latch block\n");
1331 return true;
1332 }
1333 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1334 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1335 "interleaved group requires scalar epilogue\n");
1336 return true;
1337 }
1338 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1339 return false;
1340 }
1341
1342 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1343 /// loop hint annotation.
1344 bool isScalarEpilogueAllowed() const {
1345 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1346 }
1347
1348 /// Returns the TailFoldingStyle that is best for the current loop.
1349 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1350 if (!ChosenTailFoldingStyle)
1351 return TailFoldingStyle::None;
1352 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1353 : ChosenTailFoldingStyle->second;
1354 }
1355
1356 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1357 /// overflow or not.
1358 /// \param IsScalableVF true if scalable vector factors enabled.
1359 /// \param UserIC User specific interleave count.
1360 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1361 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1362 if (!Legal->canFoldTailByMasking()) {
1363 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1364 return;
1365 }
1366
1367 // Default to TTI preference, but allow command line override.
1368 ChosenTailFoldingStyle = {
1369 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1370 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1371 if (ForceTailFoldingStyle.getNumOccurrences())
1372 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1373 ForceTailFoldingStyle.getValue()};
1374
1375 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1376 return;
1377 // Override forced styles if needed.
1378 // FIXME: Investigate opportunity for fixed vector factor.
1379 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1380 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1381 if (EVLIsLegal)
1382 return;
1383 // If for some reason EVL mode is unsupported, fallback to
1384 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1385 // in a generic way.
1386 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1387 TailFoldingStyle::DataWithoutLaneMask};
1388 LLVM_DEBUG(
1389 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1390 "not try to generate VP Intrinsics "
1391 << (UserIC > 1
1392 ? "since interleave count specified is greater than 1.\n"
1393 : "due to non-interleaving reasons.\n"));
1394 }
1395
1396 /// Returns true if all loop blocks should be masked to fold tail loop.
1397 bool foldTailByMasking() const {
1398 // TODO: check if it is possible to check for None style independent of
1399 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1400 return getTailFoldingStyle() != TailFoldingStyle::None;
1401 }
1402
1403 /// Return maximum safe number of elements to be processed per vector
1404 /// iteration, which do not prevent store-load forwarding and are safe with
1405 /// regard to the memory dependencies. Required for EVL-based VPlans to
1406 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1407 /// MaxSafeElements).
1408 /// TODO: need to consider adjusting cost model to use this value as a
1409 /// vectorization factor for EVL-based vectorization.
1410 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1411
1412 /// Returns true if the instructions in this block requires predication
1413 /// for any reason, e.g. because tail folding now requires a predicate
1414 /// or because the block in the original loop was predicated.
1415 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1416 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1417 }
1418
1419 /// Returns true if VP intrinsics with explicit vector length support should
1420 /// be generated in the tail folded loop.
1421 bool foldTailWithEVL() const {
1422 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1423 }
1424
1425 /// Returns true if the Phi is part of an inloop reduction.
1426 bool isInLoopReduction(PHINode *Phi) const {
1427 return InLoopReductions.contains(Ptr: Phi);
1428 }
1429
1430 /// Returns true if the predicated reduction select should be used to set the
1431 /// incoming value for the reduction phi.
1432 bool usePredicatedReductionSelect() const {
1433 // Force to use predicated reduction select since the EVL of the
1434 // second-to-last iteration might not be VF*UF.
1435 if (foldTailWithEVL())
1436 return true;
1437 return PreferPredicatedReductionSelect ||
1438 TTI.preferPredicatedReductionSelect();
1439 }
1440
1441 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1442 /// with factor VF. Return the cost of the instruction, including
1443 /// scalarization overhead if it's needed.
1444 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1445
1446 /// Estimate cost of a call instruction CI if it were vectorized with factor
1447 /// VF. Return the cost of the instruction, including scalarization overhead
1448 /// if it's needed.
1449 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1450
1451 /// Invalidates decisions already taken by the cost model.
1452 void invalidateCostModelingDecisions() {
1453 WideningDecisions.clear();
1454 CallWideningDecisions.clear();
1455 Uniforms.clear();
1456 Scalars.clear();
1457 }
1458
1459 /// Returns the expected execution cost. The unit of the cost does
1460 /// not matter because we use the 'cost' units to compare different
1461 /// vector widths. The cost that is returned is *not* normalized by
1462 /// the factor width.
1463 InstructionCost expectedCost(ElementCount VF);
1464
1465 bool hasPredStores() const { return NumPredStores > 0; }
1466
1467 /// Returns true if epilogue vectorization is considered profitable, and
1468 /// false otherwise.
1469 /// \p VF is the vectorization factor chosen for the original loop.
1470 /// \p Multiplier is an aditional scaling factor applied to VF before
1471 /// comparing to EpilogueVectorizationMinVF.
1472 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1473 const unsigned IC) const;
1474
1475 /// Returns the execution time cost of an instruction for a given vector
1476 /// width. Vector width of one means scalar.
1477 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1478
1479 /// Return the cost of instructions in an inloop reduction pattern, if I is
1480 /// part of that pattern.
1481 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1482 ElementCount VF,
1483 Type *VectorTy) const;
1484
1485 /// Returns true if \p Op should be considered invariant and if it is
1486 /// trivially hoistable.
1487 bool shouldConsiderInvariant(Value *Op);
1488
1489 /// Return the value of vscale used for tuning the cost model.
1490 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1491
1492private:
1493 unsigned NumPredStores = 0;
1494
1495 /// Used to store the value of vscale used for tuning the cost model. It is
1496 /// initialized during object construction.
1497 std::optional<unsigned> VScaleForTuning;
1498
1499 /// Initializes the value of vscale used for tuning the cost model. If
1500 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1501 /// return the value returned by the corresponding TTI method.
1502 void initializeVScaleForTuning() {
1503 const Function *Fn = TheLoop->getHeader()->getParent();
1504 if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1505 auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1506 auto Min = Attr.getVScaleRangeMin();
1507 auto Max = Attr.getVScaleRangeMax();
1508 if (Max && Min == Max) {
1509 VScaleForTuning = Max;
1510 return;
1511 }
1512 }
1513
1514 VScaleForTuning = TTI.getVScaleForTuning();
1515 }
1516
1517 /// \return An upper bound for the vectorization factors for both
1518 /// fixed and scalable vectorization, where the minimum-known number of
1519 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1520 /// disabled or unsupported, then the scalable part will be equal to
1521 /// ElementCount::getScalable(0).
1522 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1523 ElementCount UserVF,
1524 bool FoldTailByMasking);
1525
1526 /// \return the maximized element count based on the targets vector
1527 /// registers and the loop trip-count, but limited to a maximum safe VF.
1528 /// This is a helper function of computeFeasibleMaxVF.
1529 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1530 unsigned SmallestType,
1531 unsigned WidestType,
1532 ElementCount MaxSafeVF,
1533 bool FoldTailByMasking);
1534
1535 /// Checks if scalable vectorization is supported and enabled. Caches the
1536 /// result to avoid repeated debug dumps for repeated queries.
1537 bool isScalableVectorizationAllowed();
1538
1539 /// \return the maximum legal scalable VF, based on the safe max number
1540 /// of elements.
1541 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1542
1543 /// Calculate vectorization cost of memory instruction \p I.
1544 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1545
1546 /// The cost computation for scalarized memory instruction.
1547 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1548
1549 /// The cost computation for interleaving group of memory instructions.
1550 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1551
1552 /// The cost computation for Gather/Scatter instruction.
1553 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1554
1555 /// The cost computation for widening instruction \p I with consecutive
1556 /// memory access.
1557 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1558
1559 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1560 /// Load: scalar load + broadcast.
1561 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1562 /// element)
1563 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1564
1565 /// Estimate the overhead of scalarizing an instruction. This is a
1566 /// convenience wrapper for the type-based getScalarizationOverhead API.
1567 InstructionCost getScalarizationOverhead(Instruction *I,
1568 ElementCount VF) const;
1569
1570 /// Returns true if an artificially high cost for emulated masked memrefs
1571 /// should be used.
1572 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1573
1574 /// Map of scalar integer values to the smallest bitwidth they can be legally
1575 /// represented as. The vector equivalents of these values should be truncated
1576 /// to this type.
1577 MapVector<Instruction *, uint64_t> MinBWs;
1578
1579 /// A type representing the costs for instructions if they were to be
1580 /// scalarized rather than vectorized. The entries are Instruction-Cost
1581 /// pairs.
1582 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1583
1584 /// A set containing all BasicBlocks that are known to present after
1585 /// vectorization as a predicated block.
1586 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1587 PredicatedBBsAfterVectorization;
1588
1589 /// Records whether it is allowed to have the original scalar loop execute at
1590 /// least once. This may be needed as a fallback loop in case runtime
1591 /// aliasing/dependence checks fail, or to handle the tail/remainder
1592 /// iterations when the trip count is unknown or doesn't divide by the VF,
1593 /// or as a peel-loop to handle gaps in interleave-groups.
1594 /// Under optsize and when the trip count is very small we don't allow any
1595 /// iterations to execute in the scalar loop.
1596 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1597
1598 /// Control finally chosen tail folding style. The first element is used if
1599 /// the IV update may overflow, the second element - if it does not.
1600 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1601 ChosenTailFoldingStyle;
1602
1603 /// true if scalable vectorization is supported and enabled.
1604 std::optional<bool> IsScalableVectorizationAllowed;
1605
1606 /// Maximum safe number of elements to be processed per vector iteration,
1607 /// which do not prevent store-load forwarding and are safe with regard to the
1608 /// memory dependencies. Required for EVL-based veectorization, where this
1609 /// value is used as the upper bound of the safe AVL.
1610 std::optional<unsigned> MaxSafeElements;
1611
1612 /// A map holding scalar costs for different vectorization factors. The
1613 /// presence of a cost for an instruction in the mapping indicates that the
1614 /// instruction will be scalarized when vectorizing with the associated
1615 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1616 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1617
1618 /// Holds the instructions known to be uniform after vectorization.
1619 /// The data is collected per VF.
1620 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1621
1622 /// Holds the instructions known to be scalar after vectorization.
1623 /// The data is collected per VF.
1624 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1625
1626 /// Holds the instructions (address computations) that are forced to be
1627 /// scalarized.
1628 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1629
1630 /// PHINodes of the reductions that should be expanded in-loop.
1631 SmallPtrSet<PHINode *, 4> InLoopReductions;
1632
1633 /// A Map of inloop reduction operations and their immediate chain operand.
1634 /// FIXME: This can be removed once reductions can be costed correctly in
1635 /// VPlan. This was added to allow quick lookup of the inloop operations.
1636 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1637
1638 /// Returns the expected difference in cost from scalarizing the expression
1639 /// feeding a predicated instruction \p PredInst. The instructions to
1640 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1641 /// non-negative return value implies the expression will be scalarized.
1642 /// Currently, only single-use chains are considered for scalarization.
1643 InstructionCost computePredInstDiscount(Instruction *PredInst,
1644 ScalarCostsTy &ScalarCosts,
1645 ElementCount VF);
1646
1647 /// Collect the instructions that are uniform after vectorization. An
1648 /// instruction is uniform if we represent it with a single scalar value in
1649 /// the vectorized loop corresponding to each vector iteration. Examples of
1650 /// uniform instructions include pointer operands of consecutive or
1651 /// interleaved memory accesses. Note that although uniformity implies an
1652 /// instruction will be scalar, the reverse is not true. In general, a
1653 /// scalarized instruction will be represented by VF scalar values in the
1654 /// vectorized loop, each corresponding to an iteration of the original
1655 /// scalar loop.
1656 void collectLoopUniforms(ElementCount VF);
1657
1658 /// Collect the instructions that are scalar after vectorization. An
1659 /// instruction is scalar if it is known to be uniform or will be scalarized
1660 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1661 /// to the list if they are used by a load/store instruction that is marked as
1662 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1663 /// VF values in the vectorized loop, each corresponding to an iteration of
1664 /// the original scalar loop.
1665 void collectLoopScalars(ElementCount VF);
1666
1667 /// Keeps cost model vectorization decision and cost for instructions.
1668 /// Right now it is used for memory instructions only.
1669 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1670 std::pair<InstWidening, InstructionCost>>;
1671
1672 DecisionList WideningDecisions;
1673
1674 using CallDecisionList =
1675 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1676
1677 CallDecisionList CallWideningDecisions;
1678
1679 /// Returns true if \p V is expected to be vectorized and it needs to be
1680 /// extracted.
1681 bool needsExtract(Value *V, ElementCount VF) const {
1682 Instruction *I = dyn_cast<Instruction>(Val: V);
1683 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1684 TheLoop->isLoopInvariant(V: I) ||
1685 getWideningDecision(I, VF) == CM_Scalarize)
1686 return false;
1687
1688 // Assume we can vectorize V (and hence we need extraction) if the
1689 // scalars are not computed yet. This can happen, because it is called
1690 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1691 // the scalars are collected. That should be a safe assumption in most
1692 // cases, because we check if the operands have vectorizable types
1693 // beforehand in LoopVectorizationLegality.
1694 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1695 };
1696
1697 /// Returns a range containing only operands needing to be extracted.
1698 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1699 ElementCount VF) const {
1700 return SmallVector<Value *, 4>(make_filter_range(
1701 Range&: Ops, Pred: [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1702 }
1703
1704public:
1705 /// The loop that we evaluate.
1706 Loop *TheLoop;
1707
1708 /// Predicated scalar evolution analysis.
1709 PredicatedScalarEvolution &PSE;
1710
1711 /// Loop Info analysis.
1712 LoopInfo *LI;
1713
1714 /// Vectorization legality.
1715 LoopVectorizationLegality *Legal;
1716
1717 /// Vector target information.
1718 const TargetTransformInfo &TTI;
1719
1720 /// Target Library Info.
1721 const TargetLibraryInfo *TLI;
1722
1723 /// Demanded bits analysis.
1724 DemandedBits *DB;
1725
1726 /// Assumption cache.
1727 AssumptionCache *AC;
1728
1729 /// Interface to emit optimization remarks.
1730 OptimizationRemarkEmitter *ORE;
1731
1732 const Function *TheFunction;
1733
1734 /// Loop Vectorize Hint.
1735 const LoopVectorizeHints *Hints;
1736
1737 /// The interleave access information contains groups of interleaved accesses
1738 /// with the same stride and close to each other.
1739 InterleavedAccessInfo &InterleaveInfo;
1740
1741 /// Values to ignore in the cost model.
1742 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1743
1744 /// Values to ignore in the cost model when VF > 1.
1745 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1746
1747 /// All element types found in the loop.
1748 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1749
1750 /// The kind of cost that we are calculating
1751 TTI::TargetCostKind CostKind;
1752
1753 /// Whether this loop should be optimized for size based on function attribute
1754 /// or profile information.
1755 bool OptForSize;
1756};
1757} // end namespace llvm
1758
1759namespace {
1760/// Helper struct to manage generating runtime checks for vectorization.
1761///
1762/// The runtime checks are created up-front in temporary blocks to allow better
1763/// estimating the cost and un-linked from the existing IR. After deciding to
1764/// vectorize, the checks are moved back. If deciding not to vectorize, the
1765/// temporary blocks are completely removed.
1766class GeneratedRTChecks {
1767 /// Basic block which contains the generated SCEV checks, if any.
1768 BasicBlock *SCEVCheckBlock = nullptr;
1769
1770 /// The value representing the result of the generated SCEV checks. If it is
1771 /// nullptr no SCEV checks have been generated.
1772 Value *SCEVCheckCond = nullptr;
1773
1774 /// Basic block which contains the generated memory runtime checks, if any.
1775 BasicBlock *MemCheckBlock = nullptr;
1776
1777 /// The value representing the result of the generated memory runtime checks.
1778 /// If it is nullptr no memory runtime checks have been generated.
1779 Value *MemRuntimeCheckCond = nullptr;
1780
1781 /// True if any checks have been added.
1782 bool AddedAnyChecks = false;
1783
1784 DominatorTree *DT;
1785 LoopInfo *LI;
1786 TargetTransformInfo *TTI;
1787
1788 SCEVExpander SCEVExp;
1789 SCEVExpander MemCheckExp;
1790
1791 bool CostTooHigh = false;
1792 const bool AddBranchWeights;
1793
1794 Loop *OuterLoop = nullptr;
1795
1796 PredicatedScalarEvolution &PSE;
1797
1798 /// The kind of cost that we are calculating
1799 TTI::TargetCostKind CostKind;
1800
1801public:
1802 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1803 LoopInfo *LI, TargetTransformInfo *TTI,
1804 const DataLayout &DL, bool AddBranchWeights,
1805 TTI::TargetCostKind CostKind)
1806 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1807 MemCheckExp(*PSE.getSE(), DL, "scev.check"),
1808 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1809
1810 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1811 /// accurately estimate the cost of the runtime checks. The blocks are
1812 /// un-linked from the IR and are added back during vector code generation. If
1813 /// there is no vector code generation, the check blocks are removed
1814 /// completely.
1815 void create(Loop *L, const LoopAccessInfo &LAI,
1816 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1817
1818 // Hard cutoff to limit compile-time increase in case a very large number of
1819 // runtime checks needs to be generated.
1820 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1821 // profile info.
1822 CostTooHigh =
1823 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1824 if (CostTooHigh)
1825 return;
1826
1827 BasicBlock *LoopHeader = L->getHeader();
1828 BasicBlock *Preheader = L->getLoopPreheader();
1829
1830 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1831 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1832 // may be used by SCEVExpander. The blocks will be un-linked from their
1833 // predecessors and removed from LI & DT at the end of the function.
1834 if (!UnionPred.isAlwaysTrue()) {
1835 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1836 MSSAU: nullptr, BBName: "vector.scevcheck");
1837
1838 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1839 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1840 if (isa<Constant>(Val: SCEVCheckCond)) {
1841 // Clean up directly after expanding the predicate to a constant, to
1842 // avoid further expansions re-using anything left over from SCEVExp.
1843 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1844 SCEVCleaner.cleanup();
1845 }
1846 }
1847
1848 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1849 if (RtPtrChecking.Need) {
1850 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1851 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1852 BBName: "vector.memcheck");
1853
1854 auto DiffChecks = RtPtrChecking.getDiffChecks();
1855 if (DiffChecks) {
1856 Value *RuntimeVF = nullptr;
1857 MemRuntimeCheckCond = addDiffRuntimeChecks(
1858 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1859 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1860 if (!RuntimeVF)
1861 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1862 return RuntimeVF;
1863 },
1864 IC);
1865 } else {
1866 MemRuntimeCheckCond = addRuntimeChecks(
1867 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1868 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1869 }
1870 assert(MemRuntimeCheckCond &&
1871 "no RT checks generated although RtPtrChecking "
1872 "claimed checks are required");
1873 }
1874
1875 if (!MemCheckBlock && !SCEVCheckBlock)
1876 return;
1877
1878 // Unhook the temporary block with the checks, update various places
1879 // accordingly.
1880 if (SCEVCheckBlock)
1881 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1882 if (MemCheckBlock)
1883 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1884
1885 if (SCEVCheckBlock) {
1886 SCEVCheckBlock->getTerminator()->moveBefore(
1887 InsertPos: Preheader->getTerminator()->getIterator());
1888 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1889 UI->setDebugLoc(DebugLoc::getTemporary());
1890 Preheader->getTerminator()->eraseFromParent();
1891 }
1892 if (MemCheckBlock) {
1893 MemCheckBlock->getTerminator()->moveBefore(
1894 InsertPos: Preheader->getTerminator()->getIterator());
1895 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1896 UI->setDebugLoc(DebugLoc::getTemporary());
1897 Preheader->getTerminator()->eraseFromParent();
1898 }
1899
1900 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1901 if (MemCheckBlock) {
1902 DT->eraseNode(BB: MemCheckBlock);
1903 LI->removeBlock(BB: MemCheckBlock);
1904 }
1905 if (SCEVCheckBlock) {
1906 DT->eraseNode(BB: SCEVCheckBlock);
1907 LI->removeBlock(BB: SCEVCheckBlock);
1908 }
1909
1910 // Outer loop is used as part of the later cost calculations.
1911 OuterLoop = L->getParentLoop();
1912 }
1913
1914 InstructionCost getCost() {
1915 if (SCEVCheckBlock || MemCheckBlock)
1916 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1917
1918 if (CostTooHigh) {
1919 InstructionCost Cost;
1920 Cost.setInvalid();
1921 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1922 return Cost;
1923 }
1924
1925 InstructionCost RTCheckCost = 0;
1926 if (SCEVCheckBlock)
1927 for (Instruction &I : *SCEVCheckBlock) {
1928 if (SCEVCheckBlock->getTerminator() == &I)
1929 continue;
1930 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1931 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1932 RTCheckCost += C;
1933 }
1934 if (MemCheckBlock) {
1935 InstructionCost MemCheckCost = 0;
1936 for (Instruction &I : *MemCheckBlock) {
1937 if (MemCheckBlock->getTerminator() == &I)
1938 continue;
1939 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1940 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1941 MemCheckCost += C;
1942 }
1943
1944 // If the runtime memory checks are being created inside an outer loop
1945 // we should find out if these checks are outer loop invariant. If so,
1946 // the checks will likely be hoisted out and so the effective cost will
1947 // reduce according to the outer loop trip count.
1948 if (OuterLoop) {
1949 ScalarEvolution *SE = MemCheckExp.getSE();
1950 // TODO: If profitable, we could refine this further by analysing every
1951 // individual memory check, since there could be a mixture of loop
1952 // variant and invariant checks that mean the final condition is
1953 // variant.
1954 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1955 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1956 // It seems reasonable to assume that we can reduce the effective
1957 // cost of the checks even when we know nothing about the trip
1958 // count. Assume that the outer loop executes at least twice.
1959 unsigned BestTripCount = 2;
1960
1961 // Get the best known TC estimate.
1962 if (auto EstimatedTC = getSmallBestKnownTC(
1963 PSE, L: OuterLoop, /* CanUseConstantMax = */ false))
1964 if (EstimatedTC->isFixed())
1965 BestTripCount = EstimatedTC->getFixedValue();
1966
1967 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1968
1969 // Let's ensure the cost is always at least 1.
1970 NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
1971 b: (InstructionCost::CostType)1);
1972
1973 if (BestTripCount > 1)
1974 LLVM_DEBUG(dbgs()
1975 << "We expect runtime memory checks to be hoisted "
1976 << "out of the outer loop. Cost reduced from "
1977 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1978
1979 MemCheckCost = NewMemCheckCost;
1980 }
1981 }
1982
1983 RTCheckCost += MemCheckCost;
1984 }
1985
1986 if (SCEVCheckBlock || MemCheckBlock)
1987 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1988 << "\n");
1989
1990 return RTCheckCost;
1991 }
1992
1993 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1994 /// unused.
1995 ~GeneratedRTChecks() {
1996 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1997 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1998 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(BB: SCEVCheckBlock);
1999 bool MemChecksUsed = !MemCheckBlock || !pred_empty(BB: MemCheckBlock);
2000 if (SCEVChecksUsed)
2001 SCEVCleaner.markResultUsed();
2002
2003 if (MemChecksUsed) {
2004 MemCheckCleaner.markResultUsed();
2005 } else {
2006 auto &SE = *MemCheckExp.getSE();
2007 // Memory runtime check generation creates compares that use expanded
2008 // values. Remove them before running the SCEVExpanderCleaners.
2009 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2010 if (MemCheckExp.isInsertedInstruction(I: &I))
2011 continue;
2012 SE.forgetValue(V: &I);
2013 I.eraseFromParent();
2014 }
2015 }
2016 MemCheckCleaner.cleanup();
2017 SCEVCleaner.cleanup();
2018
2019 if (!SCEVChecksUsed)
2020 SCEVCheckBlock->eraseFromParent();
2021 if (!MemChecksUsed)
2022 MemCheckBlock->eraseFromParent();
2023 }
2024
2025 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2026 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2027 /// depending on the generated condition.
2028 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2029 BasicBlock *LoopVectorPreHeader) {
2030 using namespace llvm::PatternMatch;
2031 if (!SCEVCheckCond || match(V: SCEVCheckCond, P: m_ZeroInt()))
2032 return nullptr;
2033
2034 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2035 BranchInst::Create(IfTrue: LoopVectorPreHeader, InsertBefore: SCEVCheckBlock);
2036
2037 SCEVCheckBlock->getTerminator()->eraseFromParent();
2038 SCEVCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2039 Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2040 NewBB: SCEVCheckBlock);
2041
2042 BranchInst &BI =
2043 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: SCEVCheckCond);
2044 if (AddBranchWeights)
2045 setBranchWeights(I&: BI, Weights: SCEVCheckBypassWeights, /*IsExpected=*/false);
2046 ReplaceInstWithInst(From: SCEVCheckBlock->getTerminator(), To: &BI);
2047 AddedAnyChecks = true;
2048 return SCEVCheckBlock;
2049 }
2050
2051 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2052 /// the branches to branch to the vector preheader or \p Bypass, depending on
2053 /// the generated condition.
2054 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2055 BasicBlock *LoopVectorPreHeader) {
2056 // Check if we generated code that checks in runtime if arrays overlap.
2057 if (!MemRuntimeCheckCond)
2058 return nullptr;
2059
2060 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2061 Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2062 NewBB: MemCheckBlock);
2063
2064 MemCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2065
2066 BranchInst &BI =
2067 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: MemRuntimeCheckCond);
2068 if (AddBranchWeights) {
2069 setBranchWeights(I&: BI, Weights: MemCheckBypassWeights, /*IsExpected=*/false);
2070 }
2071 ReplaceInstWithInst(From: MemCheckBlock->getTerminator(), To: &BI);
2072 MemCheckBlock->getTerminator()->setDebugLoc(
2073 Pred->getTerminator()->getDebugLoc());
2074
2075 AddedAnyChecks = true;
2076 return MemCheckBlock;
2077 }
2078
2079 /// Return true if any runtime checks have been added
2080 bool hasChecks() const { return AddedAnyChecks; }
2081};
2082} // namespace
2083
2084static bool useActiveLaneMask(TailFoldingStyle Style) {
2085 return Style == TailFoldingStyle::Data ||
2086 Style == TailFoldingStyle::DataAndControlFlow ||
2087 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2088}
2089
2090static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2091 return Style == TailFoldingStyle::DataAndControlFlow ||
2092 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2093}
2094
2095// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2096// vectorization. The loop needs to be annotated with #pragma omp simd
2097// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2098// vector length information is not provided, vectorization is not considered
2099// explicit. Interleave hints are not allowed either. These limitations will be
2100// relaxed in the future.
2101// Please, note that we are currently forced to abuse the pragma 'clang
2102// vectorize' semantics. This pragma provides *auto-vectorization hints*
2103// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2104// provides *explicit vectorization hints* (LV can bypass legal checks and
2105// assume that vectorization is legal). However, both hints are implemented
2106// using the same metadata (llvm.loop.vectorize, processed by
2107// LoopVectorizeHints). This will be fixed in the future when the native IR
2108// representation for pragma 'omp simd' is introduced.
2109static bool isExplicitVecOuterLoop(Loop *OuterLp,
2110 OptimizationRemarkEmitter *ORE) {
2111 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2112 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2113
2114 // Only outer loops with an explicit vectorization hint are supported.
2115 // Unannotated outer loops are ignored.
2116 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2117 return false;
2118
2119 Function *Fn = OuterLp->getHeader()->getParent();
2120 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2121 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2122 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2123 return false;
2124 }
2125
2126 if (Hints.getInterleave() > 1) {
2127 // TODO: Interleave support is future work.
2128 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2129 "outer loops.\n");
2130 Hints.emitRemarkWithHints();
2131 return false;
2132 }
2133
2134 return true;
2135}
2136
2137static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2138 OptimizationRemarkEmitter *ORE,
2139 SmallVectorImpl<Loop *> &V) {
2140 // Collect inner loops and outer loops without irreducible control flow. For
2141 // now, only collect outer loops that have explicit vectorization hints. If we
2142 // are stress testing the VPlan H-CFG construction, we collect the outermost
2143 // loop of every loop nest.
2144 if (L.isInnermost() || VPlanBuildStressTest ||
2145 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2146 LoopBlocksRPO RPOT(&L);
2147 RPOT.perform(LI);
2148 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2149 V.push_back(Elt: &L);
2150 // TODO: Collect inner loops inside marked outer loops in case
2151 // vectorization fails for the outer loop. Do not invoke
2152 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2153 // already known to be reducible. We can use an inherited attribute for
2154 // that.
2155 return;
2156 }
2157 }
2158 for (Loop *InnerL : L)
2159 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2160}
2161
2162//===----------------------------------------------------------------------===//
2163// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2164// LoopVectorizationCostModel and LoopVectorizationPlanner.
2165//===----------------------------------------------------------------------===//
2166
2167/// Compute the transformed value of Index at offset StartValue using step
2168/// StepValue.
2169/// For integer induction, returns StartValue + Index * StepValue.
2170/// For pointer induction, returns StartValue[Index * StepValue].
2171/// FIXME: The newly created binary instructions should contain nsw/nuw
2172/// flags, which can be found from the original scalar operations.
2173static Value *
2174emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2175 Value *Step,
2176 InductionDescriptor::InductionKind InductionKind,
2177 const BinaryOperator *InductionBinOp) {
2178 using namespace llvm::PatternMatch;
2179 Type *StepTy = Step->getType();
2180 Value *CastedIndex = StepTy->isIntegerTy()
2181 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2182 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2183 if (CastedIndex != Index) {
2184 CastedIndex->setName(CastedIndex->getName() + ".cast");
2185 Index = CastedIndex;
2186 }
2187
2188 // Note: the IR at this point is broken. We cannot use SE to create any new
2189 // SCEV and then expand it, hoping that SCEV's simplification will give us
2190 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2191 // lead to various SCEV crashes. So all we can do is to use builder and rely
2192 // on InstCombine for future simplifications. Here we handle some trivial
2193 // cases only.
2194 auto CreateAdd = [&B](Value *X, Value *Y) {
2195 assert(X->getType() == Y->getType() && "Types don't match!");
2196 if (match(V: X, P: m_ZeroInt()))
2197 return Y;
2198 if (match(V: Y, P: m_ZeroInt()))
2199 return X;
2200 return B.CreateAdd(LHS: X, RHS: Y);
2201 };
2202
2203 // We allow X to be a vector type, in which case Y will potentially be
2204 // splatted into a vector with the same element count.
2205 auto CreateMul = [&B](Value *X, Value *Y) {
2206 assert(X->getType()->getScalarType() == Y->getType() &&
2207 "Types don't match!");
2208 if (match(V: X, P: m_One()))
2209 return Y;
2210 if (match(V: Y, P: m_One()))
2211 return X;
2212 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2213 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2214 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2215 return B.CreateMul(LHS: X, RHS: Y);
2216 };
2217
2218 switch (InductionKind) {
2219 case InductionDescriptor::IK_IntInduction: {
2220 assert(!isa<VectorType>(Index->getType()) &&
2221 "Vector indices not supported for integer inductions yet");
2222 assert(Index->getType() == StartValue->getType() &&
2223 "Index type does not match StartValue type");
2224 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2225 return B.CreateSub(LHS: StartValue, RHS: Index);
2226 auto *Offset = CreateMul(Index, Step);
2227 return CreateAdd(StartValue, Offset);
2228 }
2229 case InductionDescriptor::IK_PtrInduction:
2230 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2231 case InductionDescriptor::IK_FpInduction: {
2232 assert(!isa<VectorType>(Index->getType()) &&
2233 "Vector indices not supported for FP inductions yet");
2234 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2235 assert(InductionBinOp &&
2236 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2237 InductionBinOp->getOpcode() == Instruction::FSub) &&
2238 "Original bin op should be defined for FP induction");
2239
2240 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2241 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2242 Name: "induction");
2243 }
2244 case InductionDescriptor::IK_NoInduction:
2245 return nullptr;
2246 }
2247 llvm_unreachable("invalid enum");
2248}
2249
2250static std::optional<unsigned> getMaxVScale(const Function &F,
2251 const TargetTransformInfo &TTI) {
2252 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2253 return MaxVScale;
2254
2255 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2256 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2257
2258 return std::nullopt;
2259}
2260
2261/// For the given VF and UF and maximum trip count computed for the loop, return
2262/// whether the induction variable might overflow in the vectorized loop. If not,
2263/// then we know a runtime overflow check always evaluates to false and can be
2264/// removed.
2265static bool isIndvarOverflowCheckKnownFalse(
2266 const LoopVectorizationCostModel *Cost,
2267 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2268 // Always be conservative if we don't know the exact unroll factor.
2269 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2270
2271 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2272 APInt MaxUIntTripCount = IdxTy->getMask();
2273
2274 // We know the runtime overflow check is known false iff the (max) trip-count
2275 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2276 // the vector loop induction variable.
2277 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2278 uint64_t MaxVF = VF.getKnownMinValue();
2279 if (VF.isScalable()) {
2280 std::optional<unsigned> MaxVScale =
2281 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2282 if (!MaxVScale)
2283 return false;
2284 MaxVF *= *MaxVScale;
2285 }
2286
2287 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2288 }
2289
2290 return false;
2291}
2292
2293// Return whether we allow using masked interleave-groups (for dealing with
2294// strided loads/stores that reside in predicated blocks, or for dealing
2295// with gaps).
2296static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2297 // If an override option has been passed in for interleaved accesses, use it.
2298 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2299 return EnableMaskedInterleavedMemAccesses;
2300
2301 return TTI.enableMaskedInterleavedAccessVectorization();
2302}
2303
2304Value *
2305InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2306 if (VectorTripCount)
2307 return VectorTripCount;
2308
2309 Value *TC = getTripCount();
2310 IRBuilder<> Builder(InsertBlock->getTerminator());
2311
2312 Type *Ty = TC->getType();
2313 // This is where we can make the step a runtime constant.
2314 Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF);
2315
2316 // If the tail is to be folded by masking, round the number of iterations N
2317 // up to a multiple of Step instead of rounding down. This is done by first
2318 // adding Step-1 and then rounding down. Note that it's ok if this addition
2319 // overflows: the vector induction variable will eventually wrap to zero given
2320 // that it starts at zero and its Step is a power of two; the loop will then
2321 // exit, with the last early-exit vector comparison also producing all-true.
2322 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2323 // is accounted for in emitIterationCountCheck that adds an overflow check.
2324 if (Cost->foldTailByMasking()) {
2325 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2326 "VF*UF must be a power of 2 when folding tail by masking");
2327 TC = Builder.CreateAdd(LHS: TC, RHS: Builder.CreateSub(LHS: Step, RHS: ConstantInt::get(Ty, V: 1)),
2328 Name: "n.rnd.up");
2329 }
2330
2331 // Now we need to generate the expression for the part of the loop that the
2332 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2333 // iterations are not required for correctness, or N - Step, otherwise. Step
2334 // is equal to the vectorization factor (number of SIMD elements) times the
2335 // unroll factor (number of SIMD instructions).
2336 Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf");
2337
2338 // There are cases where we *must* run at least one iteration in the remainder
2339 // loop. See the cost model for when this can happen. If the step evenly
2340 // divides the trip count, we set the remainder to be equal to the step. If
2341 // the step does not evenly divide the trip count, no adjustment is necessary
2342 // since there will already be scalar iterations. Note that the minimum
2343 // iterations check ensures that N >= Step.
2344 if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2345 auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: 0));
2346 R = Builder.CreateSelect(C: IsZero, True: Step, False: R);
2347 }
2348
2349 VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec");
2350
2351 return VectorTripCount;
2352}
2353
2354void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2355 // Note: The block with the minimum trip-count check is already connected
2356 // during earlier VPlan construction.
2357 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2358 VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2359 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2360 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2361 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2362 VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPB, BlockPtr: CheckVPIRBB);
2363 PreVectorPH = CheckVPIRBB;
2364 VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2365 PreVectorPH->swapSuccessors();
2366
2367 // We just connected a new block to the scalar preheader. Update all
2368 // VPPhis by adding an incoming value for it, replicating the last value.
2369 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2370 for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2371 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2372 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2373 "must have incoming values for all operands");
2374 R.addOperand(Operand: R.getOperand(N: NumPredecessors - 2));
2375 }
2376}
2377
2378Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
2379 unsigned UF) const {
2380 // Generate code to check if the loop's trip count is less than VF * UF, or
2381 // equal to it in case a scalar epilogue is required; this implies that the
2382 // vector trip count is zero. This check also covers the case where adding one
2383 // to the backedge-taken count overflowed leading to an incorrect trip count
2384 // of zero. In this case we will also jump to the scalar loop.
2385 auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2386 : ICmpInst::ICMP_ULT;
2387
2388 // Reuse existing vector loop preheader for TC checks.
2389 // Note that new preheader block is generated for vector loop.
2390 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2391 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2392
2393 // If tail is to be folded, vector loop takes care of all iterations.
2394 Value *Count = getTripCount();
2395 Type *CountTy = Count->getType();
2396 Value *CheckMinIters = Builder.getFalse();
2397 auto CreateStep = [&]() -> Value * {
2398 // Create step with max(MinProTripCount, UF * VF).
2399 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2400 return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2401
2402 Value *MinProfTC =
2403 createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: 1);
2404 if (!VF.isScalable())
2405 return MinProfTC;
2406 return Builder.CreateBinaryIntrinsic(
2407 ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2408 };
2409
2410 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2411 if (Style == TailFoldingStyle::None) {
2412 Value *Step = CreateStep();
2413 ScalarEvolution &SE = *PSE.getSE();
2414 // TODO: Emit unconditional branch to vector preheader instead of
2415 // conditional branch with known condition.
2416 const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2417 // Check if the trip count is < the step.
2418 if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2419 // TODO: Ensure step is at most the trip count when determining max VF and
2420 // UF, w/o tail folding.
2421 CheckMinIters = Builder.getTrue();
2422 } else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2423 LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2424 // Generate the minimum iteration check only if we cannot prove the
2425 // check is known to be true, or known to be false.
2426 CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2427 } // else step known to be < trip count, use CheckMinIters preset to false.
2428 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2429 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2430 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2431 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2432 // an overflow to zero when updating induction variables and so an
2433 // additional overflow check is required before entering the vector loop.
2434
2435 // Get the maximum unsigned value for the type.
2436 Value *MaxUIntTripCount =
2437 ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2438 Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2439
2440 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2441 CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep());
2442 }
2443 return CheckMinIters;
2444}
2445
2446void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2447 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2448 Value *CheckMinIters = createIterationCountCheck(VF, UF);
2449 // Create new preheader for vector loop.
2450 LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
2451 DT: static_cast<DominatorTree *>(nullptr), LI,
2452 MSSAU: nullptr, BBName: "vector.ph");
2453
2454 BranchInst &BI =
2455 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
2456 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
2457 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
2458 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
2459
2460 assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
2461 TCCheckBlock &&
2462 "Plan's entry must be TCCCheckBlock");
2463}
2464
2465BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2466 BasicBlock *const SCEVCheckBlock =
2467 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader);
2468 if (!SCEVCheckBlock)
2469 return nullptr;
2470
2471 assert((!Cost->OptForSize ||
2472 Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
2473 "Cannot SCEV check stride or overflow when optimizing for size");
2474
2475 introduceCheckBlockInVPlan(CheckIRBB: SCEVCheckBlock);
2476 return SCEVCheckBlock;
2477}
2478
2479BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2480 BasicBlock *const MemCheckBlock =
2481 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2482
2483 // Check if we generated code that checks in runtime if arrays overlap. We put
2484 // the checks into a separate block to make the more common case of few
2485 // elements faster.
2486 if (!MemCheckBlock)
2487 return nullptr;
2488
2489 // VPlan-native path does not do any analysis for runtime checks currently.
2490 assert((!EnableVPlanNativePath || OrigLoop->begin() == OrigLoop->end()) &&
2491 "Runtime checks are not supported for outer loops yet");
2492
2493 if (Cost->OptForSize) {
2494 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2495 "Cannot emit memory checks when optimizing for size, unless forced "
2496 "to vectorize.");
2497 ORE->emit(RemarkBuilder: [&]() {
2498 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2499 OrigLoop->getStartLoc(),
2500 OrigLoop->getHeader())
2501 << "Code-size may be reduced by not forcing "
2502 "vectorization, or by source-code modifications "
2503 "eliminating the need for runtime checks "
2504 "(e.g., adding 'restrict').";
2505 });
2506 }
2507
2508 introduceCheckBlockInVPlan(CheckIRBB: MemCheckBlock);
2509 return MemCheckBlock;
2510}
2511
2512/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2513/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2514/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2515/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2516static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2517 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2518 for (auto &R : make_early_inc_range(Range&: *VPBB)) {
2519 assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) &&
2520 "Tried to move phi recipe after a non-phi recipe");
2521 R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2522 }
2523
2524 VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2525 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2526}
2527
2528void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2529 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2530 assert(LoopVectorPreHeader && "Invalid loop structure");
2531 assert((OrigLoop->getUniqueLatchExitBlock() ||
2532 Cost->requiresScalarEpilogue(VF.isVector())) &&
2533 "loops not exiting via the latch without required epilogue?");
2534
2535 LoopScalarPreHeader =
2536 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
2537 LI, MSSAU: nullptr, BBName: Twine(Prefix) + "scalar.ph");
2538 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2539 // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
2540 // preheader may be unreachable at this point. Instead it is replaced in
2541 // createVectorizedLoopSkeleton.
2542}
2543
2544/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2545/// expansion results.
2546static Value *getExpandedStep(const InductionDescriptor &ID,
2547 const SCEV2ValueTy &ExpandedSCEVs) {
2548 const SCEV *Step = ID.getStep();
2549 if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
2550 return C->getValue();
2551 if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
2552 return U->getValue();
2553 Value *V = ExpandedSCEVs.lookup(Val: Step);
2554 assert(V && "SCEV must be expanded at this point");
2555 return V;
2556}
2557
2558/// Knowing that loop \p L executes a single vector iteration, add instructions
2559/// that will get simplified and thus should not have any cost to \p
2560/// InstsToIgnore.
2561static void addFullyUnrolledInstructionsToIgnore(
2562 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2563 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2564 auto *Cmp = L->getLatchCmpInst();
2565 if (Cmp)
2566 InstsToIgnore.insert(Ptr: Cmp);
2567 for (const auto &KV : IL) {
2568 // Extract the key by hand so that it can be used in the lambda below. Note
2569 // that captured structured bindings are a C++20 extension.
2570 const PHINode *IV = KV.first;
2571
2572 // Get next iteration value of the induction variable.
2573 Instruction *IVInst =
2574 cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2575 if (all_of(Range: IVInst->users(),
2576 P: [&](const User *U) { return U == IV || U == Cmp; }))
2577 InstsToIgnore.insert(Ptr: IVInst);
2578 }
2579}
2580
2581BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2582 /*
2583 In this function we generate a new loop. The new loop will contain
2584 the vectorized instructions while the old loop will continue to run the
2585 scalar remainder.
2586
2587 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2588 / | preheader are expanded here. Eventually all required SCEV
2589 / | expansion should happen here.
2590 / v
2591 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2592 | / |
2593 | / v
2594 || [ ] <-- vector pre header.
2595 |/ |
2596 | v
2597 | [ ] \
2598 | [ ]_| <-- vector loop (created during VPlan execution).
2599 | |
2600 | v
2601 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2602 | | successors created during VPlan execution)
2603 \/ |
2604 /\ v
2605 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2606 | |
2607 (opt) v <-- edge from middle to exit iff epilogue is not required.
2608 | [ ] \
2609 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2610 | | wrapped in VPIRBasicBlock).
2611 \ |
2612 \ v
2613 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2614 ...
2615 */
2616
2617 // Create an empty vector loop, and prepare basic blocks for the runtime
2618 // checks.
2619 createVectorLoopSkeleton(Prefix: "");
2620
2621 // Now, compare the new count to zero. If it is zero skip the vector loop and
2622 // jump to the scalar loop. This check also covers the case where the
2623 // backedge-taken count is uint##_max: adding one to it will overflow leading
2624 // to an incorrect trip count of zero. In this (rare) case we will also jump
2625 // to the scalar loop.
2626 emitIterationCountCheck(Bypass: LoopScalarPreHeader);
2627
2628 // Generate the code to check any assumptions that we've made for SCEV
2629 // expressions.
2630 emitSCEVChecks(Bypass: LoopScalarPreHeader);
2631
2632 // Generate the code that checks in runtime if arrays overlap. We put the
2633 // checks into a separate block to make the more common case of few elements
2634 // faster.
2635 emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
2636
2637 replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
2638 return LoopVectorPreHeader;
2639}
2640
2641namespace {
2642
2643struct CSEDenseMapInfo {
2644 static bool canHandle(const Instruction *I) {
2645 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2646 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2647 }
2648
2649 static inline Instruction *getEmptyKey() {
2650 return DenseMapInfo<Instruction *>::getEmptyKey();
2651 }
2652
2653 static inline Instruction *getTombstoneKey() {
2654 return DenseMapInfo<Instruction *>::getTombstoneKey();
2655 }
2656
2657 static unsigned getHashValue(const Instruction *I) {
2658 assert(canHandle(I) && "Unknown instruction!");
2659 return hash_combine(args: I->getOpcode(),
2660 args: hash_combine_range(R: I->operand_values()));
2661 }
2662
2663 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2664 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2665 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2666 return LHS == RHS;
2667 return LHS->isIdenticalTo(I: RHS);
2668 }
2669};
2670
2671} // end anonymous namespace
2672
2673///Perform cse of induction variable instructions.
2674static void cse(BasicBlock *BB) {
2675 // Perform simple cse.
2676 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2677 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2678 if (!CSEDenseMapInfo::canHandle(I: &In))
2679 continue;
2680
2681 // Check if we can replace this instruction with any of the
2682 // visited instructions.
2683 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2684 In.replaceAllUsesWith(V);
2685 In.eraseFromParent();
2686 continue;
2687 }
2688
2689 CSEMap[&In] = &In;
2690 }
2691}
2692
2693/// This function attempts to return a value that represents the vectorization
2694/// factor at runtime. For fixed-width VFs we know this precisely at compile
2695/// time, but for scalable VFs we calculate it based on an estimate of the
2696/// vscale value.
2697static unsigned getEstimatedRuntimeVF(ElementCount VF,
2698 std::optional<unsigned> VScale) {
2699 unsigned EstimatedVF = VF.getKnownMinValue();
2700 if (VF.isScalable())
2701 if (VScale)
2702 EstimatedVF *= *VScale;
2703 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2704 return EstimatedVF;
2705}
2706
2707InstructionCost
2708LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2709 ElementCount VF) const {
2710 // We only need to calculate a cost if the VF is scalar; for actual vectors
2711 // we should already have a pre-calculated cost at each VF.
2712 if (!VF.isScalar())
2713 return getCallWideningDecision(CI, VF).Cost;
2714
2715 Type *RetTy = CI->getType();
2716 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2717 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2718 return *RedCost;
2719
2720 SmallVector<Type *, 4> Tys;
2721 for (auto &ArgOp : CI->args())
2722 Tys.push_back(Elt: ArgOp->getType());
2723
2724 InstructionCost ScalarCallCost =
2725 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2726
2727 // If this is an intrinsic we may have a lower cost for it.
2728 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2729 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2730 return std::min(a: ScalarCallCost, b: IntrinsicCost);
2731 }
2732 return ScalarCallCost;
2733}
2734
2735static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2736 if (VF.isScalar() || !canVectorizeTy(Ty))
2737 return Ty;
2738 return toVectorizedTy(Ty, EC: VF);
2739}
2740
2741InstructionCost
2742LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2743 ElementCount VF) const {
2744 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2745 assert(ID && "Expected intrinsic call!");
2746 Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2747 FastMathFlags FMF;
2748 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2749 FMF = FPMO->getFastMathFlags();
2750
2751 SmallVector<const Value *> Arguments(CI->args());
2752 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2753 SmallVector<Type *> ParamTys;
2754 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2755 result: std::back_inserter(x&: ParamTys),
2756 unary_op: [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2757
2758 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2759 dyn_cast<IntrinsicInst>(Val: CI),
2760 InstructionCost::getInvalid(), TLI);
2761 return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2762}
2763
2764void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2765 // Fix widened non-induction PHIs by setting up the PHI operands.
2766 fixNonInductionPHIs(State);
2767
2768 // After vectorization, the exit blocks of the original loop will have
2769 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2770 // looked through single-entry phis.
2771 SmallVector<BasicBlock *> ExitBlocks;
2772 OrigLoop->getExitBlocks(ExitBlocks);
2773 for (BasicBlock *Exit : ExitBlocks)
2774 for (PHINode &PN : Exit->phis())
2775 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN);
2776
2777 // Forget the original basic block.
2778 PSE.getSE()->forgetLoop(L: OrigLoop);
2779 PSE.getSE()->forgetBlockAndLoopDispositions();
2780
2781 // Don't apply optimizations below when no (vector) loop remains, as they all
2782 // require one at the moment.
2783 VPBasicBlock *HeaderVPBB =
2784 vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2785 if (!HeaderVPBB)
2786 return;
2787
2788 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2789
2790 // Remove redundant induction instructions.
2791 cse(BB: HeaderBB);
2792
2793 // Set/update profile weights for the vector and remainder loops as original
2794 // loop iterations are now distributed among them. Note that original loop
2795 // becomes the scalar remainder loop after vectorization.
2796 //
2797 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2798 // end up getting slightly roughened result but that should be OK since
2799 // profile is not inherently precise anyway. Note also possible bypass of
2800 // vector code caused by legality checks is ignored, assigning all the weight
2801 // to the vector loop, optimistically.
2802 //
2803 // For scalable vectorization we can't know at compile time how many
2804 // iterations of the loop are handled in one vector iteration, so instead
2805 // use the value of vscale used for tuning.
2806 Loop *VectorLoop = LI->getLoopFor(BB: HeaderBB);
2807 unsigned EstimatedVFxUF =
2808 getEstimatedRuntimeVF(VF: VF * UF, VScale: Cost->getVScaleForTuning());
2809 setProfileInfoAfterUnrolling(OrigLoop, UnrolledLoop: VectorLoop, RemainderLoop: OrigLoop, UF: EstimatedVFxUF);
2810}
2811
2812void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2813 auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2814 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2815 for (VPRecipeBase &P : VPBB->phis()) {
2816 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2817 if (!VPPhi)
2818 continue;
2819 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2820 // Make sure the builder has a valid insert point.
2821 Builder.SetInsertPoint(NewPhi);
2822 for (unsigned Idx = 0; Idx < VPPhi->getNumIncoming(); ++Idx) {
2823 VPValue *Inc = VPPhi->getIncomingValue(Idx);
2824 const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
2825 NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB[VPBB]);
2826 }
2827 }
2828 }
2829}
2830
2831void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2832 // We should not collect Scalars more than once per VF. Right now, this
2833 // function is called from collectUniformsAndScalars(), which already does
2834 // this check. Collecting Scalars for VF=1 does not make any sense.
2835 assert(VF.isVector() && !Scalars.contains(VF) &&
2836 "This function should not be visited twice for the same VF");
2837
2838 // This avoids any chances of creating a REPLICATE recipe during planning
2839 // since that would result in generation of scalarized code during execution,
2840 // which is not supported for scalable vectors.
2841 if (VF.isScalable()) {
2842 Scalars[VF].insert_range(R&: Uniforms[VF]);
2843 return;
2844 }
2845
2846 SmallSetVector<Instruction *, 8> Worklist;
2847
2848 // These sets are used to seed the analysis with pointers used by memory
2849 // accesses that will remain scalar.
2850 SmallSetVector<Instruction *, 8> ScalarPtrs;
2851 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2852 auto *Latch = TheLoop->getLoopLatch();
2853
2854 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2855 // The pointer operands of loads and stores will be scalar as long as the
2856 // memory access is not a gather or scatter operation. The value operand of a
2857 // store will remain scalar if the store is scalarized.
2858 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2859 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2860 assert(WideningDecision != CM_Unknown &&
2861 "Widening decision should be ready at this moment");
2862 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2863 if (Ptr == Store->getValueOperand())
2864 return WideningDecision == CM_Scalarize;
2865 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2866 "Ptr is neither a value or pointer operand");
2867 return WideningDecision != CM_GatherScatter;
2868 };
2869
2870 // A helper that returns true if the given value is a getelementptr
2871 // instruction contained in the loop.
2872 auto IsLoopVaryingGEP = [&](Value *V) {
2873 return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2874 };
2875
2876 // A helper that evaluates a memory access's use of a pointer. If the use will
2877 // be a scalar use and the pointer is only used by memory accesses, we place
2878 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2879 // PossibleNonScalarPtrs.
2880 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2881 // We only care about bitcast and getelementptr instructions contained in
2882 // the loop.
2883 if (!IsLoopVaryingGEP(Ptr))
2884 return;
2885
2886 // If the pointer has already been identified as scalar (e.g., if it was
2887 // also identified as uniform), there's nothing to do.
2888 auto *I = cast<Instruction>(Val: Ptr);
2889 if (Worklist.count(key: I))
2890 return;
2891
2892 // If the use of the pointer will be a scalar use, and all users of the
2893 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2894 // place the pointer in PossibleNonScalarPtrs.
2895 if (IsScalarUse(MemAccess, Ptr) &&
2896 all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2897 ScalarPtrs.insert(X: I);
2898 else
2899 PossibleNonScalarPtrs.insert(Ptr: I);
2900 };
2901
2902 // We seed the scalars analysis with three classes of instructions: (1)
2903 // instructions marked uniform-after-vectorization and (2) bitcast,
2904 // getelementptr and (pointer) phi instructions used by memory accesses
2905 // requiring a scalar use.
2906 //
2907 // (1) Add to the worklist all instructions that have been identified as
2908 // uniform-after-vectorization.
2909 Worklist.insert_range(R&: Uniforms[VF]);
2910
2911 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2912 // memory accesses requiring a scalar use. The pointer operands of loads and
2913 // stores will be scalar unless the operation is a gather or scatter.
2914 // The value operand of a store will remain scalar if the store is scalarized.
2915 for (auto *BB : TheLoop->blocks())
2916 for (auto &I : *BB) {
2917 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2918 EvaluatePtrUse(Load, Load->getPointerOperand());
2919 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2920 EvaluatePtrUse(Store, Store->getPointerOperand());
2921 EvaluatePtrUse(Store, Store->getValueOperand());
2922 }
2923 }
2924 for (auto *I : ScalarPtrs)
2925 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2926 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2927 Worklist.insert(X: I);
2928 }
2929
2930 // Insert the forced scalars.
2931 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2932 // induction variable when the PHI user is scalarized.
2933 auto ForcedScalar = ForcedScalars.find(Val: VF);
2934 if (ForcedScalar != ForcedScalars.end())
2935 for (auto *I : ForcedScalar->second) {
2936 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2937 Worklist.insert(X: I);
2938 }
2939
2940 // Expand the worklist by looking through any bitcasts and getelementptr
2941 // instructions we've already identified as scalar. This is similar to the
2942 // expansion step in collectLoopUniforms(); however, here we're only
2943 // expanding to include additional bitcasts and getelementptr instructions.
2944 unsigned Idx = 0;
2945 while (Idx != Worklist.size()) {
2946 Instruction *Dst = Worklist[Idx++];
2947 if (!IsLoopVaryingGEP(Dst->getOperand(i: 0)))
2948 continue;
2949 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
2950 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
2951 auto *J = cast<Instruction>(Val: U);
2952 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
2953 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
2954 IsScalarUse(J, Src));
2955 })) {
2956 Worklist.insert(X: Src);
2957 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2958 }
2959 }
2960
2961 // An induction variable will remain scalar if all users of the induction
2962 // variable and induction variable update remain scalar.
2963 for (const auto &Induction : Legal->getInductionVars()) {
2964 auto *Ind = Induction.first;
2965 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2966
2967 // If tail-folding is applied, the primary induction variable will be used
2968 // to feed a vector compare.
2969 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2970 continue;
2971
2972 // Returns true if \p Indvar is a pointer induction that is used directly by
2973 // load/store instruction \p I.
2974 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2975 Instruction *I) {
2976 return Induction.second.getKind() ==
2977 InductionDescriptor::IK_PtrInduction &&
2978 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
2979 Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse(I, Indvar);
2980 };
2981
2982 // Determine if all users of the induction variable are scalar after
2983 // vectorization.
2984 bool ScalarInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2985 auto *I = cast<Instruction>(Val: U);
2986 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2987 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2988 });
2989 if (!ScalarInd)
2990 continue;
2991
2992 // If the induction variable update is a fixed-order recurrence, neither the
2993 // induction variable or its update should be marked scalar after
2994 // vectorization.
2995 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2996 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2997 continue;
2998
2999 // Determine if all users of the induction variable update instruction are
3000 // scalar after vectorization.
3001 bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3002 auto *I = cast<Instruction>(Val: U);
3003 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3004 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3005 });
3006 if (!ScalarIndUpdate)
3007 continue;
3008
3009 // The induction variable and its update instruction will remain scalar.
3010 Worklist.insert(X: Ind);
3011 Worklist.insert(X: IndUpdate);
3012 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3013 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3014 << "\n");
3015 }
3016
3017 Scalars[VF].insert_range(R&: Worklist);
3018}
3019
3020bool LoopVectorizationCostModel::isScalarWithPredication(
3021 Instruction *I, ElementCount VF) const {
3022 if (!isPredicatedInst(I))
3023 return false;
3024
3025 // Do we have a non-scalar lowering for this predicated
3026 // instruction? No - it is scalar with predication.
3027 switch(I->getOpcode()) {
3028 default:
3029 return true;
3030 case Instruction::Call:
3031 if (VF.isScalar())
3032 return true;
3033 return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
3034 case Instruction::Load:
3035 case Instruction::Store: {
3036 auto *Ptr = getLoadStorePointerOperand(V: I);
3037 auto *Ty = getLoadStoreType(I);
3038 unsigned AS = getLoadStoreAddressSpace(I);
3039 Type *VTy = Ty;
3040 if (VF.isVector())
3041 VTy = VectorType::get(ElementType: Ty, EC: VF);
3042 const Align Alignment = getLoadStoreAlignment(I);
3043 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
3044 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
3045 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
3046 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
3047 }
3048 case Instruction::UDiv:
3049 case Instruction::SDiv:
3050 case Instruction::SRem:
3051 case Instruction::URem: {
3052 // We have the option to use the safe-divisor idiom to avoid predication.
3053 // The cost based decision here will always select safe-divisor for
3054 // scalable vectors as scalarization isn't legal.
3055 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3056 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3057 }
3058 }
3059}
3060
3061// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
3062bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3063 // TODO: We can use the loop-preheader as context point here and get
3064 // context sensitive reasoning for isSafeToSpeculativelyExecute.
3065 if (isSafeToSpeculativelyExecute(I) ||
3066 (isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) ||
3067 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(Val: I))
3068 return false;
3069
3070 // If the instruction was executed conditionally in the original scalar loop,
3071 // predication is needed with a mask whose lanes are all possibly inactive.
3072 if (Legal->blockNeedsPredication(BB: I->getParent()))
3073 return true;
3074
3075 // If we're not folding the tail by masking, predication is unnecessary.
3076 if (!foldTailByMasking())
3077 return false;
3078
3079 // All that remain are instructions with side-effects originally executed in
3080 // the loop unconditionally, but now execute under a tail-fold mask (only)
3081 // having at least one active lane (the first). If the side-effects of the
3082 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
3083 // - it will cause the same side-effects as when masked.
3084 switch(I->getOpcode()) {
3085 default:
3086 llvm_unreachable(
3087 "instruction should have been considered by earlier checks");
3088 case Instruction::Call:
3089 // Side-effects of a Call are assumed to be non-invariant, needing a
3090 // (fold-tail) mask.
3091 assert(Legal->isMaskRequired(I) &&
3092 "should have returned earlier for calls not needing a mask");
3093 return true;
3094 case Instruction::Load:
3095 // If the address is loop invariant no predication is needed.
3096 return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
3097 case Instruction::Store: {
3098 // For stores, we need to prove both speculation safety (which follows from
3099 // the same argument as loads), but also must prove the value being stored
3100 // is correct. The easiest form of the later is to require that all values
3101 // stored are the same.
3102 return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
3103 Legal->isInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
3104 }
3105 case Instruction::UDiv:
3106 case Instruction::SDiv:
3107 case Instruction::SRem:
3108 case Instruction::URem:
3109 // If the divisor is loop-invariant no predication is needed.
3110 return !Legal->isInvariant(V: I->getOperand(i: 1));
3111 }
3112}
3113
3114std::pair<InstructionCost, InstructionCost>
3115LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3116 ElementCount VF) const {
3117 assert(I->getOpcode() == Instruction::UDiv ||
3118 I->getOpcode() == Instruction::SDiv ||
3119 I->getOpcode() == Instruction::SRem ||
3120 I->getOpcode() == Instruction::URem);
3121 assert(!isSafeToSpeculativelyExecute(I));
3122
3123 // Scalarization isn't legal for scalable vector types
3124 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3125 if (!VF.isScalable()) {
3126 // Get the scalarization cost and scale this amount by the probability of
3127 // executing the predicated block. If the instruction is not predicated,
3128 // we fall through to the next case.
3129 ScalarizationCost = 0;
3130
3131 // These instructions have a non-void type, so account for the phi nodes
3132 // that we will create. This cost is likely to be zero. The phi node
3133 // cost, if any, should be scaled by the block probability because it
3134 // models a copy at the end of each predicated block.
3135 ScalarizationCost +=
3136 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
3137
3138 // The cost of the non-predicated instruction.
3139 ScalarizationCost +=
3140 VF.getFixedValue() *
3141 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
3142
3143 // The cost of insertelement and extractelement instructions needed for
3144 // scalarization.
3145 ScalarizationCost += getScalarizationOverhead(I, VF);
3146
3147 // Scale the cost by the probability of executing the predicated blocks.
3148 // This assumes the predicated block for each vector lane is equally
3149 // likely.
3150 ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
3151 }
3152 InstructionCost SafeDivisorCost = 0;
3153
3154 auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
3155
3156 // The cost of the select guard to ensure all lanes are well defined
3157 // after we speculate above any internal control flow.
3158 SafeDivisorCost +=
3159 TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
3160 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
3161 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
3162
3163 // Certain instructions can be cheaper to vectorize if they have a constant
3164 // second vector operand. One example of this are shifts on x86.
3165 Value *Op2 = I->getOperand(i: 1);
3166 auto Op2Info = TTI.getOperandInfo(V: Op2);
3167 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3168 Legal->isInvariant(V: Op2))
3169 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3170
3171 SmallVector<const Value *, 4> Operands(I->operand_values());
3172 SafeDivisorCost += TTI.getArithmeticInstrCost(
3173 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
3174 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
3175 Opd2Info: Op2Info, Args: Operands, CxtI: I);
3176 return {ScalarizationCost, SafeDivisorCost};
3177}
3178
3179bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3180 Instruction *I, ElementCount VF) const {
3181 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3182 assert(getWideningDecision(I, VF) == CM_Unknown &&
3183 "Decision should not be set yet.");
3184 auto *Group = getInterleavedAccessGroup(Instr: I);
3185 assert(Group && "Must have a group.");
3186 unsigned InterleaveFactor = Group->getFactor();
3187
3188 // If the instruction's allocated size doesn't equal its type size, it
3189 // requires padding and will be scalarized.
3190 auto &DL = I->getDataLayout();
3191 auto *ScalarTy = getLoadStoreType(I);
3192 if (hasIrregularType(Ty: ScalarTy, DL))
3193 return false;
3194
3195 // For scalable vectors, the interleave factors must be <= 8 since we require
3196 // the (de)interleaveN intrinsics instead of shufflevectors.
3197 if (VF.isScalable() && InterleaveFactor > 8)
3198 return false;
3199
3200 // If the group involves a non-integral pointer, we may not be able to
3201 // losslessly cast all values to a common type.
3202 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
3203 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3204 Instruction *Member = Group->getMember(Index: Idx);
3205 if (!Member)
3206 continue;
3207 auto *MemberTy = getLoadStoreType(I: Member);
3208 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3209 // Don't coerce non-integral pointers to integers or vice versa.
3210 if (MemberNI != ScalarNI)
3211 // TODO: Consider adding special nullptr value case here
3212 return false;
3213 if (MemberNI && ScalarNI &&
3214 ScalarTy->getPointerAddressSpace() !=
3215 MemberTy->getPointerAddressSpace())
3216 return false;
3217 }
3218
3219 // Check if masking is required.
3220 // A Group may need masking for one of two reasons: it resides in a block that
3221 // needs predication, or it was decided to use masking to deal with gaps
3222 // (either a gap at the end of a load-access that may result in a speculative
3223 // load, or any gaps in a store-access).
3224 bool PredicatedAccessRequiresMasking =
3225 blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3226 Legal->isMaskRequired(I);
3227 bool LoadAccessWithGapsRequiresEpilogMasking =
3228 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3229 !isScalarEpilogueAllowed();
3230 bool StoreAccessWithGapsRequiresMasking =
3231 isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor());
3232 if (!PredicatedAccessRequiresMasking &&
3233 !LoadAccessWithGapsRequiresEpilogMasking &&
3234 !StoreAccessWithGapsRequiresMasking)
3235 return true;
3236
3237 // If masked interleaving is required, we expect that the user/target had
3238 // enabled it, because otherwise it either wouldn't have been created or
3239 // it should have been invalidated by the CostModel.
3240 assert(useMaskedInterleavedAccesses(TTI) &&
3241 "Masked interleave-groups for predicated accesses are not enabled.");
3242
3243 if (Group->isReverse())
3244 return false;
3245
3246 auto *Ty = getLoadStoreType(I);
3247 const Align Alignment = getLoadStoreAlignment(I);
3248 unsigned AS = getLoadStoreAddressSpace(I);
3249 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3250 : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3251}
3252
3253bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3254 Instruction *I, ElementCount VF) {
3255 // Get and ensure we have a valid memory instruction.
3256 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3257
3258 auto *Ptr = getLoadStorePointerOperand(V: I);
3259 auto *ScalarTy = getLoadStoreType(I);
3260
3261 // In order to be widened, the pointer should be consecutive, first of all.
3262 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3263 return false;
3264
3265 // If the instruction is a store located in a predicated block, it will be
3266 // scalarized.
3267 if (isScalarWithPredication(I, VF))
3268 return false;
3269
3270 // If the instruction's allocated size doesn't equal it's type size, it
3271 // requires padding and will be scalarized.
3272 auto &DL = I->getDataLayout();
3273 if (hasIrregularType(Ty: ScalarTy, DL))
3274 return false;
3275
3276 return true;
3277}
3278
3279void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3280 // We should not collect Uniforms more than once per VF. Right now,
3281 // this function is called from collectUniformsAndScalars(), which
3282 // already does this check. Collecting Uniforms for VF=1 does not make any
3283 // sense.
3284
3285 assert(VF.isVector() && !Uniforms.contains(VF) &&
3286 "This function should not be visited twice for the same VF");
3287
3288 // Visit the list of Uniforms. If we find no uniform value, we won't
3289 // analyze again. Uniforms.count(VF) will return 1.
3290 Uniforms[VF].clear();
3291
3292 // Now we know that the loop is vectorizable!
3293 // Collect instructions inside the loop that will remain uniform after
3294 // vectorization.
3295
3296 // Global values, params and instructions outside of current loop are out of
3297 // scope.
3298 auto IsOutOfScope = [&](Value *V) -> bool {
3299 Instruction *I = dyn_cast<Instruction>(Val: V);
3300 return (!I || !TheLoop->contains(Inst: I));
3301 };
3302
3303 // Worklist containing uniform instructions demanding lane 0.
3304 SetVector<Instruction *> Worklist;
3305
3306 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3307 // that require predication must not be considered uniform after
3308 // vectorization, because that would create an erroneous replicating region
3309 // where only a single instance out of VF should be formed.
3310 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3311 if (IsOutOfScope(I)) {
3312 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3313 << *I << "\n");
3314 return;
3315 }
3316 if (isPredicatedInst(I)) {
3317 LLVM_DEBUG(
3318 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3319 << "\n");
3320 return;
3321 }
3322 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3323 Worklist.insert(X: I);
3324 };
3325
3326 // Start with the conditional branches exiting the loop. If the branch
3327 // condition is an instruction contained in the loop that is only used by the
3328 // branch, it is uniform. Note conditions from uncountable early exits are not
3329 // uniform.
3330 SmallVector<BasicBlock *> Exiting;
3331 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3332 for (BasicBlock *E : Exiting) {
3333 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3334 continue;
3335 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
3336 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3337 AddToWorklistIfAllowed(Cmp);
3338 }
3339
3340 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
3341 // Return true if all lanes perform the same memory operation, and we can
3342 // thus choose to execute only one.
3343 auto IsUniformMemOpUse = [&](Instruction *I) {
3344 // If the value was already known to not be uniform for the previous
3345 // (smaller VF), it cannot be uniform for the larger VF.
3346 if (PrevVF.isVector()) {
3347 auto Iter = Uniforms.find(Val: PrevVF);
3348 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
3349 return false;
3350 }
3351 if (!Legal->isUniformMemOp(I&: *I, VF))
3352 return false;
3353 if (isa<LoadInst>(Val: I))
3354 // Loading the same address always produces the same result - at least
3355 // assuming aliasing and ordering which have already been checked.
3356 return true;
3357 // Storing the same value on every iteration.
3358 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3359 };
3360
3361 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3362 InstWidening WideningDecision = getWideningDecision(I, VF);
3363 assert(WideningDecision != CM_Unknown &&
3364 "Widening decision should be ready at this moment");
3365
3366 if (IsUniformMemOpUse(I))
3367 return true;
3368
3369 return (WideningDecision == CM_Widen ||
3370 WideningDecision == CM_Widen_Reverse ||
3371 WideningDecision == CM_Interleave);
3372 };
3373
3374 // Returns true if Ptr is the pointer operand of a memory access instruction
3375 // I, I is known to not require scalarization, and the pointer is not also
3376 // stored.
3377 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3378 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
3379 return false;
3380 return getLoadStorePointerOperand(V: I) == Ptr &&
3381 (IsUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
3382 };
3383
3384 // Holds a list of values which are known to have at least one uniform use.
3385 // Note that there may be other uses which aren't uniform. A "uniform use"
3386 // here is something which only demands lane 0 of the unrolled iterations;
3387 // it does not imply that all lanes produce the same value (e.g. this is not
3388 // the usual meaning of uniform)
3389 SetVector<Value *> HasUniformUse;
3390
3391 // Scan the loop for instructions which are either a) known to have only
3392 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3393 for (auto *BB : TheLoop->blocks())
3394 for (auto &I : *BB) {
3395 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3396 switch (II->getIntrinsicID()) {
3397 case Intrinsic::sideeffect:
3398 case Intrinsic::experimental_noalias_scope_decl:
3399 case Intrinsic::assume:
3400 case Intrinsic::lifetime_start:
3401 case Intrinsic::lifetime_end:
3402 if (TheLoop->hasLoopInvariantOperands(I: &I))
3403 AddToWorklistIfAllowed(&I);
3404 break;
3405 default:
3406 break;
3407 }
3408 }
3409
3410 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3411 if (IsOutOfScope(EVI->getAggregateOperand())) {
3412 AddToWorklistIfAllowed(EVI);
3413 continue;
3414 }
3415 // Only ExtractValue instructions where the aggregate value comes from a
3416 // call are allowed to be non-uniform.
3417 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3418 "Expected aggregate value to be call return value");
3419 }
3420
3421 // If there's no pointer operand, there's nothing to do.
3422 auto *Ptr = getLoadStorePointerOperand(V: &I);
3423 if (!Ptr)
3424 continue;
3425
3426 if (IsUniformMemOpUse(&I))
3427 AddToWorklistIfAllowed(&I);
3428
3429 if (IsVectorizedMemAccessUse(&I, Ptr))
3430 HasUniformUse.insert(X: Ptr);
3431 }
3432
3433 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3434 // demanding) users. Since loops are assumed to be in LCSSA form, this
3435 // disallows uses outside the loop as well.
3436 for (auto *V : HasUniformUse) {
3437 if (IsOutOfScope(V))
3438 continue;
3439 auto *I = cast<Instruction>(Val: V);
3440 bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User *U) -> bool {
3441 auto *UI = cast<Instruction>(Val: U);
3442 return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse(UI, V);
3443 });
3444 if (UsersAreMemAccesses)
3445 AddToWorklistIfAllowed(I);
3446 }
3447
3448 // Expand Worklist in topological order: whenever a new instruction
3449 // is added , its users should be already inside Worklist. It ensures
3450 // a uniform instruction will only be used by uniform instructions.
3451 unsigned Idx = 0;
3452 while (Idx != Worklist.size()) {
3453 Instruction *I = Worklist[Idx++];
3454
3455 for (auto *OV : I->operand_values()) {
3456 // isOutOfScope operands cannot be uniform instructions.
3457 if (IsOutOfScope(OV))
3458 continue;
3459 // First order recurrence Phi's should typically be considered
3460 // non-uniform.
3461 auto *OP = dyn_cast<PHINode>(Val: OV);
3462 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3463 continue;
3464 // If all the users of the operand are uniform, then add the
3465 // operand into the uniform worklist.
3466 auto *OI = cast<Instruction>(Val: OV);
3467 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
3468 auto *J = cast<Instruction>(Val: U);
3469 return Worklist.count(key: J) || IsVectorizedMemAccessUse(J, OI);
3470 }))
3471 AddToWorklistIfAllowed(OI);
3472 }
3473 }
3474
3475 // For an instruction to be added into Worklist above, all its users inside
3476 // the loop should also be in Worklist. However, this condition cannot be
3477 // true for phi nodes that form a cyclic dependence. We must process phi
3478 // nodes separately. An induction variable will remain uniform if all users
3479 // of the induction variable and induction variable update remain uniform.
3480 // The code below handles both pointer and non-pointer induction variables.
3481 BasicBlock *Latch = TheLoop->getLoopLatch();
3482 for (const auto &Induction : Legal->getInductionVars()) {
3483 auto *Ind = Induction.first;
3484 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3485
3486 // Determine if all users of the induction variable are uniform after
3487 // vectorization.
3488 bool UniformInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3489 auto *I = cast<Instruction>(Val: U);
3490 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3491 IsVectorizedMemAccessUse(I, Ind);
3492 });
3493 if (!UniformInd)
3494 continue;
3495
3496 // Determine if all users of the induction variable update instruction are
3497 // uniform after vectorization.
3498 bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3499 auto *I = cast<Instruction>(Val: U);
3500 return I == Ind || Worklist.count(key: I) ||
3501 IsVectorizedMemAccessUse(I, IndUpdate);
3502 });
3503 if (!UniformIndUpdate)
3504 continue;
3505
3506 // The induction variable and its update instruction will remain uniform.
3507 AddToWorklistIfAllowed(Ind);
3508 AddToWorklistIfAllowed(IndUpdate);
3509 }
3510
3511 Uniforms[VF].insert_range(R&: Worklist);
3512}
3513
3514bool LoopVectorizationCostModel::runtimeChecksRequired() {
3515 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3516
3517 if (Legal->getRuntimePointerChecking()->Need) {
3518 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3519 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3520 "loop with '#pragma clang loop vectorize(enable)' when "
3521 "compiling with -Os/-Oz",
3522 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3523 return true;
3524 }
3525
3526 if (!PSE.getPredicate().isAlwaysTrue()) {
3527 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3528 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3529 "loop with '#pragma clang loop vectorize(enable)' when "
3530 "compiling with -Os/-Oz",
3531 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3532 return true;
3533 }
3534
3535 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3536 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3537 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3538 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3539 "this loop without such check by compiling with -Os/-Oz",
3540 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3541 return true;
3542 }
3543
3544 return false;
3545}
3546
3547bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3548 if (IsScalableVectorizationAllowed)
3549 return *IsScalableVectorizationAllowed;
3550
3551 IsScalableVectorizationAllowed = false;
3552 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3553 return false;
3554
3555 if (Hints->isScalableVectorizationDisabled()) {
3556 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3557 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3558 return false;
3559 }
3560
3561 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3562
3563 auto MaxScalableVF = ElementCount::getScalable(
3564 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3565
3566 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3567 // FIXME: While for scalable vectors this is currently sufficient, this should
3568 // be replaced by a more detailed mechanism that filters out specific VFs,
3569 // instead of invalidating vectorization for a whole set of VFs based on the
3570 // MaxVF.
3571
3572 // Disable scalable vectorization if the loop contains unsupported reductions.
3573 if (!canVectorizeReductions(VF: MaxScalableVF)) {
3574 reportVectorizationInfo(
3575 Msg: "Scalable vectorization not supported for the reduction "
3576 "operations found in this loop.",
3577 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3578 return false;
3579 }
3580
3581 // Disable scalable vectorization if the loop contains any instructions
3582 // with element types not supported for scalable vectors.
3583 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3584 return !Ty->isVoidTy() &&
3585 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3586 })) {
3587 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3588 "for all element types found in this loop.",
3589 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3590 return false;
3591 }
3592
3593 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3594 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3595 "for safe distance analysis.",
3596 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3597 return false;
3598 }
3599
3600 IsScalableVectorizationAllowed = true;
3601 return true;
3602}
3603
3604ElementCount
3605LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3606 if (!isScalableVectorizationAllowed())
3607 return ElementCount::getScalable(MinVal: 0);
3608
3609 auto MaxScalableVF = ElementCount::getScalable(
3610 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3611 if (Legal->isSafeForAnyVectorWidth())
3612 return MaxScalableVF;
3613
3614 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3615 // Limit MaxScalableVF by the maximum safe dependence distance.
3616 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3617
3618 if (!MaxScalableVF)
3619 reportVectorizationInfo(
3620 Msg: "Max legal vector width too small, scalable vectorization "
3621 "unfeasible.",
3622 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3623
3624 return MaxScalableVF;
3625}
3626
3627FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3628 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3629 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3630 unsigned SmallestType, WidestType;
3631 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3632
3633 // Get the maximum safe dependence distance in bits computed by LAA.
3634 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3635 // the memory accesses that is most restrictive (involved in the smallest
3636 // dependence distance).
3637 unsigned MaxSafeElementsPowerOf2 =
3638 bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3639 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3640 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3641 MaxSafeElementsPowerOf2 =
3642 std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3643 }
3644 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3645 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3646
3647 if (!Legal->isSafeForAnyVectorWidth())
3648 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3649
3650 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3651 << ".\n");
3652 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3653 << ".\n");
3654
3655 // First analyze the UserVF, fall back if the UserVF should be ignored.
3656 if (UserVF) {
3657 auto MaxSafeUserVF =
3658 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3659
3660 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3661 // If `VF=vscale x N` is safe, then so is `VF=N`
3662 if (UserVF.isScalable())
3663 return FixedScalableVFPair(
3664 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3665
3666 return UserVF;
3667 }
3668
3669 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3670
3671 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3672 // is better to ignore the hint and let the compiler choose a suitable VF.
3673 if (!UserVF.isScalable()) {
3674 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3675 << " is unsafe, clamping to max safe VF="
3676 << MaxSafeFixedVF << ".\n");
3677 ORE->emit(RemarkBuilder: [&]() {
3678 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3679 TheLoop->getStartLoc(),
3680 TheLoop->getHeader())
3681 << "User-specified vectorization factor "
3682 << ore::NV("UserVectorizationFactor", UserVF)
3683 << " is unsafe, clamping to maximum safe vectorization factor "
3684 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3685 });
3686 return MaxSafeFixedVF;
3687 }
3688
3689 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3690 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3691 << " is ignored because scalable vectors are not "
3692 "available.\n");
3693 ORE->emit(RemarkBuilder: [&]() {
3694 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3695 TheLoop->getStartLoc(),
3696 TheLoop->getHeader())
3697 << "User-specified vectorization factor "
3698 << ore::NV("UserVectorizationFactor", UserVF)
3699 << " is ignored because the target does not support scalable "
3700 "vectors. The compiler will pick a more suitable value.";
3701 });
3702 } else {
3703 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3704 << " is unsafe. Ignoring scalable UserVF.\n");
3705 ORE->emit(RemarkBuilder: [&]() {
3706 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3707 TheLoop->getStartLoc(),
3708 TheLoop->getHeader())
3709 << "User-specified vectorization factor "
3710 << ore::NV("UserVectorizationFactor", UserVF)
3711 << " is unsafe. Ignoring the hint to let the compiler pick a "
3712 "more suitable value.";
3713 });
3714 }
3715 }
3716
3717 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3718 << " / " << WidestType << " bits.\n");
3719
3720 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
3721 ElementCount::getScalable(MinVal: 0));
3722 if (auto MaxVF =
3723 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3724 MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking))
3725 Result.FixedVF = MaxVF;
3726
3727 if (auto MaxVF =
3728 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3729 MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking))
3730 if (MaxVF.isScalable()) {
3731 Result.ScalableVF = MaxVF;
3732 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3733 << "\n");
3734 }
3735
3736 return Result;
3737}
3738
3739FixedScalableVFPair
3740LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3741 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3742 // TODO: It may be useful to do since it's still likely to be dynamically
3743 // uniform if the target can skip.
3744 reportVectorizationFailure(
3745 DebugMsg: "Not inserting runtime ptr check for divergent target",
3746 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3747 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3748 return FixedScalableVFPair::getNone();
3749 }
3750
3751 ScalarEvolution *SE = PSE.getSE();
3752 ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3753 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3754 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3755 if (TC != ElementCount::getFixed(MinVal: MaxTC))
3756 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3757 if (TC.isScalar()) {
3758 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3759 OREMsg: "loop trip count is one, irrelevant for vectorization",
3760 ORETag: "SingleIterationLoop", ORE, TheLoop);
3761 return FixedScalableVFPair::getNone();
3762 }
3763
3764 // If BTC matches the widest induction type and is -1 then the trip count
3765 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3766 // to vectorize.
3767 const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3768 if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3769 BTC->getType()->getScalarSizeInBits() >=
3770 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3771 SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3772 RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3773 reportVectorizationFailure(
3774 DebugMsg: "Trip count computation wrapped",
3775 OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3776 ORETag: "TripCountWrapped", ORE, TheLoop);
3777 return FixedScalableVFPair::getNone();
3778 }
3779
3780 switch (ScalarEpilogueStatus) {
3781 case CM_ScalarEpilogueAllowed:
3782 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
3783 case CM_ScalarEpilogueNotAllowedUsePredicate:
3784 [[fallthrough]];
3785 case CM_ScalarEpilogueNotNeededUsePredicate:
3786 LLVM_DEBUG(
3787 dbgs() << "LV: vector predicate hint/switch found.\n"
3788 << "LV: Not allowing scalar epilogue, creating predicated "
3789 << "vector loop.\n");
3790 break;
3791 case CM_ScalarEpilogueNotAllowedLowTripLoop:
3792 // fallthrough as a special case of OptForSize
3793 case CM_ScalarEpilogueNotAllowedOptSize:
3794 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3795 LLVM_DEBUG(
3796 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3797 else
3798 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3799 << "count.\n");
3800
3801 // Bail if runtime checks are required, which are not good when optimising
3802 // for size.
3803 if (runtimeChecksRequired())
3804 return FixedScalableVFPair::getNone();
3805
3806 break;
3807 }
3808
3809 // Now try the tail folding
3810
3811 // Invalidate interleave groups that require an epilogue if we can't mask
3812 // the interleave-group.
3813 if (!useMaskedInterleavedAccesses(TTI)) {
3814 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3815 "No decisions should have been taken at this point");
3816 // Note: There is no need to invalidate any cost modeling decisions here, as
3817 // none were taken so far.
3818 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3819 }
3820
3821 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true);
3822
3823 // Avoid tail folding if the trip count is known to be a multiple of any VF
3824 // we choose.
3825 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3826 MaxFactors.FixedVF.getFixedValue();
3827 if (MaxFactors.ScalableVF) {
3828 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3829 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3830 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3831 a: *MaxPowerOf2RuntimeVF,
3832 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3833 } else
3834 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3835 }
3836
3837 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3838 // Return false if the loop is neither a single-latch-exit loop nor an
3839 // early-exit loop as tail-folding is not supported in that case.
3840 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3841 !Legal->hasUncountableEarlyExit())
3842 return false;
3843 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3844 ScalarEvolution *SE = PSE.getSE();
3845 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3846 // with uncountable exits. For countable loops, the symbolic maximum must
3847 // remain identical to the known back-edge taken count.
3848 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3849 assert((Legal->hasUncountableEarlyExit() ||
3850 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3851 "Invalid loop count");
3852 const SCEV *ExitCount = SE->getAddExpr(
3853 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3854 const SCEV *Rem = SE->getURemExpr(
3855 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3856 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3857 return Rem->isZero();
3858 };
3859
3860 if (MaxPowerOf2RuntimeVF > 0u) {
3861 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3862 "MaxFixedVF must be a power of 2");
3863 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3864 // Accept MaxFixedVF if we do not have a tail.
3865 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3866 return MaxFactors;
3867 }
3868 }
3869
3870 auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3871 if (ExpectedTC && ExpectedTC->isFixed() &&
3872 ExpectedTC->getFixedValue() <=
3873 TTI.getMinTripCountTailFoldingThreshold()) {
3874 if (MaxPowerOf2RuntimeVF > 0u) {
3875 // If we have a low-trip-count, and the fixed-width VF is known to divide
3876 // the trip count but the scalable factor does not, use the fixed-width
3877 // factor in preference to allow the generation of a non-predicated loop.
3878 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3879 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3880 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3881 "remain for any chosen VF.\n");
3882 MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: 0);
3883 return MaxFactors;
3884 }
3885 }
3886
3887 reportVectorizationFailure(
3888 DebugMsg: "The trip count is below the minial threshold value.",
3889 OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3890 ORE, TheLoop);
3891 return FixedScalableVFPair::getNone();
3892 }
3893
3894 // If we don't know the precise trip count, or if the trip count that we
3895 // found modulo the vectorization factor is not zero, try to fold the tail
3896 // by masking.
3897 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3898 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3899 setTailFoldingStyles(IsScalableVF: ContainsScalableVF, UserIC);
3900 if (foldTailByMasking()) {
3901 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
3902 LLVM_DEBUG(
3903 dbgs()
3904 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3905 "try to generate VP Intrinsics with scalable vector "
3906 "factors only.\n");
3907 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3908 // for now.
3909 // TODO: extend it for fixed vectors, if required.
3910 assert(ContainsScalableVF && "Expected scalable vector factor.");
3911
3912 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
3913 }
3914 return MaxFactors;
3915 }
3916
3917 // If there was a tail-folding hint/switch, but we can't fold the tail by
3918 // masking, fallback to a vectorization with a scalar epilogue.
3919 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3920 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3921 "scalar epilogue instead.\n");
3922 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3923 return MaxFactors;
3924 }
3925
3926 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3927 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3928 return FixedScalableVFPair::getNone();
3929 }
3930
3931 if (TC.isZero()) {
3932 reportVectorizationFailure(
3933 DebugMsg: "unable to calculate the loop count due to complex control flow",
3934 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3935 return FixedScalableVFPair::getNone();
3936 }
3937
3938 reportVectorizationFailure(
3939 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3940 OREMsg: "cannot optimize for size and vectorize at the same time. "
3941 "Enable vectorization of this loop with '#pragma clang loop "
3942 "vectorize(enable)' when compiling with -Os/-Oz",
3943 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3944 return FixedScalableVFPair::getNone();
3945}
3946
3947bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
3948 return useMaxBandwidth(RegKind: VF.isScalable()
3949 ? TargetTransformInfo::RGK_ScalableVector
3950 : TargetTransformInfo::RGK_FixedWidthVector);
3951}
3952
3953bool LoopVectorizationCostModel::useMaxBandwidth(
3954 TargetTransformInfo::RegisterKind RegKind) {
3955 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3956 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
3957 (UseWiderVFIfCallVariantsPresent &&
3958 Legal->hasVectorCallVariants())));
3959}
3960
3961ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3962 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3963 ElementCount MaxSafeVF, bool FoldTailByMasking) {
3964 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3965 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3966 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3967 : TargetTransformInfo::RGK_FixedWidthVector);
3968
3969 // Convenience function to return the minimum of two ElementCounts.
3970 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3971 assert((LHS.isScalable() == RHS.isScalable()) &&
3972 "Scalable flags must match");
3973 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3974 };
3975
3976 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3977 // Note that both WidestRegister and WidestType may not be a powers of 2.
3978 auto MaxVectorElementCount = ElementCount::get(
3979 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3980 Scalable: ComputeScalableMaxVF);
3981 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3982 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3983 << (MaxVectorElementCount * WidestType) << " bits.\n");
3984
3985 if (!MaxVectorElementCount) {
3986 LLVM_DEBUG(dbgs() << "LV: The target has no "
3987 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3988 << " vector registers.\n");
3989 return ElementCount::getFixed(MinVal: 1);
3990 }
3991
3992 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
3993 if (MaxVectorElementCount.isScalable() &&
3994 TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3995 auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3996 auto Min = Attr.getVScaleRangeMin();
3997 WidestRegisterMinEC *= Min;
3998 }
3999
4000 // When a scalar epilogue is required, at least one iteration of the scalar
4001 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4002 // max VF that results in a dead vector loop.
4003 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
4004 MaxTripCount -= 1;
4005
4006 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4007 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
4008 // If upper bound loop trip count (TC) is known at compile time there is no
4009 // point in choosing VF greater than TC (as done in the loop below). Select
4010 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4011 // scalable, we only fall back on a fixed VF when the TC is less than or
4012 // equal to the known number of lanes.
4013 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount);
4014 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4015 "exceeding the constant trip count: "
4016 << ClampedUpperTripCount << "\n");
4017 return ElementCount::get(
4018 MinVal: ClampedUpperTripCount,
4019 Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4020 }
4021
4022 TargetTransformInfo::RegisterKind RegKind =
4023 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4024 : TargetTransformInfo::RGK_FixedWidthVector;
4025 ElementCount MaxVF = MaxVectorElementCount;
4026 if (useMaxBandwidth(RegKind)) {
4027 auto MaxVectorElementCountMaxBW = ElementCount::get(
4028 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
4029 Scalable: ComputeScalableMaxVF);
4030 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4031
4032 if (ElementCount MinVF =
4033 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
4034 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
4035 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4036 << ") with target's minimum: " << MinVF << '\n');
4037 MaxVF = MinVF;
4038 }
4039 }
4040
4041 // Invalidate any widening decisions we might have made, in case the loop
4042 // requires prediction (decided later), but we have already made some
4043 // load/store widening decisions.
4044 invalidateCostModelingDecisions();
4045 }
4046 return MaxVF;
4047}
4048
4049bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4050 const VectorizationFactor &B,
4051 const unsigned MaxTripCount,
4052 bool HasTail) const {
4053 InstructionCost CostA = A.Cost;
4054 InstructionCost CostB = B.Cost;
4055
4056 // Improve estimate for the vector width if it is scalable.
4057 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4058 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4059 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
4060 if (A.Width.isScalable())
4061 EstimatedWidthA *= *VScale;
4062 if (B.Width.isScalable())
4063 EstimatedWidthB *= *VScale;
4064 }
4065
4066 // When optimizing for size choose whichever is smallest, which will be the
4067 // one with the smallest cost for the whole loop. On a tie pick the larger
4068 // vector width, on the assumption that throughput will be greater.
4069 if (CM.CostKind == TTI::TCK_CodeSize)
4070 return CostA < CostB ||
4071 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
4072
4073 // Assume vscale may be larger than 1 (or the value being tuned for),
4074 // so that scalable vectorization is slightly favorable over fixed-width
4075 // vectorization.
4076 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4077 A.Width.isScalable() && !B.Width.isScalable();
4078
4079 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4080 const InstructionCost &RHS) {
4081 return PreferScalable ? LHS <= RHS : LHS < RHS;
4082 };
4083
4084 // To avoid the need for FP division:
4085 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4086 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4087 if (!MaxTripCount)
4088 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4089
4090 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
4091 InstructionCost VectorCost,
4092 InstructionCost ScalarCost) {
4093 // If the trip count is a known (possibly small) constant, the trip count
4094 // will be rounded up to an integer number of iterations under
4095 // FoldTailByMasking. The total cost in that case will be
4096 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4097 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4098 // some extra overheads, but for the purpose of comparing the costs of
4099 // different VFs we can use this to compare the total loop-body cost
4100 // expected after vectorization.
4101 if (HasTail)
4102 return VectorCost * (MaxTripCount / VF) +
4103 ScalarCost * (MaxTripCount % VF);
4104 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
4105 };
4106
4107 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4108 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4109 return CmpFn(RTCostA, RTCostB);
4110}
4111
4112bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4113 const VectorizationFactor &B,
4114 bool HasTail) const {
4115 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4116 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
4117 HasTail);
4118}
4119
4120void LoopVectorizationPlanner::emitInvalidCostRemarks(
4121 OptimizationRemarkEmitter *ORE) {
4122 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4123 SmallVector<RecipeVFPair> InvalidCosts;
4124 for (const auto &Plan : VPlans) {
4125 for (ElementCount VF : Plan->vectorFactors()) {
4126 // The VPlan-based cost model is designed for computing vector cost.
4127 // Querying VPlan-based cost model with a scarlar VF will cause some
4128 // errors because we expect the VF is vector for most of the widen
4129 // recipes.
4130 if (VF.isScalar())
4131 continue;
4132
4133 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4134 CM, CM.CostKind);
4135 precomputeCosts(Plan&: *Plan, VF, CostCtx);
4136 auto Iter = vp_depth_first_deep(G: Plan->getVectorLoopRegion()->getEntry());
4137 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
4138 for (auto &R : *VPBB) {
4139 if (!R.cost(VF, Ctx&: CostCtx).isValid())
4140 InvalidCosts.emplace_back(Args: &R, Args&: VF);
4141 }
4142 }
4143 }
4144 }
4145 if (InvalidCosts.empty())
4146 return;
4147
4148 // Emit a report of VFs with invalid costs in the loop.
4149
4150 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4151 DenseMap<VPRecipeBase *, unsigned> Numbering;
4152 unsigned I = 0;
4153 for (auto &Pair : InvalidCosts)
4154 if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4155 ++I;
4156
4157 // Sort the list, first on recipe(number) then on VF.
4158 sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4159 unsigned NA = Numbering[A.first];
4160 unsigned NB = Numbering[B.first];
4161 if (NA != NB)
4162 return NA < NB;
4163 return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4164 });
4165
4166 // For a list of ordered recipe-VF pairs:
4167 // [(load, VF1), (load, VF2), (store, VF1)]
4168 // group the recipes together to emit separate remarks for:
4169 // load (VF1, VF2)
4170 // store (VF1)
4171 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4172 auto Subset = ArrayRef<RecipeVFPair>();
4173 do {
4174 if (Subset.empty())
4175 Subset = Tail.take_front(N: 1);
4176
4177 VPRecipeBase *R = Subset.front().first;
4178
4179 unsigned Opcode =
4180 TypeSwitch<const VPRecipeBase *, unsigned>(R)
4181 .Case<VPHeaderPHIRecipe>(
4182 caseFn: [](const auto *R) { return Instruction::PHI; })
4183 .Case<VPWidenSelectRecipe>(
4184 caseFn: [](const auto *R) { return Instruction::Select; })
4185 .Case<VPWidenStoreRecipe>(
4186 caseFn: [](const auto *R) { return Instruction::Store; })
4187 .Case<VPWidenLoadRecipe>(
4188 caseFn: [](const auto *R) { return Instruction::Load; })
4189 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4190 caseFn: [](const auto *R) { return Instruction::Call; })
4191 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4192 VPWidenCastRecipe>(
4193 caseFn: [](const auto *R) { return R->getOpcode(); })
4194 .Case<VPInterleaveRecipe>(caseFn: [](const VPInterleaveRecipe *R) {
4195 return R->getStoredValues().empty() ? Instruction::Load
4196 : Instruction::Store;
4197 });
4198
4199 // If the next recipe is different, or if there are no other pairs,
4200 // emit a remark for the collated subset. e.g.
4201 // [(load, VF1), (load, VF2))]
4202 // to emit:
4203 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4204 if (Subset == Tail || Tail[Subset.size()].first != R) {
4205 std::string OutString;
4206 raw_string_ostream OS(OutString);
4207 assert(!Subset.empty() && "Unexpected empty range");
4208 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4209 for (const auto &Pair : Subset)
4210 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4211 OS << "):";
4212 if (Opcode == Instruction::Call) {
4213 StringRef Name = "";
4214 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4215 Name = Int->getIntrinsicName();
4216 } else {
4217 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4218 Function *CalledFn =
4219 WidenCall ? WidenCall->getCalledScalarFunction()
4220 : cast<Function>(Val: R->getOperand(N: R->getNumOperands() - 1)
4221 ->getLiveInIRValue());
4222 Name = CalledFn->getName();
4223 }
4224 OS << " call to " << Name;
4225 } else
4226 OS << " " << Instruction::getOpcodeName(Opcode);
4227 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4228 DL: R->getDebugLoc());
4229 Tail = Tail.drop_front(N: Subset.size());
4230 Subset = {};
4231 } else
4232 // Grow the subset by one element
4233 Subset = Tail.take_front(N: Subset.size() + 1);
4234 } while (!Tail.empty());
4235}
4236
4237/// Check if any recipe of \p Plan will generate a vector value, which will be
4238/// assigned a vector register.
4239static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4240 const TargetTransformInfo &TTI) {
4241 assert(VF.isVector() && "Checking a scalar VF?");
4242 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4243 DenseSet<VPRecipeBase *> EphemeralRecipes;
4244 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4245 // Set of already visited types.
4246 DenseSet<Type *> Visited;
4247 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4248 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4249 for (VPRecipeBase &R : *VPBB) {
4250 if (EphemeralRecipes.contains(V: &R))
4251 continue;
4252 // Continue early if the recipe is considered to not produce a vector
4253 // result. Note that this includes VPInstruction where some opcodes may
4254 // produce a vector, to preserve existing behavior as VPInstructions model
4255 // aspects not directly mapped to existing IR instructions.
4256 switch (R.getVPDefID()) {
4257 case VPDef::VPDerivedIVSC:
4258 case VPDef::VPScalarIVStepsSC:
4259 case VPDef::VPReplicateSC:
4260 case VPDef::VPInstructionSC:
4261 case VPDef::VPCanonicalIVPHISC:
4262 case VPDef::VPVectorPointerSC:
4263 case VPDef::VPVectorEndPointerSC:
4264 case VPDef::VPExpandSCEVSC:
4265 case VPDef::VPEVLBasedIVPHISC:
4266 case VPDef::VPPredInstPHISC:
4267 case VPDef::VPBranchOnMaskSC:
4268 continue;
4269 case VPDef::VPReductionSC:
4270 case VPDef::VPActiveLaneMaskPHISC:
4271 case VPDef::VPWidenCallSC:
4272 case VPDef::VPWidenCanonicalIVSC:
4273 case VPDef::VPWidenCastSC:
4274 case VPDef::VPWidenGEPSC:
4275 case VPDef::VPWidenIntrinsicSC:
4276 case VPDef::VPWidenSC:
4277 case VPDef::VPWidenSelectSC:
4278 case VPDef::VPBlendSC:
4279 case VPDef::VPFirstOrderRecurrencePHISC:
4280 case VPDef::VPHistogramSC:
4281 case VPDef::VPWidenPHISC:
4282 case VPDef::VPWidenIntOrFpInductionSC:
4283 case VPDef::VPWidenPointerInductionSC:
4284 case VPDef::VPReductionPHISC:
4285 case VPDef::VPInterleaveSC:
4286 case VPDef::VPWidenLoadEVLSC:
4287 case VPDef::VPWidenLoadSC:
4288 case VPDef::VPWidenStoreEVLSC:
4289 case VPDef::VPWidenStoreSC:
4290 break;
4291 default:
4292 llvm_unreachable("unhandled recipe");
4293 }
4294
4295 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4296 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4297 if (!NumLegalParts)
4298 return false;
4299 if (VF.isScalable()) {
4300 // <vscale x 1 x iN> is assumed to be profitable over iN because
4301 // scalable registers are a distinct register class from scalar
4302 // ones. If we ever find a target which wants to lower scalable
4303 // vectors back to scalars, we'll need to update this code to
4304 // explicitly ask TTI about the register class uses for each part.
4305 return NumLegalParts <= VF.getKnownMinValue();
4306 }
4307 // Two or more elements that share a register - are vectorized.
4308 return NumLegalParts < VF.getFixedValue();
4309 };
4310
4311 // If no def nor is a store, e.g., branches, continue - no value to check.
4312 if (R.getNumDefinedValues() == 0 &&
4313 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4314 Val: &R))
4315 continue;
4316 // For multi-def recipes, currently only interleaved loads, suffice to
4317 // check first def only.
4318 // For stores check their stored value; for interleaved stores suffice
4319 // the check first stored value only. In all cases this is the second
4320 // operand.
4321 VPValue *ToCheck =
4322 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
4323 Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4324 if (!Visited.insert(V: {ScalarTy}).second)
4325 continue;
4326 Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4327 if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4328 return true;
4329 }
4330 }
4331
4332 return false;
4333}
4334
4335static bool hasReplicatorRegion(VPlan &Plan) {
4336 return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4337 G: Plan.getVectorLoopRegion()->getEntry())),
4338 P: [](auto *VPRB) { return VPRB->isReplicator(); });
4339}
4340
4341#ifndef NDEBUG
4342VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4343 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4344 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4345 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4346 assert(
4347 any_of(VPlans,
4348 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4349 "Expected Scalar VF to be a candidate");
4350
4351 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4352 ExpectedCost);
4353 VectorizationFactor ChosenFactor = ScalarCost;
4354
4355 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4356 if (ForceVectorization &&
4357 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4358 // Ignore scalar width, because the user explicitly wants vectorization.
4359 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4360 // evaluation.
4361 ChosenFactor.Cost = InstructionCost::getMax();
4362 }
4363
4364 for (auto &P : VPlans) {
4365 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4366 P->vectorFactors().end());
4367
4368 SmallVector<VPRegisterUsage, 8> RUs;
4369 if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
4370 CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
4371 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4372
4373 for (unsigned I = 0; I < VFs.size(); I++) {
4374 ElementCount VF = VFs[I];
4375 // The cost for scalar VF=1 is already calculated, so ignore it.
4376 if (VF.isScalar())
4377 continue;
4378
4379 /// Don't consider the VF if it exceeds the number of registers for the
4380 /// target.
4381 if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
4382 continue;
4383
4384 InstructionCost C = CM.expectedCost(VF);
4385
4386 // Add on other costs that are modelled in VPlan, but not in the legacy
4387 // cost model.
4388 VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
4389 CM, CM.CostKind);
4390 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4391 assert(VectorRegion && "Expected to have a vector region!");
4392 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4393 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4394 for (VPRecipeBase &R : *VPBB) {
4395 auto *VPI = dyn_cast<VPInstruction>(&R);
4396 if (!VPI)
4397 continue;
4398 switch (VPI->getOpcode()) {
4399 case VPInstruction::ActiveLaneMask:
4400 case VPInstruction::ExplicitVectorLength:
4401 C += VPI->cost(VF, CostCtx);
4402 break;
4403 default:
4404 break;
4405 }
4406 }
4407 }
4408
4409 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4410 unsigned Width =
4411 getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4412 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4413 << " costs: " << (Candidate.Cost / Width));
4414 if (VF.isScalable())
4415 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4416 << CM.getVScaleForTuning().value_or(1) << ")");
4417 LLVM_DEBUG(dbgs() << ".\n");
4418
4419 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4420 LLVM_DEBUG(
4421 dbgs()
4422 << "LV: Not considering vector loop of width " << VF
4423 << " because it will not generate any vector instructions.\n");
4424 continue;
4425 }
4426
4427 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4428 LLVM_DEBUG(
4429 dbgs()
4430 << "LV: Not considering vector loop of width " << VF
4431 << " because it would cause replicated blocks to be generated,"
4432 << " which isn't allowed when optimizing for size.\n");
4433 continue;
4434 }
4435
4436 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4437 ChosenFactor = Candidate;
4438 }
4439 }
4440
4441 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4442 reportVectorizationFailure(
4443 "There are conditional stores.",
4444 "store that is conditionally executed prevents vectorization",
4445 "ConditionalStore", ORE, OrigLoop);
4446 ChosenFactor = ScalarCost;
4447 }
4448
4449 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4450 !isMoreProfitable(ChosenFactor, ScalarCost,
4451 !CM.foldTailByMasking())) dbgs()
4452 << "LV: Vectorization seems to be not beneficial, "
4453 << "but was forced by a user.\n");
4454 return ChosenFactor;
4455}
4456#endif
4457
4458bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4459 ElementCount VF) const {
4460 // Cross iteration phis such as reductions need special handling and are
4461 // currently unsupported.
4462 if (any_of(Range: OrigLoop->getHeader()->phis(),
4463 P: [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(Phi: &Phi); }))
4464 return false;
4465
4466 // Phis with uses outside of the loop require special handling and are
4467 // currently unsupported.
4468 for (const auto &Entry : Legal->getInductionVars()) {
4469 // Look for uses of the value of the induction at the last iteration.
4470 Value *PostInc =
4471 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4472 for (User *U : PostInc->users())
4473 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4474 return false;
4475 // Look for uses of penultimate value of the induction.
4476 for (User *U : Entry.first->users())
4477 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4478 return false;
4479 }
4480
4481 // Epilogue vectorization code has not been auditted to ensure it handles
4482 // non-latch exits properly. It may be fine, but it needs auditted and
4483 // tested.
4484 // TODO: Add support for loops with an early exit.
4485 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4486 return false;
4487
4488 return true;
4489}
4490
4491bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4492 const ElementCount VF, const unsigned IC) const {
4493 // FIXME: We need a much better cost-model to take different parameters such
4494 // as register pressure, code size increase and cost of extra branches into
4495 // account. For now we apply a very crude heuristic and only consider loops
4496 // with vectorization factors larger than a certain value.
4497
4498 // Allow the target to opt out entirely.
4499 if (!TTI.preferEpilogueVectorization())
4500 return false;
4501
4502 // We also consider epilogue vectorization unprofitable for targets that don't
4503 // consider interleaving beneficial (eg. MVE).
4504 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4505 return false;
4506
4507 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4508 // VFs when deciding profitability.
4509 // See related "TODO: extend to support scalable VFs." in
4510 // selectEpilogueVectorizationFactor.
4511 unsigned Multiplier = VF.isFixed() ? IC : 1;
4512 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4513 ? EpilogueVectorizationMinVF
4514 : TTI.getEpilogueVectorizationMinVF();
4515 return getEstimatedRuntimeVF(VF: VF * Multiplier, VScale: VScaleForTuning) >=
4516 MinVFThreshold;
4517}
4518
4519VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4520 const ElementCount MainLoopVF, unsigned IC) {
4521 VectorizationFactor Result = VectorizationFactor::Disabled();
4522 if (!EnableEpilogueVectorization) {
4523 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4524 return Result;
4525 }
4526
4527 if (!CM.isScalarEpilogueAllowed()) {
4528 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4529 "epilogue is allowed.\n");
4530 return Result;
4531 }
4532
4533 // Not really a cost consideration, but check for unsupported cases here to
4534 // simplify the logic.
4535 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4536 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4537 "is not a supported candidate.\n");
4538 return Result;
4539 }
4540
4541 if (EpilogueVectorizationForceVF > 1) {
4542 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4543 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4544 if (hasPlanWithVF(VF: ForcedEC))
4545 return {ForcedEC, 0, 0};
4546
4547 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4548 "viable.\n");
4549 return Result;
4550 }
4551
4552 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4553 LLVM_DEBUG(
4554 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4555 return Result;
4556 }
4557
4558 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4559 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4560 "this loop\n");
4561 return Result;
4562 }
4563
4564 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4565 // the main loop handles 8 lanes per iteration. We could still benefit from
4566 // vectorizing the epilogue loop with VF=4.
4567 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4568 MinVal: getEstimatedRuntimeVF(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4569
4570 ScalarEvolution &SE = *PSE.getSE();
4571 Type *TCType = Legal->getWidestInductionType();
4572 const SCEV *RemainingIterations = nullptr;
4573 unsigned MaxTripCount = 0;
4574 for (auto &NextVF : ProfitableVFs) {
4575 // Skip candidate VFs without a corresponding VPlan.
4576 if (!hasPlanWithVF(VF: NextVF.Width))
4577 continue;
4578
4579 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4580 // vectors) or > the VF of the main loop (fixed vectors).
4581 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4582 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) ||
4583 (NextVF.Width.isScalable() &&
4584 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) ||
4585 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4586 ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4587 continue;
4588
4589 // If NextVF is greater than the number of remaining iterations, the
4590 // epilogue loop would be dead. Skip such factors.
4591 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4592 // TODO: extend to support scalable VFs.
4593 if (!RemainingIterations) {
4594 const SCEV *TC = vputils::getSCEVExprForVPValue(
4595 V: getPlanFor(VF: NextVF.Width).getTripCount(), SE);
4596 assert(!isa<SCEVCouldNotCompute>(TC) &&
4597 "Trip count SCEV must be computable");
4598 RemainingIterations = SE.getURemExpr(
4599 LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getFixedValue() * IC));
4600 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4601 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4602 RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4603 MaxTripCount =
4604 SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4605 }
4606 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4607 << MaxTripCount << "\n");
4608 }
4609 if (SE.isKnownPredicate(
4610 Pred: CmpInst::ICMP_UGT,
4611 LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getFixedValue()),
4612 RHS: RemainingIterations))
4613 continue;
4614 }
4615
4616 if (Result.Width.isScalar() ||
4617 isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking()))
4618 Result = NextVF;
4619 }
4620
4621 if (Result != VectorizationFactor::Disabled())
4622 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4623 << Result.Width << "\n");
4624 return Result;
4625}
4626
4627std::pair<unsigned, unsigned>
4628LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4629 unsigned MinWidth = -1U;
4630 unsigned MaxWidth = 8;
4631 const DataLayout &DL = TheFunction->getDataLayout();
4632 // For in-loop reductions, no element types are added to ElementTypesInLoop
4633 // if there are no loads/stores in the loop. In this case, check through the
4634 // reduction variables to determine the maximum width.
4635 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4636 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4637 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4638 // When finding the min width used by the recurrence we need to account
4639 // for casts on the input operands of the recurrence.
4640 MinWidth = std::min(
4641 a: MinWidth,
4642 b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4643 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4644 MaxWidth = std::max(a: MaxWidth,
4645 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4646 }
4647 } else {
4648 for (Type *T : ElementTypesInLoop) {
4649 MinWidth = std::min<unsigned>(
4650 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4651 MaxWidth = std::max<unsigned>(
4652 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4653 }
4654 }
4655 return {MinWidth, MaxWidth};
4656}
4657
4658void LoopVectorizationCostModel::collectElementTypesForWidening() {
4659 ElementTypesInLoop.clear();
4660 // For each block.
4661 for (BasicBlock *BB : TheLoop->blocks()) {
4662 // For each instruction in the loop.
4663 for (Instruction &I : BB->instructionsWithoutDebug()) {
4664 Type *T = I.getType();
4665
4666 // Skip ignored values.
4667 if (ValuesToIgnore.count(Ptr: &I))
4668 continue;
4669
4670 // Only examine Loads, Stores and PHINodes.
4671 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4672 continue;
4673
4674 // Examine PHI nodes that are reduction variables. Update the type to
4675 // account for the recurrence type.
4676 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4677 if (!Legal->isReductionVariable(PN))
4678 continue;
4679 const RecurrenceDescriptor &RdxDesc =
4680 Legal->getReductionVars().find(Key: PN)->second;
4681 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4682 TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4683 Ty: RdxDesc.getRecurrenceType()))
4684 continue;
4685 T = RdxDesc.getRecurrenceType();
4686 }
4687
4688 // Examine the stored values.
4689 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4690 T = ST->getValueOperand()->getType();
4691
4692 assert(T->isSized() &&
4693 "Expected the load/store/recurrence type to be sized");
4694
4695 ElementTypesInLoop.insert(Ptr: T);
4696 }
4697 }
4698}
4699
4700unsigned
4701LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4702 InstructionCost LoopCost) {
4703 // -- The interleave heuristics --
4704 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4705 // There are many micro-architectural considerations that we can't predict
4706 // at this level. For example, frontend pressure (on decode or fetch) due to
4707 // code size, or the number and capabilities of the execution ports.
4708 //
4709 // We use the following heuristics to select the interleave count:
4710 // 1. If the code has reductions, then we interleave to break the cross
4711 // iteration dependency.
4712 // 2. If the loop is really small, then we interleave to reduce the loop
4713 // overhead.
4714 // 3. We don't interleave if we think that we will spill registers to memory
4715 // due to the increased register pressure.
4716
4717 if (!isScalarEpilogueAllowed())
4718 return 1;
4719
4720 // Do not interleave if EVL is preferred and no User IC is specified.
4721 if (foldTailWithEVL()) {
4722 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4723 "Unroll factor forced to be 1.\n");
4724 return 1;
4725 }
4726
4727 // We used the distance for the interleave count.
4728 if (!Legal->isSafeForAnyVectorWidth())
4729 return 1;
4730
4731 // We don't attempt to perform interleaving for loops with uncountable early
4732 // exits because the VPInstruction::AnyOf code cannot currently handle
4733 // multiple parts.
4734 if (Legal->hasUncountableEarlyExit())
4735 return 1;
4736
4737 const bool HasReductions = !Legal->getReductionVars().empty();
4738
4739 // If we did not calculate the cost for VF (because the user selected the VF)
4740 // then we calculate the cost of VF here.
4741 if (LoopCost == 0) {
4742 LoopCost = expectedCost(VF);
4743 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4744
4745 // Loop body is free and there is no need for interleaving.
4746 if (LoopCost == 0)
4747 return 1;
4748 }
4749
4750 VPRegisterUsage R =
4751 calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore)[0];
4752 // We divide by these constants so assume that we have at least one
4753 // instruction that uses at least one register.
4754 for (auto &Pair : R.MaxLocalUsers) {
4755 Pair.second = std::max(a: Pair.second, b: 1U);
4756 }
4757
4758 // We calculate the interleave count using the following formula.
4759 // Subtract the number of loop invariants from the number of available
4760 // registers. These registers are used by all of the interleaved instances.
4761 // Next, divide the remaining registers by the number of registers that is
4762 // required by the loop, in order to estimate how many parallel instances
4763 // fit without causing spills. All of this is rounded down if necessary to be
4764 // a power of two. We want power of two interleave count to simplify any
4765 // addressing operations or alignment considerations.
4766 // We also want power of two interleave counts to ensure that the induction
4767 // variable of the vector loop wraps to zero, when tail is folded by masking;
4768 // this currently happens when OptForSize, in which case IC is set to 1 above.
4769 unsigned IC = UINT_MAX;
4770
4771 for (const auto &Pair : R.MaxLocalUsers) {
4772 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4773 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4774 << " registers of "
4775 << TTI.getRegisterClassName(Pair.first)
4776 << " register class\n");
4777 if (VF.isScalar()) {
4778 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4779 TargetNumRegisters = ForceTargetNumScalarRegs;
4780 } else {
4781 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4782 TargetNumRegisters = ForceTargetNumVectorRegs;
4783 }
4784 unsigned MaxLocalUsers = Pair.second;
4785 unsigned LoopInvariantRegs = 0;
4786 if (R.LoopInvariantRegs.contains(Key: Pair.first))
4787 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4788
4789 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4790 MaxLocalUsers);
4791 // Don't count the induction variable as interleaved.
4792 if (EnableIndVarRegisterHeur) {
4793 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
4794 std::max(a: 1U, b: (MaxLocalUsers - 1)));
4795 }
4796
4797 IC = std::min(a: IC, b: TmpIC);
4798 }
4799
4800 // Clamp the interleave ranges to reasonable counts.
4801 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4802
4803 // Check if the user has overridden the max.
4804 if (VF.isScalar()) {
4805 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4806 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4807 } else {
4808 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4809 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4810 }
4811
4812 unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScale: VScaleForTuning);
4813
4814 // Try to get the exact trip count, or an estimate based on profiling data or
4815 // ConstantMax from PSE, failing that.
4816 if (auto BestKnownTC = getSmallBestKnownTC(PSE, L: TheLoop)) {
4817 // At least one iteration must be scalar when this constraint holds. So the
4818 // maximum available iterations for interleaving is one less.
4819 unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector())
4820 ? BestKnownTC->getFixedValue() - 1
4821 : BestKnownTC->getFixedValue();
4822
4823 unsigned InterleaveCountLB = bit_floor(Value: std::max(
4824 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4825
4826 if (getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop).isNonZero()) {
4827 // If the best known trip count is exact, we select between two
4828 // prospective ICs, where
4829 //
4830 // 1) the aggressive IC is capped by the trip count divided by VF
4831 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4832 //
4833 // The final IC is selected in a way that the epilogue loop trip count is
4834 // minimized while maximizing the IC itself, so that we either run the
4835 // vector loop at least once if it generates a small epilogue loop, or
4836 // else we run the vector loop at least twice.
4837
4838 unsigned InterleaveCountUB = bit_floor(Value: std::max(
4839 a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4840 MaxInterleaveCount = InterleaveCountLB;
4841
4842 if (InterleaveCountUB != InterleaveCountLB) {
4843 unsigned TailTripCountUB =
4844 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4845 unsigned TailTripCountLB =
4846 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4847 // If both produce same scalar tail, maximize the IC to do the same work
4848 // in fewer vector loop iterations
4849 if (TailTripCountUB == TailTripCountLB)
4850 MaxInterleaveCount = InterleaveCountUB;
4851 }
4852 } else {
4853 // If trip count is an estimated compile time constant, limit the
4854 // IC to be capped by the trip count divided by VF * 2, such that the
4855 // vector loop runs at least twice to make interleaving seem profitable
4856 // when there is an epilogue loop present. Since exact Trip count is not
4857 // known we choose to be conservative in our IC estimate.
4858 MaxInterleaveCount = InterleaveCountLB;
4859 }
4860 }
4861
4862 assert(MaxInterleaveCount > 0 &&
4863 "Maximum interleave count must be greater than 0");
4864
4865 // Clamp the calculated IC to be between the 1 and the max interleave count
4866 // that the target and trip count allows.
4867 if (IC > MaxInterleaveCount)
4868 IC = MaxInterleaveCount;
4869 else
4870 // Make sure IC is greater than 0.
4871 IC = std::max(a: 1u, b: IC);
4872
4873 assert(IC > 0 && "Interleave count must be greater than 0.");
4874
4875 // Interleave if we vectorized this loop and there is a reduction that could
4876 // benefit from interleaving.
4877 if (VF.isVector() && HasReductions) {
4878 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4879 return IC;
4880 }
4881
4882 // For any scalar loop that either requires runtime checks or predication we
4883 // are better off leaving this to the unroller. Note that if we've already
4884 // vectorized the loop we will have done the runtime check and so interleaving
4885 // won't require further checks.
4886 bool ScalarInterleavingRequiresPredication =
4887 (VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) {
4888 return Legal->blockNeedsPredication(BB);
4889 }));
4890 bool ScalarInterleavingRequiresRuntimePointerCheck =
4891 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4892
4893 // We want to interleave small loops in order to reduce the loop overhead and
4894 // potentially expose ILP opportunities.
4895 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4896 << "LV: IC is " << IC << '\n'
4897 << "LV: VF is " << VF << '\n');
4898 const bool AggressivelyInterleaveReductions =
4899 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4900 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4901 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4902 // We assume that the cost overhead is 1 and we use the cost model
4903 // to estimate the cost of the loop and interleave until the cost of the
4904 // loop overhead is about 5% of the cost of the loop.
4905 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4906 Value: SmallLoopCost / LoopCost.getValue()));
4907
4908 // Interleave until store/load ports (estimated by max interleave count) are
4909 // saturated.
4910 unsigned NumStores = Legal->getNumStores();
4911 unsigned NumLoads = Legal->getNumLoads();
4912 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4913 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4914
4915 // There is little point in interleaving for reductions containing selects
4916 // and compares when VF=1 since it may just create more overhead than it's
4917 // worth for loops with small trip counts. This is because we still have to
4918 // do the final reduction after the loop.
4919 bool HasSelectCmpReductions =
4920 HasReductions &&
4921 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4922 const RecurrenceDescriptor &RdxDesc = Reduction.second;
4923 RecurKind RK = RdxDesc.getRecurrenceKind();
4924 return RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) ||
4925 RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK);
4926 });
4927 if (HasSelectCmpReductions) {
4928 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4929 return 1;
4930 }
4931
4932 // If we have a scalar reduction (vector reductions are already dealt with
4933 // by this point), we can increase the critical path length if the loop
4934 // we're interleaving is inside another loop. For tree-wise reductions
4935 // set the limit to 2, and for ordered reductions it's best to disable
4936 // interleaving entirely.
4937 if (HasReductions && TheLoop->getLoopDepth() > 1) {
4938 bool HasOrderedReductions =
4939 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4940 const RecurrenceDescriptor &RdxDesc = Reduction.second;
4941 return RdxDesc.isOrdered();
4942 });
4943 if (HasOrderedReductions) {
4944 LLVM_DEBUG(
4945 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4946 return 1;
4947 }
4948
4949 unsigned F = MaxNestedScalarReductionIC;
4950 SmallIC = std::min(a: SmallIC, b: F);
4951 StoresIC = std::min(a: StoresIC, b: F);
4952 LoadsIC = std::min(a: LoadsIC, b: F);
4953 }
4954
4955 if (EnableLoadStoreRuntimeInterleave &&
4956 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4957 LLVM_DEBUG(
4958 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4959 return std::max(a: StoresIC, b: LoadsIC);
4960 }
4961
4962 // If there are scalar reductions and TTI has enabled aggressive
4963 // interleaving for reductions, we will interleave to expose ILP.
4964 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4965 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4966 // Interleave no less than SmallIC but not as aggressive as the normal IC
4967 // to satisfy the rare situation when resources are too limited.
4968 return std::max(a: IC / 2, b: SmallIC);
4969 }
4970
4971 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4972 return SmallIC;
4973 }
4974
4975 // Interleave if this is a large loop (small loops are already dealt with by
4976 // this point) that could benefit from interleaving.
4977 if (AggressivelyInterleaveReductions) {
4978 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4979 return IC;
4980 }
4981
4982 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4983 return 1;
4984}
4985
4986bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4987 ElementCount VF) {
4988 // TODO: Cost model for emulated masked load/store is completely
4989 // broken. This hack guides the cost model to use an artificially
4990 // high enough value to practically disable vectorization with such
4991 // operations, except where previously deployed legality hack allowed
4992 // using very low cost values. This is to avoid regressions coming simply
4993 // from moving "masked load/store" check from legality to cost model.
4994 // Masked Load/Gather emulation was previously never allowed.
4995 // Limited number of Masked Store/Scatter emulation was allowed.
4996 assert((isPredicatedInst(I)) &&
4997 "Expecting a scalar emulated instruction");
4998 return isa<LoadInst>(Val: I) ||
4999 (isa<StoreInst>(Val: I) &&
5000 NumPredStores > NumberOfStoresToPredicate);
5001}
5002
5003void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5004 assert(VF.isVector() && "Expected VF >= 2");
5005
5006 // If we've already collected the instructions to scalarize or the predicated
5007 // BBs after vectorization, there's nothing to do. Collection may already have
5008 // occurred if we have a user-selected VF and are now computing the expected
5009 // cost for interleaving.
5010 if (InstsToScalarize.contains(Val: VF) ||
5011 PredicatedBBsAfterVectorization.contains(Val: VF))
5012 return;
5013
5014 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5015 // not profitable to scalarize any instructions, the presence of VF in the
5016 // map will indicate that we've analyzed it already.
5017 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5018
5019 // Find all the instructions that are scalar with predication in the loop and
5020 // determine if it would be better to not if-convert the blocks they are in.
5021 // If so, we also record the instructions to scalarize.
5022 for (BasicBlock *BB : TheLoop->blocks()) {
5023 if (!blockNeedsPredicationForAnyReason(BB))
5024 continue;
5025 for (Instruction &I : *BB)
5026 if (isScalarWithPredication(I: &I, VF)) {
5027 ScalarCostsTy ScalarCosts;
5028 // Do not apply discount logic for:
5029 // 1. Scalars after vectorization, as there will only be a single copy
5030 // of the instruction.
5031 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5032 // 3. Emulated masked memrefs, if a hacked cost is needed.
5033 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5034 !useEmulatedMaskMemRefHack(I: &I, VF) &&
5035 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) {
5036 ScalarCostsVF.insert_range(R&: ScalarCosts);
5037 // Check if we decided to scalarize a call. If so, update the widening
5038 // decision of the call to CM_Scalarize with the computed scalar cost.
5039 for (const auto &[I, Cost] : ScalarCosts) {
5040 auto *CI = dyn_cast<CallInst>(Val: I);
5041 if (!CI || !CallWideningDecisions.contains(Val: {CI, VF}))
5042 continue;
5043 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5044 CallWideningDecisions[{CI, VF}].Cost = Cost;
5045 }
5046 }
5047 // Remember that BB will remain after vectorization.
5048 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
5049 for (auto *Pred : predecessors(BB)) {
5050 if (Pred->getSingleSuccessor() == BB)
5051 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
5052 }
5053 }
5054 }
5055}
5056
5057InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5058 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5059 assert(!isUniformAfterVectorization(PredInst, VF) &&
5060 "Instruction marked uniform-after-vectorization will be predicated");
5061
5062 // Initialize the discount to zero, meaning that the scalar version and the
5063 // vector version cost the same.
5064 InstructionCost Discount = 0;
5065
5066 // Holds instructions to analyze. The instructions we visit are mapped in
5067 // ScalarCosts. Those instructions are the ones that would be scalarized if
5068 // we find that the scalar version costs less.
5069 SmallVector<Instruction *, 8> Worklist;
5070
5071 // Returns true if the given instruction can be scalarized.
5072 auto CanBeScalarized = [&](Instruction *I) -> bool {
5073 // We only attempt to scalarize instructions forming a single-use chain
5074 // from the original predicated block that would otherwise be vectorized.
5075 // Although not strictly necessary, we give up on instructions we know will
5076 // already be scalar to avoid traversing chains that are unlikely to be
5077 // beneficial.
5078 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5079 isScalarAfterVectorization(I, VF))
5080 return false;
5081
5082 // If the instruction is scalar with predication, it will be analyzed
5083 // separately. We ignore it within the context of PredInst.
5084 if (isScalarWithPredication(I, VF))
5085 return false;
5086
5087 // If any of the instruction's operands are uniform after vectorization,
5088 // the instruction cannot be scalarized. This prevents, for example, a
5089 // masked load from being scalarized.
5090 //
5091 // We assume we will only emit a value for lane zero of an instruction
5092 // marked uniform after vectorization, rather than VF identical values.
5093 // Thus, if we scalarize an instruction that uses a uniform, we would
5094 // create uses of values corresponding to the lanes we aren't emitting code
5095 // for. This behavior can be changed by allowing getScalarValue to clone
5096 // the lane zero values for uniforms rather than asserting.
5097 for (Use &U : I->operands())
5098 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5099 if (isUniformAfterVectorization(I: J, VF))
5100 return false;
5101
5102 // Otherwise, we can scalarize the instruction.
5103 return true;
5104 };
5105
5106 // Compute the expected cost discount from scalarizing the entire expression
5107 // feeding the predicated instruction. We currently only consider expressions
5108 // that are single-use instruction chains.
5109 Worklist.push_back(Elt: PredInst);
5110 while (!Worklist.empty()) {
5111 Instruction *I = Worklist.pop_back_val();
5112
5113 // If we've already analyzed the instruction, there's nothing to do.
5114 if (ScalarCosts.contains(Val: I))
5115 continue;
5116
5117 // Cannot scalarize fixed-order recurrence phis at the moment.
5118 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5119 continue;
5120
5121 // Compute the cost of the vector instruction. Note that this cost already
5122 // includes the scalarization overhead of the predicated instruction.
5123 InstructionCost VectorCost = getInstructionCost(I, VF);
5124
5125 // Compute the cost of the scalarized instruction. This cost is the cost of
5126 // the instruction as if it wasn't if-converted and instead remained in the
5127 // predicated block. We will scale this cost by block probability after
5128 // computing the scalarization overhead.
5129 InstructionCost ScalarCost =
5130 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
5131
5132 // Compute the scalarization overhead of needed insertelement instructions
5133 // and phi nodes.
5134 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5135 Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5136 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5137 ScalarCost += TTI.getScalarizationOverhead(
5138 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5139 /*Insert=*/true,
5140 /*Extract=*/false, CostKind);
5141 }
5142 ScalarCost +=
5143 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5144 }
5145
5146 // Compute the scalarization overhead of needed extractelement
5147 // instructions. For each of the instruction's operands, if the operand can
5148 // be scalarized, add it to the worklist; otherwise, account for the
5149 // overhead.
5150 for (Use &U : I->operands())
5151 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5152 assert(canVectorizeTy(J->getType()) &&
5153 "Instruction has non-scalar type");
5154 if (CanBeScalarized(J))
5155 Worklist.push_back(Elt: J);
5156 else if (needsExtract(V: J, VF)) {
5157 Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5158 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5159 ScalarCost += TTI.getScalarizationOverhead(
5160 Ty: cast<VectorType>(Val: VectorTy),
5161 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5162 /*Extract*/ true, CostKind);
5163 }
5164 }
5165 }
5166
5167 // Scale the total scalar cost by block probability.
5168 ScalarCost /= getPredBlockCostDivisor(CostKind);
5169
5170 // Compute the discount. A non-negative discount means the vector version
5171 // of the instruction costs more, and scalarizing would be beneficial.
5172 Discount += VectorCost - ScalarCost;
5173 ScalarCosts[I] = ScalarCost;
5174 }
5175
5176 return Discount;
5177}
5178
5179InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5180 InstructionCost Cost;
5181
5182 // If the vector loop gets executed exactly once with the given VF, ignore the
5183 // costs of comparison and induction instructions, as they'll get simplified
5184 // away.
5185 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5186 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5187 if (TC == VF && !foldTailByMasking())
5188 addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5189 InstsToIgnore&: ValuesToIgnoreForVF);
5190
5191 // For each block.
5192 for (BasicBlock *BB : TheLoop->blocks()) {
5193 InstructionCost BlockCost;
5194
5195 // For each instruction in the old loop.
5196 for (Instruction &I : BB->instructionsWithoutDebug()) {
5197 // Skip ignored values.
5198 if (ValuesToIgnore.count(Ptr: &I) || ValuesToIgnoreForVF.count(Ptr: &I) ||
5199 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5200 continue;
5201
5202 InstructionCost C = getInstructionCost(I: &I, VF);
5203
5204 // Check if we should override the cost.
5205 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5206 C = InstructionCost(ForceTargetInstructionCost);
5207
5208 BlockCost += C;
5209 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5210 << VF << " For instruction: " << I << '\n');
5211 }
5212
5213 // If we are vectorizing a predicated block, it will have been
5214 // if-converted. This means that the block's instructions (aside from
5215 // stores and instructions that may divide by zero) will now be
5216 // unconditionally executed. For the scalar case, we may not always execute
5217 // the predicated block, if it is an if-else block. Thus, scale the block's
5218 // cost by the probability of executing it. blockNeedsPredication from
5219 // Legal is used so as to not include all blocks in tail folded loops.
5220 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5221 BlockCost /= getPredBlockCostDivisor(CostKind);
5222
5223 Cost += BlockCost;
5224 }
5225
5226 return Cost;
5227}
5228
5229/// Gets Address Access SCEV after verifying that the access pattern
5230/// is loop invariant except the induction variable dependence.
5231///
5232/// This SCEV can be sent to the Target in order to estimate the address
5233/// calculation cost.
5234static const SCEV *getAddressAccessSCEV(
5235 Value *Ptr,
5236 LoopVectorizationLegality *Legal,
5237 PredicatedScalarEvolution &PSE,
5238 const Loop *TheLoop) {
5239
5240 auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr);
5241 if (!Gep)
5242 return nullptr;
5243
5244 // We are looking for a gep with all loop invariant indices except for one
5245 // which should be an induction variable.
5246 auto *SE = PSE.getSE();
5247 unsigned NumOperands = Gep->getNumOperands();
5248 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5249 Value *Opd = Gep->getOperand(i_nocapture: Idx);
5250 if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) &&
5251 !Legal->isInductionVariable(V: Opd))
5252 return nullptr;
5253 }
5254
5255 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5256 return PSE.getSCEV(V: Ptr);
5257}
5258
5259InstructionCost
5260LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5261 ElementCount VF) {
5262 assert(VF.isVector() &&
5263 "Scalarization cost of instruction implies vectorization.");
5264 if (VF.isScalable())
5265 return InstructionCost::getInvalid();
5266
5267 Type *ValTy = getLoadStoreType(I);
5268 auto *SE = PSE.getSE();
5269
5270 unsigned AS = getLoadStoreAddressSpace(I);
5271 Value *Ptr = getLoadStorePointerOperand(V: I);
5272 Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5273 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5274 // that it is being called from this specific place.
5275
5276 // Figure out whether the access is strided and get the stride value
5277 // if it's known in compile time
5278 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5279
5280 // Get the cost of the scalar memory instruction and address computation.
5281 InstructionCost Cost =
5282 VF.getFixedValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV);
5283
5284 // Don't pass *I here, since it is scalar but will actually be part of a
5285 // vectorized loop where the user of it is a vectorized instruction.
5286 const Align Alignment = getLoadStoreAlignment(I);
5287 Cost += VF.getFixedValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(),
5288 Src: ValTy->getScalarType(),
5289 Alignment, AddressSpace: AS, CostKind);
5290
5291 // Get the overhead of the extractelement and insertelement instructions
5292 // we might create due to scalarization.
5293 Cost += getScalarizationOverhead(I, VF);
5294
5295 // If we have a predicated load/store, it will need extra i1 extracts and
5296 // conditional branches, but may not be executed for each vector lane. Scale
5297 // the cost by the probability of executing the predicated block.
5298 if (isPredicatedInst(I)) {
5299 Cost /= getPredBlockCostDivisor(CostKind);
5300
5301 // Add the cost of an i1 extract and a branch
5302 auto *VecI1Ty =
5303 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5304 Cost += TTI.getScalarizationOverhead(
5305 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5306 /*Insert=*/false, /*Extract=*/true, CostKind);
5307 Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
5308
5309 if (useEmulatedMaskMemRefHack(I, VF))
5310 // Artificially setting to a high enough value to practically disable
5311 // vectorization with such operations.
5312 Cost = 3000000;
5313 }
5314
5315 return Cost;
5316}
5317
5318InstructionCost
5319LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5320 ElementCount VF) {
5321 Type *ValTy = getLoadStoreType(I);
5322 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5323 Value *Ptr = getLoadStorePointerOperand(V: I);
5324 unsigned AS = getLoadStoreAddressSpace(I);
5325 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5326
5327 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5328 "Stride should be 1 or -1 for consecutive memory access");
5329 const Align Alignment = getLoadStoreAlignment(I);
5330 InstructionCost Cost = 0;
5331 if (Legal->isMaskRequired(I)) {
5332 Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5333 CostKind);
5334 } else {
5335 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5336 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5337 CostKind, OpdInfo: OpInfo, I);
5338 }
5339
5340 bool Reverse = ConsecutiveStride < 0;
5341 if (Reverse)
5342 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5343 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5344 return Cost;
5345}
5346
5347InstructionCost
5348LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5349 ElementCount VF) {
5350 assert(Legal->isUniformMemOp(*I, VF));
5351
5352 Type *ValTy = getLoadStoreType(I);
5353 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5354 const Align Alignment = getLoadStoreAlignment(I);
5355 unsigned AS = getLoadStoreAddressSpace(I);
5356 if (isa<LoadInst>(Val: I)) {
5357 return TTI.getAddressComputationCost(Ty: ValTy) +
5358 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5359 CostKind) +
5360 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5361 SrcTy: VectorTy, Mask: {}, CostKind);
5362 }
5363 StoreInst *SI = cast<StoreInst>(Val: I);
5364
5365 bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5366 // TODO: We have existing tests that request the cost of extracting element
5367 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5368 // the actual generated code, which involves extracting the last element of
5369 // a scalable vector where the lane to extract is unknown at compile time.
5370 return TTI.getAddressComputationCost(Ty: ValTy) +
5371 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
5372 CostKind) +
5373 (IsLoopInvariantStoreValue
5374 ? 0
5375 : TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy,
5376 CostKind, Index: VF.getKnownMinValue() - 1));
5377}
5378
5379InstructionCost
5380LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5381 ElementCount VF) {
5382 Type *ValTy = getLoadStoreType(I);
5383 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5384 const Align Alignment = getLoadStoreAlignment(I);
5385 const Value *Ptr = getLoadStorePointerOperand(V: I);
5386
5387 return TTI.getAddressComputationCost(Ty: VectorTy) +
5388 TTI.getGatherScatterOpCost(Opcode: I->getOpcode(), DataTy: VectorTy, Ptr,
5389 VariableMask: Legal->isMaskRequired(I), Alignment,
5390 CostKind, I);
5391}
5392
5393InstructionCost
5394LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5395 ElementCount VF) {
5396 const auto *Group = getInterleavedAccessGroup(Instr: I);
5397 assert(Group && "Fail to get an interleaved access group.");
5398
5399 Instruction *InsertPos = Group->getInsertPos();
5400 Type *ValTy = getLoadStoreType(I: InsertPos);
5401 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5402 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5403
5404 unsigned InterleaveFactor = Group->getFactor();
5405 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
5406
5407 // Holds the indices of existing members in the interleaved group.
5408 SmallVector<unsigned, 4> Indices;
5409 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5410 if (Group->getMember(Index: IF))
5411 Indices.push_back(Elt: IF);
5412
5413 // Calculate the cost of the whole interleaved group.
5414 bool UseMaskForGaps =
5415 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5416 (isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()));
5417 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5418 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5419 Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5420 UseMaskForGaps);
5421
5422 if (Group->isReverse()) {
5423 // TODO: Add support for reversed masked interleaved access.
5424 assert(!Legal->isMaskRequired(I) &&
5425 "Reverse masked interleaved access not supported.");
5426 Cost += Group->getNumMembers() *
5427 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5428 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5429 }
5430 return Cost;
5431}
5432
5433std::optional<InstructionCost>
5434LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5435 ElementCount VF,
5436 Type *Ty) const {
5437 using namespace llvm::PatternMatch;
5438 // Early exit for no inloop reductions
5439 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
5440 return std::nullopt;
5441 auto *VectorTy = cast<VectorType>(Val: Ty);
5442
5443 // We are looking for a pattern of, and finding the minimal acceptable cost:
5444 // reduce(mul(ext(A), ext(B))) or
5445 // reduce(mul(A, B)) or
5446 // reduce(ext(A)) or
5447 // reduce(A).
5448 // The basic idea is that we walk down the tree to do that, finding the root
5449 // reduction instruction in InLoopReductionImmediateChains. From there we find
5450 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5451 // of the components. If the reduction cost is lower then we return it for the
5452 // reduction instruction and 0 for the other instructions in the pattern. If
5453 // it is not we return an invalid cost specifying the orignal cost method
5454 // should be used.
5455 Instruction *RetI = I;
5456 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5457 if (!RetI->hasOneUser())
5458 return std::nullopt;
5459 RetI = RetI->user_back();
5460 }
5461
5462 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5463 RetI->user_back()->getOpcode() == Instruction::Add) {
5464 RetI = RetI->user_back();
5465 }
5466
5467 // Test if the found instruction is a reduction, and if not return an invalid
5468 // cost specifying the parent to use the original cost modelling.
5469 Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5470 if (!LastChain)
5471 return std::nullopt;
5472
5473 // Find the reduction this chain is a part of and calculate the basic cost of
5474 // the reduction on its own.
5475 Instruction *ReductionPhi = LastChain;
5476 while (!isa<PHINode>(Val: ReductionPhi))
5477 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5478
5479 const RecurrenceDescriptor &RdxDesc =
5480 Legal->getReductionVars().find(Key: cast<PHINode>(Val: ReductionPhi))->second;
5481
5482 InstructionCost BaseCost;
5483 RecurKind RK = RdxDesc.getRecurrenceKind();
5484 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5485 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5486 BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5487 FMF: RdxDesc.getFastMathFlags(), CostKind);
5488 } else {
5489 BaseCost = TTI.getArithmeticReductionCost(
5490 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5491 }
5492
5493 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5494 // normal fmul instruction to the cost of the fadd reduction.
5495 if (RK == RecurKind::FMulAdd)
5496 BaseCost +=
5497 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5498
5499 // If we're using ordered reductions then we can just return the base cost
5500 // here, since getArithmeticReductionCost calculates the full ordered
5501 // reduction cost when FP reassociation is not allowed.
5502 if (useOrderedReductions(RdxDesc))
5503 return BaseCost;
5504
5505 // Get the operand that was not the reduction chain and match it to one of the
5506 // patterns, returning the better cost if it is found.
5507 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
5508 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
5509 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
5510
5511 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
5512
5513 Instruction *Op0, *Op1;
5514 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5515 match(V: RedOp,
5516 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5517 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5518 Op0->getOpcode() == Op1->getOpcode() &&
5519 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
5520 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5521 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5522
5523 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5524 // Note that the extend opcodes need to all match, or if A==B they will have
5525 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5526 // which is equally fine.
5527 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5528 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
5529 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5530
5531 InstructionCost ExtCost =
5532 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5533 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5534 InstructionCost MulCost =
5535 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5536 InstructionCost Ext2Cost =
5537 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5538 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5539
5540 InstructionCost RedCost = TTI.getMulAccReductionCost(
5541 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5542
5543 if (RedCost.isValid() &&
5544 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5545 return I == RetI ? RedCost : 0;
5546 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5547 !TheLoop->isLoopInvariant(V: RedOp)) {
5548 // Matched reduce(ext(A))
5549 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5550 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
5551 InstructionCost RedCost = TTI.getExtendedReductionCost(
5552 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5553 FMF: RdxDesc.getFastMathFlags(), CostKind);
5554
5555 InstructionCost ExtCost =
5556 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5557 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5558 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5559 return I == RetI ? RedCost : 0;
5560 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5561 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5562 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5563 Op0->getOpcode() == Op1->getOpcode() &&
5564 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5565 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5566 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
5567 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
5568 Type *LargestOpTy =
5569 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5570 : Op0Ty;
5571 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5572
5573 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5574 // different sizes. We take the largest type as the ext to reduce, and add
5575 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5576 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5577 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5578 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5579 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5580 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5581 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5582 InstructionCost MulCost =
5583 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5584
5585 InstructionCost RedCost = TTI.getMulAccReductionCost(
5586 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5587 InstructionCost ExtraExtCost = 0;
5588 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5589 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5590 ExtraExtCost = TTI.getCastInstrCost(
5591 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5592 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
5593 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5594 }
5595
5596 if (RedCost.isValid() &&
5597 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5598 return I == RetI ? RedCost : 0;
5599 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5600 // Matched reduce.add(mul())
5601 InstructionCost MulCost =
5602 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5603
5604 InstructionCost RedCost = TTI.getMulAccReductionCost(
5605 IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind);
5606
5607 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5608 return I == RetI ? RedCost : 0;
5609 }
5610 }
5611
5612 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5613}
5614
5615InstructionCost
5616LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5617 ElementCount VF) {
5618 // Calculate scalar cost only. Vectorization cost should be ready at this
5619 // moment.
5620 if (VF.isScalar()) {
5621 Type *ValTy = getLoadStoreType(I);
5622 const Align Alignment = getLoadStoreAlignment(I);
5623 unsigned AS = getLoadStoreAddressSpace(I);
5624
5625 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5626 return TTI.getAddressComputationCost(Ty: ValTy) +
5627 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5628 OpdInfo: OpInfo, I);
5629 }
5630 return getWideningCost(I, VF);
5631}
5632
5633InstructionCost
5634LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5635 ElementCount VF) const {
5636
5637 // There is no mechanism yet to create a scalable scalarization loop,
5638 // so this is currently Invalid.
5639 if (VF.isScalable())
5640 return InstructionCost::getInvalid();
5641
5642 if (VF.isScalar())
5643 return 0;
5644
5645 InstructionCost Cost = 0;
5646 Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5647 if (!RetTy->isVoidTy() &&
5648 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5649
5650 for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5651 Cost += TTI.getScalarizationOverhead(
5652 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5653 /*Insert=*/true,
5654 /*Extract=*/false, CostKind);
5655 }
5656 }
5657
5658 // Some targets keep addresses scalar.
5659 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5660 return Cost;
5661
5662 // Some targets support efficient element stores.
5663 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5664 return Cost;
5665
5666 // Collect operands to consider.
5667 CallInst *CI = dyn_cast<CallInst>(Val: I);
5668 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5669
5670 // Skip operands that do not require extraction/scalarization and do not incur
5671 // any overhead.
5672 SmallVector<Type *> Tys;
5673 for (auto *V : filterExtractingOperands(Ops, VF))
5674 Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5675 return Cost + TTI.getOperandsScalarizationOverhead(
5676 Args: filterExtractingOperands(Ops, VF), Tys, CostKind);
5677}
5678
5679void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5680 if (VF.isScalar())
5681 return;
5682 NumPredStores = 0;
5683 for (BasicBlock *BB : TheLoop->blocks()) {
5684 // For each instruction in the old loop.
5685 for (Instruction &I : *BB) {
5686 Value *Ptr = getLoadStorePointerOperand(V: &I);
5687 if (!Ptr)
5688 continue;
5689
5690 // TODO: We should generate better code and update the cost model for
5691 // predicated uniform stores. Today they are treated as any other
5692 // predicated store (see added test cases in
5693 // invariant-store-vectorization.ll).
5694 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5695 NumPredStores++;
5696
5697 if (Legal->isUniformMemOp(I, VF)) {
5698 auto IsLegalToScalarize = [&]() {
5699 if (!VF.isScalable())
5700 // Scalarization of fixed length vectors "just works".
5701 return true;
5702
5703 // We have dedicated lowering for unpredicated uniform loads and
5704 // stores. Note that even with tail folding we know that at least
5705 // one lane is active (i.e. generalized predication is not possible
5706 // here), and the logic below depends on this fact.
5707 if (!foldTailByMasking())
5708 return true;
5709
5710 // For scalable vectors, a uniform memop load is always
5711 // uniform-by-parts and we know how to scalarize that.
5712 if (isa<LoadInst>(Val: I))
5713 return true;
5714
5715 // A uniform store isn't neccessarily uniform-by-part
5716 // and we can't assume scalarization.
5717 auto &SI = cast<StoreInst>(Val&: I);
5718 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5719 };
5720
5721 const InstructionCost GatherScatterCost =
5722 isLegalGatherOrScatter(V: &I, VF) ?
5723 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5724
5725 // Load: Scalar load + broadcast
5726 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5727 // FIXME: This cost is a significant under-estimate for tail folded
5728 // memory ops.
5729 const InstructionCost ScalarizationCost =
5730 IsLegalToScalarize() ? getUniformMemOpCost(I: &I, VF)
5731 : InstructionCost::getInvalid();
5732
5733 // Choose better solution for the current VF, Note that Invalid
5734 // costs compare as maximumal large. If both are invalid, we get
5735 // scalable invalid which signals a failure and a vectorization abort.
5736 if (GatherScatterCost < ScalarizationCost)
5737 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5738 else
5739 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5740 continue;
5741 }
5742
5743 // We assume that widening is the best solution when possible.
5744 if (memoryInstructionCanBeWidened(I: &I, VF)) {
5745 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5746 int ConsecutiveStride = Legal->isConsecutivePtr(
5747 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5748 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5749 "Expected consecutive stride.");
5750 InstWidening Decision =
5751 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5752 setWideningDecision(I: &I, VF, W: Decision, Cost);
5753 continue;
5754 }
5755
5756 // Choose between Interleaving, Gather/Scatter or Scalarization.
5757 InstructionCost InterleaveCost = InstructionCost::getInvalid();
5758 unsigned NumAccesses = 1;
5759 if (isAccessInterleaved(Instr: &I)) {
5760 const auto *Group = getInterleavedAccessGroup(Instr: &I);
5761 assert(Group && "Fail to get an interleaved access group.");
5762
5763 // Make one decision for the whole group.
5764 if (getWideningDecision(I: &I, VF) != CM_Unknown)
5765 continue;
5766
5767 NumAccesses = Group->getNumMembers();
5768 if (interleavedAccessCanBeWidened(I: &I, VF))
5769 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5770 }
5771
5772 InstructionCost GatherScatterCost =
5773 isLegalGatherOrScatter(V: &I, VF)
5774 ? getGatherScatterCost(I: &I, VF) * NumAccesses
5775 : InstructionCost::getInvalid();
5776
5777 InstructionCost ScalarizationCost =
5778 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5779
5780 // Choose better solution for the current VF,
5781 // write down this decision and use it during vectorization.
5782 InstructionCost Cost;
5783 InstWidening Decision;
5784 if (InterleaveCost <= GatherScatterCost &&
5785 InterleaveCost < ScalarizationCost) {
5786 Decision = CM_Interleave;
5787 Cost = InterleaveCost;
5788 } else if (GatherScatterCost < ScalarizationCost) {
5789 Decision = CM_GatherScatter;
5790 Cost = GatherScatterCost;
5791 } else {
5792 Decision = CM_Scalarize;
5793 Cost = ScalarizationCost;
5794 }
5795 // If the instructions belongs to an interleave group, the whole group
5796 // receives the same decision. The whole group receives the cost, but
5797 // the cost will actually be assigned to one instruction.
5798 if (const auto *Group = getInterleavedAccessGroup(Instr: &I))
5799 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5800 else
5801 setWideningDecision(I: &I, VF, W: Decision, Cost);
5802 }
5803 }
5804
5805 // Make sure that any load of address and any other address computation
5806 // remains scalar unless there is gather/scatter support. This avoids
5807 // inevitable extracts into address registers, and also has the benefit of
5808 // activating LSR more, since that pass can't optimize vectorized
5809 // addresses.
5810 if (TTI.prefersVectorizedAddressing())
5811 return;
5812
5813 // Start with all scalar pointer uses.
5814 SmallPtrSet<Instruction *, 8> AddrDefs;
5815 for (BasicBlock *BB : TheLoop->blocks())
5816 for (Instruction &I : *BB) {
5817 Instruction *PtrDef =
5818 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5819 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5820 getWideningDecision(I: &I, VF) != CM_GatherScatter)
5821 AddrDefs.insert(Ptr: PtrDef);
5822 }
5823
5824 // Add all instructions used to generate the addresses.
5825 SmallVector<Instruction *, 4> Worklist;
5826 append_range(C&: Worklist, R&: AddrDefs);
5827 while (!Worklist.empty()) {
5828 Instruction *I = Worklist.pop_back_val();
5829 for (auto &Op : I->operands())
5830 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5831 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) &&
5832 AddrDefs.insert(Ptr: InstOp).second)
5833 Worklist.push_back(Elt: InstOp);
5834 }
5835
5836 for (auto *I : AddrDefs) {
5837 if (isa<LoadInst>(Val: I)) {
5838 // Setting the desired widening decision should ideally be handled in
5839 // by cost functions, but since this involves the task of finding out
5840 // if the loaded register is involved in an address computation, it is
5841 // instead changed here when we know this is the case.
5842 InstWidening Decision = getWideningDecision(I, VF);
5843 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5844 // Scalarize a widened load of address.
5845 setWideningDecision(
5846 I, VF, W: CM_Scalarize,
5847 Cost: (VF.getKnownMinValue() *
5848 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
5849 else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5850 // Scalarize an interleave group of address loads.
5851 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5852 if (Instruction *Member = Group->getMember(Index: I))
5853 setWideningDecision(
5854 I: Member, VF, W: CM_Scalarize,
5855 Cost: (VF.getKnownMinValue() *
5856 getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: 1))));
5857 }
5858 }
5859 } else {
5860 // Cannot scalarize fixed-order recurrence phis at the moment.
5861 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5862 continue;
5863
5864 // Make sure I gets scalarized and a cost estimate without
5865 // scalarization overhead.
5866 ForcedScalars[VF].insert(Ptr: I);
5867 }
5868 }
5869}
5870
5871void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5872 assert(!VF.isScalar() &&
5873 "Trying to set a vectorization decision for a scalar VF");
5874
5875 auto ForcedScalar = ForcedScalars.find(Val: VF);
5876 for (BasicBlock *BB : TheLoop->blocks()) {
5877 // For each instruction in the old loop.
5878 for (Instruction &I : *BB) {
5879 CallInst *CI = dyn_cast<CallInst>(Val: &I);
5880
5881 if (!CI)
5882 continue;
5883
5884 InstructionCost ScalarCost = InstructionCost::getInvalid();
5885 InstructionCost VectorCost = InstructionCost::getInvalid();
5886 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5887 Function *ScalarFunc = CI->getCalledFunction();
5888 Type *ScalarRetTy = CI->getType();
5889 SmallVector<Type *, 4> Tys, ScalarTys;
5890 for (auto &ArgOp : CI->args())
5891 ScalarTys.push_back(Elt: ArgOp->getType());
5892
5893 // Estimate cost of scalarized vector call. The source operands are
5894 // assumed to be vectors, so we need to extract individual elements from
5895 // there, execute VF scalar calls, and then gather the result into the
5896 // vector return value.
5897 InstructionCost ScalarCallCost =
5898 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5899
5900 // Compute costs of unpacking argument values for the scalar calls and
5901 // packing the return values to a vector.
5902 InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5903
5904 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5905 // Honor ForcedScalars and UniformAfterVectorization decisions.
5906 // TODO: For calls, it might still be more profitable to widen. Use
5907 // VPlan-based cost model to compare different options.
5908 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5909 ForcedScalar->second.contains(Ptr: CI)) ||
5910 isUniformAfterVectorization(I: CI, VF))) {
5911 setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5912 IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5913 Cost: ScalarCost);
5914 continue;
5915 }
5916
5917 bool MaskRequired = Legal->isMaskRequired(I: CI);
5918 // Compute corresponding vector type for return value and arguments.
5919 Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5920 for (Type *ScalarTy : ScalarTys)
5921 Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5922
5923 // An in-loop reduction using an fmuladd intrinsic is a special case;
5924 // we don't want the normal cost for that intrinsic.
5925 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5926 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5927 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5928 IID: getVectorIntrinsicIDForCall(CI, TLI),
5929 MaskPos: std::nullopt, Cost: *RedCost);
5930 continue;
5931 }
5932
5933 // Find the cost of vectorizing the call, if we can find a suitable
5934 // vector variant of the function.
5935 VFInfo FuncInfo;
5936 Function *VecFunc = nullptr;
5937 // Search through any available variants for one we can use at this VF.
5938 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5939 // Must match requested VF.
5940 if (Info.Shape.VF != VF)
5941 continue;
5942
5943 // Must take a mask argument if one is required
5944 if (MaskRequired && !Info.isMasked())
5945 continue;
5946
5947 // Check that all parameter kinds are supported
5948 bool ParamsOk = true;
5949 for (VFParameter Param : Info.Shape.Parameters) {
5950 switch (Param.ParamKind) {
5951 case VFParamKind::Vector:
5952 break;
5953 case VFParamKind::OMP_Uniform: {
5954 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5955 // Make sure the scalar parameter in the loop is invariant.
5956 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
5957 L: TheLoop))
5958 ParamsOk = false;
5959 break;
5960 }
5961 case VFParamKind::OMP_Linear: {
5962 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5963 // Find the stride for the scalar parameter in this loop and see if
5964 // it matches the stride for the variant.
5965 // TODO: do we need to figure out the cost of an extract to get the
5966 // first lane? Or do we hope that it will be folded away?
5967 ScalarEvolution *SE = PSE.getSE();
5968 const auto *SAR =
5969 dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam));
5970
5971 if (!SAR || SAR->getLoop() != TheLoop) {
5972 ParamsOk = false;
5973 break;
5974 }
5975
5976 const SCEVConstant *Step =
5977 dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE));
5978
5979 if (!Step ||
5980 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
5981 ParamsOk = false;
5982
5983 break;
5984 }
5985 case VFParamKind::GlobalPredicate:
5986 break;
5987 default:
5988 ParamsOk = false;
5989 break;
5990 }
5991 }
5992
5993 if (!ParamsOk)
5994 continue;
5995
5996 // Found a suitable candidate, stop here.
5997 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
5998 FuncInfo = Info;
5999 break;
6000 }
6001
6002 if (TLI && VecFunc && !CI->isNoBuiltin())
6003 VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
6004
6005 // Find the cost of an intrinsic; some targets may have instructions that
6006 // perform the operation without needing an actual call.
6007 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6008 if (IID != Intrinsic::not_intrinsic)
6009 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6010
6011 InstructionCost Cost = ScalarCost;
6012 InstWidening Decision = CM_Scalarize;
6013
6014 if (VectorCost <= Cost) {
6015 Cost = VectorCost;
6016 Decision = CM_VectorCall;
6017 }
6018
6019 if (IntrinsicCost <= Cost) {
6020 Cost = IntrinsicCost;
6021 Decision = CM_IntrinsicCall;
6022 }
6023
6024 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6025 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6026 }
6027 }
6028}
6029
6030bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6031 if (!Legal->isInvariant(V: Op))
6032 return false;
6033 // Consider Op invariant, if it or its operands aren't predicated
6034 // instruction in the loop. In that case, it is not trivially hoistable.
6035 auto *OpI = dyn_cast<Instruction>(Val: Op);
6036 return !OpI || !TheLoop->contains(Inst: OpI) ||
6037 (!isPredicatedInst(I: OpI) &&
6038 (!isa<PHINode>(Val: OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6039 all_of(Range: OpI->operands(),
6040 P: [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6041}
6042
6043InstructionCost
6044LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6045 ElementCount VF) {
6046 // If we know that this instruction will remain uniform, check the cost of
6047 // the scalar version.
6048 if (isUniformAfterVectorization(I, VF))
6049 VF = ElementCount::getFixed(MinVal: 1);
6050
6051 if (VF.isVector() && isProfitableToScalarize(I, VF))
6052 return InstsToScalarize[VF][I];
6053
6054 // Forced scalars do not have any scalarization overhead.
6055 auto ForcedScalar = ForcedScalars.find(Val: VF);
6056 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6057 auto InstSet = ForcedScalar->second;
6058 if (InstSet.count(Ptr: I))
6059 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
6060 VF.getKnownMinValue();
6061 }
6062
6063 Type *RetTy = I->getType();
6064 if (canTruncateToMinimalBitwidth(I, VF))
6065 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
6066 auto *SE = PSE.getSE();
6067
6068 Type *VectorTy;
6069 if (isScalarAfterVectorization(I, VF)) {
6070 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6071 [this](Instruction *I, ElementCount VF) -> bool {
6072 if (VF.isScalar())
6073 return true;
6074
6075 auto Scalarized = InstsToScalarize.find(Val: VF);
6076 assert(Scalarized != InstsToScalarize.end() &&
6077 "VF not yet analyzed for scalarization profitability");
6078 return !Scalarized->second.count(Val: I) &&
6079 llvm::all_of(Range: I->users(), P: [&](User *U) {
6080 auto *UI = cast<Instruction>(Val: U);
6081 return !Scalarized->second.count(Val: UI);
6082 });
6083 };
6084
6085 // With the exception of GEPs and PHIs, after scalarization there should
6086 // only be one copy of the instruction generated in the loop. This is
6087 // because the VF is either 1, or any instructions that need scalarizing
6088 // have already been dealt with by the time we get here. As a result,
6089 // it means we don't have to multiply the instruction cost by VF.
6090 assert(I->getOpcode() == Instruction::GetElementPtr ||
6091 I->getOpcode() == Instruction::PHI ||
6092 (I->getOpcode() == Instruction::BitCast &&
6093 I->getType()->isPointerTy()) ||
6094 HasSingleCopyAfterVectorization(I, VF));
6095 VectorTy = RetTy;
6096 } else
6097 VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
6098
6099 if (VF.isVector() && VectorTy->isVectorTy() &&
6100 !TTI.getNumberOfParts(Tp: VectorTy))
6101 return InstructionCost::getInvalid();
6102
6103 // TODO: We need to estimate the cost of intrinsic calls.
6104 switch (I->getOpcode()) {
6105 case Instruction::GetElementPtr:
6106 // We mark this instruction as zero-cost because the cost of GEPs in
6107 // vectorized code depends on whether the corresponding memory instruction
6108 // is scalarized or not. Therefore, we handle GEPs with the memory
6109 // instruction cost.
6110 return 0;
6111 case Instruction::Br: {
6112 // In cases of scalarized and predicated instructions, there will be VF
6113 // predicated blocks in the vectorized loop. Each branch around these
6114 // blocks requires also an extract of its vector compare i1 element.
6115 // Note that the conditional branch from the loop latch will be replaced by
6116 // a single branch controlling the loop, so there is no extra overhead from
6117 // scalarization.
6118 bool ScalarPredicatedBB = false;
6119 BranchInst *BI = cast<BranchInst>(Val: I);
6120 if (VF.isVector() && BI->isConditional() &&
6121 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6122 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
6123 BI->getParent() != TheLoop->getLoopLatch())
6124 ScalarPredicatedBB = true;
6125
6126 if (ScalarPredicatedBB) {
6127 // Not possible to scalarize scalable vector with predicated instructions.
6128 if (VF.isScalable())
6129 return InstructionCost::getInvalid();
6130 // Return cost for branches around scalarized and predicated blocks.
6131 auto *VecI1Ty =
6132 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6133 return (
6134 TTI.getScalarizationOverhead(
6135 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6136 /*Insert*/ false, /*Extract*/ true, CostKind) +
6137 (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6138 }
6139
6140 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6141 // The back-edge branch will remain, as will all scalar branches.
6142 return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6143
6144 // This branch will be eliminated by if-conversion.
6145 return 0;
6146 // Note: We currently assume zero cost for an unconditional branch inside
6147 // a predicated block since it will become a fall-through, although we
6148 // may decide in the future to call TTI for all branches.
6149 }
6150 case Instruction::Switch: {
6151 if (VF.isScalar())
6152 return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6153 auto *Switch = cast<SwitchInst>(Val: I);
6154 return Switch->getNumCases() *
6155 TTI.getCmpSelInstrCost(
6156 Opcode: Instruction::ICmp,
6157 ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6158 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6159 VecPred: CmpInst::ICMP_EQ, CostKind);
6160 }
6161 case Instruction::PHI: {
6162 auto *Phi = cast<PHINode>(Val: I);
6163
6164 // First-order recurrences are replaced by vector shuffles inside the loop.
6165 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6166 SmallVector<int> Mask(VF.getKnownMinValue());
6167 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6168 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6169 DstTy: cast<VectorType>(Val: VectorTy),
6170 SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6171 Index: VF.getKnownMinValue() - 1);
6172 }
6173
6174 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6175 // converted into select instructions. We require N - 1 selects per phi
6176 // node, where N is the number of incoming values.
6177 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6178 Type *ResultTy = Phi->getType();
6179
6180 // All instructions in an Any-of reduction chain are narrowed to bool.
6181 // Check if that is the case for this phi node.
6182 auto *HeaderUser = cast_if_present<PHINode>(
6183 Val: find_singleton<User>(Range: Phi->users(), P: [this](User *U, bool) -> User * {
6184 auto *Phi = dyn_cast<PHINode>(Val: U);
6185 if (Phi && Phi->getParent() == TheLoop->getHeader())
6186 return Phi;
6187 return nullptr;
6188 }));
6189 if (HeaderUser) {
6190 auto &ReductionVars = Legal->getReductionVars();
6191 auto Iter = ReductionVars.find(Key: HeaderUser);
6192 if (Iter != ReductionVars.end() &&
6193 RecurrenceDescriptor::isAnyOfRecurrenceKind(
6194 Kind: Iter->second.getRecurrenceKind()))
6195 ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6196 }
6197 return (Phi->getNumIncomingValues() - 1) *
6198 TTI.getCmpSelInstrCost(
6199 Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6200 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6201 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6202 }
6203
6204 // When tail folding with EVL, if the phi is part of an out of loop
6205 // reduction then it will be transformed into a wide vp_merge.
6206 if (VF.isVector() && foldTailWithEVL() &&
6207 Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6208 IntrinsicCostAttributes ICA(
6209 Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6210 {toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6211 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6212 }
6213
6214 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6215 }
6216 case Instruction::UDiv:
6217 case Instruction::SDiv:
6218 case Instruction::URem:
6219 case Instruction::SRem:
6220 if (VF.isVector() && isPredicatedInst(I)) {
6221 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6222 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6223 ScalarCost : SafeDivisorCost;
6224 }
6225 // We've proven all lanes safe to speculate, fall through.
6226 [[fallthrough]];
6227 case Instruction::Add:
6228 case Instruction::Sub: {
6229 auto Info = Legal->getHistogramInfo(I);
6230 if (Info && VF.isVector()) {
6231 const HistogramInfo *HGram = Info.value();
6232 // Assume that a non-constant update value (or a constant != 1) requires
6233 // a multiply, and add that into the cost.
6234 InstructionCost MulCost = TTI::TCC_Free;
6235 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
6236 if (!RHS || RHS->getZExtValue() != 1)
6237 MulCost =
6238 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6239
6240 // Find the cost of the histogram operation itself.
6241 Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6242 Type *ScalarTy = I->getType();
6243 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6244 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6245 Type::getVoidTy(C&: I->getContext()),
6246 {PtrTy, ScalarTy, MaskTy});
6247
6248 // Add the costs together with the add/sub operation.
6249 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6250 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6251 }
6252 [[fallthrough]];
6253 }
6254 case Instruction::FAdd:
6255 case Instruction::FSub:
6256 case Instruction::Mul:
6257 case Instruction::FMul:
6258 case Instruction::FDiv:
6259 case Instruction::FRem:
6260 case Instruction::Shl:
6261 case Instruction::LShr:
6262 case Instruction::AShr:
6263 case Instruction::And:
6264 case Instruction::Or:
6265 case Instruction::Xor: {
6266 // If we're speculating on the stride being 1, the multiplication may
6267 // fold away. We can generalize this for all operations using the notion
6268 // of neutral elements. (TODO)
6269 if (I->getOpcode() == Instruction::Mul &&
6270 ((TheLoop->isLoopInvariant(V: I->getOperand(i: 0)) &&
6271 PSE.getSCEV(V: I->getOperand(i: 0))->isOne()) ||
6272 (TheLoop->isLoopInvariant(V: I->getOperand(i: 1)) &&
6273 PSE.getSCEV(V: I->getOperand(i: 1))->isOne())))
6274 return 0;
6275
6276 // Detect reduction patterns
6277 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6278 return *RedCost;
6279
6280 // Certain instructions can be cheaper to vectorize if they have a constant
6281 // second vector operand. One example of this are shifts on x86.
6282 Value *Op2 = I->getOperand(i: 1);
6283 if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6284 PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6285 isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6286 Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6287 }
6288 auto Op2Info = TTI.getOperandInfo(V: Op2);
6289 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6290 shouldConsiderInvariant(Op: Op2))
6291 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6292
6293 SmallVector<const Value *, 4> Operands(I->operand_values());
6294 return TTI.getArithmeticInstrCost(
6295 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6296 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6297 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6298 }
6299 case Instruction::FNeg: {
6300 return TTI.getArithmeticInstrCost(
6301 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6302 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6303 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6304 Args: I->getOperand(i: 0), CxtI: I);
6305 }
6306 case Instruction::Select: {
6307 SelectInst *SI = cast<SelectInst>(Val: I);
6308 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6309 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6310
6311 const Value *Op0, *Op1;
6312 using namespace llvm::PatternMatch;
6313 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6314 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6315 // select x, y, false --> x & y
6316 // select x, true, y --> x | y
6317 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6318 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6319 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6320 Op1->getType()->getScalarSizeInBits() == 1);
6321
6322 SmallVector<const Value *, 2> Operands{Op0, Op1};
6323 return TTI.getArithmeticInstrCost(
6324 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy,
6325 CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I);
6326 }
6327
6328 Type *CondTy = SI->getCondition()->getType();
6329 if (!ScalarCond)
6330 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6331
6332 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6333 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6334 Pred = Cmp->getPredicate();
6335 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6336 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6337 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6338 }
6339 case Instruction::ICmp:
6340 case Instruction::FCmp: {
6341 Type *ValTy = I->getOperand(i: 0)->getType();
6342
6343 if (canTruncateToMinimalBitwidth(I, VF)) {
6344 [[maybe_unused]] Instruction *Op0AsInstruction =
6345 dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6346 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6347 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6348 "if both the operand and the compare are marked for "
6349 "truncation, they must have the same bitwidth");
6350 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[I]);
6351 }
6352
6353 VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6354 return TTI.getCmpSelInstrCost(
6355 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6356 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6357 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6358 }
6359 case Instruction::Store:
6360 case Instruction::Load: {
6361 ElementCount Width = VF;
6362 if (Width.isVector()) {
6363 InstWidening Decision = getWideningDecision(I, VF: Width);
6364 assert(Decision != CM_Unknown &&
6365 "CM decision should be taken at this point");
6366 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6367 return InstructionCost::getInvalid();
6368 if (Decision == CM_Scalarize)
6369 Width = ElementCount::getFixed(MinVal: 1);
6370 }
6371 VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6372 return getMemoryInstructionCost(I, VF);
6373 }
6374 case Instruction::BitCast:
6375 if (I->getType()->isPointerTy())
6376 return 0;
6377 [[fallthrough]];
6378 case Instruction::ZExt:
6379 case Instruction::SExt:
6380 case Instruction::FPToUI:
6381 case Instruction::FPToSI:
6382 case Instruction::FPExt:
6383 case Instruction::PtrToInt:
6384 case Instruction::IntToPtr:
6385 case Instruction::SIToFP:
6386 case Instruction::UIToFP:
6387 case Instruction::Trunc:
6388 case Instruction::FPTrunc: {
6389 // Computes the CastContextHint from a Load/Store instruction.
6390 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6391 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6392 "Expected a load or a store!");
6393
6394 if (VF.isScalar() || !TheLoop->contains(Inst: I))
6395 return TTI::CastContextHint::Normal;
6396
6397 switch (getWideningDecision(I, VF)) {
6398 case LoopVectorizationCostModel::CM_GatherScatter:
6399 return TTI::CastContextHint::GatherScatter;
6400 case LoopVectorizationCostModel::CM_Interleave:
6401 return TTI::CastContextHint::Interleave;
6402 case LoopVectorizationCostModel::CM_Scalarize:
6403 case LoopVectorizationCostModel::CM_Widen:
6404 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6405 : TTI::CastContextHint::Normal;
6406 case LoopVectorizationCostModel::CM_Widen_Reverse:
6407 return TTI::CastContextHint::Reversed;
6408 case LoopVectorizationCostModel::CM_Unknown:
6409 llvm_unreachable("Instr did not go through cost modelling?");
6410 case LoopVectorizationCostModel::CM_VectorCall:
6411 case LoopVectorizationCostModel::CM_IntrinsicCall:
6412 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6413 }
6414
6415 llvm_unreachable("Unhandled case!");
6416 };
6417
6418 unsigned Opcode = I->getOpcode();
6419 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6420 // For Trunc, the context is the only user, which must be a StoreInst.
6421 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6422 if (I->hasOneUse())
6423 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
6424 CCH = ComputeCCH(Store);
6425 }
6426 // For Z/Sext, the context is the operand, which must be a LoadInst.
6427 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6428 Opcode == Instruction::FPExt) {
6429 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
6430 CCH = ComputeCCH(Load);
6431 }
6432
6433 // We optimize the truncation of induction variables having constant
6434 // integer steps. The cost of these truncations is the same as the scalar
6435 // operation.
6436 if (isOptimizableIVTruncate(I, VF)) {
6437 auto *Trunc = cast<TruncInst>(Val: I);
6438 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6439 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6440 }
6441
6442 // Detect reduction patterns
6443 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6444 return *RedCost;
6445
6446 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
6447 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6448 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6449 SrcScalarTy =
6450 IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6451 Type *SrcVecTy =
6452 VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6453
6454 if (canTruncateToMinimalBitwidth(I, VF)) {
6455 // If the result type is <= the source type, there will be no extend
6456 // after truncating the users to the minimal required bitwidth.
6457 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6458 (I->getOpcode() == Instruction::ZExt ||
6459 I->getOpcode() == Instruction::SExt))
6460 return 0;
6461 }
6462
6463 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6464 }
6465 case Instruction::Call:
6466 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6467 case Instruction::ExtractValue:
6468 return TTI.getInstructionCost(U: I, CostKind);
6469 case Instruction::Alloca:
6470 // We cannot easily widen alloca to a scalable alloca, as
6471 // the result would need to be a vector of pointers.
6472 if (VF.isScalable())
6473 return InstructionCost::getInvalid();
6474 [[fallthrough]];
6475 default:
6476 // This opcode is unknown. Assume that it is the same as 'mul'.
6477 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6478 } // end of switch.
6479}
6480
6481void LoopVectorizationCostModel::collectValuesToIgnore() {
6482 // Ignore ephemeral values.
6483 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6484
6485 SmallVector<Value *, 4> DeadInterleavePointerOps;
6486 SmallVector<Value *, 4> DeadOps;
6487
6488 // If a scalar epilogue is required, users outside the loop won't use
6489 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6490 // that is the case.
6491 bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6492 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6493 return RequiresScalarEpilogue &&
6494 !TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6495 };
6496
6497 LoopBlocksDFS DFS(TheLoop);
6498 DFS.perform(LI);
6499 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6500 for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6501 for (Instruction &I : reverse(C&: *BB)) {
6502 // Find all stores to invariant variables. Since they are going to sink
6503 // outside the loop we do not need calculate cost for them.
6504 StoreInst *SI;
6505 if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
6506 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
6507 ValuesToIgnore.insert(Ptr: &I);
6508 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6509 Elt: SI->getValueOperand());
6510 }
6511
6512 if (VecValuesToIgnore.contains(Ptr: &I) || ValuesToIgnore.contains(Ptr: &I))
6513 continue;
6514
6515 // Add instructions that would be trivially dead and are only used by
6516 // values already ignored to DeadOps to seed worklist.
6517 if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6518 all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6519 return VecValuesToIgnore.contains(Ptr: U) ||
6520 ValuesToIgnore.contains(Ptr: U) || IsLiveOutDead(U);
6521 }))
6522 DeadOps.push_back(Elt: &I);
6523
6524 // For interleave groups, we only create a pointer for the start of the
6525 // interleave group. Queue up addresses of group members except the insert
6526 // position for further processing.
6527 if (isAccessInterleaved(Instr: &I)) {
6528 auto *Group = getInterleavedAccessGroup(Instr: &I);
6529 if (Group->getInsertPos() == &I)
6530 continue;
6531 Value *PointerOp = getLoadStorePointerOperand(V: &I);
6532 DeadInterleavePointerOps.push_back(Elt: PointerOp);
6533 }
6534
6535 // Queue branches for analysis. They are dead, if their successors only
6536 // contain dead instructions.
6537 if (auto *Br = dyn_cast<BranchInst>(Val: &I)) {
6538 if (Br->isConditional())
6539 DeadOps.push_back(Elt: &I);
6540 }
6541 }
6542
6543 // Mark ops feeding interleave group members as free, if they are only used
6544 // by other dead computations.
6545 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6546 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
6547 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
6548 Instruction *UI = cast<Instruction>(Val: U);
6549 return !VecValuesToIgnore.contains(Ptr: U) &&
6550 (!isAccessInterleaved(Instr: UI) ||
6551 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6552 }))
6553 continue;
6554 VecValuesToIgnore.insert(Ptr: Op);
6555 DeadInterleavePointerOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6556 }
6557
6558 for (const auto &[_, Ops] : DeadInvariantStoreOps)
6559 llvm::append_range(C&: DeadOps, R: ArrayRef(Ops).drop_back());
6560
6561 // Mark ops that would be trivially dead and are only used by ignored
6562 // instructions as free.
6563 BasicBlock *Header = TheLoop->getHeader();
6564
6565 // Returns true if the block contains only dead instructions. Such blocks will
6566 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6567 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6568 auto IsEmptyBlock = [this](BasicBlock *BB) {
6569 return all_of(Range&: *BB, P: [this](Instruction &I) {
6570 return ValuesToIgnore.contains(Ptr: &I) || VecValuesToIgnore.contains(Ptr: &I) ||
6571 (isa<BranchInst>(Val: &I) && !cast<BranchInst>(Val: &I)->isConditional());
6572 });
6573 };
6574 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6575 auto *Op = dyn_cast<Instruction>(Val: DeadOps[I]);
6576
6577 // Check if the branch should be considered dead.
6578 if (auto *Br = dyn_cast_or_null<BranchInst>(Val: Op)) {
6579 BasicBlock *ThenBB = Br->getSuccessor(i: 0);
6580 BasicBlock *ElseBB = Br->getSuccessor(i: 1);
6581 // Don't considers branches leaving the loop for simplification.
6582 if (!TheLoop->contains(BB: ThenBB) || !TheLoop->contains(BB: ElseBB))
6583 continue;
6584 bool ThenEmpty = IsEmptyBlock(ThenBB);
6585 bool ElseEmpty = IsEmptyBlock(ElseBB);
6586 if ((ThenEmpty && ElseEmpty) ||
6587 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6588 ElseBB->phis().empty()) ||
6589 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6590 ThenBB->phis().empty())) {
6591 VecValuesToIgnore.insert(Ptr: Br);
6592 DeadOps.push_back(Elt: Br->getCondition());
6593 }
6594 continue;
6595 }
6596
6597 // Skip any op that shouldn't be considered dead.
6598 if (!Op || !TheLoop->contains(Inst: Op) ||
6599 (isa<PHINode>(Val: Op) && Op->getParent() == Header) ||
6600 !wouldInstructionBeTriviallyDead(I: Op, TLI) ||
6601 any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6602 return !VecValuesToIgnore.contains(Ptr: U) &&
6603 !ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead(U);
6604 }))
6605 continue;
6606
6607 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6608 // which applies for both scalar and vector versions. Otherwise it is only
6609 // dead in vector versions, so only add it to VecValuesToIgnore.
6610 if (all_of(Range: Op->users(),
6611 P: [this](User *U) { return ValuesToIgnore.contains(Ptr: U); }))
6612 ValuesToIgnore.insert(Ptr: Op);
6613
6614 VecValuesToIgnore.insert(Ptr: Op);
6615 DeadOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6616 }
6617
6618 // Ignore type-promoting instructions we identified during reduction
6619 // detection.
6620 for (const auto &Reduction : Legal->getReductionVars()) {
6621 const RecurrenceDescriptor &RedDes = Reduction.second;
6622 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6623 VecValuesToIgnore.insert_range(R: Casts);
6624 }
6625 // Ignore type-casting instructions we identified during induction
6626 // detection.
6627 for (const auto &Induction : Legal->getInductionVars()) {
6628 const InductionDescriptor &IndDes = Induction.second;
6629 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6630 VecValuesToIgnore.insert_range(R: Casts);
6631 }
6632}
6633
6634void LoopVectorizationCostModel::collectInLoopReductions() {
6635 // Avoid duplicating work finding in-loop reductions.
6636 if (!InLoopReductions.empty())
6637 return;
6638
6639 for (const auto &Reduction : Legal->getReductionVars()) {
6640 PHINode *Phi = Reduction.first;
6641 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6642
6643 // We don't collect reductions that are type promoted (yet).
6644 if (RdxDesc.getRecurrenceType() != Phi->getType())
6645 continue;
6646
6647 // If the target would prefer this reduction to happen "in-loop", then we
6648 // want to record it as such.
6649 RecurKind Kind = RdxDesc.getRecurrenceKind();
6650 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6651 !TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6652 continue;
6653
6654 // Check that we can correctly put the reductions into the loop, by
6655 // finding the chain of operations that leads from the phi to the loop
6656 // exit value.
6657 SmallVector<Instruction *, 4> ReductionOperations =
6658 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6659 bool InLoop = !ReductionOperations.empty();
6660
6661 if (InLoop) {
6662 InLoopReductions.insert(Ptr: Phi);
6663 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6664 Instruction *LastChain = Phi;
6665 for (auto *I : ReductionOperations) {
6666 InLoopReductionImmediateChains[I] = LastChain;
6667 LastChain = I;
6668 }
6669 }
6670 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6671 << " reduction for phi: " << *Phi << "\n");
6672 }
6673}
6674
6675// This function will select a scalable VF if the target supports scalable
6676// vectors and a fixed one otherwise.
6677// TODO: we could return a pair of values that specify the max VF and
6678// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6679// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6680// doesn't have a cost model that can choose which plan to execute if
6681// more than one is generated.
6682static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6683 LoopVectorizationCostModel &CM) {
6684 unsigned WidestType;
6685 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6686
6687 TargetTransformInfo::RegisterKind RegKind =
6688 TTI.enableScalableVectorization()
6689 ? TargetTransformInfo::RGK_ScalableVector
6690 : TargetTransformInfo::RGK_FixedWidthVector;
6691
6692 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6693 unsigned N = RegSize.getKnownMinValue() / WidestType;
6694 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6695}
6696
6697VectorizationFactor
6698LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6699 ElementCount VF = UserVF;
6700 // Outer loop handling: They may require CFG and instruction level
6701 // transformations before even evaluating whether vectorization is profitable.
6702 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6703 // the vectorization pipeline.
6704 if (!OrigLoop->isInnermost()) {
6705 // If the user doesn't provide a vectorization factor, determine a
6706 // reasonable one.
6707 if (UserVF.isZero()) {
6708 VF = determineVPlanVF(TTI, CM);
6709 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6710
6711 // Make sure we have a VF > 1 for stress testing.
6712 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6713 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6714 << "overriding computed VF.\n");
6715 VF = ElementCount::getFixed(MinVal: 4);
6716 }
6717 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6718 !ForceTargetSupportsScalableVectors) {
6719 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6720 << "not supported by the target.\n");
6721 reportVectorizationFailure(
6722 DebugMsg: "Scalable vectorization requested but not supported by the target",
6723 OREMsg: "the scalable user-specified vectorization width for outer-loop "
6724 "vectorization cannot be used because the target does not support "
6725 "scalable vectors.",
6726 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6727 return VectorizationFactor::Disabled();
6728 }
6729 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6730 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6731 "VF needs to be a power of two");
6732 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6733 << "VF " << VF << " to build VPlans.\n");
6734 buildVPlans(MinVF: VF, MaxVF: VF);
6735
6736 if (VPlans.empty())
6737 return VectorizationFactor::Disabled();
6738
6739 // For VPlan build stress testing, we bail out after VPlan construction.
6740 if (VPlanBuildStressTest)
6741 return VectorizationFactor::Disabled();
6742
6743 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6744 }
6745
6746 LLVM_DEBUG(
6747 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6748 "VPlan-native path.\n");
6749 return VectorizationFactor::Disabled();
6750}
6751
6752void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6753 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6754 CM.collectValuesToIgnore();
6755 CM.collectElementTypesForWidening();
6756
6757 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6758 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6759 return;
6760
6761 // Invalidate interleave groups if all blocks of loop will be predicated.
6762 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6763 !useMaskedInterleavedAccesses(TTI)) {
6764 LLVM_DEBUG(
6765 dbgs()
6766 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6767 "which requires masked-interleaved support.\n");
6768 if (CM.InterleaveInfo.invalidateGroups())
6769 // Invalidating interleave groups also requires invalidating all decisions
6770 // based on them, which includes widening decisions and uniform and scalar
6771 // values.
6772 CM.invalidateCostModelingDecisions();
6773 }
6774
6775 if (CM.foldTailByMasking())
6776 Legal->prepareToFoldTailByMasking();
6777
6778 ElementCount MaxUserVF =
6779 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6780 if (UserVF) {
6781 if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6782 reportVectorizationInfo(
6783 Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6784 ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6785 } else {
6786 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6787 "VF needs to be a power of two");
6788 // Collect the instructions (and their associated costs) that will be more
6789 // profitable to scalarize.
6790 CM.collectInLoopReductions();
6791 if (CM.selectUserVectorizationFactor(UserVF)) {
6792 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6793 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6794 LLVM_DEBUG(printPlans(dbgs()));
6795 return;
6796 }
6797 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6798 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6799 }
6800 }
6801
6802 // Collect the Vectorization Factor Candidates.
6803 SmallVector<ElementCount> VFCandidates;
6804 for (auto VF = ElementCount::getFixed(MinVal: 1);
6805 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
6806 VFCandidates.push_back(Elt: VF);
6807 for (auto VF = ElementCount::getScalable(MinVal: 1);
6808 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
6809 VFCandidates.push_back(Elt: VF);
6810
6811 CM.collectInLoopReductions();
6812 for (const auto &VF : VFCandidates) {
6813 // Collect Uniform and Scalar instructions after vectorization with VF.
6814 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6815 }
6816
6817 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
6818 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
6819
6820 LLVM_DEBUG(printPlans(dbgs()));
6821}
6822
6823InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6824 ElementCount VF) const {
6825 if (ForceTargetInstructionCost.getNumOccurrences())
6826 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
6827 return CM.getInstructionCost(I: UI, VF);
6828}
6829
6830bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6831 ElementCount VF) const {
6832 return CM.isUniformAfterVectorization(I, VF);
6833}
6834
6835bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6836 return CM.ValuesToIgnore.contains(Ptr: UI) ||
6837 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
6838 SkipCostComputation.contains(Ptr: UI);
6839}
6840
6841InstructionCost
6842LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6843 VPCostContext &CostCtx) const {
6844 InstructionCost Cost;
6845 // Cost modeling for inductions is inaccurate in the legacy cost model
6846 // compared to the recipes that are generated. To match here initially during
6847 // VPlan cost model bring up directly use the induction costs from the legacy
6848 // cost model. Note that we do this as pre-processing; the VPlan may not have
6849 // any recipes associated with the original induction increment instruction
6850 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6851 // the cost of induction phis and increments (both that are represented by
6852 // recipes and those that are not), to avoid distinguishing between them here,
6853 // and skip all recipes that represent induction phis and increments (the
6854 // former case) later on, if they exist, to avoid counting them twice.
6855 // Similarly we pre-compute the cost of any optimized truncates.
6856 // TODO: Switch to more accurate costing based on VPlan.
6857 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6858 Instruction *IVInc = cast<Instruction>(
6859 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6860 SmallVector<Instruction *> IVInsts = {IVInc};
6861 for (unsigned I = 0; I != IVInsts.size(); I++) {
6862 for (Value *Op : IVInsts[I]->operands()) {
6863 auto *OpI = dyn_cast<Instruction>(Val: Op);
6864 if (Op == IV || !OpI || !OrigLoop->contains(Inst: OpI) || !Op->hasOneUse())
6865 continue;
6866 IVInsts.push_back(Elt: OpI);
6867 }
6868 }
6869 IVInsts.push_back(Elt: IV);
6870 for (User *U : IV->users()) {
6871 auto *CI = cast<Instruction>(Val: U);
6872 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6873 continue;
6874 IVInsts.push_back(Elt: CI);
6875 }
6876
6877 // If the vector loop gets executed exactly once with the given VF, ignore
6878 // the costs of comparison and induction instructions, as they'll get
6879 // simplified away.
6880 // TODO: Remove this code after stepping away from the legacy cost model and
6881 // adding code to simplify VPlans before calculating their costs.
6882 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6883 if (TC == VF && !CM.foldTailByMasking())
6884 addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6885 InstsToIgnore&: CostCtx.SkipCostComputation);
6886
6887 for (Instruction *IVInst : IVInsts) {
6888 if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6889 continue;
6890 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6891 LLVM_DEBUG({
6892 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6893 << ": induction instruction " << *IVInst << "\n";
6894 });
6895 Cost += InductionCost;
6896 CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6897 }
6898 }
6899
6900 /// Compute the cost of all exiting conditions of the loop using the legacy
6901 /// cost model. This is to match the legacy behavior, which adds the cost of
6902 /// all exit conditions. Note that this over-estimates the cost, as there will
6903 /// be a single condition to control the vector loop.
6904 SmallVector<BasicBlock *> Exiting;
6905 CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6906 SetVector<Instruction *> ExitInstrs;
6907 // Collect all exit conditions.
6908 for (BasicBlock *EB : Exiting) {
6909 auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator());
6910 if (!Term || CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6911 continue;
6912 if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) {
6913 ExitInstrs.insert(X: CondI);
6914 }
6915 }
6916 // Compute the cost of all instructions only feeding the exit conditions.
6917 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6918 Instruction *CondI = ExitInstrs[I];
6919 if (!OrigLoop->contains(Inst: CondI) ||
6920 !CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6921 continue;
6922 InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6923 LLVM_DEBUG({
6924 dbgs() << "Cost of " << CondICost << " for VF " << VF
6925 << ": exit condition instruction " << *CondI << "\n";
6926 });
6927 Cost += CondICost;
6928 for (Value *Op : CondI->operands()) {
6929 auto *OpI = dyn_cast<Instruction>(Val: Op);
6930 if (!OpI || CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) ||
6931 any_of(Range: OpI->users(), P: [&ExitInstrs, this](User *U) {
6932 return OrigLoop->contains(BB: cast<Instruction>(Val: U)->getParent()) &&
6933 !ExitInstrs.contains(key: cast<Instruction>(Val: U));
6934 }))
6935 continue;
6936 ExitInstrs.insert(X: OpI);
6937 }
6938 }
6939
6940 // Pre-compute the costs for branches except for the backedge, as the number
6941 // of replicate regions in a VPlan may not directly match the number of
6942 // branches, which would lead to different decisions.
6943 // TODO: Compute cost of branches for each replicate region in the VPlan,
6944 // which is more accurate than the legacy cost model.
6945 for (BasicBlock *BB : OrigLoop->blocks()) {
6946 if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6947 continue;
6948 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6949 if (BB == OrigLoop->getLoopLatch())
6950 continue;
6951 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6952 Cost += BranchCost;
6953 }
6954
6955 // Pre-compute costs for instructions that are forced-scalar or profitable to
6956 // scalarize. Their costs will be computed separately in the legacy cost
6957 // model.
6958 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6959 if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
6960 continue;
6961 CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
6962 InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
6963 LLVM_DEBUG({
6964 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6965 << ": forced scalar " << *ForcedScalar << "\n";
6966 });
6967 Cost += ForcedCost;
6968 }
6969 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6970 if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
6971 continue;
6972 CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
6973 LLVM_DEBUG({
6974 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6975 << ": profitable to scalarize " << *Scalarized << "\n";
6976 });
6977 Cost += ScalarCost;
6978 }
6979
6980 return Cost;
6981}
6982
6983InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6984 ElementCount VF) const {
6985 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
6986 CM.CostKind);
6987 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6988
6989 // Now compute and add the VPlan-based cost.
6990 Cost += Plan.cost(VF, Ctx&: CostCtx);
6991#ifndef NDEBUG
6992 unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
6993 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6994 << " (Estimated cost per lane: ");
6995 if (Cost.isValid()) {
6996 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6997 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6998 } else /* No point dividing an invalid cost - it will still be invalid */
6999 LLVM_DEBUG(dbgs() << "Invalid");
7000 LLVM_DEBUG(dbgs() << ")\n");
7001#endif
7002 return Cost;
7003}
7004
7005#ifndef NDEBUG
7006/// Return true if the original loop \ TheLoop contains any instructions that do
7007/// not have corresponding recipes in \p Plan and are not marked to be ignored
7008/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7009/// cost-model did not account for.
7010static bool planContainsAdditionalSimplifications(VPlan &Plan,
7011 VPCostContext &CostCtx,
7012 Loop *TheLoop,
7013 ElementCount VF) {
7014 // First collect all instructions for the recipes in Plan.
7015 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7016 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7017 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7018 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7019 return &WidenMem->getIngredient();
7020 return nullptr;
7021 };
7022
7023 DenseSet<Instruction *> SeenInstrs;
7024 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7025 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7026 for (VPRecipeBase &R : *VPBB) {
7027 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7028 auto *IG = IR->getInterleaveGroup();
7029 unsigned NumMembers = IG->getNumMembers();
7030 for (unsigned I = 0; I != NumMembers; ++I) {
7031 if (Instruction *M = IG->getMember(I))
7032 SeenInstrs.insert(M);
7033 }
7034 continue;
7035 }
7036 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7037 // cost model won't cost it whilst the legacy will.
7038 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7039 if (none_of(FOR->users(), [](VPUser *U) {
7040 auto *VPI = dyn_cast<VPInstruction>(U);
7041 return VPI && VPI->getOpcode() ==
7042 VPInstruction::FirstOrderRecurrenceSplice;
7043 }))
7044 return true;
7045 }
7046 // The VPlan-based cost model is more accurate for partial reduction and
7047 // comparing against the legacy cost isn't desirable.
7048 if (isa<VPPartialReductionRecipe>(&R))
7049 return true;
7050
7051 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7052 /// but the original instruction wasn't uniform-after-vectorization in the
7053 /// legacy cost model, the legacy cost overestimates the actual cost.
7054 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7055 if (RepR->isSingleScalar() &&
7056 !CostCtx.isLegacyUniformAfterVectorization(
7057 RepR->getUnderlyingInstr(), VF))
7058 return true;
7059 }
7060 if (Instruction *UI = GetInstructionForCost(&R)) {
7061 // If we adjusted the predicate of the recipe, the cost in the legacy
7062 // cost model may be different.
7063 if (auto *WidenCmp = dyn_cast<VPWidenRecipe>(&R)) {
7064 if ((WidenCmp->getOpcode() == Instruction::ICmp ||
7065 WidenCmp->getOpcode() == Instruction::FCmp) &&
7066 WidenCmp->getPredicate() != cast<CmpInst>(UI)->getPredicate())
7067 return true;
7068 }
7069 SeenInstrs.insert(UI);
7070 }
7071 }
7072 }
7073
7074 // Return true if the loop contains any instructions that are not also part of
7075 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7076 // that the VPlan contains extra simplifications.
7077 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7078 TheLoop](BasicBlock *BB) {
7079 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7080 // Skip induction phis when checking for simplifications, as they may not
7081 // be lowered directly be lowered to a corresponding PHI recipe.
7082 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7083 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7084 return false;
7085 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7086 });
7087 });
7088}
7089#endif
7090
7091VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7092 if (VPlans.empty())
7093 return VectorizationFactor::Disabled();
7094 // If there is a single VPlan with a single VF, return it directly.
7095 VPlan &FirstPlan = *VPlans[0];
7096 if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1)
7097 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7098
7099 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7100 << (CM.CostKind == TTI::TCK_RecipThroughput
7101 ? "Reciprocal Throughput\n"
7102 : CM.CostKind == TTI::TCK_Latency
7103 ? "Instruction Latency\n"
7104 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7105 : CM.CostKind == TTI::TCK_SizeAndLatency
7106 ? "Code Size and Latency\n"
7107 : "Unknown\n"));
7108
7109 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
7110 assert(hasPlanWithVF(ScalarVF) &&
7111 "More than a single plan/VF w/o any plan having scalar VF");
7112
7113 // TODO: Compute scalar cost using VPlan-based cost model.
7114 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7115 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7116 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7117 VectorizationFactor BestFactor = ScalarFactor;
7118
7119 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7120 if (ForceVectorization) {
7121 // Ignore scalar width, because the user explicitly wants vectorization.
7122 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7123 // evaluation.
7124 BestFactor.Cost = InstructionCost::getMax();
7125 }
7126
7127 for (auto &P : VPlans) {
7128 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7129 P->vectorFactors().end());
7130
7131 SmallVector<VPRegisterUsage, 8> RUs;
7132 if (CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_ScalableVector) ||
7133 CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_FixedWidthVector))
7134 RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7135
7136 for (unsigned I = 0; I < VFs.size(); I++) {
7137 ElementCount VF = VFs[I];
7138 if (VF.isScalar())
7139 continue;
7140 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7141 LLVM_DEBUG(
7142 dbgs()
7143 << "LV: Not considering vector loop of width " << VF
7144 << " because it will not generate any vector instructions.\n");
7145 continue;
7146 }
7147 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7148 LLVM_DEBUG(
7149 dbgs()
7150 << "LV: Not considering vector loop of width " << VF
7151 << " because it would cause replicated blocks to be generated,"
7152 << " which isn't allowed when optimizing for size.\n");
7153 continue;
7154 }
7155
7156 InstructionCost Cost = cost(Plan&: *P, VF);
7157 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7158
7159 if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
7160 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7161 << VF << " because it uses too many registers\n");
7162 continue;
7163 }
7164
7165 if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P->hasScalarTail()))
7166 BestFactor = CurrentFactor;
7167
7168 // If profitable add it to ProfitableVF list.
7169 if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P->hasScalarTail()))
7170 ProfitableVFs.push_back(Elt: CurrentFactor);
7171 }
7172 }
7173
7174#ifndef NDEBUG
7175 // Select the optimal vectorization factor according to the legacy cost-model.
7176 // This is now only used to verify the decisions by the new VPlan-based
7177 // cost-model and will be retired once the VPlan-based cost-model is
7178 // stabilized.
7179 VectorizationFactor LegacyVF = selectVectorizationFactor();
7180 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7181
7182 // Pre-compute the cost and use it to check if BestPlan contains any
7183 // simplifications not accounted for in the legacy cost model. If that's the
7184 // case, don't trigger the assertion, as the extra simplifications may cause a
7185 // different VF to be picked by the VPlan-based cost model.
7186 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7187 CM.CostKind);
7188 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7189 // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7190 // with early exits and plans with additional VPlan simplifications. The
7191 // legacy cost model doesn't properly model costs for such loops.
7192 assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7193 planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7194 CostCtx, OrigLoop,
7195 BestFactor.Width) ||
7196 planContainsAdditionalSimplifications(
7197 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7198 " VPlan cost model and legacy cost model disagreed");
7199 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7200 "when vectorizing, the scalar cost must be computed.");
7201#endif
7202
7203 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7204 return BestFactor;
7205}
7206
7207static void addRuntimeUnrollDisableMetaData(Loop *L) {
7208 SmallVector<Metadata *, 4> MDs;
7209 // Reserve first location for self reference to the LoopID metadata node.
7210 MDs.push_back(Elt: nullptr);
7211 bool IsUnrollMetadata = false;
7212 MDNode *LoopID = L->getLoopID();
7213 if (LoopID) {
7214 // First find existing loop unrolling disable metadata.
7215 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7216 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
7217 if (MD) {
7218 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
7219 IsUnrollMetadata =
7220 S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
7221 }
7222 MDs.push_back(Elt: LoopID->getOperand(I));
7223 }
7224 }
7225
7226 if (!IsUnrollMetadata) {
7227 // Add runtime unroll disable metadata.
7228 LLVMContext &Context = L->getHeader()->getContext();
7229 SmallVector<Metadata *, 1> DisableOperands;
7230 DisableOperands.push_back(
7231 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
7232 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
7233 MDs.push_back(Elt: DisableNode);
7234 MDNode *NewLoopID = MDNode::get(Context, MDs);
7235 // Set operand 0 to refer to the loop id itself.
7236 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
7237 L->setLoopID(NewLoopID);
7238 }
7239}
7240
7241static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
7242 using namespace VPlanPatternMatch;
7243 assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
7244 "RdxResult must be ComputeFindIVResult");
7245 VPValue *StartVPV = RdxResult->getOperand(N: 1);
7246 match(V: StartVPV, P: m_Freeze(Op0: m_VPValue(V&: StartVPV)));
7247 return StartVPV->getLiveInIRValue();
7248}
7249
7250// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7251// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7252// from the main vector loop.
7253static void fixReductionScalarResumeWhenVectorizingEpilog(
7254 VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
7255 // Get the VPInstruction computing the reduction result in the middle block.
7256 // The first operand may not be from the middle block if it is not connected
7257 // to the scalar preheader. In that case, there's nothing to fix.
7258 VPValue *Incoming = EpiResumePhiR->getOperand(N: 0);
7259 match(V: Incoming, P: VPlanPatternMatch::m_ZExtOrSExt(
7260 Op0: VPlanPatternMatch::m_VPValue(V&: Incoming)));
7261 auto *EpiRedResult = dyn_cast<VPInstruction>(Val: Incoming);
7262 if (!EpiRedResult ||
7263 (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7264 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7265 EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7266 return;
7267
7268 auto *EpiRedHeaderPhi =
7269 cast<VPReductionPHIRecipe>(Val: EpiRedResult->getOperand(N: 0));
7270 const RecurrenceDescriptor &RdxDesc =
7271 EpiRedHeaderPhi->getRecurrenceDescriptor();
7272 Value *MainResumeValue;
7273 if (auto *VPI = dyn_cast<VPInstruction>(Val: EpiRedHeaderPhi->getStartValue())) {
7274 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7275 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7276 "unexpected start recipe");
7277 MainResumeValue = VPI->getOperand(N: 0)->getUnderlyingValue();
7278 } else
7279 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7280 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7281 Kind: RdxDesc.getRecurrenceKind())) {
7282 [[maybe_unused]] Value *StartV =
7283 EpiRedResult->getOperand(N: 1)->getLiveInIRValue();
7284 auto *Cmp = cast<ICmpInst>(Val: MainResumeValue);
7285 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7286 "AnyOf expected to start with ICMP_NE");
7287 assert(Cmp->getOperand(1) == StartV &&
7288 "AnyOf expected to start by comparing main resume value to original "
7289 "start value");
7290 MainResumeValue = Cmp->getOperand(i_nocapture: 0);
7291 } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(
7292 Kind: RdxDesc.getRecurrenceKind())) {
7293 Value *StartV = getStartValueFromReductionResult(RdxResult: EpiRedResult);
7294 Value *SentinelV = EpiRedResult->getOperand(N: 2)->getLiveInIRValue();
7295 using namespace llvm::PatternMatch;
7296 Value *Cmp, *OrigResumeV, *CmpOp;
7297 [[maybe_unused]] bool IsExpectedPattern =
7298 match(V: MainResumeValue,
7299 P: m_Select(C: m_OneUse(SubPattern: m_Value(V&: Cmp)), L: m_Specific(V: SentinelV),
7300 R: m_Value(V&: OrigResumeV))) &&
7301 (match(V: Cmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Specific(V: OrigResumeV),
7302 R: m_Value(V&: CmpOp))) &&
7303 ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(V: CmpOp))));
7304 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7305 MainResumeValue = OrigResumeV;
7306 }
7307 PHINode *MainResumePhi = cast<PHINode>(Val: MainResumeValue);
7308
7309 // When fixing reductions in the epilogue loop we should already have
7310 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7311 // over the incoming values correctly.
7312 auto *EpiResumePhi = cast<PHINode>(Val: State.get(Def: EpiResumePhiR, IsScalar: true));
7313 EpiResumePhi->setIncomingValueForBlock(
7314 BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
7315}
7316
7317DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7318 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7319 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7320 assert(BestVPlan.hasVF(BestVF) &&
7321 "Trying to execute plan with unsupported VF");
7322 assert(BestVPlan.hasUF(BestUF) &&
7323 "Trying to execute plan with unsupported UF");
7324 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7325 // cost model is complete for better cost estimates.
7326 VPlanTransforms::runPass(Fn: VPlanTransforms::unrollByUF, Plan&: BestVPlan, Args&: BestUF,
7327 Args&: OrigLoop->getHeader()->getContext());
7328 VPlanTransforms::runPass(Fn: VPlanTransforms::replicateByVF, Plan&: BestVPlan, Args&: BestVF);
7329 VPlanTransforms::runPass(Fn: VPlanTransforms::materializeBroadcasts, Plan&: BestVPlan);
7330 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7331 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7332 VPlanTransforms::runPass(Fn: VPlanTransforms::addBranchWeightToMiddleTerminator,
7333 Plan&: BestVPlan, Args&: BestVF, Args&: VScale);
7334 }
7335 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7336 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan, CanonicalIVTy&: *Legal->getWidestInductionType());
7337 VPlanTransforms::narrowInterleaveGroups(
7338 Plan&: BestVPlan, VF: BestVF,
7339 VectorRegWidth: TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector));
7340 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7341
7342 VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan,
7343 CanonicalIVTy&: *Legal->getWidestInductionType());
7344 // Regions are dissolved after optimizing for VF and UF, which completely
7345 // removes unneeded loop regions first.
7346 VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7347 // Perform the actual loop transformation.
7348 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7349 OrigLoop->getParentLoop(),
7350 Legal->getWidestInductionType());
7351
7352#ifdef EXPENSIVE_CHECKS
7353 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7354#endif
7355
7356 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7357 // making any changes to the CFG.
7358 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
7359 auto *Entry = cast<VPIRBasicBlock>(Val: BestVPlan.getEntry());
7360 State.Builder.SetInsertPoint(Entry->getIRBasicBlock()->getTerminator());
7361 for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
7362 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
7363 if (!ExpSCEV)
7364 continue;
7365 ExpSCEV->execute(State);
7366 ExpandedSCEVs[ExpSCEV->getSCEV()] = State.get(Def: ExpSCEV, Lane: VPLane(0));
7367 VPValue *Exp = BestVPlan.getOrAddLiveIn(V: ExpandedSCEVs[ExpSCEV->getSCEV()]);
7368 ExpSCEV->replaceAllUsesWith(New: Exp);
7369 if (BestVPlan.getTripCount() == ExpSCEV)
7370 BestVPlan.resetTripCount(NewTripCount: Exp);
7371 ExpSCEV->eraseFromParent();
7372 }
7373
7374 if (!ILV.getTripCount())
7375 ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Lane: VPLane(0)));
7376 else
7377 assert(VectorizingEpilogue && "should only re-use the existing trip "
7378 "count during epilogue vectorization");
7379
7380 // 1. Set up the skeleton for vectorization, including vector pre-header and
7381 // middle block. The vector loop is created during VPlan execution.
7382 VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: Entry->getSuccessors()[1]);
7383 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7384 if (VectorizingEpilogue)
7385 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7386
7387 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7388 "final VPlan is invalid");
7389
7390 ILV.printDebugTracesAtStart();
7391
7392 //===------------------------------------------------===//
7393 //
7394 // Notice: any optimization or new instruction that go
7395 // into the code below should also be implemented in
7396 // the cost-model.
7397 //
7398 //===------------------------------------------------===//
7399
7400 // 2. Copy and widen instructions from the old loop into the new loop.
7401 BestVPlan.prepareToExecute(
7402 TripCount: ILV.getTripCount(),
7403 VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: ILV.LoopVectorPreHeader), State);
7404 replaceVPBBWithIRVPBB(VPBB: VectorPH, IRBB: State.CFG.PrevBB);
7405
7406 BestVPlan.execute(State: &State);
7407
7408 // 2.5 When vectorizing the epilogue, fix reduction resume values from the
7409 // additional bypass block.
7410 if (VectorizingEpilogue) {
7411 assert(!BestVPlan.hasEarlyExit() &&
7412 "Epilogue vectorisation not yet supported with early exits");
7413 BasicBlock *PH = OrigLoop->getLoopPreheader();
7414 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7415 for (auto *Pred : predecessors(BB: PH)) {
7416 for (PHINode &Phi : PH->phis()) {
7417 if (Phi.getBasicBlockIndex(BB: Pred) != -1)
7418 continue;
7419 Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
7420 }
7421 }
7422 VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
7423 if (ScalarPH->getNumPredecessors() > 0) {
7424 // If ScalarPH has predecessors, we may need to update its reduction
7425 // resume values.
7426 for (VPRecipeBase &R : ScalarPH->phis()) {
7427 fixReductionScalarResumeWhenVectorizingEpilog(EpiResumePhiR: cast<VPPhi>(Val: &R), State,
7428 BypassBlock);
7429 }
7430 }
7431 }
7432
7433 // 2.6. Maintain Loop Hints
7434 // Keep all loop hints from the original loop on the vector loop (we'll
7435 // replace the vectorizer-specific hints below).
7436 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7437 if (HeaderVPBB) {
7438 MDNode *OrigLoopID = OrigLoop->getLoopID();
7439
7440 std::optional<MDNode *> VectorizedLoopID =
7441 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
7442 LLVMLoopVectorizeFollowupVectorized});
7443
7444 Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[HeaderVPBB]);
7445 if (VectorizedLoopID) {
7446 L->setLoopID(*VectorizedLoopID);
7447 } else {
7448 // Keep all loop hints from the original loop on the vector loop (we'll
7449 // replace the vectorizer-specific hints below).
7450 if (MDNode *LID = OrigLoop->getLoopID())
7451 L->setLoopID(LID);
7452
7453 LoopVectorizeHints Hints(L, true, *ORE);
7454 Hints.setAlreadyVectorized();
7455
7456 // Check if it's EVL-vectorized and mark the corresponding metadata.
7457 bool IsEVLVectorized =
7458 llvm::any_of(Range&: *HeaderVPBB, P: [](const VPRecipeBase &Recipe) {
7459 // Looking for the ExplictVectorLength VPInstruction.
7460 if (const auto *VI = dyn_cast<VPInstruction>(Val: &Recipe))
7461 return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
7462 return false;
7463 });
7464 if (IsEVLVectorized) {
7465 LLVMContext &Context = L->getHeader()->getContext();
7466 MDNode *LoopID = L->getLoopID();
7467 auto *IsEVLVectorizedMD = MDNode::get(
7468 Context,
7469 MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized.tailfoldingstyle"),
7470 MDString::get(Context, Str: "evl")});
7471 MDNode *NewLoopID = makePostTransformationMetadata(Context, OrigLoopID: LoopID, RemovePrefixes: {},
7472 AddAttrs: {IsEVLVectorizedMD});
7473 L->setLoopID(NewLoopID);
7474 }
7475 }
7476 TargetTransformInfo::UnrollingPreferences UP;
7477 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7478 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7479 addRuntimeUnrollDisableMetaData(L);
7480 }
7481
7482 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7483 // predication, updating analyses.
7484 ILV.fixVectorizedLoop(State);
7485
7486 ILV.printDebugTracesAtEnd();
7487
7488 return ExpandedSCEVs;
7489}
7490
7491//===--------------------------------------------------------------------===//
7492// EpilogueVectorizerMainLoop
7493//===--------------------------------------------------------------------===//
7494
7495/// This function is partially responsible for generating the control flow
7496/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7497BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7498 createVectorLoopSkeleton(Prefix: "");
7499
7500 // Generate the code to check the minimum iteration count of the vector
7501 // epilogue (see below).
7502 EPI.EpilogueIterationCountCheck =
7503 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true);
7504 EPI.EpilogueIterationCountCheck->setName("iter.check");
7505
7506 // Generate the code to check any assumptions that we've made for SCEV
7507 // expressions.
7508 EPI.SCEVSafetyCheck = emitSCEVChecks(Bypass: LoopScalarPreHeader);
7509
7510 // Generate the code that checks at runtime if arrays overlap. We put the
7511 // checks into a separate block to make the more common case of few elements
7512 // faster.
7513 EPI.MemSafetyCheck = emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
7514
7515 // Generate the iteration count check for the main loop, *after* the check
7516 // for the epilogue loop, so that the path-length is shorter for the case
7517 // that goes directly through the vector epilogue. The longer-path length for
7518 // the main loop is compensated for, by the gain from vectorizing the larger
7519 // trip count. Note: the branch will get updated later on when we vectorize
7520 // the epilogue.
7521 EPI.MainLoopIterationCountCheck =
7522 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false);
7523
7524 // Generate the induction variable.
7525 EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
7526
7527 replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7528 return LoopVectorPreHeader;
7529}
7530
7531void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7532 LLVM_DEBUG({
7533 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7534 << "Main Loop VF:" << EPI.MainLoopVF
7535 << ", Main Loop UF:" << EPI.MainLoopUF
7536 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7537 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7538 });
7539}
7540
7541void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7542 DEBUG_WITH_TYPE(VerboseDebug, {
7543 dbgs() << "intermediate fn:\n"
7544 << *OrigLoop->getHeader()->getParent() << "\n";
7545 });
7546}
7547
7548BasicBlock *
7549EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7550 bool ForEpilogue) {
7551 assert(Bypass && "Expected valid bypass basic block.");
7552 Value *Count = getTripCount();
7553 MinProfitableTripCount = ElementCount::getFixed(MinVal: 0);
7554 Value *CheckMinIters = createIterationCountCheck(
7555 VF: ForEpilogue ? EPI.EpilogueVF : VF, UF: ForEpilogue ? EPI.EpilogueUF : UF);
7556
7557 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7558 if (!ForEpilogue)
7559 TCCheckBlock->setName("vector.main.loop.iter.check");
7560
7561 // Create new preheader for vector loop.
7562 LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7563 DT: static_cast<DominatorTree *>(nullptr), LI,
7564 MSSAU: nullptr, BBName: "vector.ph");
7565
7566 if (ForEpilogue) {
7567 // Save the trip count so we don't have to regenerate it in the
7568 // vec.epilog.iter.check. This is safe to do because the trip count
7569 // generated here dominates the vector epilog iter check.
7570 EPI.TripCount = Count;
7571 }
7572
7573 BranchInst &BI =
7574 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7575 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7576 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
7577 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7578
7579 // When vectorizing the main loop, its trip-count check is placed in a new
7580 // block, whereas the overall trip-count check is placed in the VPlan entry
7581 // block. When vectorizing the epilogue loop, its trip-count check is placed
7582 // in the VPlan entry block.
7583 if (!ForEpilogue)
7584 introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7585 return TCCheckBlock;
7586}
7587
7588//===--------------------------------------------------------------------===//
7589// EpilogueVectorizerEpilogueLoop
7590//===--------------------------------------------------------------------===//
7591
7592/// This function is partially responsible for generating the control flow
7593/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7594BasicBlock *
7595EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7596 createVectorLoopSkeleton(Prefix: "vec.epilog.");
7597
7598 // Now, compare the remaining count and if there aren't enough iterations to
7599 // execute the vectorized epilogue skip to the scalar part.
7600 LoopVectorPreHeader->setName("vec.epilog.ph");
7601 BasicBlock *VecEpilogueIterationCountCheck =
7602 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->begin(), DT, LI,
7603 MSSAU: nullptr, BBName: "vec.epilog.iter.check", Before: true);
7604 emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader,
7605 Insert: VecEpilogueIterationCountCheck);
7606 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7607
7608 // Adjust the control flow taking the state info from the main loop
7609 // vectorization into account.
7610 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7611 "expected this to be saved from the previous pass.");
7612 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7613 From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader);
7614
7615 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7616 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7617
7618 if (EPI.SCEVSafetyCheck)
7619 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7620 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7621 if (EPI.MemSafetyCheck)
7622 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7623 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7624
7625 DT->changeImmediateDominator(BB: LoopScalarPreHeader,
7626 NewBB: EPI.EpilogueIterationCountCheck);
7627
7628 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7629 // reductions which merge control-flow from the latch block and the middle
7630 // block. Update the incoming values here and move the Phi into the preheader.
7631 SmallVector<PHINode *, 4> PhisInBlock(
7632 llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
7633
7634 for (PHINode *Phi : PhisInBlock) {
7635 Phi->moveBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt());
7636 Phi->replaceIncomingBlockWith(
7637 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7638 New: VecEpilogueIterationCountCheck);
7639
7640 // If the phi doesn't have an incoming value from the
7641 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7642 // value and also those from other check blocks. This is needed for
7643 // reduction phis only.
7644 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7645 return EPI.EpilogueIterationCountCheck == IncB;
7646 }))
7647 continue;
7648 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
7649 if (EPI.SCEVSafetyCheck)
7650 Phi->removeIncomingValue(BB: EPI.SCEVSafetyCheck);
7651 if (EPI.MemSafetyCheck)
7652 Phi->removeIncomingValue(BB: EPI.MemSafetyCheck);
7653 }
7654
7655 replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7656 return LoopVectorPreHeader;
7657}
7658
7659BasicBlock *
7660EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7661 BasicBlock *Bypass, BasicBlock *Insert) {
7662
7663 assert(EPI.TripCount &&
7664 "Expected trip count to have been saved in the first pass.");
7665 Value *TC = EPI.TripCount;
7666 IRBuilder<> Builder(Insert->getTerminator());
7667 Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining");
7668
7669 // Generate code to check if the loop's trip count is less than VF * UF of the
7670 // vector epilogue loop.
7671 auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())
7672 ? ICmpInst::ICMP_ULE
7673 : ICmpInst::ICMP_ULT;
7674
7675 Value *CheckMinIters =
7676 Builder.CreateICmp(P, LHS: Count,
7677 RHS: createStepForVF(B&: Builder, Ty: Count->getType(),
7678 VF: EPI.EpilogueVF, Step: EPI.EpilogueUF),
7679 Name: "min.epilog.iters.check");
7680
7681 BranchInst &BI =
7682 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7683 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7684 // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
7685 // think the MainLoopStep is correct.
7686 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7687 unsigned EpilogueLoopStep =
7688 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7689 // We assume the remaining `Count` is equally distributed in
7690 // [0, MainLoopStep)
7691 // So the probability for `Count < EpilogueLoopStep` should be
7692 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7693 unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
7694 const uint32_t Weights[] = {EstimatedSkipCount,
7695 MainLoopStep - EstimatedSkipCount};
7696 setBranchWeights(I&: BI, Weights, /*IsExpected=*/false);
7697 }
7698 ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI);
7699
7700 // A new entry block has been created for the epilogue VPlan. Hook it in, as
7701 // otherwise we would try to modify the entry to the main vector loop.
7702 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: Insert);
7703 VPBasicBlock *OldEntry = Plan.getEntry();
7704 VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7705 Plan.setEntry(NewEntry);
7706 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7707
7708 return Insert;
7709}
7710
7711void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7712 LLVM_DEBUG({
7713 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7714 << "Epilogue Loop VF:" << EPI.EpilogueVF
7715 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7716 });
7717}
7718
7719void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7720 DEBUG_WITH_TYPE(VerboseDebug, {
7721 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7722 });
7723}
7724
7725VPWidenMemoryRecipe *
7726VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7727 VFRange &Range) {
7728 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7729 "Must be called with either a load or store");
7730
7731 auto WillWiden = [&](ElementCount VF) -> bool {
7732 LoopVectorizationCostModel::InstWidening Decision =
7733 CM.getWideningDecision(I, VF);
7734 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7735 "CM decision should be taken at this point.");
7736 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7737 return true;
7738 if (CM.isScalarAfterVectorization(I, VF) ||
7739 CM.isProfitableToScalarize(I, VF))
7740 return false;
7741 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7742 };
7743
7744 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7745 return nullptr;
7746
7747 VPValue *Mask = nullptr;
7748 if (Legal->isMaskRequired(I))
7749 Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7750
7751 // Determine if the pointer operand of the access is either consecutive or
7752 // reverse consecutive.
7753 LoopVectorizationCostModel::InstWidening Decision =
7754 CM.getWideningDecision(I, VF: Range.Start);
7755 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7756 bool Consecutive =
7757 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7758
7759 VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands[0] : Operands[1];
7760 if (Consecutive) {
7761 auto *GEP = dyn_cast<GetElementPtrInst>(
7762 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7763 VPSingleDefRecipe *VectorPtr;
7764 if (Reverse) {
7765 // When folding the tail, we may compute an address that we don't in the
7766 // original scalar loop and it may not be inbounds. Drop Inbounds in that
7767 // case.
7768 GEPNoWrapFlags Flags =
7769 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
7770 ? GEPNoWrapFlags::none()
7771 : GEPNoWrapFlags::inBounds();
7772 VectorPtr =
7773 new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
7774 /*Stride*/ -1, Flags, I->getDebugLoc());
7775 } else {
7776 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7777 GEP ? GEP->getNoWrapFlags()
7778 : GEPNoWrapFlags::none(),
7779 I->getDebugLoc());
7780 }
7781 Builder.insert(R: VectorPtr);
7782 Ptr = VectorPtr;
7783 }
7784 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I))
7785 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7786 VPIRMetadata(*Load, LVer), I->getDebugLoc());
7787
7788 StoreInst *Store = cast<StoreInst>(Val: I);
7789 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7790 Reverse, VPIRMetadata(*Store, LVer),
7791 I->getDebugLoc());
7792}
7793
7794/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7795/// insert a recipe to expand the step for the induction recipe.
7796static VPWidenIntOrFpInductionRecipe *
7797createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
7798 VPValue *Start, const InductionDescriptor &IndDesc,
7799 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7800 assert(IndDesc.getStartValue() ==
7801 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7802 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7803 "step must be loop invariant");
7804
7805 VPValue *Step =
7806 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE);
7807 if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) {
7808 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7809 IndDesc, TruncI,
7810 TruncI->getDebugLoc());
7811 }
7812 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7813 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7814 IndDesc, Phi->getDebugLoc());
7815}
7816
7817VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7818 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
7819
7820 // Check if this is an integer or fp induction. If so, build the recipe that
7821 // produces its scalar and vector values.
7822 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7823 return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands[0], IndDesc: *II, Plan,
7824 SE&: *PSE.getSE(), OrigLoop&: *OrigLoop);
7825
7826 // Check if this is pointer induction. If so, build the recipe for it.
7827 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7828 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(),
7829 SE&: *PSE.getSE());
7830 return new VPWidenPointerInductionRecipe(
7831 Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
7832 LoopVectorizationPlanner::getDecisionAndClampRange(
7833 Predicate: [&](ElementCount VF) {
7834 return CM.isScalarAfterVectorization(I: Phi, VF);
7835 },
7836 Range),
7837 Phi->getDebugLoc());
7838 }
7839 return nullptr;
7840}
7841
7842VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7843 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
7844 // Optimize the special case where the source is a constant integer
7845 // induction variable. Notice that we can only optimize the 'trunc' case
7846 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7847 // (c) other casts depend on pointer size.
7848
7849 // Determine whether \p K is a truncation based on an induction variable that
7850 // can be optimized.
7851 auto IsOptimizableIVTruncate =
7852 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7853 return [=](ElementCount VF) -> bool {
7854 return CM.isOptimizableIVTruncate(I: K, VF);
7855 };
7856 };
7857
7858 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7859 Predicate: IsOptimizableIVTruncate(I), Range)) {
7860
7861 auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: 0));
7862 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7863 VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue());
7864 return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(),
7865 OrigLoop&: *OrigLoop);
7866 }
7867 return nullptr;
7868}
7869
7870VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
7871 ArrayRef<VPValue *> Operands,
7872 VFRange &Range) {
7873 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7874 Predicate: [this, CI](ElementCount VF) {
7875 return CM.isScalarWithPredication(I: CI, VF);
7876 },
7877 Range);
7878
7879 if (IsPredicated)
7880 return nullptr;
7881
7882 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7883 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7884 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7885 ID == Intrinsic::pseudoprobe ||
7886 ID == Intrinsic::experimental_noalias_scope_decl))
7887 return nullptr;
7888
7889 SmallVector<VPValue *, 4> Ops(Operands.take_front(N: CI->arg_size()));
7890
7891 // Is it beneficial to perform intrinsic call compared to lib call?
7892 bool ShouldUseVectorIntrinsic =
7893 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7894 Predicate: [&](ElementCount VF) -> bool {
7895 return CM.getCallWideningDecision(CI, VF).Kind ==
7896 LoopVectorizationCostModel::CM_IntrinsicCall;
7897 },
7898 Range);
7899 if (ShouldUseVectorIntrinsic)
7900 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
7901 CI->getDebugLoc());
7902
7903 Function *Variant = nullptr;
7904 std::optional<unsigned> MaskPos;
7905 // Is better to call a vectorized version of the function than to to scalarize
7906 // the call?
7907 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7908 Predicate: [&](ElementCount VF) -> bool {
7909 // The following case may be scalarized depending on the VF.
7910 // The flag shows whether we can use a usual Call for vectorized
7911 // version of the instruction.
7912
7913 // If we've found a variant at a previous VF, then stop looking. A
7914 // vectorized variant of a function expects input in a certain shape
7915 // -- basically the number of input registers, the number of lanes
7916 // per register, and whether there's a mask required.
7917 // We store a pointer to the variant in the VPWidenCallRecipe, so
7918 // once we have an appropriate variant it's only valid for that VF.
7919 // This will force a different vplan to be generated for each VF that
7920 // finds a valid variant.
7921 if (Variant)
7922 return false;
7923 LoopVectorizationCostModel::CallWideningDecision Decision =
7924 CM.getCallWideningDecision(CI, VF);
7925 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7926 Variant = Decision.Variant;
7927 MaskPos = Decision.MaskPos;
7928 return true;
7929 }
7930
7931 return false;
7932 },
7933 Range);
7934 if (ShouldUseVectorCall) {
7935 if (MaskPos.has_value()) {
7936 // We have 2 cases that would require a mask:
7937 // 1) The block needs to be predicated, either due to a conditional
7938 // in the scalar loop or use of an active lane mask with
7939 // tail-folding, and we use the appropriate mask for the block.
7940 // 2) No mask is required for the block, but the only available
7941 // vector variant at this VF requires a mask, so we synthesize an
7942 // all-true mask.
7943 VPValue *Mask = nullptr;
7944 if (Legal->isMaskRequired(I: CI))
7945 Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7946 else
7947 Mask = Plan.getOrAddLiveIn(
7948 V: ConstantInt::getTrue(Ty: IntegerType::getInt1Ty(C&: CI->getContext())));
7949
7950 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7951 }
7952
7953 Ops.push_back(Elt: Operands.back());
7954 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
7955 }
7956
7957 return nullptr;
7958}
7959
7960bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7961 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7962 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7963 // Instruction should be widened, unless it is scalar after vectorization,
7964 // scalarization is profitable or it is predicated.
7965 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7966 return CM.isScalarAfterVectorization(I, VF) ||
7967 CM.isProfitableToScalarize(I, VF) ||
7968 CM.isScalarWithPredication(I, VF);
7969 };
7970 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7971 Range);
7972}
7973
7974VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
7975 ArrayRef<VPValue *> Operands) {
7976 switch (I->getOpcode()) {
7977 default:
7978 return nullptr;
7979 case Instruction::SDiv:
7980 case Instruction::UDiv:
7981 case Instruction::SRem:
7982 case Instruction::URem: {
7983 // If not provably safe, use a select to form a safe divisor before widening the
7984 // div/rem operation itself. Otherwise fall through to general handling below.
7985 if (CM.isPredicatedInst(I)) {
7986 SmallVector<VPValue *> Ops(Operands);
7987 VPValue *Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7988 VPValue *One =
7989 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: 1u, IsSigned: false));
7990 auto *SafeRHS = Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: I->getDebugLoc());
7991 Ops[1] = SafeRHS;
7992 return new VPWidenRecipe(*I, Ops);
7993 }
7994 [[fallthrough]];
7995 }
7996 case Instruction::Add:
7997 case Instruction::And:
7998 case Instruction::AShr:
7999 case Instruction::FAdd:
8000 case Instruction::FCmp:
8001 case Instruction::FDiv:
8002 case Instruction::FMul:
8003 case Instruction::FNeg:
8004 case Instruction::FRem:
8005 case Instruction::FSub:
8006 case Instruction::ICmp:
8007 case Instruction::LShr:
8008 case Instruction::Mul:
8009 case Instruction::Or:
8010 case Instruction::Select:
8011 case Instruction::Shl:
8012 case Instruction::Sub:
8013 case Instruction::Xor:
8014 case Instruction::Freeze: {
8015 SmallVector<VPValue *> NewOps(Operands);
8016 if (Instruction::isBinaryOp(Opcode: I->getOpcode())) {
8017 // The legacy cost model uses SCEV to check if some of the operands are
8018 // constants. To match the legacy cost model's behavior, use SCEV to try
8019 // to replace operands with constants.
8020 ScalarEvolution &SE = *PSE.getSE();
8021 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
8022 if (!Op->isLiveIn())
8023 return Op;
8024 Value *V = Op->getUnderlyingValue();
8025 if (isa<Constant>(Val: V) || !SE.isSCEVable(Ty: V->getType()))
8026 return Op;
8027 auto *C = dyn_cast<SCEVConstant>(Val: SE.getSCEV(V));
8028 if (!C)
8029 return Op;
8030 return Plan.getOrAddLiveIn(V: C->getValue());
8031 };
8032 // For Mul, the legacy cost model checks both operands.
8033 if (I->getOpcode() == Instruction::Mul)
8034 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8035 // For other binops, the legacy cost model only checks the second operand.
8036 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8037 }
8038 return new VPWidenRecipe(*I, NewOps);
8039 }
8040 case Instruction::ExtractValue: {
8041 SmallVector<VPValue *> NewOps(Operands);
8042 Type *I32Ty = IntegerType::getInt32Ty(C&: I->getContext());
8043 auto *EVI = cast<ExtractValueInst>(Val: I);
8044 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
8045 unsigned Idx = EVI->getIndices()[0];
8046 NewOps.push_back(Elt: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: Idx, IsSigned: false)));
8047 return new VPWidenRecipe(*I, NewOps);
8048 }
8049 };
8050}
8051
8052VPHistogramRecipe *
8053VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
8054 ArrayRef<VPValue *> Operands) {
8055 // FIXME: Support other operations.
8056 unsigned Opcode = HI->Update->getOpcode();
8057 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8058 "Histogram update operation must be an Add or Sub");
8059
8060 SmallVector<VPValue *, 3> HGramOps;
8061 // Bucket address.
8062 HGramOps.push_back(Elt: Operands[1]);
8063 // Increment value.
8064 HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: 1)));
8065
8066 // In case of predicated execution (due to tail-folding, or conditional
8067 // execution, or both), pass the relevant mask.
8068 if (Legal->isMaskRequired(I: HI->Store))
8069 HGramOps.push_back(Elt: getBlockInMask(VPBB: Builder.getInsertBlock()));
8070
8071 return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
8072}
8073
8074VPReplicateRecipe *
8075VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
8076 VFRange &Range) {
8077 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8078 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8079 Range);
8080
8081 bool IsPredicated = CM.isPredicatedInst(I);
8082
8083 // Even if the instruction is not marked as uniform, there are certain
8084 // intrinsic calls that can be effectively treated as such, so we check for
8085 // them here. Conservatively, we only do this for scalable vectors, since
8086 // for fixed-width VFs we can always fall back on full scalarization.
8087 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
8088 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
8089 case Intrinsic::assume:
8090 case Intrinsic::lifetime_start:
8091 case Intrinsic::lifetime_end:
8092 // For scalable vectors if one of the operands is variant then we still
8093 // want to mark as uniform, which will generate one instruction for just
8094 // the first lane of the vector. We can't scalarize the call in the same
8095 // way as for fixed-width vectors because we don't know how many lanes
8096 // there are.
8097 //
8098 // The reasons for doing it this way for scalable vectors are:
8099 // 1. For the assume intrinsic generating the instruction for the first
8100 // lane is still be better than not generating any at all. For
8101 // example, the input may be a splat across all lanes.
8102 // 2. For the lifetime start/end intrinsics the pointer operand only
8103 // does anything useful when the input comes from a stack object,
8104 // which suggests it should always be uniform. For non-stack objects
8105 // the effect is to poison the object, which still allows us to
8106 // remove the call.
8107 IsUniform = true;
8108 break;
8109 default:
8110 break;
8111 }
8112 }
8113 VPValue *BlockInMask = nullptr;
8114 if (!IsPredicated) {
8115 // Finalize the recipe for Instr, first if it is not predicated.
8116 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8117 } else {
8118 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8119 // Instructions marked for predication are replicated and a mask operand is
8120 // added initially. Masked replicate recipes will later be placed under an
8121 // if-then construct to prevent side-effects. Generate recipes to compute
8122 // the block mask for this region.
8123 BlockInMask = getBlockInMask(VPBB: Builder.getInsertBlock());
8124 }
8125
8126 // Note that there is some custom logic to mark some intrinsics as uniform
8127 // manually above for scalable vectors, which this assert needs to account for
8128 // as well.
8129 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8130 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8131 "Should not predicate a uniform recipe");
8132 auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
8133 VPIRMetadata(*I, LVer));
8134 return Recipe;
8135}
8136
8137/// Find all possible partial reductions in the loop and track all of those that
8138/// are valid so recipes can be formed later.
8139void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8140 // Find all possible partial reductions.
8141 SmallVector<std::pair<PartialReductionChain, unsigned>>
8142 PartialReductionChains;
8143 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8144 getScaledReductions(PHI: Phi, RdxExitInstr: RdxDesc.getLoopExitInstr(), Range,
8145 Chains&: PartialReductionChains);
8146 }
8147
8148 // A partial reduction is invalid if any of its extends are used by
8149 // something that isn't another partial reduction. This is because the
8150 // extends are intended to be lowered along with the reduction itself.
8151
8152 // Build up a set of partial reduction ops for efficient use checking.
8153 SmallSet<User *, 4> PartialReductionOps;
8154 for (const auto &[PartialRdx, _] : PartialReductionChains)
8155 PartialReductionOps.insert(Ptr: PartialRdx.ExtendUser);
8156
8157 auto ExtendIsOnlyUsedByPartialReductions =
8158 [&PartialReductionOps](Instruction *Extend) {
8159 return all_of(Range: Extend->users(), P: [&](const User *U) {
8160 return PartialReductionOps.contains(Ptr: U);
8161 });
8162 };
8163
8164 // Check if each use of a chain's two extends is a partial reduction
8165 // and only add those that don't have non-partial reduction users.
8166 for (auto Pair : PartialReductionChains) {
8167 PartialReductionChain Chain = Pair.first;
8168 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8169 (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
8170 ScaledReductionMap.try_emplace(Key: Chain.Reduction, Args&: Pair.second);
8171 }
8172}
8173
8174bool VPRecipeBuilder::getScaledReductions(
8175 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8176 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8177 if (!CM.TheLoop->contains(Inst: RdxExitInstr))
8178 return false;
8179
8180 auto *Update = dyn_cast<BinaryOperator>(Val: RdxExitInstr);
8181 if (!Update)
8182 return false;
8183
8184 Value *Op = Update->getOperand(i_nocapture: 0);
8185 Value *PhiOp = Update->getOperand(i_nocapture: 1);
8186 if (Op == PHI)
8187 std::swap(a&: Op, b&: PhiOp);
8188
8189 // Try and get a scaled reduction from the first non-phi operand.
8190 // If one is found, we use the discovered reduction instruction in
8191 // place of the accumulator for costing.
8192 if (auto *OpInst = dyn_cast<Instruction>(Val: Op)) {
8193 if (getScaledReductions(PHI, RdxExitInstr: OpInst, Range, Chains)) {
8194 PHI = Chains.rbegin()->first.Reduction;
8195
8196 Op = Update->getOperand(i_nocapture: 0);
8197 PhiOp = Update->getOperand(i_nocapture: 1);
8198 if (Op == PHI)
8199 std::swap(a&: Op, b&: PhiOp);
8200 }
8201 }
8202 if (PhiOp != PHI)
8203 return false;
8204
8205 using namespace llvm::PatternMatch;
8206
8207 // If the update is a binary operator, check both of its operands to see if
8208 // they are extends. Otherwise, see if the update comes directly from an
8209 // extend.
8210 Instruction *Exts[2] = {nullptr};
8211 BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Val: Op);
8212 std::optional<unsigned> BinOpc;
8213 Type *ExtOpTypes[2] = {nullptr};
8214
8215 auto CollectExtInfo = [&Exts,
8216 &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
8217 unsigned I = 0;
8218 for (Value *OpI : Ops) {
8219 Value *ExtOp;
8220 if (!match(V: OpI, P: m_ZExtOrSExt(Op: m_Value(V&: ExtOp))))
8221 return false;
8222 Exts[I] = cast<Instruction>(Val: OpI);
8223 ExtOpTypes[I] = ExtOp->getType();
8224 I++;
8225 }
8226 return true;
8227 };
8228
8229 if (ExtendUser) {
8230 if (!ExtendUser->hasOneUse())
8231 return false;
8232
8233 // Use the side-effect of match to replace BinOp only if the pattern is
8234 // matched, we don't care at this point whether it actually matched.
8235 match(V: ExtendUser, P: m_Neg(V: m_BinOp(I&: ExtendUser)));
8236
8237 SmallVector<Value *> Ops(ExtendUser->operands());
8238 if (!CollectExtInfo(Ops))
8239 return false;
8240
8241 BinOpc = std::make_optional(t: ExtendUser->getOpcode());
8242 } else if (match(V: Update, P: m_Add(L: m_Value(), R: m_Value()))) {
8243 // We already know the operands for Update are Op and PhiOp.
8244 SmallVector<Value *> Ops({Op});
8245 if (!CollectExtInfo(Ops))
8246 return false;
8247
8248 ExtendUser = Update;
8249 BinOpc = std::nullopt;
8250 } else
8251 return false;
8252
8253 TTI::PartialReductionExtendKind OpAExtend =
8254 TTI::getPartialReductionExtendKind(I: Exts[0]);
8255 TTI::PartialReductionExtendKind OpBExtend =
8256 Exts[1] ? TTI::getPartialReductionExtendKind(I: Exts[1]) : TTI::PR_None;
8257 PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8258
8259 TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8260 TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8261 if (!PHISize.hasKnownScalarFactor(RHS: ASize))
8262 return false;
8263 unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(RHS: ASize);
8264
8265 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8266 Predicate: [&](ElementCount VF) {
8267 InstructionCost Cost = TTI->getPartialReductionCost(
8268 Opcode: Update->getOpcode(), InputTypeA: ExtOpTypes[0], InputTypeB: ExtOpTypes[1],
8269 AccumType: PHI->getType(), VF, OpAExtend, OpBExtend, BinOp: BinOpc, CostKind: CM.CostKind);
8270 return Cost.isValid();
8271 },
8272 Range)) {
8273 Chains.emplace_back(Args&: Chain, Args&: TargetScaleFactor);
8274 return true;
8275 }
8276
8277 return false;
8278}
8279
8280VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
8281 VFRange &Range) {
8282 // First, check for specific widening recipes that deal with inductions, Phi
8283 // nodes, calls and memory operations.
8284 VPRecipeBase *Recipe;
8285 Instruction *Instr = R->getUnderlyingInstr();
8286 SmallVector<VPValue *, 4> Operands(R->operands());
8287 if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(Val: R)) {
8288 VPBasicBlock *Parent = PhiR->getParent();
8289 [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8290 Parent->getEnclosingLoopRegion();
8291 assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8292 "Non-header phis should have been handled during predication");
8293 auto *Phi = cast<PHINode>(Val: R->getUnderlyingInstr());
8294 assert(Operands.size() == 2 && "Must have 2 operands for header phis");
8295 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8296 return Recipe;
8297
8298 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8299 assert((Legal->isReductionVariable(Phi) ||
8300 Legal->isFixedOrderRecurrence(Phi)) &&
8301 "can only widen reductions and fixed-order recurrences here");
8302 VPValue *StartV = Operands[0];
8303 if (Legal->isReductionVariable(PN: Phi)) {
8304 const RecurrenceDescriptor &RdxDesc =
8305 Legal->getReductionVars().find(Key: Phi)->second;
8306 assert(RdxDesc.getRecurrenceStartValue() ==
8307 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8308
8309 // If the PHI is used by a partial reduction, set the scale factor.
8310 unsigned ScaleFactor =
8311 getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr()).value_or(u: 1);
8312 PhiRecipe = new VPReductionPHIRecipe(
8313 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
8314 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8315 } else {
8316 // TODO: Currently fixed-order recurrences are modeled as chains of
8317 // first-order recurrences. If there are no users of the intermediate
8318 // recurrences in the chain, the fixed order recurrence should be modeled
8319 // directly, enabling more efficient codegen.
8320 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8321 }
8322 // Add backedge value.
8323 PhiRecipe->addOperand(Operand: Operands[1]);
8324 return PhiRecipe;
8325 }
8326
8327 if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate(
8328 I: cast<TruncInst>(Val: Instr), Operands, Range)))
8329 return Recipe;
8330
8331 // All widen recipes below deal only with VF > 1.
8332 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8333 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8334 return nullptr;
8335
8336 if (auto *CI = dyn_cast<CallInst>(Val: Instr))
8337 return tryToWidenCall(CI, Operands, Range);
8338
8339 if (StoreInst *SI = dyn_cast<StoreInst>(Val: Instr))
8340 if (auto HistInfo = Legal->getHistogramInfo(I: SI))
8341 return tryToWidenHistogram(HI: *HistInfo, Operands);
8342
8343 if (isa<LoadInst>(Val: Instr) || isa<StoreInst>(Val: Instr))
8344 return tryToWidenMemory(I: Instr, Operands, Range);
8345
8346 if (std::optional<unsigned> ScaleFactor = getScalingForReduction(ExitInst: Instr))
8347 return tryToCreatePartialReduction(Reduction: Instr, Operands, ScaleFactor: ScaleFactor.value());
8348
8349 if (!shouldWiden(I: Instr, Range))
8350 return nullptr;
8351
8352 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Instr))
8353 return new VPWidenGEPRecipe(GEP, Operands);
8354
8355 if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) {
8356 return new VPWidenSelectRecipe(*SI, Operands);
8357 }
8358
8359 if (auto *CI = dyn_cast<CastInst>(Val: Instr)) {
8360 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8361 *CI);
8362 }
8363
8364 return tryToWiden(I: Instr, Operands);
8365}
8366
8367VPRecipeBase *
8368VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8369 ArrayRef<VPValue *> Operands,
8370 unsigned ScaleFactor) {
8371 assert(Operands.size() == 2 &&
8372 "Unexpected number of operands for partial reduction");
8373
8374 VPValue *BinOp = Operands[0];
8375 VPValue *Accumulator = Operands[1];
8376 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8377 if (isa<VPReductionPHIRecipe>(Val: BinOpRecipe) ||
8378 isa<VPPartialReductionRecipe>(Val: BinOpRecipe))
8379 std::swap(a&: BinOp, b&: Accumulator);
8380
8381 unsigned ReductionOpcode = Reduction->getOpcode();
8382 if (ReductionOpcode == Instruction::Sub) {
8383 auto *const Zero = ConstantInt::get(Ty: Reduction->getType(), V: 0);
8384 SmallVector<VPValue *, 2> Ops;
8385 Ops.push_back(Elt: Plan.getOrAddLiveIn(V: Zero));
8386 Ops.push_back(Elt: BinOp);
8387 BinOp = new VPWidenRecipe(*Reduction, Ops);
8388 Builder.insert(R: BinOp->getDefiningRecipe());
8389 ReductionOpcode = Instruction::Add;
8390 }
8391
8392 VPValue *Cond = nullptr;
8393 if (CM.blockNeedsPredicationForAnyReason(BB: Reduction->getParent())) {
8394 assert((ReductionOpcode == Instruction::Add ||
8395 ReductionOpcode == Instruction::Sub) &&
8396 "Expected an ADD or SUB operation for predicated partial "
8397 "reductions (because the neutral element in the mask is zero)!");
8398 Cond = getBlockInMask(VPBB: Builder.getInsertBlock());
8399 VPValue *Zero =
8400 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: Reduction->getType(), V: 0));
8401 BinOp = Builder.createSelect(Cond, TrueVal: BinOp, FalseVal: Zero, DL: Reduction->getDebugLoc());
8402 }
8403 return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8404 ScaleFactor, Reduction);
8405}
8406
8407void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8408 ElementCount MaxVF) {
8409 if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8410 return;
8411
8412 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8413
8414 const LoopAccessInfo *LAI = Legal->getLAI();
8415 LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8416 OrigLoop, LI, DT, PSE.getSE());
8417 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8418 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
8419 // Only use noalias metadata when using memory checks guaranteeing no
8420 // overlap across all iterations.
8421 LVer.prepareNoAliasMetadata();
8422 }
8423
8424 auto MaxVFTimes2 = MaxVF * 2;
8425 auto VPlan0 = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
8426 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8427 VFRange SubRange = {VF, MaxVFTimes2};
8428 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8429 InitialPlan: std::unique_ptr<VPlan>(VPlan0->duplicate()), Range&: SubRange, LVer: &LVer)) {
8430 bool HasScalarVF = Plan->hasScalarVFOnly();
8431 // Now optimize the initial VPlan.
8432 if (!HasScalarVF)
8433 VPlanTransforms::runPass(Fn: VPlanTransforms::truncateToMinimalBitwidths,
8434 Plan&: *Plan, Args: CM.getMinimalBitwidths());
8435 VPlanTransforms::runPass(Fn: VPlanTransforms::optimize, Plan&: *Plan);
8436 // TODO: try to put it close to addActiveLaneMask().
8437 // Discard the plan if it is not EVL-compatible
8438 if (CM.foldTailWithEVL() && !HasScalarVF &&
8439 !VPlanTransforms::runPass(Transform: VPlanTransforms::tryAddExplicitVectorLength,
8440 Plan&: *Plan, Args: CM.getMaxSafeElements()))
8441 break;
8442 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8443 VPlans.push_back(Elt: std::move(Plan));
8444 }
8445 VF = SubRange.End;
8446 }
8447}
8448
8449/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8450/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8451/// the end value of the induction.
8452static VPInstruction *addResumePhiRecipeForInduction(
8453 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8454 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8455 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
8456 // Truncated wide inductions resume from the last lane of their vector value
8457 // in the last vector iteration which is handled elsewhere.
8458 if (WideIntOrFp && WideIntOrFp->getTruncInst())
8459 return nullptr;
8460
8461 VPValue *Start = WideIV->getStartValue();
8462 VPValue *Step = WideIV->getStepValue();
8463 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8464 VPValue *EndValue = VectorTC;
8465 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8466 EndValue = VectorPHBuilder.createDerivedIV(
8467 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
8468 Start, Current: VectorTC, Step);
8469 }
8470
8471 // EndValue is derived from the vector trip count (which has the same type as
8472 // the widest induction) and thus may be wider than the induction here.
8473 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
8474 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
8475 EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
8476 ResultTy: ScalarTypeOfWideIV,
8477 DL: WideIV->getDebugLoc());
8478 }
8479
8480 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8481 IncomingValues: {EndValue, Start}, DL: WideIV->getDebugLoc(), Name: "bc.resume.val");
8482 return ResumePhiRecipe;
8483}
8484
8485/// Create resume phis in the scalar preheader for first-order recurrences,
8486/// reductions and inductions, and update the VPIRInstructions wrapping the
8487/// original phis in the scalar header. End values for inductions are added to
8488/// \p IVEndValues.
8489static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8490 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8491 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8492 auto *ScalarPH = Plan.getScalarPreheader();
8493 auto *MiddleVPBB = cast<VPBasicBlock>(Val: ScalarPH->getPredecessors()[0]);
8494 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8495 VPBuilder VectorPHBuilder(
8496 cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor()));
8497 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8498 VPBuilder ScalarPHBuilder(ScalarPH);
8499 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8500 auto *ScalarPhiIRI = cast<VPIRPhi>(Val: &ScalarPhiR);
8501
8502 // TODO: Extract final value from induction recipe initially, optimize to
8503 // pre-computed end value together in optimizeInductionExitUsers.
8504 auto *VectorPhiR =
8505 cast<VPHeaderPHIRecipe>(Val: Builder.getRecipe(I: &ScalarPhiIRI->getIRPhi()));
8506 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(Val: VectorPhiR)) {
8507 if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
8508 WideIV: WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8509 VectorTC: &Plan.getVectorTripCount())) {
8510 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8511 IVEndValues[WideIVR] = ResumePhi->getOperand(N: 0);
8512 ScalarPhiIRI->addOperand(Operand: ResumePhi);
8513 continue;
8514 }
8515 // TODO: Also handle truncated inductions here. Computing end-values
8516 // separately should be done as VPlan-to-VPlan optimization, after
8517 // legalizing all resume values to use the last lane from the loop.
8518 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8519 "should only skip truncated wide inductions");
8520 continue;
8521 }
8522
8523 // The backedge value provides the value to resume coming out of a loop,
8524 // which for FORs is a vector whose last element needs to be extracted. The
8525 // start value provides the value if the loop is bypassed.
8526 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(Val: VectorPhiR);
8527 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8528 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8529 "Cannot handle loops with uncountable early exits");
8530 if (IsFOR)
8531 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8532 Opcode: VPInstruction::ExtractLastElement, Operands: {ResumeFromVectorLoop}, Inst: {},
8533 Name: "vector.recur.extract");
8534 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8535 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8536 IncomingValues: {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, DL: {}, Name);
8537 ScalarPhiIRI->addOperand(Operand: ResumePhiR);
8538 }
8539}
8540
8541// Collect VPIRInstructions for phis in the exit block from the latch only.
8542static SetVector<VPIRInstruction *> collectUsersInLatchExitBlock(VPlan &Plan) {
8543 SetVector<VPIRInstruction *> ExitUsersToFix;
8544 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8545
8546 if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock())
8547 continue;
8548
8549 for (VPRecipeBase &R : ExitVPBB->phis()) {
8550 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
8551 assert(ExitIRI->getNumOperands() == 1 && "must have a single operand");
8552 VPValue *V = ExitIRI->getOperand(N: 0);
8553 if (V->isLiveIn())
8554 continue;
8555 assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
8556 "Only recipes defined inside a region should need fixing.");
8557 ExitUsersToFix.insert(X: ExitIRI);
8558 }
8559 }
8560 return ExitUsersToFix;
8561}
8562
8563// Add exit values to \p Plan. Extracts are added for each entry in \p
8564// ExitUsersToFix if needed and their operands are updated.
8565static void
8566addUsersInExitBlocks(VPlan &Plan,
8567 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
8568 if (ExitUsersToFix.empty())
8569 return;
8570
8571 auto *MiddleVPBB = Plan.getMiddleBlock();
8572 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8573
8574 // Introduce extract for exiting values and update the VPIRInstructions
8575 // modeling the corresponding LCSSA phis.
8576 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8577 assert(ExitIRI->getNumOperands() == 1 &&
8578 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8579 "exit values from early exits must be fixed when branch to "
8580 "early-exit is added");
8581 ExitIRI->extractLastLaneOfFirstOperand(Builder&: B);
8582 }
8583}
8584
8585/// Handle users in the exit block for first order reductions in the original
8586/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8587/// users in the original exit block using the VPIRInstruction wrapping to the
8588/// LCSSA phi.
8589static void addExitUsersForFirstOrderRecurrences(
8590 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
8591 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8592 auto *ScalarPHVPBB = Plan.getScalarPreheader();
8593 auto *MiddleVPBB = Plan.getMiddleBlock();
8594 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8595 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8596
8597 auto IsScalableOne = [](ElementCount VF) -> bool {
8598 return VF == ElementCount::getScalable(MinVal: 1);
8599 };
8600
8601 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8602 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
8603 if (!FOR)
8604 continue;
8605
8606 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8607 "Cannot handle loops with uncountable early exits");
8608
8609 // This is the second phase of vectorizing first-order recurrences, creating
8610 // extract for users outside the loop. An overview of the transformation is
8611 // described below. Suppose we have the following loop with some use after
8612 // the loop of the last a[i-1],
8613 //
8614 // for (int i = 0; i < n; ++i) {
8615 // t = a[i - 1];
8616 // b[i] = a[i] - t;
8617 // }
8618 // use t;
8619 //
8620 // There is a first-order recurrence on "a". For this loop, the shorthand
8621 // scalar IR looks like:
8622 //
8623 // scalar.ph:
8624 // s.init = a[-1]
8625 // br scalar.body
8626 //
8627 // scalar.body:
8628 // i = phi [0, scalar.ph], [i+1, scalar.body]
8629 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8630 // s2 = a[i]
8631 // b[i] = s2 - s1
8632 // br cond, scalar.body, exit.block
8633 //
8634 // exit.block:
8635 // use = lcssa.phi [s1, scalar.body]
8636 //
8637 // In this example, s1 is a recurrence because it's value depends on the
8638 // previous iteration. In the first phase of vectorization, we created a
8639 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8640 // for users in the scalar preheader and exit block.
8641 //
8642 // vector.ph:
8643 // v_init = vector(..., ..., ..., a[-1])
8644 // br vector.body
8645 //
8646 // vector.body
8647 // i = phi [0, vector.ph], [i+4, vector.body]
8648 // v1 = phi [v_init, vector.ph], [v2, vector.body]
8649 // v2 = a[i, i+1, i+2, i+3]
8650 // b[i] = v2 - v1
8651 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8652 // b[i, i+1, i+2, i+3] = v2 - v1
8653 // br cond, vector.body, middle.block
8654 //
8655 // middle.block:
8656 // vector.recur.extract.for.phi = v2(2)
8657 // vector.recur.extract = v2(3)
8658 // br cond, scalar.ph, exit.block
8659 //
8660 // scalar.ph:
8661 // scalar.recur.init = phi [vector.recur.extract, middle.block],
8662 // [s.init, otherwise]
8663 // br scalar.body
8664 //
8665 // scalar.body:
8666 // i = phi [0, scalar.ph], [i+1, scalar.body]
8667 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8668 // s2 = a[i]
8669 // b[i] = s2 - s1
8670 // br cond, scalar.body, exit.block
8671 //
8672 // exit.block:
8673 // lo = lcssa.phi [s1, scalar.body],
8674 // [vector.recur.extract.for.phi, middle.block]
8675 //
8676 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
8677 // Extract the penultimate value of the recurrence and use it as operand for
8678 // the VPIRInstruction modeling the phi.
8679 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8680 if (ExitIRI->getOperand(N: 0) != FOR)
8681 continue;
8682 // For VF vscale x 1, if vscale = 1, we are unable to extract the
8683 // penultimate value of the recurrence. Instead, we rely on function
8684 // addUsersInExitBlocks to extract the last element from the result of
8685 // VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
8686 // recurrence phi in ExitUsersToFix.
8687 // TODO: Consider vscale_range info and UF.
8688 if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
8689 Range))
8690 return;
8691 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8692 Opcode: VPInstruction::ExtractPenultimateElement, Operands: {FOR->getBackedgeValue()},
8693 Inst: {}, Name: "vector.recur.extract.for.phi");
8694 ExitIRI->setOperand(I: 0, New: PenultimateElement);
8695 ExitUsersToFix.remove(X: ExitIRI);
8696 }
8697 }
8698}
8699
8700VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8701 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8702
8703 using namespace llvm::VPlanPatternMatch;
8704 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8705
8706 // ---------------------------------------------------------------------------
8707 // Build initial VPlan: Scan the body of the loop in a topological order to
8708 // visit each basic block after having visited its predecessor basic blocks.
8709 // ---------------------------------------------------------------------------
8710
8711 // Create initial VPlan skeleton, having a basic block for the pre-header
8712 // which contains SCEV expansions that need to happen before the CFG is
8713 // modified; a basic block for the vector pre-header, followed by a region for
8714 // the vector loop, followed by the middle basic block. The skeleton vector
8715 // loop region contains a header and latch basic blocks.
8716
8717 bool RequiresScalarEpilogueCheck =
8718 LoopVectorizationPlanner::getDecisionAndClampRange(
8719 Predicate: [this](ElementCount VF) {
8720 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8721 },
8722 Range);
8723 VPlanTransforms::prepareForVectorization(
8724 Plan&: *Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
8725 TailFolded: CM.foldTailByMasking(), TheLoop: OrigLoop,
8726 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()),
8727 HasUncountableExit: Legal->hasUncountableEarlyExit(), Range);
8728 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8729
8730 // Don't use getDecisionAndClampRange here, because we don't know the UF
8731 // so this function is better to be conservative, rather than to split
8732 // it up into different VPlans.
8733 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8734 bool IVUpdateMayOverflow = false;
8735 for (ElementCount VF : Range)
8736 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8737
8738 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8739 // Use NUW for the induction increment if we proved that it won't overflow in
8740 // the vector loop or when not folding the tail. In the later case, we know
8741 // that the canonical induction increment will not overflow as the vector trip
8742 // count is >= increment and a multiple of the increment.
8743 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8744 if (!HasNUW) {
8745 auto *IVInc = Plan->getVectorLoopRegion()
8746 ->getExitingBasicBlock()
8747 ->getTerminator()
8748 ->getOperand(N: 0);
8749 assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8750 m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8751 "Did not find the canonical IV increment");
8752 cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8753 }
8754
8755 // ---------------------------------------------------------------------------
8756 // Pre-construction: record ingredients whose recipes we'll need to further
8757 // process after constructing the initial VPlan.
8758 // ---------------------------------------------------------------------------
8759
8760 // For each interleave group which is relevant for this (possibly trimmed)
8761 // Range, add it to the set of groups to be later applied to the VPlan and add
8762 // placeholders for its members' Recipes which we'll be replacing with a
8763 // single VPInterleaveRecipe.
8764 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8765 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8766 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8767 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8768 LoopVectorizationCostModel::CM_Interleave);
8769 // For scalable vectors, the interleave factors must be <= 8 since we
8770 // require the (de)interleaveN intrinsics instead of shufflevectors.
8771 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8772 "Unsupported interleave factor for scalable vectors");
8773 return Result;
8774 };
8775 if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8776 continue;
8777 InterleaveGroups.insert(Ptr: IG);
8778 }
8779
8780 // ---------------------------------------------------------------------------
8781 // Predicate and linearize the top-level loop region.
8782 // ---------------------------------------------------------------------------
8783 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8784 Plan&: *Plan, FoldTail: CM.foldTailByMasking());
8785
8786 // ---------------------------------------------------------------------------
8787 // Construct wide recipes and apply predication for original scalar
8788 // VPInstructions in the loop.
8789 // ---------------------------------------------------------------------------
8790 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8791 Builder, BlockMaskCache, LVer);
8792 RecipeBuilder.collectScaledReductions(Range);
8793
8794 // Scan the body of the loop in a topological order to visit each basic block
8795 // after having visited its predecessor basic blocks.
8796 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8797 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8798 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8799 HeaderVPBB);
8800
8801 auto *MiddleVPBB = Plan->getMiddleBlock();
8802 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8803 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8804 // temporarily to update created block masks.
8805 DenseMap<VPValue *, VPValue *> Old2New;
8806 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8807 // Convert input VPInstructions to widened recipes.
8808 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
8809 auto *SingleDef = cast<VPSingleDefRecipe>(Val: &R);
8810 auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8811 // Skip recipes that do not need transforming, including canonical IV,
8812 // wide canonical IV and VPInstructions without underlying values. The
8813 // latter are added above for masking.
8814 // FIXME: Migrate code relying on the underlying instruction from VPlan0
8815 // to construct recipes below to not use the underlying instruction.
8816 if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8817 Val: &R) ||
8818 (isa<VPInstruction>(Val: &R) && !UnderlyingValue))
8819 continue;
8820
8821 // FIXME: VPlan0, which models a copy of the original scalar loop, should
8822 // not use VPWidenPHIRecipe to model the phis.
8823 assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
8824 UnderlyingValue && "unsupported recipe");
8825
8826 // TODO: Gradually replace uses of underlying instruction by analyses on
8827 // VPlan.
8828 Instruction *Instr = cast<Instruction>(Val: UnderlyingValue);
8829 Builder.setInsertPoint(SingleDef);
8830
8831 // The stores with invariant address inside the loop will be deleted, and
8832 // in the exit block, a uniform store recipe will be created for the final
8833 // invariant store of the reduction.
8834 StoreInst *SI;
8835 if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8836 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8837 // Only create recipe for the final invariant store of the reduction.
8838 if (Legal->isInvariantStoreOfReduction(SI)) {
8839 auto *Recipe =
8840 new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
8841 nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
8842 Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8843 }
8844 R.eraseFromParent();
8845 continue;
8846 }
8847
8848 VPRecipeBase *Recipe =
8849 RecipeBuilder.tryToCreateWidenRecipe(R: SingleDef, Range);
8850 if (!Recipe) {
8851 SmallVector<VPValue *, 4> Operands(R.operands());
8852 Recipe = RecipeBuilder.handleReplication(I: Instr, Operands, Range);
8853 }
8854
8855 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8856 if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8857 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8858 // moved to the phi section in the header.
8859 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8860 } else {
8861 Builder.insert(R: Recipe);
8862 }
8863 if (Recipe->getNumDefinedValues() == 1) {
8864 SingleDef->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8865 Old2New[SingleDef] = Recipe->getVPSingleValue();
8866 } else {
8867 assert(Recipe->getNumDefinedValues() == 0 &&
8868 "Unexpected multidef recipe");
8869 R.eraseFromParent();
8870 }
8871 }
8872 }
8873
8874 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8875 // TODO: Include the masks as operands in the predicated VPlan directly
8876 // to remove the need to keep a map of masks beyond the predication
8877 // transform.
8878 RecipeBuilder.updateBlockMaskCache(Old2New);
8879 for (const auto &[Old, _] : Old2New)
8880 Old->getDefiningRecipe()->eraseFromParent();
8881
8882 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8883 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8884 "entry block must be set to a VPRegionBlock having a non-empty entry "
8885 "VPBasicBlock");
8886
8887 // Update wide induction increments to use the same step as the corresponding
8888 // wide induction. This enables detecting induction increments directly in
8889 // VPlan and removes redundant splats.
8890 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8891 auto *IVInc = cast<Instruction>(
8892 Val: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
8893 if (IVInc->getOperand(i: 0) != Phi || IVInc->getOpcode() != Instruction::Add)
8894 continue;
8895 VPWidenInductionRecipe *WideIV =
8896 cast<VPWidenInductionRecipe>(Val: RecipeBuilder.getRecipe(I: Phi));
8897 VPRecipeBase *R = RecipeBuilder.getRecipe(I: IVInc);
8898 R->setOperand(I: 1, New: WideIV->getStepValue());
8899 }
8900
8901 DenseMap<VPValue *, VPValue *> IVEndValues;
8902 addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
8903 SetVector<VPIRInstruction *> ExitUsersToFix =
8904 collectUsersInLatchExitBlock(Plan&: *Plan);
8905 addExitUsersForFirstOrderRecurrences(Plan&: *Plan, ExitUsersToFix, Range);
8906 addUsersInExitBlocks(Plan&: *Plan, ExitUsersToFix);
8907
8908 // ---------------------------------------------------------------------------
8909 // Transform initial VPlan: Apply previously taken decisions, in order, to
8910 // bring the VPlan to its final state.
8911 // ---------------------------------------------------------------------------
8912
8913 // Adjust the recipes for any inloop reductions.
8914 adjustRecipesForReductions(Plan, RecipeBuilder, MinVF: Range.Start);
8915
8916 // Transform recipes to abstract recipes if it is legal and beneficial and
8917 // clamp the range for better cost estimation.
8918 // TODO: Enable following transform when the EVL-version of extended-reduction
8919 // and mulacc-reduction are implemented.
8920 if (!CM.foldTailWithEVL()) {
8921 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8922 CM.CostKind);
8923 VPlanTransforms::runPass(Fn: VPlanTransforms::convertToAbstractRecipes, Plan&: *Plan,
8924 Args&: CostCtx, Args&: Range);
8925 }
8926
8927 for (ElementCount VF : Range)
8928 Plan->addVF(VF);
8929 Plan->setName("Initial VPlan");
8930
8931 // Interleave memory: for each Interleave Group we marked earlier as relevant
8932 // for this VPlan, replace the Recipes widening its memory instructions with a
8933 // single VPInterleaveRecipe at its insertion point.
8934 VPlanTransforms::runPass(Fn: VPlanTransforms::createInterleaveGroups, Plan&: *Plan,
8935 Args: InterleaveGroups, Args&: RecipeBuilder,
8936 Args: CM.isScalarEpilogueAllowed());
8937
8938 // Replace VPValues for known constant strides guaranteed by predicate scalar
8939 // evolution.
8940 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
8941 auto *R = cast<VPRecipeBase>(Val: &U);
8942 return R->getParent()->getParent() ||
8943 R->getParent() ==
8944 Plan->getVectorLoopRegion()->getSinglePredecessor();
8945 };
8946 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8947 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
8948 auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV));
8949 // Only handle constant strides for now.
8950 if (!ScevStride)
8951 continue;
8952
8953 auto *CI = Plan->getOrAddLiveIn(
8954 V: ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt()));
8955 if (VPValue *StrideVPV = Plan->getLiveIn(V: StrideV))
8956 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8957
8958 // The versioned value may not be used in the loop directly but through a
8959 // sext/zext. Add new live-ins in those cases.
8960 for (Value *U : StrideV->users()) {
8961 if (!isa<SExtInst, ZExtInst>(Val: U))
8962 continue;
8963 VPValue *StrideVPV = Plan->getLiveIn(V: U);
8964 if (!StrideVPV)
8965 continue;
8966 unsigned BW = U->getType()->getScalarSizeInBits();
8967 APInt C = isa<SExtInst>(Val: U) ? ScevStride->getAPInt().sext(width: BW)
8968 : ScevStride->getAPInt().zext(width: BW);
8969 VPValue *CI = Plan->getOrAddLiveIn(V: ConstantInt::get(Ty: U->getType(), V: C));
8970 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8971 }
8972 }
8973
8974 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8975 return Legal->blockNeedsPredication(BB);
8976 };
8977 VPlanTransforms::runPass(Fn: VPlanTransforms::dropPoisonGeneratingRecipes, Plan&: *Plan,
8978 Args: BlockNeedsPredication);
8979
8980 // Sink users of fixed-order recurrence past the recipe defining the previous
8981 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8982 if (!VPlanTransforms::runPass(Transform: VPlanTransforms::adjustFixedOrderRecurrences,
8983 Plan&: *Plan, Args&: Builder))
8984 return nullptr;
8985
8986 if (useActiveLaneMask(Style)) {
8987 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8988 // TailFoldingStyle is visible there.
8989 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8990 bool WithoutRuntimeCheck =
8991 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8992 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8993 DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8994 }
8995 VPlanTransforms::optimizeInductionExitUsers(Plan&: *Plan, EndValues&: IVEndValues);
8996
8997 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8998 return Plan;
8999}
9000
9001VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
9002 // Outer loop handling: They may require CFG and instruction level
9003 // transformations before even evaluating whether vectorization is profitable.
9004 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9005 // the vectorization pipeline.
9006 assert(!OrigLoop->isInnermost());
9007 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9008
9009 auto Plan = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
9010 VPlanTransforms::prepareForVectorization(
9011 Plan&: *Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck: true, TailFolded: false, TheLoop: OrigLoop,
9012 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), HasUncountableExit: false,
9013 Range);
9014 VPlanTransforms::createLoopRegions(Plan&: *Plan);
9015
9016 for (ElementCount VF : Range)
9017 Plan->addVF(VF);
9018
9019 if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
9020 Plan,
9021 GetIntOrFpInductionDescriptor: [this](PHINode *P) {
9022 return Legal->getIntOrFpInductionDescriptor(Phi: P);
9023 },
9024 SE&: *PSE.getSE(), TLI: *TLI))
9025 return nullptr;
9026
9027 // Collect mapping of IR header phis to header phi recipes, to be used in
9028 // addScalarResumePhis.
9029 DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
9030 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
9031 Builder, BlockMaskCache, nullptr /*LVer*/);
9032 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9033 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9034 continue;
9035 auto *HeaderR = cast<VPHeaderPHIRecipe>(Val: &R);
9036 RecipeBuilder.setRecipe(I: HeaderR->getUnderlyingInstr(), R: HeaderR);
9037 }
9038 DenseMap<VPValue *, VPValue *> IVEndValues;
9039 // TODO: IVEndValues are not used yet in the native path, to optimize exit
9040 // values.
9041 addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
9042
9043 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
9044 return Plan;
9045}
9046
9047// Adjust the recipes for reductions. For in-loop reductions the chain of
9048// instructions leading from the loop exit instr to the phi need to be converted
9049// to reductions, with one operand being vector and the other being the scalar
9050// reduction chain. For other reductions, a select is introduced between the phi
9051// and users outside the vector region when folding the tail.
9052//
9053// A ComputeReductionResult recipe is added to the middle block, also for
9054// in-loop reductions which compute their result in-loop, because generating
9055// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
9056//
9057// Adjust AnyOf reductions; replace the reduction phi for the selected value
9058// with a boolean reduction phi node to check if the condition is true in any
9059// iteration. The final value is selected by the final ComputeReductionResult.
9060void LoopVectorizationPlanner::adjustRecipesForReductions(
9061 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
9062 using namespace VPlanPatternMatch;
9063 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9064 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
9065 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
9066 SmallVector<VPRecipeBase *> ToDelete;
9067
9068 for (VPRecipeBase &R : Header->phis()) {
9069 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9070 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
9071 continue;
9072
9073 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9074 RecurKind Kind = RdxDesc.getRecurrenceKind();
9075 assert(
9076 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
9077 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
9078 "AnyOf and FindIV reductions are not allowed for in-loop reductions");
9079
9080 // Collect the chain of "link" recipes for the reduction starting at PhiR.
9081 SetVector<VPSingleDefRecipe *> Worklist;
9082 Worklist.insert(X: PhiR);
9083 for (unsigned I = 0; I != Worklist.size(); ++I) {
9084 VPSingleDefRecipe *Cur = Worklist[I];
9085 for (VPUser *U : Cur->users()) {
9086 auto *UserRecipe = cast<VPSingleDefRecipe>(Val: U);
9087 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9088 assert((UserRecipe->getParent() == MiddleVPBB ||
9089 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9090 "U must be either in the loop region, the middle block or the "
9091 "scalar preheader.");
9092 continue;
9093 }
9094 Worklist.insert(X: UserRecipe);
9095 }
9096 }
9097
9098 // Visit operation "Links" along the reduction chain top-down starting from
9099 // the phi until LoopExitValue. We keep track of the previous item
9100 // (PreviousLink) to tell which of the two operands of a Link will remain
9101 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9102 // the select instructions. Blend recipes of in-loop reduction phi's will
9103 // get folded to their non-phi operand, as the reduction recipe handles the
9104 // condition directly.
9105 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9106 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
9107 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink)) {
9108 assert(Blend->getNumIncomingValues() == 2 &&
9109 "Blend must have 2 incoming values");
9110 if (Blend->getIncomingValue(Idx: 0) == PhiR) {
9111 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1));
9112 } else {
9113 assert(Blend->getIncomingValue(1) == PhiR &&
9114 "PhiR must be an operand of the blend");
9115 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 0));
9116 }
9117 continue;
9118 }
9119
9120 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9121
9122 // Index of the first operand which holds a non-mask vector operand.
9123 unsigned IndexOfFirstOperand;
9124 // Recognize a call to the llvm.fmuladd intrinsic.
9125 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9126 VPValue *VecOp;
9127 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9128 if (IsFMulAdd) {
9129 assert(
9130 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9131 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9132 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9133 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9134 CurrentLink->getOperand(2) == PreviousLink &&
9135 "expected a call where the previous link is the added operand");
9136
9137 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9138 // need to create an fmul recipe (multiplying the first two operands of
9139 // the fmuladd together) to use as the vector operand for the fadd
9140 // reduction.
9141 VPInstruction *FMulRecipe = new VPInstruction(
9142 Instruction::FMul,
9143 {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)},
9144 CurrentLinkI->getFastMathFlags());
9145 LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
9146 VecOp = FMulRecipe;
9147 } else {
9148 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9149 if (isa<VPWidenRecipe>(Val: CurrentLink)) {
9150 assert(isa<CmpInst>(CurrentLinkI) &&
9151 "need to have the compare of the select");
9152 continue;
9153 }
9154 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9155 "must be a select recipe");
9156 IndexOfFirstOperand = 1;
9157 } else {
9158 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9159 "Expected to replace a VPWidenSC");
9160 IndexOfFirstOperand = 0;
9161 }
9162 // Note that for non-commutable operands (cmp-selects), the semantics of
9163 // the cmp-select are captured in the recurrence kind.
9164 unsigned VecOpId =
9165 CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
9166 ? IndexOfFirstOperand + 1
9167 : IndexOfFirstOperand;
9168 VecOp = CurrentLink->getOperand(N: VecOpId);
9169 assert(VecOp != PreviousLink &&
9170 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9171 (VecOpId - IndexOfFirstOperand)) ==
9172 PreviousLink &&
9173 "PreviousLink must be the operand other than VecOp");
9174 }
9175
9176 VPValue *CondOp = nullptr;
9177 if (CM.blockNeedsPredicationForAnyReason(BB: CurrentLinkI->getParent()))
9178 CondOp = RecipeBuilder.getBlockInMask(VPBB: CurrentLink->getParent());
9179
9180 // Non-FP RdxDescs will have all fast math flags set, so clear them.
9181 FastMathFlags FMFs = isa<FPMathOperator>(Val: CurrentLinkI)
9182 ? RdxDesc.getFastMathFlags()
9183 : FastMathFlags();
9184 auto *RedRecipe = new VPReductionRecipe(
9185 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
9186 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());
9187 // Append the recipe to the end of the VPBasicBlock because we need to
9188 // ensure that it comes after all of it's inputs, including CondOp.
9189 // Delete CurrentLink as it will be invalid if its operand is replaced
9190 // with a reduction defined at the bottom of the block in the next link.
9191 if (LinkVPBB->getNumSuccessors() == 0)
9192 RedRecipe->insertBefore(InsertPos: &*std::prev(x: std::prev(x: LinkVPBB->end())));
9193 else
9194 LinkVPBB->appendRecipe(Recipe: RedRecipe);
9195
9196 CurrentLink->replaceAllUsesWith(New: RedRecipe);
9197 ToDelete.push_back(Elt: CurrentLink);
9198 PreviousLink = RedRecipe;
9199 }
9200 }
9201 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9202 Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
9203 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9204 for (VPRecipeBase &R :
9205 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9206 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9207 if (!PhiR)
9208 continue;
9209
9210 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
9211 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9212 // If tail is folded by masking, introduce selects between the phi
9213 // and the users outside the vector region of each reduction, at the
9214 // beginning of the dedicated latch block.
9215 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9216 auto *NewExitingVPV = PhiR->getBackedgeValue();
9217 // Don't output selects for partial reductions because they have an output
9218 // with fewer lanes than the VF. So the operands of the select would have
9219 // different numbers of lanes. Partial reductions mask the input instead.
9220 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9221 !isa<VPPartialReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe())) {
9222 VPValue *Cond = RecipeBuilder.getBlockInMask(VPBB: PhiR->getParent());
9223 std::optional<FastMathFlags> FMFs =
9224 PhiTy->isFloatingPointTy()
9225 ? std::make_optional(t: RdxDesc.getFastMathFlags())
9226 : std::nullopt;
9227 NewExitingVPV =
9228 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
9229 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
9230 return isa<VPInstruction>(Val: &U) &&
9231 (cast<VPInstruction>(Val: &U)->getOpcode() ==
9232 VPInstruction::ComputeAnyOfResult ||
9233 cast<VPInstruction>(Val: &U)->getOpcode() ==
9234 VPInstruction::ComputeReductionResult ||
9235 cast<VPInstruction>(Val: &U)->getOpcode() ==
9236 VPInstruction::ComputeFindIVResult);
9237 });
9238 if (CM.usePredicatedReductionSelect())
9239 PhiR->setOperand(I: 1, New: NewExitingVPV);
9240 }
9241
9242 // We want code in the middle block to appear to execute on the location of
9243 // the scalar loop's latch terminator because: (a) it is all compiler
9244 // generated, (b) these instructions are always executed after evaluating
9245 // the latch conditional branch, and (c) other passes may add new
9246 // predecessors which terminate on this line. This is the easiest way to
9247 // ensure we don't accidentally cause an extra step back into the loop while
9248 // debugging.
9249 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9250
9251 // TODO: At the moment ComputeReductionResult also drives creation of the
9252 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9253 // even for in-loop reductions, until the reduction resume value handling is
9254 // also modeled in VPlan.
9255 VPInstruction *FinalReductionResult;
9256 VPBuilder::InsertPointGuard Guard(Builder);
9257 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
9258 if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9259 Kind: RdxDesc.getRecurrenceKind())) {
9260 VPValue *Start = PhiR->getStartValue();
9261 VPValue *Sentinel = Plan->getOrAddLiveIn(V: RdxDesc.getSentinelValue());
9262 FinalReductionResult =
9263 Builder.createNaryOp(Opcode: VPInstruction::ComputeFindIVResult,
9264 Operands: {PhiR, Start, Sentinel, NewExitingVPV}, DL: ExitDL);
9265 } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9266 Kind: RdxDesc.getRecurrenceKind())) {
9267 VPValue *Start = PhiR->getStartValue();
9268 FinalReductionResult =
9269 Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
9270 Operands: {PhiR, Start, NewExitingVPV}, DL: ExitDL);
9271 } else {
9272 VPIRFlags Flags = RecurrenceDescriptor::isFloatingPointRecurrenceKind(
9273 Kind: RdxDesc.getRecurrenceKind())
9274 ? VPIRFlags(RdxDesc.getFastMathFlags())
9275 : VPIRFlags();
9276 FinalReductionResult =
9277 Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
9278 Operands: {PhiR, NewExitingVPV}, Flags, DL: ExitDL);
9279 }
9280 // If the vector reduction can be performed in a smaller type, we truncate
9281 // then extend the loop exit value to enable InstCombine to evaluate the
9282 // entire expression in the smaller type.
9283 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9284 !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9285 Kind: RdxDesc.getRecurrenceKind())) {
9286 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9287 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(
9288 RdxDesc.getRecurrenceKind()) &&
9289 "Unexpected truncated min-max recurrence!");
9290 Type *RdxTy = RdxDesc.getRecurrenceType();
9291 auto *Trunc =
9292 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9293 Instruction::CastOps ExtendOpc =
9294 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9295 auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
9296 Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe());
9297 Extnd->insertAfter(InsertPos: Trunc);
9298 if (PhiR->getOperand(N: 1) == NewExitingVPV)
9299 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
9300
9301 // Update ComputeReductionResult with the truncated exiting value and
9302 // extend its result.
9303 FinalReductionResult->setOperand(I: 1, New: Trunc);
9304 FinalReductionResult =
9305 Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
9306 }
9307
9308 // Update all users outside the vector region. Also replace redundant
9309 // ExtractLastElement.
9310 for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
9311 auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
9312 if (FinalReductionResult == U || Parent->getParent())
9313 continue;
9314 U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
9315 if (match(U, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
9316 Op0: m_VPValue())))
9317 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
9318 }
9319
9320 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9321 // with a boolean reduction phi node to check if the condition is true in
9322 // any iteration. The final value is selected by the final
9323 // ComputeReductionResult.
9324 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
9325 Kind: RdxDesc.getRecurrenceKind())) {
9326 auto *Select = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
9327 return isa<VPWidenSelectRecipe>(Val: U) ||
9328 (isa<VPReplicateRecipe>(Val: U) &&
9329 cast<VPReplicateRecipe>(Val: U)->getUnderlyingInstr()->getOpcode() ==
9330 Instruction::Select);
9331 }));
9332 VPValue *Cmp = Select->getOperand(N: 0);
9333 // If the compare is checking the reduction PHI node, adjust it to check
9334 // the start value.
9335 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9336 CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
9337 Builder.setInsertPoint(Select);
9338
9339 // If the true value of the select is the reduction phi, the new value is
9340 // selected if the negated condition is true in any iteration.
9341 if (Select->getOperand(N: 1) == PhiR)
9342 Cmp = Builder.createNot(Operand: Cmp);
9343 VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
9344 Select->getVPSingleValue()->replaceAllUsesWith(New: Or);
9345 // Delete Select now that it has invalid types.
9346 ToDelete.push_back(Elt: Select);
9347
9348 // Convert the reduction phi to operate on bools.
9349 PhiR->setOperand(I: 0, New: Plan->getOrAddLiveIn(V: ConstantInt::getFalse(
9350 Context&: OrigLoop->getHeader()->getContext())));
9351 continue;
9352 }
9353
9354 if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9355 Kind: RdxDesc.getRecurrenceKind())) {
9356 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9357 // the sentinel value after generating the ResumePhi recipe, which uses
9358 // the original start value.
9359 PhiR->setOperand(I: 0, New: Plan->getOrAddLiveIn(V: RdxDesc.getSentinelValue()));
9360 }
9361 RecurKind RK = RdxDesc.getRecurrenceKind();
9362 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
9363 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
9364 !RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))) {
9365 VPBuilder PHBuilder(Plan->getVectorPreheader());
9366 VPValue *Iden = Plan->getOrAddLiveIn(
9367 V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: RdxDesc.getFastMathFlags()));
9368 // If the PHI is used by a partial reduction, set the scale factor.
9369 unsigned ScaleFactor =
9370 RecipeBuilder.getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr())
9371 .value_or(u: 1);
9372 Type *I32Ty = IntegerType::getInt32Ty(C&: PhiTy->getContext());
9373 auto *ScaleFactorVPV =
9374 Plan->getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: ScaleFactor));
9375 VPValue *StartV = PHBuilder.createNaryOp(
9376 Opcode: VPInstruction::ReductionStartVector,
9377 Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9378 Flags: PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9379 : FastMathFlags());
9380 PhiR->setOperand(I: 0, New: StartV);
9381 }
9382 }
9383 for (VPRecipeBase *R : ToDelete)
9384 R->eraseFromParent();
9385
9386 VPlanTransforms::runPass(Fn: VPlanTransforms::clearReductionWrapFlags, Plan&: *Plan);
9387}
9388
9389void VPDerivedIVRecipe::execute(VPTransformState &State) {
9390 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9391
9392 // Fast-math-flags propagate from the original induction instruction.
9393 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9394 if (FPBinOp)
9395 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9396
9397 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
9398 Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
9399 Value *DerivedIV = emitTransformedIndex(
9400 B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
9401 InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
9402 DerivedIV->setName(Name);
9403 // If index is the vector trip count, the concrete value will only be set in
9404 // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9405 // TODO: Remove the special case for the vector trip count once it is computed
9406 // in VPlan and can be used during VPlan simplification.
9407 assert((DerivedIV != Index ||
9408 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9409 "IV didn't need transforming?");
9410 State.set(Def: this, V: DerivedIV, Lane: VPLane(0));
9411}
9412
9413// Determine how to lower the scalar epilogue, which depends on 1) optimising
9414// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9415// predication, and 4) a TTI hook that analyses whether the loop is suitable
9416// for predication.
9417static ScalarEpilogueLowering getScalarEpilogueLowering(
9418 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9419 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9420 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9421 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9422 // don't look at hints or options, and don't request a scalar epilogue.
9423 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9424 // LoopAccessInfo (due to code dependency and not being able to reliably get
9425 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9426 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9427 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9428 // back to the old way and vectorize with versioning when forced. See D81345.)
9429 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
9430 QueryType: PGSOQueryType::IRPass) &&
9431 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9432 return CM_ScalarEpilogueNotAllowedOptSize;
9433
9434 // 2) If set, obey the directives
9435 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9436 switch (PreferPredicateOverEpilogue) {
9437 case PreferPredicateTy::ScalarEpilogue:
9438 return CM_ScalarEpilogueAllowed;
9439 case PreferPredicateTy::PredicateElseScalarEpilogue:
9440 return CM_ScalarEpilogueNotNeededUsePredicate;
9441 case PreferPredicateTy::PredicateOrDontVectorize:
9442 return CM_ScalarEpilogueNotAllowedUsePredicate;
9443 };
9444 }
9445
9446 // 3) If set, obey the hints
9447 switch (Hints.getPredicate()) {
9448 case LoopVectorizeHints::FK_Enabled:
9449 return CM_ScalarEpilogueNotNeededUsePredicate;
9450 case LoopVectorizeHints::FK_Disabled:
9451 return CM_ScalarEpilogueAllowed;
9452 };
9453
9454 // 4) if the TTI hook indicates this is profitable, request predication.
9455 TailFoldingInfo TFI(TLI, &LVL, IAI);
9456 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
9457 return CM_ScalarEpilogueNotNeededUsePredicate;
9458
9459 return CM_ScalarEpilogueAllowed;
9460}
9461
9462// Process the loop in the VPlan-native vectorization path. This path builds
9463// VPlan upfront in the vectorization pipeline, which allows to apply
9464// VPlan-to-VPlan transformations from the very beginning without modifying the
9465// input LLVM IR.
9466static bool processLoopInVPlanNativePath(
9467 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9468 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9469 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9470 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9471 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9472 LoopVectorizationRequirements &Requirements) {
9473
9474 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
9475 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9476 return false;
9477 }
9478 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9479 Function *F = L->getHeader()->getParent();
9480 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9481
9482 ScalarEpilogueLowering SEL =
9483 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI);
9484
9485 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9486 &Hints, IAI, PSI, BFI);
9487 // Use the planner for outer loop vectorization.
9488 // TODO: CM is not used at this point inside the planner. Turn CM into an
9489 // optional argument if we don't need it in the future.
9490 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9491 ORE);
9492
9493 // Get user vectorization factor.
9494 ElementCount UserVF = Hints.getWidth();
9495
9496 CM.collectElementTypesForWidening();
9497
9498 // Plan how to best vectorize, return the best VF and its cost.
9499 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9500
9501 // If we are stress testing VPlan builds, do not attempt to generate vector
9502 // code. Masked vector code generation support will follow soon.
9503 // Also, do not attempt to vectorize if no vector code will be produced.
9504 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9505 return false;
9506
9507 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9508
9509 {
9510 bool AddBranchWeights =
9511 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9512 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
9513 AddBranchWeights, CM.CostKind);
9514 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9515 VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
9516 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9517 << L->getHeader()->getParent()->getName() << "\"\n");
9518 LVP.executePlan(BestVF: VF.Width, BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9519 }
9520
9521 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
9522
9523 // Mark the loop as already vectorized to avoid vectorizing again.
9524 Hints.setAlreadyVectorized();
9525 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9526 return true;
9527}
9528
9529// Emit a remark if there are stores to floats that required a floating point
9530// extension. If the vectorized loop was generated with floating point there
9531// will be a performance penalty from the conversion overhead and the change in
9532// the vector width.
9533static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9534 SmallVector<Instruction *, 4> Worklist;
9535 for (BasicBlock *BB : L->getBlocks()) {
9536 for (Instruction &Inst : *BB) {
9537 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
9538 if (S->getValueOperand()->getType()->isFloatTy())
9539 Worklist.push_back(Elt: S);
9540 }
9541 }
9542 }
9543
9544 // Traverse the floating point stores upwards searching, for floating point
9545 // conversions.
9546 SmallPtrSet<const Instruction *, 4> Visited;
9547 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9548 while (!Worklist.empty()) {
9549 auto *I = Worklist.pop_back_val();
9550 if (!L->contains(Inst: I))
9551 continue;
9552 if (!Visited.insert(Ptr: I).second)
9553 continue;
9554
9555 // Emit a remark if the floating point store required a floating
9556 // point conversion.
9557 // TODO: More work could be done to identify the root cause such as a
9558 // constant or a function return type and point the user to it.
9559 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
9560 ORE->emit(RemarkBuilder: [&]() {
9561 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9562 I->getDebugLoc(), L->getHeader())
9563 << "floating point conversion changes vector width. "
9564 << "Mixed floating point precision requires an up/down "
9565 << "cast that will negatively impact performance.";
9566 });
9567
9568 for (Use &Op : I->operands())
9569 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
9570 Worklist.push_back(Elt: OpI);
9571 }
9572}
9573
9574/// For loops with uncountable early exits, find the cost of doing work when
9575/// exiting the loop early, such as calculating the final exit values of
9576/// variables used outside the loop.
9577/// TODO: This is currently overly pessimistic because the loop may not take
9578/// the early exit, but better to keep this conservative for now. In future,
9579/// it might be possible to relax this by using branch probabilities.
9580static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
9581 VPlan &Plan, ElementCount VF) {
9582 InstructionCost Cost = 0;
9583 for (auto *ExitVPBB : Plan.getExitBlocks()) {
9584 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9585 // If the predecessor is not the middle.block, then it must be the
9586 // vector.early.exit block, which may contain work to calculate the exit
9587 // values of variables used outside the loop.
9588 if (PredVPBB != Plan.getMiddleBlock()) {
9589 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9590 << PredVPBB->getName() << ":\n");
9591 Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
9592 }
9593 }
9594 }
9595 return Cost;
9596}
9597
9598/// This function determines whether or not it's still profitable to vectorize
9599/// the loop given the extra work we have to do outside of the loop:
9600/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9601/// to vectorize.
9602/// 2. In the case of loops with uncountable early exits, we may have to do
9603/// extra work when exiting the loop early, such as calculating the final
9604/// exit values of variables used outside the loop.
9605static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9606 VectorizationFactor &VF, Loop *L,
9607 PredicatedScalarEvolution &PSE,
9608 VPCostContext &CostCtx, VPlan &Plan,
9609 ScalarEpilogueLowering SEL,
9610 std::optional<unsigned> VScale) {
9611 InstructionCost TotalCost = Checks.getCost();
9612 if (!TotalCost.isValid())
9613 return false;
9614
9615 // Add on the cost of any work required in the vector early exit block, if
9616 // one exists.
9617 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
9618
9619 // When interleaving only scalar and vector cost will be equal, which in turn
9620 // would lead to a divide by 0. Fall back to hard threshold.
9621 if (VF.Width.isScalar()) {
9622 // TODO: Should we rename VectorizeMemoryCheckThreshold?
9623 if (TotalCost > VectorizeMemoryCheckThreshold) {
9624 LLVM_DEBUG(
9625 dbgs()
9626 << "LV: Interleaving only is not profitable due to runtime checks\n");
9627 return false;
9628 }
9629 return true;
9630 }
9631
9632 // The scalar cost should only be 0 when vectorizing with a user specified
9633 // VF/IC. In those cases, runtime checks should always be generated.
9634 uint64_t ScalarC = VF.ScalarCost.getValue();
9635 if (ScalarC == 0)
9636 return true;
9637
9638 // First, compute the minimum iteration count required so that the vector
9639 // loop outperforms the scalar loop.
9640 // The total cost of the scalar loop is
9641 // ScalarC * TC
9642 // where
9643 // * TC is the actual trip count of the loop.
9644 // * ScalarC is the cost of a single scalar iteration.
9645 //
9646 // The total cost of the vector loop is
9647 // RtC + VecC * (TC / VF) + EpiC
9648 // where
9649 // * RtC is the cost of the generated runtime checks plus the cost of
9650 // performing any additional work in the vector.early.exit block for loops
9651 // with uncountable early exits.
9652 // * VecC is the cost of a single vector iteration.
9653 // * TC is the actual trip count of the loop
9654 // * VF is the vectorization factor
9655 // * EpiCost is the cost of the generated epilogue, including the cost
9656 // of the remaining scalar operations.
9657 //
9658 // Vectorization is profitable once the total vector cost is less than the
9659 // total scalar cost:
9660 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9661 //
9662 // Now we can compute the minimum required trip count TC as
9663 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9664 //
9665 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9666 // the computations are performed on doubles, not integers and the result
9667 // is rounded up, hence we get an upper estimate of the TC.
9668 unsigned IntVF = getEstimatedRuntimeVF(VF: VF.Width, VScale);
9669 uint64_t RtC = TotalCost.getValue();
9670 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9671 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(Numerator: RtC * IntVF, Denominator: Div);
9672
9673 // Second, compute a minimum iteration count so that the cost of the
9674 // runtime checks is only a fraction of the total scalar loop cost. This
9675 // adds a loop-dependent bound on the overhead incurred if the runtime
9676 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9677 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9678 // cost, compute
9679 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9680 uint64_t MinTC2 = divideCeil(Numerator: RtC * 10, Denominator: ScalarC);
9681
9682 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9683 // epilogue is allowed, choose the next closest multiple of VF. This should
9684 // partly compensate for ignoring the epilogue cost.
9685 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9686 if (SEL == CM_ScalarEpilogueAllowed)
9687 MinTC = alignTo(Value: MinTC, Align: IntVF);
9688 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9689
9690 LLVM_DEBUG(
9691 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9692 << VF.MinProfitableTripCount << "\n");
9693
9694 // Skip vectorization if the expected trip count is less than the minimum
9695 // required trip count.
9696 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9697 if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
9698 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9699 "trip count < minimum profitable VF ("
9700 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9701 << ")\n");
9702
9703 return false;
9704 }
9705 }
9706 return true;
9707}
9708
9709LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9710 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9711 !EnableLoopInterleaving),
9712 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9713 !EnableLoopVectorization) {}
9714
9715/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9716/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9717/// don't have a corresponding wide induction in \p EpiPlan.
9718static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9719 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9720 // will need their resume-values computed in the main vector loop. Others
9721 // can be removed from the main VPlan.
9722 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9723 for (VPRecipeBase &R :
9724 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9725 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9726 continue;
9727 EpiWidenedPhis.insert(
9728 Ptr: cast<PHINode>(Val: R.getVPSingleValue()->getUnderlyingValue()));
9729 }
9730 for (VPRecipeBase &R :
9731 make_early_inc_range(Range: MainPlan.getScalarHeader()->phis())) {
9732 auto *VPIRInst = cast<VPIRPhi>(Val: &R);
9733 if (EpiWidenedPhis.contains(Ptr: &VPIRInst->getIRPhi()))
9734 continue;
9735 // There is no corresponding wide induction in the epilogue plan that would
9736 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9737 // together with the corresponding ResumePhi. The resume values for the
9738 // scalar loop will be created during execution of EpiPlan.
9739 VPRecipeBase *ResumePhi = VPIRInst->getOperand(N: 0)->getDefiningRecipe();
9740 VPIRInst->eraseFromParent();
9741 ResumePhi->eraseFromParent();
9742 }
9743 VPlanTransforms::runPass(Fn: VPlanTransforms::removeDeadRecipes, Plan&: MainPlan);
9744
9745 using namespace VPlanPatternMatch;
9746 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9747 // introduce multiple uses of undef/poison. If the reduction start value may
9748 // be undef or poison it needs to be frozen and the frozen start has to be
9749 // used when computing the reduction result. We also need to use the frozen
9750 // value in the resume phi generated by the main vector loop, as this is also
9751 // used to compute the reduction result after the epilogue vector loop.
9752 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9753 bool UpdateResumePhis) {
9754 VPBuilder Builder(Plan.getEntry());
9755 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9756 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9757 if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9758 continue;
9759 VPValue *OrigStart = VPI->getOperand(N: 1);
9760 if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
9761 continue;
9762 VPInstruction *Freeze =
9763 Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, Inst: {}, Name: "fr");
9764 VPI->setOperand(I: 1, New: Freeze);
9765 if (UpdateResumePhis)
9766 OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
9767 return Freeze != &U && isa<VPPhi>(Val: &U);
9768 });
9769 }
9770 };
9771 AddFreezeForFindLastIVReductions(MainPlan, true);
9772 AddFreezeForFindLastIVReductions(EpiPlan, false);
9773
9774 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9775 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9776 // If there is a suitable resume value for the canonical induction in the
9777 // scalar (which will become vector) epilogue loop we are done. Otherwise
9778 // create it below.
9779 if (any_of(Range&: *MainScalarPH, P: [VectorTC](VPRecipeBase &R) {
9780 return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Op0: m_Specific(VPV: VectorTC),
9781 Op1: m_SpecificInt(V: 0)));
9782 }))
9783 return;
9784 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9785 ScalarPHBuilder.createScalarPhi(
9786 IncomingValues: {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, DL: {},
9787 Name: "vec.epilog.resume.val");
9788}
9789
9790/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9791/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9792static void
9793preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
9794 const SCEV2ValueTy &ExpandedSCEVs,
9795 const EpilogueLoopVectorizationInfo &EPI) {
9796 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9797 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9798 Header->setName("vec.epilog.vector.body");
9799
9800 DenseMap<Value *, Value *> ToFrozen;
9801 // Ensure that the start values for all header phi recipes are updated before
9802 // vectorizing the epilogue loop.
9803 for (VPRecipeBase &R : Header->phis()) {
9804 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(Val: &R)) {
9805 // When vectorizing the epilogue loop, the canonical induction start
9806 // value needs to be changed from zero to the value after the main
9807 // vector loop. Find the resume value created during execution of the main
9808 // VPlan.
9809 // FIXME: Improve modeling for canonical IV start values in the epilogue
9810 // loop.
9811 using namespace llvm::PatternMatch;
9812 Type *IdxTy = IV->getScalarType();
9813 PHINode *EPResumeVal = find_singleton<PHINode>(
9814 Range: L->getLoopPreheader()->phis(),
9815 P: [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
9816 if (P.getType() == IdxTy &&
9817 match(
9818 V: P.getIncomingValueForBlock(BB: EPI.MainLoopIterationCountCheck),
9819 P: m_SpecificInt(V: 0)) &&
9820 all_of(Range: P.incoming_values(), P: [&EPI](Value *Inc) {
9821 return Inc == EPI.VectorTripCount ||
9822 match(V: Inc, P: m_SpecificInt(V: 0));
9823 }))
9824 return &P;
9825 return nullptr;
9826 });
9827 assert(EPResumeVal && "must have a resume value for the canonical IV");
9828 VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9829 assert(all_of(IV->users(),
9830 [](const VPUser *U) {
9831 return isa<VPScalarIVStepsRecipe>(U) ||
9832 isa<VPDerivedIVRecipe>(U) ||
9833 cast<VPRecipeBase>(U)->isScalarCast() ||
9834 cast<VPInstruction>(U)->getOpcode() ==
9835 Instruction::Add;
9836 }) &&
9837 "the canonical IV should only be used by its increment or "
9838 "ScalarIVSteps when resetting the start value");
9839 IV->setOperand(I: 0, New: VPV);
9840 continue;
9841 }
9842
9843 Value *ResumeV = nullptr;
9844 // TODO: Move setting of resume values to prepareToExecute.
9845 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9846 auto *RdxResult =
9847 cast<VPInstruction>(Val: *find_if(Range: ReductionPhi->users(), P: [](VPUser *U) {
9848 auto *VPI = dyn_cast<VPInstruction>(Val: U);
9849 return VPI &&
9850 (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9851 VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9852 VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9853 }));
9854 ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9855 ->getIncomingValueForBlock(BB: L->getLoopPreheader());
9856 const RecurrenceDescriptor &RdxDesc =
9857 ReductionPhi->getRecurrenceDescriptor();
9858 RecurKind RK = RdxDesc.getRecurrenceKind();
9859 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
9860 Value *StartV = RdxResult->getOperand(N: 1)->getLiveInIRValue();
9861 assert(RdxDesc.getRecurrenceStartValue() == StartV &&
9862 "start value from ComputeAnyOfResult must match");
9863
9864 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9865 // start value; compare the final value from the main vector loop
9866 // to the start value.
9867 BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9868 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9869 ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9870 } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK)) {
9871 Value *StartV = getStartValueFromReductionResult(RdxResult);
9872 assert(RdxDesc.getRecurrenceStartValue() == StartV &&
9873 "start value from ComputeFinIVResult must match");
9874
9875 ToFrozen[StartV] = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9876 BB: EPI.MainLoopIterationCountCheck);
9877
9878 // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9879 // an adjustment to the resume value. The resume value is adjusted to
9880 // the sentinel value when the final value from the main vector loop
9881 // equals the start value. This ensures correctness when the start value
9882 // might not be less than the minimum value of a monotonically
9883 // increasing induction variable.
9884 BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9885 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9886 Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: ToFrozen[StartV]);
9887 Value *Sentinel = RdxResult->getOperand(N: 2)->getLiveInIRValue();
9888 ResumeV = Builder.CreateSelect(C: Cmp, True: Sentinel, False: ResumeV);
9889 } else {
9890 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9891 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9892 if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9893 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9894 "unexpected start value");
9895 VPI->setOperand(I: 0, New: StartVal);
9896 continue;
9897 }
9898 }
9899 } else {
9900 // Retrieve the induction resume values for wide inductions from
9901 // their original phi nodes in the scalar loop.
9902 PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9903 // Hook up to the PHINode generated by a ResumePhi recipe of main
9904 // loop VPlan, which feeds the scalar loop.
9905 ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9906 }
9907 assert(ResumeV && "Must have a resume value");
9908 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9909 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9910 }
9911
9912 // For some VPValues in the epilogue plan we must re-use the generated IR
9913 // values from the main plan. Replace them with live-in VPValues.
9914 // TODO: This is a workaround needed for epilogue vectorization and it
9915 // should be removed once induction resume value creation is done
9916 // directly in VPlan.
9917 for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9918 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9919 // epilogue plan. This ensures all users use the same frozen value.
9920 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9921 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9922 VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9923 V: ToFrozen.lookup(Val: VPI->getOperand(N: 0)->getLiveInIRValue())));
9924 continue;
9925 }
9926
9927 // Re-use the trip count and steps expanded for the main loop, as
9928 // skeleton creation needs it as a value that dominates both the scalar
9929 // and vector epilogue loops
9930 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9931 if (!ExpandR)
9932 continue;
9933 VPValue *ExpandedVal =
9934 Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9935 ExpandR->replaceAllUsesWith(New: ExpandedVal);
9936 if (Plan.getTripCount() == ExpandR)
9937 Plan.resetTripCount(NewTripCount: ExpandedVal);
9938 ExpandR->eraseFromParent();
9939 }
9940}
9941
9942// Generate bypass values from the additional bypass block. Note that when the
9943// vectorized epilogue is skipped due to iteration count check, then the
9944// resume value for the induction variable comes from the trip count of the
9945// main vector loop, passed as the second argument.
9946static Value *createInductionAdditionalBypassValues(
9947 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9948 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9949 Instruction *OldInduction) {
9950 Value *Step = getExpandedStep(ID: II, ExpandedSCEVs);
9951 // For the primary induction the additional bypass end value is known.
9952 // Otherwise it is computed.
9953 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9954 if (OrigPhi != OldInduction) {
9955 auto *BinOp = II.getInductionBinOp();
9956 // Fast-math-flags propagate from the original induction instruction.
9957 if (isa_and_nonnull<FPMathOperator>(Val: BinOp))
9958 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9959
9960 // Compute the end value for the additional bypass.
9961 EndValueFromAdditionalBypass =
9962 emitTransformedIndex(B&: BypassBuilder, Index: MainVectorTripCount,
9963 StartValue: II.getStartValue(), Step, InductionKind: II.getKind(), InductionBinOp: BinOp);
9964 EndValueFromAdditionalBypass->setName("ind.end");
9965 }
9966 return EndValueFromAdditionalBypass;
9967}
9968
9969bool LoopVectorizePass::processLoop(Loop *L) {
9970 assert((EnableVPlanNativePath || L->isInnermost()) &&
9971 "VPlan-native path is not enabled. Only process inner loops.");
9972
9973 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9974 << L->getHeader()->getParent()->getName() << "' from "
9975 << L->getLocStr() << "\n");
9976
9977 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9978
9979 LLVM_DEBUG(
9980 dbgs() << "LV: Loop hints:"
9981 << " force="
9982 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9983 ? "disabled"
9984 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9985 ? "enabled"
9986 : "?"))
9987 << " width=" << Hints.getWidth()
9988 << " interleave=" << Hints.getInterleave() << "\n");
9989
9990 // Function containing loop
9991 Function *F = L->getHeader()->getParent();
9992
9993 // Looking at the diagnostic output is the only way to determine if a loop
9994 // was vectorized (other than looking at the IR or machine code), so it
9995 // is important to generate an optimization remark for each loop. Most of
9996 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9997 // generated as OptimizationRemark and OptimizationRemarkMissed are
9998 // less verbose reporting vectorized loops and unvectorized loops that may
9999 // benefit from vectorization, respectively.
10000
10001 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10002 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10003 return false;
10004 }
10005
10006 PredicatedScalarEvolution PSE(*SE, *L);
10007
10008 // Check if it is legal to vectorize the loop.
10009 LoopVectorizationRequirements Requirements;
10010 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
10011 &Requirements, &Hints, DB, AC, BFI, PSI);
10012 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
10013 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10014 Hints.emitRemarkWithHints();
10015 return false;
10016 }
10017
10018 if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
10019 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
10020 "early exit is not enabled",
10021 ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
10022 return false;
10023 }
10024
10025 // Entrance to the VPlan-native vectorization path. Outer loops are processed
10026 // here. They may require CFG and instruction level transformations before
10027 // even evaluating whether vectorization is profitable. Since we cannot modify
10028 // the incoming IR, we need to build VPlan upfront in the vectorization
10029 // pipeline.
10030 if (!L->isInnermost())
10031 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
10032 ORE, BFI, PSI, Hints, Requirements);
10033
10034 assert(L->isInnermost() && "Inner loop expected.");
10035
10036 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10037 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10038
10039 // If an override option has been passed in for interleaved accesses, use it.
10040 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10041 UseInterleaved = EnableInterleavedMemAccesses;
10042
10043 // Analyze interleaved memory accesses.
10044 if (UseInterleaved)
10045 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
10046
10047 if (LVL.hasUncountableEarlyExit()) {
10048 BasicBlock *LoopLatch = L->getLoopLatch();
10049 if (IAI.requiresScalarEpilogue() ||
10050 any_of(Range: LVL.getCountableExitingBlocks(),
10051 P: [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
10052 reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
10053 "requiring a scalar epilogue is unsupported",
10054 ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
10055 return false;
10056 }
10057 }
10058
10059 // Check the function attributes and profiles to find out if this function
10060 // should be optimized for size.
10061 ScalarEpilogueLowering SEL =
10062 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI);
10063
10064 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10065 // count by optimizing for size, to minimize overheads.
10066 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10067 if (ExpectedTC && ExpectedTC->isFixed() &&
10068 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
10069 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10070 << "This loop is worth vectorizing only if no scalar "
10071 << "iteration overheads are incurred.");
10072 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10073 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10074 else {
10075 LLVM_DEBUG(dbgs() << "\n");
10076 // Predicate tail-folded loops are efficient even when the loop
10077 // iteration count is low. However, setting the epilogue policy to
10078 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10079 // with runtime checks. It's more effective to let
10080 // `isOutsideLoopWorkProfitable` determine if vectorization is
10081 // beneficial for the loop.
10082 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10083 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10084 }
10085 }
10086
10087 // Check the function attributes to see if implicit floats or vectors are
10088 // allowed.
10089 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
10090 reportVectorizationFailure(
10091 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
10092 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
10093 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
10094 Hints.emitRemarkWithHints();
10095 return false;
10096 }
10097
10098 // Check if the target supports potentially unsafe FP vectorization.
10099 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10100 // for the target we're vectorizing for, to make sure none of the
10101 // additional fp-math flags can help.
10102 if (Hints.isPotentiallyUnsafe() &&
10103 TTI->isFPVectorizationPotentiallyUnsafe()) {
10104 reportVectorizationFailure(
10105 DebugMsg: "Potentially unsafe FP op prevents vectorization",
10106 OREMsg: "loop not vectorized due to unsafe FP support.",
10107 ORETag: "UnsafeFP", ORE, TheLoop: L);
10108 Hints.emitRemarkWithHints();
10109 return false;
10110 }
10111
10112 bool AllowOrderedReductions;
10113 // If the flag is set, use that instead and override the TTI behaviour.
10114 if (ForceOrderedReductions.getNumOccurrences() > 0)
10115 AllowOrderedReductions = ForceOrderedReductions;
10116 else
10117 AllowOrderedReductions = TTI->enableOrderedReductions();
10118 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
10119 ORE->emit(RemarkBuilder: [&]() {
10120 auto *ExactFPMathInst = Requirements.getExactFPInst();
10121 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10122 ExactFPMathInst->getDebugLoc(),
10123 ExactFPMathInst->getParent())
10124 << "loop not vectorized: cannot prove it is safe to reorder "
10125 "floating-point operations";
10126 });
10127 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10128 "reorder floating-point operations\n");
10129 Hints.emitRemarkWithHints();
10130 return false;
10131 }
10132
10133 // Use the cost model.
10134 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10135 F, &Hints, IAI, PSI, BFI);
10136 // Use the planner for vectorization.
10137 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10138 ORE);
10139
10140 // Get user vectorization factor and interleave count.
10141 ElementCount UserVF = Hints.getWidth();
10142 unsigned UserIC = Hints.getInterleave();
10143 if (LVL.hasUncountableEarlyExit() && UserIC != 1 &&
10144 !VectorizerParams::isInterleaveForced()) {
10145 UserIC = 1;
10146 reportVectorizationInfo(Msg: "Interleaving not supported for loops "
10147 "with uncountable early exits",
10148 ORETag: "InterleaveEarlyExitDisabled", ORE, TheLoop: L);
10149 }
10150
10151 // Plan how to best vectorize.
10152 LVP.plan(UserVF, UserIC);
10153 VectorizationFactor VF = LVP.computeBestVF();
10154 unsigned IC = 1;
10155
10156 if (ORE->allowExtraAnalysis(LV_NAME))
10157 LVP.emitInvalidCostRemarks(ORE);
10158
10159 bool AddBranchWeights =
10160 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
10161 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10162 AddBranchWeights, CM.CostKind);
10163 if (LVP.hasPlanWithVF(VF: VF.Width)) {
10164 // Select the interleave count.
10165 IC = CM.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
10166
10167 unsigned SelectedIC = std::max(a: IC, b: UserIC);
10168 // Optimistically generate runtime checks if they are needed. Drop them if
10169 // they turn out to not be profitable.
10170 if (VF.Width.isVector() || SelectedIC > 1)
10171 Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC);
10172
10173 // Check if it is profitable to vectorize with runtime checks.
10174 bool ForceVectorization =
10175 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10176 VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10177 CM, CM.CostKind);
10178 if (!ForceVectorization &&
10179 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10180 Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
10181 VScale: CM.getVScaleForTuning())) {
10182 ORE->emit(RemarkBuilder: [&]() {
10183 return OptimizationRemarkAnalysisAliasing(
10184 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10185 L->getHeader())
10186 << "loop not vectorized: cannot prove it is safe to reorder "
10187 "memory operations";
10188 });
10189 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10190 Hints.emitRemarkWithHints();
10191 return false;
10192 }
10193 }
10194
10195 // Identify the diagnostic messages that should be produced.
10196 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10197 bool VectorizeLoop = true, InterleaveLoop = true;
10198 if (VF.Width.isScalar()) {
10199 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10200 VecDiagMsg = {
10201 "VectorizationNotBeneficial",
10202 "the cost-model indicates that vectorization is not beneficial"};
10203 VectorizeLoop = false;
10204 }
10205
10206 if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > 1) {
10207 // Tell the user interleaving was avoided up-front, despite being explicitly
10208 // requested.
10209 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10210 "interleaving should be avoided up front\n");
10211 IntDiagMsg = {"InterleavingAvoided",
10212 "Ignoring UserIC, because interleaving was avoided up front"};
10213 InterleaveLoop = false;
10214 } else if (IC == 1 && UserIC <= 1) {
10215 // Tell the user interleaving is not beneficial.
10216 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10217 IntDiagMsg = {
10218 "InterleavingNotBeneficial",
10219 "the cost-model indicates that interleaving is not beneficial"};
10220 InterleaveLoop = false;
10221 if (UserIC == 1) {
10222 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10223 IntDiagMsg.second +=
10224 " and is explicitly disabled or interleave count is set to 1";
10225 }
10226 } else if (IC > 1 && UserIC == 1) {
10227 // Tell the user interleaving is beneficial, but it explicitly disabled.
10228 LLVM_DEBUG(
10229 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10230 IntDiagMsg = {"InterleavingBeneficialButDisabled",
10231 "the cost-model indicates that interleaving is beneficial "
10232 "but is explicitly disabled or interleave count is set to 1"};
10233 InterleaveLoop = false;
10234 }
10235
10236 // If there is a histogram in the loop, do not just interleave without
10237 // vectorizing. The order of operations will be incorrect without the
10238 // histogram intrinsics, which are only used for recipes with VF > 1.
10239 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10240 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10241 << "to histogram operations.\n");
10242 IntDiagMsg = {
10243 "HistogramPreventsScalarInterleaving",
10244 "Unable to interleave without vectorization due to constraints on "
10245 "the order of histogram operations"};
10246 InterleaveLoop = false;
10247 }
10248
10249 // Override IC if user provided an interleave count.
10250 IC = UserIC > 0 ? UserIC : IC;
10251
10252 // Emit diagnostic messages, if any.
10253 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10254 if (!VectorizeLoop && !InterleaveLoop) {
10255 // Do not vectorize or interleaving the loop.
10256 ORE->emit(RemarkBuilder: [&]() {
10257 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10258 L->getStartLoc(), L->getHeader())
10259 << VecDiagMsg.second;
10260 });
10261 ORE->emit(RemarkBuilder: [&]() {
10262 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10263 L->getStartLoc(), L->getHeader())
10264 << IntDiagMsg.second;
10265 });
10266 return false;
10267 }
10268
10269 if (!VectorizeLoop && InterleaveLoop) {
10270 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10271 ORE->emit(RemarkBuilder: [&]() {
10272 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10273 L->getStartLoc(), L->getHeader())
10274 << VecDiagMsg.second;
10275 });
10276 } else if (VectorizeLoop && !InterleaveLoop) {
10277 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10278 << ") in " << L->getLocStr() << '\n');
10279 ORE->emit(RemarkBuilder: [&]() {
10280 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10281 L->getStartLoc(), L->getHeader())
10282 << IntDiagMsg.second;
10283 });
10284 } else if (VectorizeLoop && InterleaveLoop) {
10285 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10286 << ") in " << L->getLocStr() << '\n');
10287 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10288 }
10289
10290 bool DisableRuntimeUnroll = false;
10291 MDNode *OrigLoopID = L->getLoopID();
10292 {
10293 using namespace ore;
10294 if (!VectorizeLoop) {
10295 assert(IC > 1 && "interleave count should not be 1 or 0");
10296 // If we decided that it is not legal to vectorize the loop, then
10297 // interleave it.
10298 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10299 InnerLoopVectorizer Unroller(
10300 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(MinVal: 1),
10301 ElementCount::getFixed(MinVal: 1), IC, &CM, BFI, PSI, Checks, BestPlan);
10302
10303 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, VectorizingEpilogue: false);
10304
10305 ORE->emit(RemarkBuilder: [&]() {
10306 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10307 L->getHeader())
10308 << "interleaved loop (interleaved count: "
10309 << NV("InterleaveCount", IC) << ")";
10310 });
10311 } else {
10312 // If we decided that it is *legal* to vectorize the loop, then do it.
10313
10314 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10315 // Consider vectorizing the epilogue too if it's profitable.
10316 VectorizationFactor EpilogueVF =
10317 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
10318 if (EpilogueVF.Width.isVector()) {
10319 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10320
10321 // The first pass vectorizes the main loop and creates a scalar epilogue
10322 // to be vectorized by executing the plan (potentially with a different
10323 // factor) again shortly afterwards.
10324 VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
10325 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10326 preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
10327 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10328 BestEpiPlan);
10329 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10330 EPI, &CM, BFI, PSI, Checks,
10331 *BestMainPlan);
10332 auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
10333 BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false);
10334 ++LoopsVectorized;
10335
10336 // Second pass vectorizes the epilogue and adjusts the control flow
10337 // edges from the first pass.
10338 EPI.MainLoopVF = EPI.EpilogueVF;
10339 EPI.MainLoopUF = EPI.EpilogueUF;
10340 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10341 ORE, EPI, &CM, BFI, PSI,
10342 Checks, BestEpiPlan);
10343 EpilogILV.setTripCount(MainILV.getTripCount());
10344 preparePlanForEpilogueVectorLoop(Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI);
10345
10346 LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV,
10347 DT, VectorizingEpilogue: true);
10348
10349 // Fix induction resume values from the additional bypass block.
10350 BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10351 IRBuilder<> BypassBuilder(BypassBlock,
10352 BypassBlock->getFirstInsertionPt());
10353 BasicBlock *PH = L->getLoopPreheader();
10354 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10355 auto *Inc = cast<PHINode>(Val: IVPhi->getIncomingValueForBlock(BB: PH));
10356 Value *V = createInductionAdditionalBypassValues(
10357 OrigPhi: IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount: EPI.VectorTripCount,
10358 OldInduction: LVL.getPrimaryInduction());
10359 // TODO: Directly add as extra operand to the VPResumePHI recipe.
10360 Inc->setIncomingValueForBlock(BB: BypassBlock, V);
10361 }
10362 ++LoopsEpilogueVectorized;
10363
10364 if (!Checks.hasChecks())
10365 DisableRuntimeUnroll = true;
10366 } else {
10367 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10368 VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
10369 Checks, BestPlan);
10370 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
10371 ++LoopsVectorized;
10372
10373 // Add metadata to disable runtime unrolling a scalar loop when there
10374 // are no runtime checks about strides and memory. A scalar loop that is
10375 // rarely used is not worth unrolling.
10376 if (!Checks.hasChecks())
10377 DisableRuntimeUnroll = true;
10378 }
10379 // Report the vectorization decision.
10380 reportVectorization(ORE, TheLoop: L, VF, IC);
10381 }
10382
10383 if (ORE->allowExtraAnalysis(LV_NAME))
10384 checkMixedPrecision(L, ORE);
10385 }
10386
10387 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10388 "DT not preserved correctly");
10389
10390 std::optional<MDNode *> RemainderLoopID =
10391 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
10392 LLVMLoopVectorizeFollowupEpilogue});
10393 if (RemainderLoopID) {
10394 L->setLoopID(*RemainderLoopID);
10395 } else {
10396 if (DisableRuntimeUnroll)
10397 addRuntimeUnrollDisableMetaData(L);
10398
10399 // Mark the loop as already vectorized to avoid vectorizing again.
10400 Hints.setAlreadyVectorized();
10401 }
10402
10403 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10404 return true;
10405}
10406
10407LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10408
10409 // Don't attempt if
10410 // 1. the target claims to have no vector registers, and
10411 // 2. interleaving won't help ILP.
10412 //
10413 // The second condition is necessary because, even if the target has no
10414 // vector registers, loop vectorization may still enable scalar
10415 // interleaving.
10416 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
10417 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
10418 return LoopVectorizeResult(false, false);
10419
10420 bool Changed = false, CFGChanged = false;
10421
10422 // The vectorizer requires loops to be in simplified form.
10423 // Since simplification may add new inner loops, it has to run before the
10424 // legality and profitability checks. This means running the loop vectorizer
10425 // will simplify all loops, regardless of whether anything end up being
10426 // vectorized.
10427 for (const auto &L : *LI)
10428 Changed |= CFGChanged |=
10429 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
10430
10431 // Build up a worklist of inner-loops to vectorize. This is necessary as
10432 // the act of vectorizing or partially unrolling a loop creates new loops
10433 // and can invalidate iterators across the loops.
10434 SmallVector<Loop *, 8> Worklist;
10435
10436 for (Loop *L : *LI)
10437 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
10438
10439 LoopsAnalyzed += Worklist.size();
10440
10441 // Now walk the identified inner loops.
10442 while (!Worklist.empty()) {
10443 Loop *L = Worklist.pop_back_val();
10444
10445 // For the inner loops we actually process, form LCSSA to simplify the
10446 // transform.
10447 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
10448
10449 Changed |= CFGChanged |= processLoop(L);
10450
10451 if (Changed) {
10452 LAIs->clear();
10453
10454#ifndef NDEBUG
10455 if (VerifySCEV)
10456 SE->verify();
10457#endif
10458 }
10459 }
10460
10461 // Process each loop nest in the function.
10462 return LoopVectorizeResult(Changed, CFGChanged);
10463}
10464
10465PreservedAnalyses LoopVectorizePass::run(Function &F,
10466 FunctionAnalysisManager &AM) {
10467 LI = &AM.getResult<LoopAnalysis>(IR&: F);
10468 // There are no loops in the function. Return before computing other
10469 // expensive analyses.
10470 if (LI->empty())
10471 return PreservedAnalyses::all();
10472 SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
10473 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
10474 DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
10475 TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
10476 AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
10477 DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
10478 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
10479 LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
10480
10481 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
10482 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
10483 BFI = nullptr;
10484 if (PSI && PSI->hasProfileSummary())
10485 BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
10486 LoopVectorizeResult Result = runImpl(F);
10487 if (!Result.MadeAnyChange)
10488 return PreservedAnalyses::all();
10489 PreservedAnalyses PA;
10490
10491 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
10492 for (auto &BB : F)
10493 RemoveRedundantDbgInstrs(BB: &BB);
10494 }
10495
10496 PA.preserve<LoopAnalysis>();
10497 PA.preserve<DominatorTreeAnalysis>();
10498 PA.preserve<ScalarEvolutionAnalysis>();
10499 PA.preserve<LoopAccessAnalysis>();
10500
10501 if (Result.MadeCFGChange) {
10502 // Making CFG changes likely means a loop got vectorized. Indicate that
10503 // extra simplification passes should be run.
10504 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10505 // be run if runtime checks have been added.
10506 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10507 PA.preserve<ShouldRunExtraVectorPasses>();
10508 } else {
10509 PA.preserveSet<CFGAnalyses>();
10510 }
10511 return PA;
10512}
10513
10514void LoopVectorizePass::printPipeline(
10515 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10516 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10517 OS, MapClassName2PassName);
10518
10519 OS << '<';
10520 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10521 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10522 OS << '>';
10523}
10524