1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/DenseMapInfo.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
74#include "llvm/ADT/SmallPtrSet.h"
75#include "llvm/ADT/SmallVector.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
80#include "llvm/ADT/iterator_range.h"
81#include "llvm/Analysis/AssumptionCache.h"
82#include "llvm/Analysis/BasicAliasAnalysis.h"
83#include "llvm/Analysis/BlockFrequencyInfo.h"
84#include "llvm/Analysis/CFG.h"
85#include "llvm/Analysis/CodeMetrics.h"
86#include "llvm/Analysis/DemandedBits.h"
87#include "llvm/Analysis/GlobalsModRef.h"
88#include "llvm/Analysis/LoopAccessAnalysis.h"
89#include "llvm/Analysis/LoopAnalysisManager.h"
90#include "llvm/Analysis/LoopInfo.h"
91#include "llvm/Analysis/LoopIterator.h"
92#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93#include "llvm/Analysis/ProfileSummaryInfo.h"
94#include "llvm/Analysis/ScalarEvolution.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
97#include "llvm/Analysis/TargetLibraryInfo.h"
98#include "llvm/Analysis/TargetTransformInfo.h"
99#include "llvm/Analysis/ValueTracking.h"
100#include "llvm/Analysis/VectorUtils.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/DiagnosticInfo.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
117#include "llvm/IR/IntrinsicInst.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/ProfDataUtils.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
131#include "llvm/Support/CommandLine.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/NativeFormatting.h"
137#include "llvm/Support/raw_ostream.h"
138#include "llvm/Transforms/Utils/BasicBlockUtils.h"
139#include "llvm/Transforms/Utils/InjectTLIMappings.h"
140#include "llvm/Transforms/Utils/Local.h"
141#include "llvm/Transforms/Utils/LoopSimplify.h"
142#include "llvm/Transforms/Utils/LoopUtils.h"
143#include "llvm/Transforms/Utils/LoopVersioning.h"
144#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
145#include "llvm/Transforms/Utils/SizeOpts.h"
146#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
174static cl::opt<bool> EnableEpilogueVectorization(
175 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
178static cl::opt<unsigned> EpilogueVectorizationForceVF(
179 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
184static cl::opt<unsigned> EpilogueVectorizationMinVF(
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
191static cl::opt<unsigned> TinyTripCountVectorThreshold(
192 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
197static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
198 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
202// that predication is preferred, and this lists all options. I.e., the
203// vectorizer will try to fold the tail-loop (epilogue) into the vector body
204// and predicate the instructions accordingly. If tail-folding fails, there are
205// different fallback strategies depending on these values:
206namespace PreferPredicateTy {
207 enum Option {
208 ScalarEpilogue = 0,
209 PredicateElseScalarEpilogue,
210 PredicateOrDontVectorize
211 };
212} // namespace PreferPredicateTy
213
214static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
215 "prefer-predicate-over-epilogue",
216 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
217 cl::Hidden,
218 cl::desc("Tail-folding and predication preferences over creating a scalar "
219 "epilogue loop."),
220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
221 "scalar-epilogue",
222 "Don't tail-predicate loops, create scalar epilogue"),
223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
224 "predicate-else-scalar-epilogue",
225 "prefer tail-folding, create scalar epilogue if tail "
226 "folding fails."),
227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
228 "predicate-dont-vectorize",
229 "prefers tail-folding, don't attempt vectorization if "
230 "tail-folding fails.")));
231
232static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
233 "force-tail-folding-style", cl::desc("Force the tail folding style"),
234 cl::init(Val: TailFoldingStyle::None),
235 cl::values(
236 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
237 clEnumValN(
238 TailFoldingStyle::Data, "data",
239 "Create lane mask for data only, using active.lane.mask intrinsic"),
240 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
241 "data-without-lane-mask",
242 "Create lane mask with compare/stepvector"),
243 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
244 "Create lane mask using active.lane.mask intrinsic, and use "
245 "it for both data and control flow"),
246 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
247 "data-and-control-without-rt-check",
248 "Similar to data-and-control, but remove the runtime check"),
249 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
250 "Use predicated EVL instructions for tail folding. If EVL "
251 "is unsupported, fallback to data-without-lane-mask.")));
252
253cl::opt<bool> llvm::EnableWideActiveLaneMask(
254 "enable-wide-lane-mask", cl::init(Val: false), cl::Hidden,
255 cl::desc("Enable use of wide lane masks when used for control flow in "
256 "tail-folded loops"));
257
258static cl::opt<bool> MaximizeBandwidth(
259 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
260 cl::desc("Maximize bandwidth when selecting vectorization factor which "
261 "will be determined by the smallest type in loop."));
262
263static cl::opt<bool> EnableInterleavedMemAccesses(
264 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
265 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
266
267/// An interleave-group may need masking if it resides in a block that needs
268/// predication, or in order to mask away gaps.
269static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
270 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
271 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
272
273static cl::opt<unsigned> ForceTargetNumScalarRegs(
274 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
275 cl::desc("A flag that overrides the target's number of scalar registers."));
276
277static cl::opt<unsigned> ForceTargetNumVectorRegs(
278 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of vector registers."));
280
281static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
282 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
283 cl::desc("A flag that overrides the target's max interleave factor for "
284 "scalar loops."));
285
286static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
287 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
288 cl::desc("A flag that overrides the target's max interleave factor for "
289 "vectorized loops."));
290
291cl::opt<unsigned> llvm::ForceTargetInstructionCost(
292 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
293 cl::desc("A flag that overrides the target's expected cost for "
294 "an instruction to a single constant value. Mostly "
295 "useful for getting consistent testing."));
296
297static cl::opt<bool> ForceTargetSupportsScalableVectors(
298 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
299 cl::desc(
300 "Pretend that scalable vectors are supported, even if the target does "
301 "not support them. This flag should only be used for testing."));
302
303static cl::opt<unsigned> SmallLoopCost(
304 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
305 cl::desc(
306 "The cost of a loop that is considered 'small' by the interleaver."));
307
308static cl::opt<bool> LoopVectorizeWithBlockFrequency(
309 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
310 cl::desc("Enable the use of the block frequency analysis to access PGO "
311 "heuristics minimizing code growth in cold regions and being more "
312 "aggressive in hot regions."));
313
314// Runtime interleave loops for load/store throughput.
315static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
316 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
317 cl::desc(
318 "Enable runtime interleaving until load/store ports are saturated"));
319
320/// The number of stores in a loop that are allowed to need predication.
321cl::opt<unsigned> NumberOfStoresToPredicate(
322 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
323 cl::desc("Max number of stores to be predicated behind an if."));
324
325static cl::opt<bool> EnableIndVarRegisterHeur(
326 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
327 cl::desc("Count the induction variable only once when interleaving"));
328
329static cl::opt<bool> EnableCondStoresVectorization(
330 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
331 cl::desc("Enable if predication of stores during vectorization."));
332
333static cl::opt<unsigned> MaxNestedScalarReductionIC(
334 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
335 cl::desc("The maximum interleave count to use when interleaving a scalar "
336 "reduction in a nested loop."));
337
338static cl::opt<bool>
339 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
340 cl::Hidden,
341 cl::desc("Prefer in-loop vector reductions, "
342 "overriding the targets preference."));
343
344static cl::opt<bool> ForceOrderedReductions(
345 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
346 cl::desc("Enable the vectorisation of loops with in-order (strict) "
347 "FP reductions"));
348
349static cl::opt<bool> PreferPredicatedReductionSelect(
350 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
351 cl::desc(
352 "Prefer predicating a reduction operation over an after loop select."));
353
354cl::opt<bool> llvm::EnableVPlanNativePath(
355 "enable-vplan-native-path", cl::Hidden,
356 cl::desc("Enable VPlan-native vectorization path with "
357 "support for outer loop vectorization."));
358
359cl::opt<bool>
360 llvm::VerifyEachVPlan("vplan-verify-each",
361#ifdef EXPENSIVE_CHECKS
362 cl::init(true),
363#else
364 cl::init(Val: false),
365#endif
366 cl::Hidden,
367 cl::desc("Verfiy VPlans after VPlan transforms."));
368
369#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
370cl::opt<bool> llvm::VPlanPrintAfterAll(
371 "vplan-print-after-all", cl::init(false), cl::Hidden,
372 cl::desc("Print VPlans after all VPlan transformations."));
373
374cl::list<std::string> llvm::VPlanPrintAfterPasses(
375 "vplan-print-after", cl::Hidden,
376 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
377
378cl::opt<bool> llvm::VPlanPrintVectorRegionScope(
379 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
380 cl::desc("Limit VPlan printing to vector loop region in "
381 "`-vplan-print-after*` if the plan has one."));
382#endif
383
384// This flag enables the stress testing of the VPlan H-CFG construction in the
385// VPlan-native vectorization path. It must be used in conjuction with
386// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
387// verification of the H-CFGs built.
388static cl::opt<bool> VPlanBuildStressTest(
389 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
390 cl::desc(
391 "Build VPlan for every supported loop nest in the function and bail "
392 "out right after the build (stress test the VPlan H-CFG construction "
393 "in the VPlan-native vectorization path)."));
394
395cl::opt<bool> llvm::EnableLoopInterleaving(
396 "interleave-loops", cl::init(Val: true), cl::Hidden,
397 cl::desc("Enable loop interleaving in Loop vectorization passes"));
398cl::opt<bool> llvm::EnableLoopVectorization(
399 "vectorize-loops", cl::init(Val: true), cl::Hidden,
400 cl::desc("Run the Loop vectorization passes"));
401
402static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
403 "force-widen-divrem-via-safe-divisor", cl::Hidden,
404 cl::desc(
405 "Override cost based safe divisor widening for div/rem instructions"));
406
407static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
408 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
409 cl::Hidden,
410 cl::desc("Try wider VFs if they enable the use of vector variants"));
411
412static cl::opt<bool> EnableEarlyExitVectorization(
413 "enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
414 cl::desc(
415 "Enable vectorization of early exit loops with uncountable exits."));
416
417static cl::opt<bool> ConsiderRegPressure(
418 "vectorizer-consider-reg-pressure", cl::init(Val: false), cl::Hidden,
419 cl::desc("Discard VFs if their register pressure is too high."));
420
421// Likelyhood of bypassing the vectorized loop because there are zero trips left
422// after prolog. See `emitIterationCountCheck`.
423static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
424
425/// A helper function that returns true if the given type is irregular. The
426/// type is irregular if its allocated size doesn't equal the store size of an
427/// element of the corresponding vector type.
428static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
429 // Determine if an array of N elements of type Ty is "bitcast compatible"
430 // with a <N x Ty> vector.
431 // This is only true if there is no padding between the array elements.
432 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
433}
434
435/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
436/// ElementCount to include loops whose trip count is a function of vscale.
437static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
438 const Loop *L) {
439 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
440 return ElementCount::getFixed(MinVal: ExpectedTC);
441
442 const SCEV *BTC = SE->getBackedgeTakenCount(L);
443 if (isa<SCEVCouldNotCompute>(Val: BTC))
444 return ElementCount::getFixed(MinVal: 0);
445
446 const SCEV *ExitCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
447 if (isa<SCEVVScale>(Val: ExitCount))
448 return ElementCount::getScalable(MinVal: 1);
449
450 const APInt *Scale;
451 if (match(S: ExitCount, P: m_scev_Mul(Op0: m_scev_APInt(C&: Scale), Op1: m_SCEVVScale())))
452 if (cast<SCEVMulExpr>(Val: ExitCount)->hasNoUnsignedWrap())
453 if (Scale->getActiveBits() <= 32)
454 return ElementCount::getScalable(MinVal: Scale->getZExtValue());
455
456 return ElementCount::getFixed(MinVal: 0);
457}
458
459/// Returns "best known" trip count, which is either a valid positive trip count
460/// or std::nullopt when an estimate cannot be made (including when the trip
461/// count would overflow), for the specified loop \p L as defined by the
462/// following procedure:
463/// 1) Returns exact trip count if it is known.
464/// 2) Returns expected trip count according to profile data if any.
465/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
466/// 4) Returns std::nullopt if all of the above failed.
467static std::optional<ElementCount>
468getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
469 bool CanUseConstantMax = true) {
470 // Check if exact trip count is known.
471 if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
472 return ExpectedTC;
473
474 // Check if there is an expected trip count available from profile data.
475 if (LoopVectorizeWithBlockFrequency)
476 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
477 return ElementCount::getFixed(MinVal: *EstimatedTC);
478
479 if (!CanUseConstantMax)
480 return std::nullopt;
481
482 // Check if upper bound estimate is known.
483 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
484 return ElementCount::getFixed(MinVal: ExpectedTC);
485
486 return std::nullopt;
487}
488
489namespace {
490// Forward declare GeneratedRTChecks.
491class GeneratedRTChecks;
492
493using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
494} // namespace
495
496namespace llvm {
497
498AnalysisKey ShouldRunExtraVectorPasses::Key;
499
500/// InnerLoopVectorizer vectorizes loops which contain only one basic
501/// block to a specified vectorization factor (VF).
502/// This class performs the widening of scalars into vectors, or multiple
503/// scalars. This class also implements the following features:
504/// * It inserts an epilogue loop for handling loops that don't have iteration
505/// counts that are known to be a multiple of the vectorization factor.
506/// * It handles the code generation for reduction variables.
507/// * Scalarization (implementation using scalars) of un-vectorizable
508/// instructions.
509/// InnerLoopVectorizer does not perform any vectorization-legality
510/// checks, and relies on the caller to check for the different legality
511/// aspects. The InnerLoopVectorizer relies on the
512/// LoopVectorizationLegality class to provide information about the induction
513/// and reduction variables that were found to a given vectorization factor.
514class InnerLoopVectorizer {
515public:
516 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
517 LoopInfo *LI, DominatorTree *DT,
518 const TargetTransformInfo *TTI, AssumptionCache *AC,
519 ElementCount VecWidth, unsigned UnrollFactor,
520 LoopVectorizationCostModel *CM,
521 GeneratedRTChecks &RTChecks, VPlan &Plan)
522 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
523 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
524 Cost(CM), RTChecks(RTChecks), Plan(Plan),
525 VectorPHVPBB(cast<VPBasicBlock>(
526 Val: Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
527
528 virtual ~InnerLoopVectorizer() = default;
529
530 /// Creates a basic block for the scalar preheader. Both
531 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
532 /// the method to create additional blocks and checks needed for epilogue
533 /// vectorization.
534 virtual BasicBlock *createVectorizedLoopSkeleton();
535
536 /// Fix the vectorized code, taking care of header phi's, and more.
537 void fixVectorizedLoop(VPTransformState &State);
538
539 /// Fix the non-induction PHIs in \p Plan.
540 void fixNonInductionPHIs(VPTransformState &State);
541
542 /// Returns the original loop trip count.
543 Value *getTripCount() const { return TripCount; }
544
545 /// Used to set the trip count after ILV's construction and after the
546 /// preheader block has been executed. Note that this always holds the trip
547 /// count of the original loop for both main loop and epilogue vectorization.
548 void setTripCount(Value *TC) { TripCount = TC; }
549
550protected:
551 friend class LoopVectorizationPlanner;
552
553 /// Create and return a new IR basic block for the scalar preheader whose name
554 /// is prefixed with \p Prefix.
555 BasicBlock *createScalarPreheader(StringRef Prefix);
556
557 /// Allow subclasses to override and print debug traces before/after vplan
558 /// execution, when trace information is requested.
559 virtual void printDebugTracesAtStart() {}
560 virtual void printDebugTracesAtEnd() {}
561
562 /// The original loop.
563 Loop *OrigLoop;
564
565 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
566 /// dynamic knowledge to simplify SCEV expressions and converts them to a
567 /// more usable form.
568 PredicatedScalarEvolution &PSE;
569
570 /// Loop Info.
571 LoopInfo *LI;
572
573 /// Dominator Tree.
574 DominatorTree *DT;
575
576 /// Target Transform Info.
577 const TargetTransformInfo *TTI;
578
579 /// Assumption Cache.
580 AssumptionCache *AC;
581
582 /// The vectorization SIMD factor to use. Each vector will have this many
583 /// vector elements.
584 ElementCount VF;
585
586 /// The vectorization unroll factor to use. Each scalar is vectorized to this
587 /// many different vector instructions.
588 unsigned UF;
589
590 /// The builder that we use
591 IRBuilder<> Builder;
592
593 // --- Vectorization state ---
594
595 /// Trip count of the original loop.
596 Value *TripCount = nullptr;
597
598 /// The profitablity analysis.
599 LoopVectorizationCostModel *Cost;
600
601 /// Structure to hold information about generated runtime checks, responsible
602 /// for cleaning the checks, if vectorization turns out unprofitable.
603 GeneratedRTChecks &RTChecks;
604
605 VPlan &Plan;
606
607 /// The vector preheader block of \p Plan, used as target for check blocks
608 /// introduced during skeleton creation.
609 VPBasicBlock *VectorPHVPBB;
610};
611
612/// Encapsulate information regarding vectorization of a loop and its epilogue.
613/// This information is meant to be updated and used across two stages of
614/// epilogue vectorization.
615struct EpilogueLoopVectorizationInfo {
616 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
617 unsigned MainLoopUF = 0;
618 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
619 unsigned EpilogueUF = 0;
620 BasicBlock *MainLoopIterationCountCheck = nullptr;
621 BasicBlock *EpilogueIterationCountCheck = nullptr;
622 Value *TripCount = nullptr;
623 Value *VectorTripCount = nullptr;
624 VPlan &EpiloguePlan;
625
626 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
627 ElementCount EVF, unsigned EUF,
628 VPlan &EpiloguePlan)
629 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
630 EpiloguePlan(EpiloguePlan) {
631 assert(EUF == 1 &&
632 "A high UF for the epilogue loop is likely not beneficial.");
633 }
634};
635
636/// An extension of the inner loop vectorizer that creates a skeleton for a
637/// vectorized loop that has its epilogue (residual) also vectorized.
638/// The idea is to run the vplan on a given loop twice, firstly to setup the
639/// skeleton and vectorize the main loop, and secondly to complete the skeleton
640/// from the first step and vectorize the epilogue. This is achieved by
641/// deriving two concrete strategy classes from this base class and invoking
642/// them in succession from the loop vectorizer planner.
643class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
644public:
645 InnerLoopAndEpilogueVectorizer(
646 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
647 DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
648 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
649 GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
650 ElementCount MinProfitableTripCount, unsigned UnrollFactor)
651 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
652 UnrollFactor, CM, Checks, Plan),
653 EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
654
655 /// Holds and updates state information required to vectorize the main loop
656 /// and its epilogue in two separate passes. This setup helps us avoid
657 /// regenerating and recomputing runtime safety checks. It also helps us to
658 /// shorten the iteration-count-check path length for the cases where the
659 /// iteration count of the loop is so small that the main vector loop is
660 /// completely skipped.
661 EpilogueLoopVectorizationInfo &EPI;
662
663protected:
664 ElementCount MinProfitableTripCount;
665};
666
667/// A specialized derived class of inner loop vectorizer that performs
668/// vectorization of *main* loops in the process of vectorizing loops and their
669/// epilogues.
670class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
671public:
672 EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
673 LoopInfo *LI, DominatorTree *DT,
674 const TargetTransformInfo *TTI,
675 AssumptionCache *AC,
676 EpilogueLoopVectorizationInfo &EPI,
677 LoopVectorizationCostModel *CM,
678 GeneratedRTChecks &Check, VPlan &Plan)
679 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
680 Check, Plan, EPI.MainLoopVF,
681 EPI.MainLoopVF, EPI.MainLoopUF) {}
682 /// Implements the interface for creating a vectorized skeleton using the
683 /// *main loop* strategy (i.e., the first pass of VPlan execution).
684 BasicBlock *createVectorizedLoopSkeleton() final;
685
686protected:
687 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
688 /// vector preheader and its predecessor, also connecting the new block to the
689 /// scalar preheader.
690 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
691
692 // Create a check to see if the main vector loop should be executed
693 Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF,
694 unsigned UF) const;
695
696 /// Emits an iteration count bypass check once for the main loop (when \p
697 /// ForEpilogue is false) and once for the epilogue loop (when \p
698 /// ForEpilogue is true).
699 BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass,
700 bool ForEpilogue);
701 void printDebugTracesAtStart() override;
702 void printDebugTracesAtEnd() override;
703};
704
705// A specialized derived class of inner loop vectorizer that performs
706// vectorization of *epilogue* loops in the process of vectorizing loops and
707// their epilogues.
708class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
709public:
710 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
711 LoopInfo *LI, DominatorTree *DT,
712 const TargetTransformInfo *TTI,
713 AssumptionCache *AC,
714 EpilogueLoopVectorizationInfo &EPI,
715 LoopVectorizationCostModel *CM,
716 GeneratedRTChecks &Checks, VPlan &Plan)
717 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
718 Checks, Plan, EPI.EpilogueVF,
719 EPI.EpilogueVF, EPI.EpilogueUF) {}
720 /// Implements the interface for creating a vectorized skeleton using the
721 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
722 BasicBlock *createVectorizedLoopSkeleton() final;
723
724protected:
725 void printDebugTracesAtStart() override;
726 void printDebugTracesAtEnd() override;
727};
728} // end namespace llvm
729
730/// Look for a meaningful debug location on the instruction or its operands.
731static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
732 if (!I)
733 return DebugLoc::getUnknown();
734
735 DebugLoc Empty;
736 if (I->getDebugLoc() != Empty)
737 return I->getDebugLoc();
738
739 for (Use &Op : I->operands()) {
740 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
741 if (OpInst->getDebugLoc() != Empty)
742 return OpInst->getDebugLoc();
743 }
744
745 return I->getDebugLoc();
746}
747
748/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
749/// is passed, the message relates to that particular instruction.
750#ifndef NDEBUG
751static void debugVectorizationMessage(const StringRef Prefix,
752 const StringRef DebugMsg,
753 Instruction *I) {
754 dbgs() << "LV: " << Prefix << DebugMsg;
755 if (I != nullptr)
756 dbgs() << " " << *I;
757 else
758 dbgs() << '.';
759 dbgs() << '\n';
760}
761#endif
762
763/// Create an analysis remark that explains why vectorization failed
764///
765/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
766/// RemarkName is the identifier for the remark. If \p I is passed it is an
767/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
768/// the location of the remark. If \p DL is passed, use it as debug location for
769/// the remark. \return the remark object that can be streamed to.
770static OptimizationRemarkAnalysis
771createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
772 Instruction *I, DebugLoc DL = {}) {
773 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
774 // If debug location is attached to the instruction, use it. Otherwise if DL
775 // was not provided, use the loop's.
776 if (I && I->getDebugLoc())
777 DL = I->getDebugLoc();
778 else if (!DL)
779 DL = TheLoop->getStartLoc();
780
781 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
782}
783
784namespace llvm {
785
786/// Return a value for Step multiplied by VF.
787Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
788 int64_t Step) {
789 assert(Ty->isIntegerTy() && "Expected an integer step");
790 ElementCount VFxStep = VF.multiplyCoefficientBy(RHS: Step);
791 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
792 if (VF.isScalable() && isPowerOf2_64(Value: Step)) {
793 return B.CreateShl(
794 LHS: B.CreateVScale(Ty),
795 RHS: ConstantInt::get(Ty, V: Log2_64(Value: VFxStep.getKnownMinValue())), Name: "", HasNUW: true);
796 }
797 return B.CreateElementCount(Ty, EC: VFxStep);
798}
799
800/// Return the runtime value for VF.
801Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
802 return B.CreateElementCount(Ty, EC: VF);
803}
804
805void reportVectorizationFailure(const StringRef DebugMsg,
806 const StringRef OREMsg, const StringRef ORETag,
807 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
808 Instruction *I) {
809 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
810 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
811 ORE->emit(
812 OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
813 << "loop not vectorized: " << OREMsg);
814}
815
816/// Reports an informative message: print \p Msg for debugging purposes as well
817/// as an optimization remark. Uses either \p I as location of the remark, or
818/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
819/// remark. If \p DL is passed, use it as debug location for the remark.
820static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
821 OptimizationRemarkEmitter *ORE,
822 Loop *TheLoop, Instruction *I = nullptr,
823 DebugLoc DL = {}) {
824 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
825 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
826 ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
827 I, DL)
828 << Msg);
829}
830
831/// Report successful vectorization of the loop. In case an outer loop is
832/// vectorized, prepend "outer" to the vectorization remark.
833static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
834 VectorizationFactor VF, unsigned IC) {
835 LLVM_DEBUG(debugVectorizationMessage(
836 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
837 nullptr));
838 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
839 ORE->emit(RemarkBuilder: [&]() {
840 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
841 TheLoop->getHeader())
842 << "vectorized " << LoopType << "loop (vectorization width: "
843 << ore::NV("VectorizationFactor", VF.Width)
844 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
845 });
846}
847
848} // end namespace llvm
849
850namespace llvm {
851
852// Loop vectorization cost-model hints how the scalar epilogue loop should be
853// lowered.
854enum ScalarEpilogueLowering {
855
856 // The default: allowing scalar epilogues.
857 CM_ScalarEpilogueAllowed,
858
859 // Vectorization with OptForSize: don't allow epilogues.
860 CM_ScalarEpilogueNotAllowedOptSize,
861
862 // A special case of vectorisation with OptForSize: loops with a very small
863 // trip count are considered for vectorization under OptForSize, thereby
864 // making sure the cost of their loop body is dominant, free of runtime
865 // guards and scalar iteration overheads.
866 CM_ScalarEpilogueNotAllowedLowTripLoop,
867
868 // Loop hint predicate indicating an epilogue is undesired.
869 CM_ScalarEpilogueNotNeededUsePredicate,
870
871 // Directive indicating we must either tail fold or not vectorize
872 CM_ScalarEpilogueNotAllowedUsePredicate
873};
874
875/// LoopVectorizationCostModel - estimates the expected speedups due to
876/// vectorization.
877/// In many cases vectorization is not profitable. This can happen because of
878/// a number of reasons. In this class we mainly attempt to predict the
879/// expected speedup/slowdowns due to the supported instruction set. We use the
880/// TargetTransformInfo to query the different backends for the cost of
881/// different operations.
882class LoopVectorizationCostModel {
883 friend class LoopVectorizationPlanner;
884
885public:
886 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
887 PredicatedScalarEvolution &PSE, LoopInfo *LI,
888 LoopVectorizationLegality *Legal,
889 const TargetTransformInfo &TTI,
890 const TargetLibraryInfo *TLI, DemandedBits *DB,
891 AssumptionCache *AC,
892 OptimizationRemarkEmitter *ORE,
893 std::function<BlockFrequencyInfo &()> GetBFI,
894 const Function *F, const LoopVectorizeHints *Hints,
895 InterleavedAccessInfo &IAI, bool OptForSize)
896 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
897 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
898 TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
899 OptForSize(OptForSize) {
900 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
901 initializeVScaleForTuning();
902 CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
903 }
904
905 /// \return An upper bound for the vectorization factors (both fixed and
906 /// scalable). If the factors are 0, vectorization and interleaving should be
907 /// avoided up front.
908 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
909
910 /// \return True if runtime checks are required for vectorization, and false
911 /// otherwise.
912 bool runtimeChecksRequired();
913
914 /// Setup cost-based decisions for user vectorization factor.
915 /// \return true if the UserVF is a feasible VF to be chosen.
916 bool selectUserVectorizationFactor(ElementCount UserVF) {
917 collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
918 return expectedCost(VF: UserVF).isValid();
919 }
920
921 /// \return True if maximizing vector bandwidth is enabled by the target or
922 /// user options, for the given register kind.
923 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
924
925 /// \return True if register pressure should be considered for the given VF.
926 bool shouldConsiderRegPressureForVF(ElementCount VF);
927
928 /// \return The size (in bits) of the smallest and widest types in the code
929 /// that needs to be vectorized. We ignore values that remain scalar such as
930 /// 64 bit loop indices.
931 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
932
933 /// Memory access instruction may be vectorized in more than one way.
934 /// Form of instruction after vectorization depends on cost.
935 /// This function takes cost-based decisions for Load/Store instructions
936 /// and collects them in a map. This decisions map is used for building
937 /// the lists of loop-uniform and loop-scalar instructions.
938 /// The calculated cost is saved with widening decision in order to
939 /// avoid redundant calculations.
940 void setCostBasedWideningDecision(ElementCount VF);
941
942 /// A call may be vectorized in different ways depending on whether we have
943 /// vectorized variants available and whether the target supports masking.
944 /// This function analyzes all calls in the function at the supplied VF,
945 /// makes a decision based on the costs of available options, and stores that
946 /// decision in a map for use in planning and plan execution.
947 void setVectorizedCallDecision(ElementCount VF);
948
949 /// Collect values we want to ignore in the cost model.
950 void collectValuesToIgnore();
951
952 /// Collect all element types in the loop for which widening is needed.
953 void collectElementTypesForWidening();
954
955 /// Split reductions into those that happen in the loop, and those that happen
956 /// outside. In loop reductions are collected into InLoopReductions.
957 void collectInLoopReductions();
958
959 /// Returns true if we should use strict in-order reductions for the given
960 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
961 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
962 /// of FP operations.
963 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
964 return !Hints->allowReordering() && RdxDesc.isOrdered();
965 }
966
967 /// \returns The smallest bitwidth each instruction can be represented with.
968 /// The vector equivalents of these instructions should be truncated to this
969 /// type.
970 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
971 return MinBWs;
972 }
973
974 /// \returns True if it is more profitable to scalarize instruction \p I for
975 /// vectorization factor \p VF.
976 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
977 assert(VF.isVector() &&
978 "Profitable to scalarize relevant only for VF > 1.");
979 assert(
980 TheLoop->isInnermost() &&
981 "cost-model should not be used for outer loops (in VPlan-native path)");
982
983 auto Scalars = InstsToScalarize.find(Key: VF);
984 assert(Scalars != InstsToScalarize.end() &&
985 "VF not yet analyzed for scalarization profitability");
986 return Scalars->second.contains(Key: I);
987 }
988
989 /// Returns true if \p I is known to be uniform after vectorization.
990 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
991 assert(
992 TheLoop->isInnermost() &&
993 "cost-model should not be used for outer loops (in VPlan-native path)");
994 // Pseudo probe needs to be duplicated for each unrolled iteration and
995 // vector lane so that profiled loop trip count can be accurately
996 // accumulated instead of being under counted.
997 if (isa<PseudoProbeInst>(Val: I))
998 return false;
999
1000 if (VF.isScalar())
1001 return true;
1002
1003 auto UniformsPerVF = Uniforms.find(Val: VF);
1004 assert(UniformsPerVF != Uniforms.end() &&
1005 "VF not yet analyzed for uniformity");
1006 return UniformsPerVF->second.count(Ptr: I);
1007 }
1008
1009 /// Returns true if \p I is known to be scalar after vectorization.
1010 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1011 assert(
1012 TheLoop->isInnermost() &&
1013 "cost-model should not be used for outer loops (in VPlan-native path)");
1014 if (VF.isScalar())
1015 return true;
1016
1017 auto ScalarsPerVF = Scalars.find(Val: VF);
1018 assert(ScalarsPerVF != Scalars.end() &&
1019 "Scalar values are not calculated for VF");
1020 return ScalarsPerVF->second.count(Ptr: I);
1021 }
1022
1023 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1024 /// for vectorization factor \p VF.
1025 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1026 // Truncs must truncate at most to their destination type.
1027 if (isa_and_nonnull<TruncInst>(Val: I) && MinBWs.contains(Key: I) &&
1028 I->getType()->getScalarSizeInBits() < MinBWs.lookup(Key: I))
1029 return false;
1030 return VF.isVector() && MinBWs.contains(Key: I) &&
1031 !isProfitableToScalarize(I, VF) &&
1032 !isScalarAfterVectorization(I, VF);
1033 }
1034
1035 /// Decision that was taken during cost calculation for memory instruction.
1036 enum InstWidening {
1037 CM_Unknown,
1038 CM_Widen, // For consecutive accesses with stride +1.
1039 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1040 CM_Interleave,
1041 CM_GatherScatter,
1042 CM_Scalarize,
1043 CM_VectorCall,
1044 CM_IntrinsicCall
1045 };
1046
1047 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1048 /// instruction \p I and vector width \p VF.
1049 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1050 InstructionCost Cost) {
1051 assert(VF.isVector() && "Expected VF >=2");
1052 WideningDecisions[{I, VF}] = {W, Cost};
1053 }
1054
1055 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1056 /// interleaving group \p Grp and vector width \p VF.
1057 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1058 ElementCount VF, InstWidening W,
1059 InstructionCost Cost) {
1060 assert(VF.isVector() && "Expected VF >=2");
1061 /// Broadcast this decicion to all instructions inside the group.
1062 /// When interleaving, the cost will only be assigned one instruction, the
1063 /// insert position. For other cases, add the appropriate fraction of the
1064 /// total cost to each instruction. This ensures accurate costs are used,
1065 /// even if the insert position instruction is not used.
1066 InstructionCost InsertPosCost = Cost;
1067 InstructionCost OtherMemberCost = 0;
1068 if (W != CM_Interleave)
1069 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1070 ;
1071 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1072 if (auto *I = Grp->getMember(Index: Idx)) {
1073 if (Grp->getInsertPos() == I)
1074 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1075 else
1076 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1077 }
1078 }
1079 }
1080
1081 /// Return the cost model decision for the given instruction \p I and vector
1082 /// width \p VF. Return CM_Unknown if this instruction did not pass
1083 /// through the cost modeling.
1084 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1085 assert(VF.isVector() && "Expected VF to be a vector VF");
1086 assert(
1087 TheLoop->isInnermost() &&
1088 "cost-model should not be used for outer loops (in VPlan-native path)");
1089
1090 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1091 auto Itr = WideningDecisions.find(Val: InstOnVF);
1092 if (Itr == WideningDecisions.end())
1093 return CM_Unknown;
1094 return Itr->second.first;
1095 }
1096
1097 /// Return the vectorization cost for the given instruction \p I and vector
1098 /// width \p VF.
1099 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1100 assert(VF.isVector() && "Expected VF >=2");
1101 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1102 assert(WideningDecisions.contains(InstOnVF) &&
1103 "The cost is not calculated");
1104 return WideningDecisions[InstOnVF].second;
1105 }
1106
1107 struct CallWideningDecision {
1108 InstWidening Kind;
1109 Function *Variant;
1110 Intrinsic::ID IID;
1111 std::optional<unsigned> MaskPos;
1112 InstructionCost Cost;
1113 };
1114
1115 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1116 Function *Variant, Intrinsic::ID IID,
1117 std::optional<unsigned> MaskPos,
1118 InstructionCost Cost) {
1119 assert(!VF.isScalar() && "Expected vector VF");
1120 CallWideningDecisions[{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1121 }
1122
1123 CallWideningDecision getCallWideningDecision(CallInst *CI,
1124 ElementCount VF) const {
1125 assert(!VF.isScalar() && "Expected vector VF");
1126 auto I = CallWideningDecisions.find(Val: {CI, VF});
1127 if (I == CallWideningDecisions.end())
1128 return {.Kind: CM_Unknown, .Variant: nullptr, .IID: Intrinsic::not_intrinsic, .MaskPos: std::nullopt, .Cost: 0};
1129 return I->second;
1130 }
1131
1132 /// Return True if instruction \p I is an optimizable truncate whose operand
1133 /// is an induction variable. Such a truncate will be removed by adding a new
1134 /// induction variable with the destination type.
1135 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1136 // If the instruction is not a truncate, return false.
1137 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1138 if (!Trunc)
1139 return false;
1140
1141 // Get the source and destination types of the truncate.
1142 Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1143 Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1144
1145 // If the truncate is free for the given types, return false. Replacing a
1146 // free truncate with an induction variable would add an induction variable
1147 // update instruction to each iteration of the loop. We exclude from this
1148 // check the primary induction variable since it will need an update
1149 // instruction regardless.
1150 Value *Op = Trunc->getOperand(i_nocapture: 0);
1151 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1152 return false;
1153
1154 // If the truncated value is not an induction variable, return false.
1155 return Legal->isInductionPhi(V: Op);
1156 }
1157
1158 /// Collects the instructions to scalarize for each predicated instruction in
1159 /// the loop.
1160 void collectInstsToScalarize(ElementCount VF);
1161
1162 /// Collect values that will not be widened, including Uniforms, Scalars, and
1163 /// Instructions to Scalarize for the given \p VF.
1164 /// The sets depend on CM decision for Load/Store instructions
1165 /// that may be vectorized as interleave, gather-scatter or scalarized.
1166 /// Also make a decision on what to do about call instructions in the loop
1167 /// at that VF -- scalarize, call a known vector routine, or call a
1168 /// vector intrinsic.
1169 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1170 // Do the analysis once.
1171 if (VF.isScalar() || Uniforms.contains(Val: VF))
1172 return;
1173 setCostBasedWideningDecision(VF);
1174 collectLoopUniforms(VF);
1175 setVectorizedCallDecision(VF);
1176 collectLoopScalars(VF);
1177 collectInstsToScalarize(VF);
1178 }
1179
1180 /// Returns true if the target machine supports masked store operation
1181 /// for the given \p DataType and kind of access to \p Ptr.
1182 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1183 unsigned AddressSpace) const {
1184 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1185 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1186 }
1187
1188 /// Returns true if the target machine supports masked load operation
1189 /// for the given \p DataType and kind of access to \p Ptr.
1190 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1191 unsigned AddressSpace) const {
1192 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1193 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1194 }
1195
1196 /// Returns true if the target machine can represent \p V as a masked gather
1197 /// or scatter operation.
1198 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1199 bool LI = isa<LoadInst>(Val: V);
1200 bool SI = isa<StoreInst>(Val: V);
1201 if (!LI && !SI)
1202 return false;
1203 auto *Ty = getLoadStoreType(I: V);
1204 Align Align = getLoadStoreAlignment(I: V);
1205 if (VF.isVector())
1206 Ty = VectorType::get(ElementType: Ty, EC: VF);
1207 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1208 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1209 }
1210
1211 /// Returns true if the target machine supports all of the reduction
1212 /// variables found for the given VF.
1213 bool canVectorizeReductions(ElementCount VF) const {
1214 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1215 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1216 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1217 }));
1218 }
1219
1220 /// Given costs for both strategies, return true if the scalar predication
1221 /// lowering should be used for div/rem. This incorporates an override
1222 /// option so it is not simply a cost comparison.
1223 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1224 InstructionCost SafeDivisorCost) const {
1225 switch (ForceSafeDivisor) {
1226 case cl::BOU_UNSET:
1227 return ScalarCost < SafeDivisorCost;
1228 case cl::BOU_TRUE:
1229 return false;
1230 case cl::BOU_FALSE:
1231 return true;
1232 }
1233 llvm_unreachable("impossible case value");
1234 }
1235
1236 /// Returns true if \p I is an instruction which requires predication and
1237 /// for which our chosen predication strategy is scalarization (i.e. we
1238 /// don't have an alternate strategy such as masking available).
1239 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1240 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1241
1242 /// Returns true if \p I is an instruction that needs to be predicated
1243 /// at runtime. The result is independent of the predication mechanism.
1244 /// Superset of instructions that return true for isScalarWithPredication.
1245 bool isPredicatedInst(Instruction *I) const;
1246
1247 /// A helper function that returns how much we should divide the cost of a
1248 /// predicated block by. Typically this is the reciprocal of the block
1249 /// probability, i.e. if we return X we are assuming the predicated block will
1250 /// execute once for every X iterations of the loop header so the block should
1251 /// only contribute 1/X of its cost to the total cost calculation, but when
1252 /// optimizing for code size it will just be 1 as code size costs don't depend
1253 /// on execution probabilities.
1254 ///
1255 /// Note that if a block wasn't originally predicated but was predicated due
1256 /// to tail folding, the divisor will still be 1 because it will execute for
1257 /// every iteration of the loop header.
1258 inline uint64_t
1259 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1260 const BasicBlock *BB);
1261
1262 /// Returns true if an artificially high cost for emulated masked memrefs
1263 /// should be used.
1264 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1265
1266 /// Return the costs for our two available strategies for lowering a
1267 /// div/rem operation which requires speculating at least one lane.
1268 /// First result is for scalarization (will be invalid for scalable
1269 /// vectors); second is for the safe-divisor strategy.
1270 std::pair<InstructionCost, InstructionCost>
1271 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1272
1273 /// Returns true if \p I is a memory instruction with consecutive memory
1274 /// access that can be widened.
1275 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1276
1277 /// Returns true if \p I is a memory instruction in an interleaved-group
1278 /// of memory accesses that can be vectorized with wide vector loads/stores
1279 /// and shuffles.
1280 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1281
1282 /// Check if \p Instr belongs to any interleaved access group.
1283 bool isAccessInterleaved(Instruction *Instr) const {
1284 return InterleaveInfo.isInterleaved(Instr);
1285 }
1286
1287 /// Get the interleaved access group that \p Instr belongs to.
1288 const InterleaveGroup<Instruction> *
1289 getInterleavedAccessGroup(Instruction *Instr) const {
1290 return InterleaveInfo.getInterleaveGroup(Instr);
1291 }
1292
1293 /// Returns true if we're required to use a scalar epilogue for at least
1294 /// the final iteration of the original loop.
1295 bool requiresScalarEpilogue(bool IsVectorizing) const {
1296 if (!isScalarEpilogueAllowed()) {
1297 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1298 return false;
1299 }
1300 // If we might exit from anywhere but the latch and early exit vectorization
1301 // is disabled, we must run the exiting iteration in scalar form.
1302 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1303 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1304 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1305 "from latch block\n");
1306 return true;
1307 }
1308 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1309 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1310 "interleaved group requires scalar epilogue\n");
1311 return true;
1312 }
1313 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1314 return false;
1315 }
1316
1317 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1318 /// loop hint annotation.
1319 bool isScalarEpilogueAllowed() const {
1320 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1321 }
1322
1323 /// Returns true if tail-folding is preferred over a scalar epilogue.
1324 bool preferPredicatedLoop() const {
1325 return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1326 ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1327 }
1328
1329 /// Returns the TailFoldingStyle that is best for the current loop.
1330 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1331 if (!ChosenTailFoldingStyle)
1332 return TailFoldingStyle::None;
1333 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1334 : ChosenTailFoldingStyle->second;
1335 }
1336
1337 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1338 /// overflow or not.
1339 /// \param IsScalableVF true if scalable vector factors enabled.
1340 /// \param UserIC User specific interleave count.
1341 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1342 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1343 if (!Legal->canFoldTailByMasking()) {
1344 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1345 return;
1346 }
1347
1348 // Default to TTI preference, but allow command line override.
1349 ChosenTailFoldingStyle = {
1350 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1351 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1352 if (ForceTailFoldingStyle.getNumOccurrences())
1353 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1354 ForceTailFoldingStyle.getValue()};
1355
1356 if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
1357 ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
1358 return;
1359 // Override EVL styles if needed.
1360 // FIXME: Investigate opportunity for fixed vector factor.
1361 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1362 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1363 if (EVLIsLegal)
1364 return;
1365 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1366 // if it's allowed, or DataWithoutLaneMask otherwise.
1367 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1368 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1369 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1370 else
1371 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1372 TailFoldingStyle::DataWithoutLaneMask};
1373
1374 LLVM_DEBUG(
1375 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1376 "not try to generate VP Intrinsics "
1377 << (UserIC > 1
1378 ? "since interleave count specified is greater than 1.\n"
1379 : "due to non-interleaving reasons.\n"));
1380 }
1381
1382 /// Returns true if all loop blocks should be masked to fold tail loop.
1383 bool foldTailByMasking() const {
1384 // TODO: check if it is possible to check for None style independent of
1385 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1386 return getTailFoldingStyle() != TailFoldingStyle::None;
1387 }
1388
1389 /// Returns true if the use of wide lane masks is requested and the loop is
1390 /// using tail-folding with a lane mask for control flow.
1391 bool useWideActiveLaneMask() const {
1392 if (!EnableWideActiveLaneMask)
1393 return false;
1394
1395 TailFoldingStyle TF = getTailFoldingStyle();
1396 return TF == TailFoldingStyle::DataAndControlFlow ||
1397 TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
1398 }
1399
1400 /// Return maximum safe number of elements to be processed per vector
1401 /// iteration, which do not prevent store-load forwarding and are safe with
1402 /// regard to the memory dependencies. Required for EVL-based VPlans to
1403 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1404 /// MaxSafeElements).
1405 /// TODO: need to consider adjusting cost model to use this value as a
1406 /// vectorization factor for EVL-based vectorization.
1407 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1408
1409 /// Returns true if the instructions in this block requires predication
1410 /// for any reason, e.g. because tail folding now requires a predicate
1411 /// or because the block in the original loop was predicated.
1412 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1413 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1414 }
1415
1416 /// Returns true if VP intrinsics with explicit vector length support should
1417 /// be generated in the tail folded loop.
1418 bool foldTailWithEVL() const {
1419 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1420 }
1421
1422 /// Returns true if the Phi is part of an inloop reduction.
1423 bool isInLoopReduction(PHINode *Phi) const {
1424 return InLoopReductions.contains(Ptr: Phi);
1425 }
1426
1427 /// Returns the set of in-loop reduction PHIs.
1428 const SmallPtrSetImpl<PHINode *> &getInLoopReductions() const {
1429 return InLoopReductions;
1430 }
1431
1432 /// Returns true if the predicated reduction select should be used to set the
1433 /// incoming value for the reduction phi.
1434 bool usePredicatedReductionSelect() const {
1435 // Force to use predicated reduction select since the EVL of the
1436 // second-to-last iteration might not be VF*UF.
1437 if (foldTailWithEVL())
1438 return true;
1439 return PreferPredicatedReductionSelect ||
1440 TTI.preferPredicatedReductionSelect();
1441 }
1442
1443 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1444 /// with factor VF. Return the cost of the instruction, including
1445 /// scalarization overhead if it's needed.
1446 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1447
1448 /// Estimate cost of a call instruction CI if it were vectorized with factor
1449 /// VF. Return the cost of the instruction, including scalarization overhead
1450 /// if it's needed.
1451 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1452
1453 /// Invalidates decisions already taken by the cost model.
1454 void invalidateCostModelingDecisions() {
1455 WideningDecisions.clear();
1456 CallWideningDecisions.clear();
1457 Uniforms.clear();
1458 Scalars.clear();
1459 }
1460
1461 /// Returns the expected execution cost. The unit of the cost does
1462 /// not matter because we use the 'cost' units to compare different
1463 /// vector widths. The cost that is returned is *not* normalized by
1464 /// the factor width.
1465 InstructionCost expectedCost(ElementCount VF);
1466
1467 bool hasPredStores() const { return NumPredStores > 0; }
1468
1469 /// Returns true if epilogue vectorization is considered profitable, and
1470 /// false otherwise.
1471 /// \p VF is the vectorization factor chosen for the original loop.
1472 /// \p Multiplier is an aditional scaling factor applied to VF before
1473 /// comparing to EpilogueVectorizationMinVF.
1474 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1475 const unsigned IC) const;
1476
1477 /// Returns the execution time cost of an instruction for a given vector
1478 /// width. Vector width of one means scalar.
1479 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1480
1481 /// Return the cost of instructions in an inloop reduction pattern, if I is
1482 /// part of that pattern.
1483 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1484 ElementCount VF,
1485 Type *VectorTy) const;
1486
1487 /// Returns true if \p Op should be considered invariant and if it is
1488 /// trivially hoistable.
1489 bool shouldConsiderInvariant(Value *Op);
1490
1491 /// Return the value of vscale used for tuning the cost model.
1492 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1493
1494private:
1495 unsigned NumPredStores = 0;
1496
1497 /// Used to store the value of vscale used for tuning the cost model. It is
1498 /// initialized during object construction.
1499 std::optional<unsigned> VScaleForTuning;
1500
1501 /// Initializes the value of vscale used for tuning the cost model. If
1502 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1503 /// return the value returned by the corresponding TTI method.
1504 void initializeVScaleForTuning() {
1505 const Function *Fn = TheLoop->getHeader()->getParent();
1506 if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1507 auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1508 auto Min = Attr.getVScaleRangeMin();
1509 auto Max = Attr.getVScaleRangeMax();
1510 if (Max && Min == Max) {
1511 VScaleForTuning = Max;
1512 return;
1513 }
1514 }
1515
1516 VScaleForTuning = TTI.getVScaleForTuning();
1517 }
1518
1519 /// \return An upper bound for the vectorization factors for both
1520 /// fixed and scalable vectorization, where the minimum-known number of
1521 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1522 /// disabled or unsupported, then the scalable part will be equal to
1523 /// ElementCount::getScalable(0).
1524 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1525 ElementCount UserVF, unsigned UserIC,
1526 bool FoldTailByMasking);
1527
1528 /// If \p VF * \p UserIC > MaxTripcount, clamps VF to the next lower VF that
1529 /// results in VF * UserIC <= MaxTripCount.
1530 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1531 unsigned UserIC,
1532 bool FoldTailByMasking) const;
1533
1534 /// \return the maximized element count based on the targets vector
1535 /// registers and the loop trip-count, but limited to a maximum safe VF.
1536 /// This is a helper function of computeFeasibleMaxVF.
1537 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1538 unsigned SmallestType,
1539 unsigned WidestType,
1540 ElementCount MaxSafeVF, unsigned UserIC,
1541 bool FoldTailByMasking);
1542
1543 /// Checks if scalable vectorization is supported and enabled. Caches the
1544 /// result to avoid repeated debug dumps for repeated queries.
1545 bool isScalableVectorizationAllowed();
1546
1547 /// \return the maximum legal scalable VF, based on the safe max number
1548 /// of elements.
1549 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1550
1551 /// Calculate vectorization cost of memory instruction \p I.
1552 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1553
1554 /// The cost computation for scalarized memory instruction.
1555 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1556
1557 /// The cost computation for interleaving group of memory instructions.
1558 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1559
1560 /// The cost computation for Gather/Scatter instruction.
1561 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1562
1563 /// The cost computation for widening instruction \p I with consecutive
1564 /// memory access.
1565 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1566
1567 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1568 /// Load: scalar load + broadcast.
1569 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1570 /// element)
1571 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1572
1573 /// Estimate the overhead of scalarizing an instruction. This is a
1574 /// convenience wrapper for the type-based getScalarizationOverhead API.
1575 InstructionCost getScalarizationOverhead(Instruction *I,
1576 ElementCount VF) const;
1577
1578 /// Map of scalar integer values to the smallest bitwidth they can be legally
1579 /// represented as. The vector equivalents of these values should be truncated
1580 /// to this type.
1581 MapVector<Instruction *, uint64_t> MinBWs;
1582
1583 /// A type representing the costs for instructions if they were to be
1584 /// scalarized rather than vectorized. The entries are Instruction-Cost
1585 /// pairs.
1586 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1587
1588 /// A set containing all BasicBlocks that are known to present after
1589 /// vectorization as a predicated block.
1590 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1591 PredicatedBBsAfterVectorization;
1592
1593 /// Records whether it is allowed to have the original scalar loop execute at
1594 /// least once. This may be needed as a fallback loop in case runtime
1595 /// aliasing/dependence checks fail, or to handle the tail/remainder
1596 /// iterations when the trip count is unknown or doesn't divide by the VF,
1597 /// or as a peel-loop to handle gaps in interleave-groups.
1598 /// Under optsize and when the trip count is very small we don't allow any
1599 /// iterations to execute in the scalar loop.
1600 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1601
1602 /// Control finally chosen tail folding style. The first element is used if
1603 /// the IV update may overflow, the second element - if it does not.
1604 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1605 ChosenTailFoldingStyle;
1606
1607 /// true if scalable vectorization is supported and enabled.
1608 std::optional<bool> IsScalableVectorizationAllowed;
1609
1610 /// Maximum safe number of elements to be processed per vector iteration,
1611 /// which do not prevent store-load forwarding and are safe with regard to the
1612 /// memory dependencies. Required for EVL-based veectorization, where this
1613 /// value is used as the upper bound of the safe AVL.
1614 std::optional<unsigned> MaxSafeElements;
1615
1616 /// A map holding scalar costs for different vectorization factors. The
1617 /// presence of a cost for an instruction in the mapping indicates that the
1618 /// instruction will be scalarized when vectorizing with the associated
1619 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1620 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1621
1622 /// Holds the instructions known to be uniform after vectorization.
1623 /// The data is collected per VF.
1624 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1625
1626 /// Holds the instructions known to be scalar after vectorization.
1627 /// The data is collected per VF.
1628 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1629
1630 /// Holds the instructions (address computations) that are forced to be
1631 /// scalarized.
1632 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1633
1634 /// PHINodes of the reductions that should be expanded in-loop.
1635 SmallPtrSet<PHINode *, 4> InLoopReductions;
1636
1637 /// A Map of inloop reduction operations and their immediate chain operand.
1638 /// FIXME: This can be removed once reductions can be costed correctly in
1639 /// VPlan. This was added to allow quick lookup of the inloop operations.
1640 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1641
1642 /// Returns the expected difference in cost from scalarizing the expression
1643 /// feeding a predicated instruction \p PredInst. The instructions to
1644 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1645 /// non-negative return value implies the expression will be scalarized.
1646 /// Currently, only single-use chains are considered for scalarization.
1647 InstructionCost computePredInstDiscount(Instruction *PredInst,
1648 ScalarCostsTy &ScalarCosts,
1649 ElementCount VF);
1650
1651 /// Collect the instructions that are uniform after vectorization. An
1652 /// instruction is uniform if we represent it with a single scalar value in
1653 /// the vectorized loop corresponding to each vector iteration. Examples of
1654 /// uniform instructions include pointer operands of consecutive or
1655 /// interleaved memory accesses. Note that although uniformity implies an
1656 /// instruction will be scalar, the reverse is not true. In general, a
1657 /// scalarized instruction will be represented by VF scalar values in the
1658 /// vectorized loop, each corresponding to an iteration of the original
1659 /// scalar loop.
1660 void collectLoopUniforms(ElementCount VF);
1661
1662 /// Collect the instructions that are scalar after vectorization. An
1663 /// instruction is scalar if it is known to be uniform or will be scalarized
1664 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1665 /// to the list if they are used by a load/store instruction that is marked as
1666 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1667 /// VF values in the vectorized loop, each corresponding to an iteration of
1668 /// the original scalar loop.
1669 void collectLoopScalars(ElementCount VF);
1670
1671 /// Keeps cost model vectorization decision and cost for instructions.
1672 /// Right now it is used for memory instructions only.
1673 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1674 std::pair<InstWidening, InstructionCost>>;
1675
1676 DecisionList WideningDecisions;
1677
1678 using CallDecisionList =
1679 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1680
1681 CallDecisionList CallWideningDecisions;
1682
1683 /// Returns true if \p V is expected to be vectorized and it needs to be
1684 /// extracted.
1685 bool needsExtract(Value *V, ElementCount VF) const {
1686 Instruction *I = dyn_cast<Instruction>(Val: V);
1687 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1688 TheLoop->isLoopInvariant(V: I) ||
1689 getWideningDecision(I, VF) == CM_Scalarize ||
1690 (isa<CallInst>(Val: I) &&
1691 getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize))
1692 return false;
1693
1694 // Assume we can vectorize V (and hence we need extraction) if the
1695 // scalars are not computed yet. This can happen, because it is called
1696 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1697 // the scalars are collected. That should be a safe assumption in most
1698 // cases, because we check if the operands have vectorizable types
1699 // beforehand in LoopVectorizationLegality.
1700 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1701 };
1702
1703 /// Returns a range containing only operands needing to be extracted.
1704 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1705 ElementCount VF) const {
1706
1707 SmallPtrSet<const Value *, 4> UniqueOperands;
1708 SmallVector<Value *, 4> Res;
1709 for (Value *Op : Ops) {
1710 if (isa<Constant>(Val: Op) || !UniqueOperands.insert(Ptr: Op).second ||
1711 !needsExtract(V: Op, VF))
1712 continue;
1713 Res.push_back(Elt: Op);
1714 }
1715 return Res;
1716 }
1717
1718public:
1719 /// The loop that we evaluate.
1720 Loop *TheLoop;
1721
1722 /// Predicated scalar evolution analysis.
1723 PredicatedScalarEvolution &PSE;
1724
1725 /// Loop Info analysis.
1726 LoopInfo *LI;
1727
1728 /// Vectorization legality.
1729 LoopVectorizationLegality *Legal;
1730
1731 /// Vector target information.
1732 const TargetTransformInfo &TTI;
1733
1734 /// Target Library Info.
1735 const TargetLibraryInfo *TLI;
1736
1737 /// Demanded bits analysis.
1738 DemandedBits *DB;
1739
1740 /// Assumption cache.
1741 AssumptionCache *AC;
1742
1743 /// Interface to emit optimization remarks.
1744 OptimizationRemarkEmitter *ORE;
1745
1746 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1747 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1748 /// there is no predication.
1749 std::function<BlockFrequencyInfo &()> GetBFI;
1750 /// The BlockFrequencyInfo returned from GetBFI.
1751 BlockFrequencyInfo *BFI = nullptr;
1752 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1753 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1754 BlockFrequencyInfo &getBFI() {
1755 if (!BFI)
1756 BFI = &GetBFI();
1757 return *BFI;
1758 }
1759
1760 const Function *TheFunction;
1761
1762 /// Loop Vectorize Hint.
1763 const LoopVectorizeHints *Hints;
1764
1765 /// The interleave access information contains groups of interleaved accesses
1766 /// with the same stride and close to each other.
1767 InterleavedAccessInfo &InterleaveInfo;
1768
1769 /// Values to ignore in the cost model.
1770 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1771
1772 /// Values to ignore in the cost model when VF > 1.
1773 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1774
1775 /// All element types found in the loop.
1776 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1777
1778 /// The kind of cost that we are calculating
1779 TTI::TargetCostKind CostKind;
1780
1781 /// Whether this loop should be optimized for size based on function attribute
1782 /// or profile information.
1783 bool OptForSize;
1784
1785 /// The highest VF possible for this loop, without using MaxBandwidth.
1786 FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
1787};
1788} // end namespace llvm
1789
1790namespace {
1791/// Helper struct to manage generating runtime checks for vectorization.
1792///
1793/// The runtime checks are created up-front in temporary blocks to allow better
1794/// estimating the cost and un-linked from the existing IR. After deciding to
1795/// vectorize, the checks are moved back. If deciding not to vectorize, the
1796/// temporary blocks are completely removed.
1797class GeneratedRTChecks {
1798 /// Basic block which contains the generated SCEV checks, if any.
1799 BasicBlock *SCEVCheckBlock = nullptr;
1800
1801 /// The value representing the result of the generated SCEV checks. If it is
1802 /// nullptr no SCEV checks have been generated.
1803 Value *SCEVCheckCond = nullptr;
1804
1805 /// Basic block which contains the generated memory runtime checks, if any.
1806 BasicBlock *MemCheckBlock = nullptr;
1807
1808 /// The value representing the result of the generated memory runtime checks.
1809 /// If it is nullptr no memory runtime checks have been generated.
1810 Value *MemRuntimeCheckCond = nullptr;
1811
1812 DominatorTree *DT;
1813 LoopInfo *LI;
1814 TargetTransformInfo *TTI;
1815
1816 SCEVExpander SCEVExp;
1817 SCEVExpander MemCheckExp;
1818
1819 bool CostTooHigh = false;
1820
1821 Loop *OuterLoop = nullptr;
1822
1823 PredicatedScalarEvolution &PSE;
1824
1825 /// The kind of cost that we are calculating
1826 TTI::TargetCostKind CostKind;
1827
1828public:
1829 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1830 LoopInfo *LI, TargetTransformInfo *TTI,
1831 TTI::TargetCostKind CostKind)
1832 : DT(DT), LI(LI), TTI(TTI),
1833 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1834 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1835 PSE(PSE), CostKind(CostKind) {}
1836
1837 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1838 /// accurately estimate the cost of the runtime checks. The blocks are
1839 /// un-linked from the IR and are added back during vector code generation. If
1840 /// there is no vector code generation, the check blocks are removed
1841 /// completely.
1842 void create(Loop *L, const LoopAccessInfo &LAI,
1843 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1844 OptimizationRemarkEmitter &ORE) {
1845
1846 // Hard cutoff to limit compile-time increase in case a very large number of
1847 // runtime checks needs to be generated.
1848 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1849 // profile info.
1850 CostTooHigh =
1851 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1852 if (CostTooHigh) {
1853 // Mark runtime checks as never succeeding when they exceed the threshold.
1854 MemRuntimeCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1855 SCEVCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1856 ORE.emit(RemarkBuilder: [&]() {
1857 return OptimizationRemarkAnalysisAliasing(
1858 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1859 L->getHeader())
1860 << "loop not vectorized: too many memory checks needed";
1861 });
1862 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1863 return;
1864 }
1865
1866 BasicBlock *LoopHeader = L->getHeader();
1867 BasicBlock *Preheader = L->getLoopPreheader();
1868
1869 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1870 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1871 // may be used by SCEVExpander. The blocks will be un-linked from their
1872 // predecessors and removed from LI & DT at the end of the function.
1873 if (!UnionPred.isAlwaysTrue()) {
1874 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1875 MSSAU: nullptr, BBName: "vector.scevcheck");
1876
1877 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1878 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1879 if (isa<Constant>(Val: SCEVCheckCond)) {
1880 // Clean up directly after expanding the predicate to a constant, to
1881 // avoid further expansions re-using anything left over from SCEVExp.
1882 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1883 SCEVCleaner.cleanup();
1884 }
1885 }
1886
1887 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1888 if (RtPtrChecking.Need) {
1889 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1890 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1891 BBName: "vector.memcheck");
1892
1893 auto DiffChecks = RtPtrChecking.getDiffChecks();
1894 if (DiffChecks) {
1895 Value *RuntimeVF = nullptr;
1896 MemRuntimeCheckCond = addDiffRuntimeChecks(
1897 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1898 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1899 if (!RuntimeVF)
1900 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1901 return RuntimeVF;
1902 },
1903 IC);
1904 } else {
1905 MemRuntimeCheckCond = addRuntimeChecks(
1906 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1907 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1908 }
1909 assert(MemRuntimeCheckCond &&
1910 "no RT checks generated although RtPtrChecking "
1911 "claimed checks are required");
1912 }
1913
1914 SCEVExp.eraseDeadInstructions(Root: SCEVCheckCond);
1915
1916 if (!MemCheckBlock && !SCEVCheckBlock)
1917 return;
1918
1919 // Unhook the temporary block with the checks, update various places
1920 // accordingly.
1921 if (SCEVCheckBlock)
1922 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1923 if (MemCheckBlock)
1924 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1925
1926 if (SCEVCheckBlock) {
1927 SCEVCheckBlock->getTerminator()->moveBefore(
1928 InsertPos: Preheader->getTerminator()->getIterator());
1929 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1930 UI->setDebugLoc(DebugLoc::getTemporary());
1931 Preheader->getTerminator()->eraseFromParent();
1932 }
1933 if (MemCheckBlock) {
1934 MemCheckBlock->getTerminator()->moveBefore(
1935 InsertPos: Preheader->getTerminator()->getIterator());
1936 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1937 UI->setDebugLoc(DebugLoc::getTemporary());
1938 Preheader->getTerminator()->eraseFromParent();
1939 }
1940
1941 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1942 if (MemCheckBlock) {
1943 DT->eraseNode(BB: MemCheckBlock);
1944 LI->removeBlock(BB: MemCheckBlock);
1945 }
1946 if (SCEVCheckBlock) {
1947 DT->eraseNode(BB: SCEVCheckBlock);
1948 LI->removeBlock(BB: SCEVCheckBlock);
1949 }
1950
1951 // Outer loop is used as part of the later cost calculations.
1952 OuterLoop = L->getParentLoop();
1953 }
1954
1955 InstructionCost getCost() {
1956 if (SCEVCheckBlock || MemCheckBlock)
1957 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1958
1959 if (CostTooHigh) {
1960 InstructionCost Cost;
1961 Cost.setInvalid();
1962 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1963 return Cost;
1964 }
1965
1966 InstructionCost RTCheckCost = 0;
1967 if (SCEVCheckBlock)
1968 for (Instruction &I : *SCEVCheckBlock) {
1969 if (SCEVCheckBlock->getTerminator() == &I)
1970 continue;
1971 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1972 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1973 RTCheckCost += C;
1974 }
1975 if (MemCheckBlock) {
1976 InstructionCost MemCheckCost = 0;
1977 for (Instruction &I : *MemCheckBlock) {
1978 if (MemCheckBlock->getTerminator() == &I)
1979 continue;
1980 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1981 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1982 MemCheckCost += C;
1983 }
1984
1985 // If the runtime memory checks are being created inside an outer loop
1986 // we should find out if these checks are outer loop invariant. If so,
1987 // the checks will likely be hoisted out and so the effective cost will
1988 // reduce according to the outer loop trip count.
1989 if (OuterLoop) {
1990 ScalarEvolution *SE = MemCheckExp.getSE();
1991 // TODO: If profitable, we could refine this further by analysing every
1992 // individual memory check, since there could be a mixture of loop
1993 // variant and invariant checks that mean the final condition is
1994 // variant.
1995 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1996 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1997 // It seems reasonable to assume that we can reduce the effective
1998 // cost of the checks even when we know nothing about the trip
1999 // count. Assume that the outer loop executes at least twice.
2000 unsigned BestTripCount = 2;
2001
2002 // Get the best known TC estimate.
2003 if (auto EstimatedTC = getSmallBestKnownTC(
2004 PSE, L: OuterLoop, /* CanUseConstantMax = */ false))
2005 if (EstimatedTC->isFixed())
2006 BestTripCount = EstimatedTC->getFixedValue();
2007
2008 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2009
2010 // Let's ensure the cost is always at least 1.
2011 NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
2012 b: (InstructionCost::CostType)1);
2013
2014 if (BestTripCount > 1)
2015 LLVM_DEBUG(dbgs()
2016 << "We expect runtime memory checks to be hoisted "
2017 << "out of the outer loop. Cost reduced from "
2018 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2019
2020 MemCheckCost = NewMemCheckCost;
2021 }
2022 }
2023
2024 RTCheckCost += MemCheckCost;
2025 }
2026
2027 if (SCEVCheckBlock || MemCheckBlock)
2028 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2029 << "\n");
2030
2031 return RTCheckCost;
2032 }
2033
2034 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2035 /// unused.
2036 ~GeneratedRTChecks() {
2037 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2038 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2039 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(BB: SCEVCheckBlock);
2040 bool MemChecksUsed = !MemCheckBlock || !pred_empty(BB: MemCheckBlock);
2041 if (SCEVChecksUsed)
2042 SCEVCleaner.markResultUsed();
2043
2044 if (MemChecksUsed) {
2045 MemCheckCleaner.markResultUsed();
2046 } else {
2047 auto &SE = *MemCheckExp.getSE();
2048 // Memory runtime check generation creates compares that use expanded
2049 // values. Remove them before running the SCEVExpanderCleaners.
2050 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2051 if (MemCheckExp.isInsertedInstruction(I: &I))
2052 continue;
2053 SE.forgetValue(V: &I);
2054 I.eraseFromParent();
2055 }
2056 }
2057 MemCheckCleaner.cleanup();
2058 SCEVCleaner.cleanup();
2059
2060 if (!SCEVChecksUsed)
2061 SCEVCheckBlock->eraseFromParent();
2062 if (!MemChecksUsed)
2063 MemCheckBlock->eraseFromParent();
2064 }
2065
2066 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2067 /// outside VPlan.
2068 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2069 using namespace llvm::PatternMatch;
2070 if (!SCEVCheckCond || match(V: SCEVCheckCond, P: m_ZeroInt()))
2071 return {nullptr, nullptr};
2072
2073 return {SCEVCheckCond, SCEVCheckBlock};
2074 }
2075
2076 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2077 /// outside VPlan.
2078 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2079 using namespace llvm::PatternMatch;
2080 if (MemRuntimeCheckCond && match(V: MemRuntimeCheckCond, P: m_ZeroInt()))
2081 return {nullptr, nullptr};
2082 return {MemRuntimeCheckCond, MemCheckBlock};
2083 }
2084
2085 /// Return true if any runtime checks have been added
2086 bool hasChecks() const {
2087 return getSCEVChecks().first || getMemRuntimeChecks().first;
2088 }
2089};
2090} // namespace
2091
2092static bool useActiveLaneMask(TailFoldingStyle Style) {
2093 return Style == TailFoldingStyle::Data ||
2094 Style == TailFoldingStyle::DataAndControlFlow ||
2095 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2096}
2097
2098static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2099 return Style == TailFoldingStyle::DataAndControlFlow ||
2100 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2101}
2102
2103// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2104// vectorization. The loop needs to be annotated with #pragma omp simd
2105// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2106// vector length information is not provided, vectorization is not considered
2107// explicit. Interleave hints are not allowed either. These limitations will be
2108// relaxed in the future.
2109// Please, note that we are currently forced to abuse the pragma 'clang
2110// vectorize' semantics. This pragma provides *auto-vectorization hints*
2111// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2112// provides *explicit vectorization hints* (LV can bypass legal checks and
2113// assume that vectorization is legal). However, both hints are implemented
2114// using the same metadata (llvm.loop.vectorize, processed by
2115// LoopVectorizeHints). This will be fixed in the future when the native IR
2116// representation for pragma 'omp simd' is introduced.
2117static bool isExplicitVecOuterLoop(Loop *OuterLp,
2118 OptimizationRemarkEmitter *ORE) {
2119 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2120 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2121
2122 // Only outer loops with an explicit vectorization hint are supported.
2123 // Unannotated outer loops are ignored.
2124 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2125 return false;
2126
2127 Function *Fn = OuterLp->getHeader()->getParent();
2128 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2129 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2130 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2131 return false;
2132 }
2133
2134 if (Hints.getInterleave() > 1) {
2135 // TODO: Interleave support is future work.
2136 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2137 "outer loops.\n");
2138 Hints.emitRemarkWithHints();
2139 return false;
2140 }
2141
2142 return true;
2143}
2144
2145static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2146 OptimizationRemarkEmitter *ORE,
2147 SmallVectorImpl<Loop *> &V) {
2148 // Collect inner loops and outer loops without irreducible control flow. For
2149 // now, only collect outer loops that have explicit vectorization hints. If we
2150 // are stress testing the VPlan H-CFG construction, we collect the outermost
2151 // loop of every loop nest.
2152 if (L.isInnermost() || VPlanBuildStressTest ||
2153 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2154 LoopBlocksRPO RPOT(&L);
2155 RPOT.perform(LI);
2156 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2157 V.push_back(Elt: &L);
2158 // TODO: Collect inner loops inside marked outer loops in case
2159 // vectorization fails for the outer loop. Do not invoke
2160 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2161 // already known to be reducible. We can use an inherited attribute for
2162 // that.
2163 return;
2164 }
2165 }
2166 for (Loop *InnerL : L)
2167 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2168}
2169
2170//===----------------------------------------------------------------------===//
2171// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2172// LoopVectorizationCostModel and LoopVectorizationPlanner.
2173//===----------------------------------------------------------------------===//
2174
2175/// FIXME: The newly created binary instructions should contain nsw/nuw
2176/// flags, which can be found from the original scalar operations.
2177Value *
2178llvm::emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2179 Value *Step,
2180 InductionDescriptor::InductionKind InductionKind,
2181 const BinaryOperator *InductionBinOp) {
2182 using namespace llvm::PatternMatch;
2183 Type *StepTy = Step->getType();
2184 Value *CastedIndex = StepTy->isIntegerTy()
2185 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2186 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2187 if (CastedIndex != Index) {
2188 CastedIndex->setName(CastedIndex->getName() + ".cast");
2189 Index = CastedIndex;
2190 }
2191
2192 // Note: the IR at this point is broken. We cannot use SE to create any new
2193 // SCEV and then expand it, hoping that SCEV's simplification will give us
2194 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2195 // lead to various SCEV crashes. So all we can do is to use builder and rely
2196 // on InstCombine for future simplifications. Here we handle some trivial
2197 // cases only.
2198 auto CreateAdd = [&B](Value *X, Value *Y) {
2199 assert(X->getType() == Y->getType() && "Types don't match!");
2200 if (match(V: X, P: m_ZeroInt()))
2201 return Y;
2202 if (match(V: Y, P: m_ZeroInt()))
2203 return X;
2204 return B.CreateAdd(LHS: X, RHS: Y);
2205 };
2206
2207 // We allow X to be a vector type, in which case Y will potentially be
2208 // splatted into a vector with the same element count.
2209 auto CreateMul = [&B](Value *X, Value *Y) {
2210 assert(X->getType()->getScalarType() == Y->getType() &&
2211 "Types don't match!");
2212 if (match(V: X, P: m_One()))
2213 return Y;
2214 if (match(V: Y, P: m_One()))
2215 return X;
2216 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2217 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2218 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2219 return B.CreateMul(LHS: X, RHS: Y);
2220 };
2221
2222 switch (InductionKind) {
2223 case InductionDescriptor::IK_IntInduction: {
2224 assert(!isa<VectorType>(Index->getType()) &&
2225 "Vector indices not supported for integer inductions yet");
2226 assert(Index->getType() == StartValue->getType() &&
2227 "Index type does not match StartValue type");
2228 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2229 return B.CreateSub(LHS: StartValue, RHS: Index);
2230 auto *Offset = CreateMul(Index, Step);
2231 return CreateAdd(StartValue, Offset);
2232 }
2233 case InductionDescriptor::IK_PtrInduction:
2234 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2235 case InductionDescriptor::IK_FpInduction: {
2236 assert(!isa<VectorType>(Index->getType()) &&
2237 "Vector indices not supported for FP inductions yet");
2238 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2239 assert(InductionBinOp &&
2240 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2241 InductionBinOp->getOpcode() == Instruction::FSub) &&
2242 "Original bin op should be defined for FP induction");
2243
2244 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2245 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2246 Name: "induction");
2247 }
2248 case InductionDescriptor::IK_NoInduction:
2249 return nullptr;
2250 }
2251 llvm_unreachable("invalid enum");
2252}
2253
2254static std::optional<unsigned> getMaxVScale(const Function &F,
2255 const TargetTransformInfo &TTI) {
2256 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2257 return MaxVScale;
2258
2259 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2260 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2261
2262 return std::nullopt;
2263}
2264
2265/// For the given VF and UF and maximum trip count computed for the loop, return
2266/// whether the induction variable might overflow in the vectorized loop. If not,
2267/// then we know a runtime overflow check always evaluates to false and can be
2268/// removed.
2269static bool isIndvarOverflowCheckKnownFalse(
2270 const LoopVectorizationCostModel *Cost,
2271 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2272 // Always be conservative if we don't know the exact unroll factor.
2273 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2274
2275 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2276 APInt MaxUIntTripCount = IdxTy->getMask();
2277
2278 // We know the runtime overflow check is known false iff the (max) trip-count
2279 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2280 // the vector loop induction variable.
2281 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2282 uint64_t MaxVF = VF.getKnownMinValue();
2283 if (VF.isScalable()) {
2284 std::optional<unsigned> MaxVScale =
2285 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2286 if (!MaxVScale)
2287 return false;
2288 MaxVF *= *MaxVScale;
2289 }
2290
2291 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2292 }
2293
2294 return false;
2295}
2296
2297// Return whether we allow using masked interleave-groups (for dealing with
2298// strided loads/stores that reside in predicated blocks, or for dealing
2299// with gaps).
2300static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2301 // If an override option has been passed in for interleaved accesses, use it.
2302 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2303 return EnableMaskedInterleavedMemAccesses;
2304
2305 return TTI.enableMaskedInterleavedAccessVectorization();
2306}
2307
2308void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan(
2309 BasicBlock *CheckIRBB) {
2310 // Note: The block with the minimum trip-count check is already connected
2311 // during earlier VPlan construction.
2312 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2313 VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2314 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2315 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2316 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2317 VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPBB, BlockPtr: CheckVPIRBB);
2318 PreVectorPH = CheckVPIRBB;
2319 VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2320 PreVectorPH->swapSuccessors();
2321
2322 // We just connected a new block to the scalar preheader. Update all
2323 // VPPhis by adding an incoming value for it, replicating the last value.
2324 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2325 for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2326 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2327 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2328 "must have incoming values for all operands");
2329 R.addOperand(Operand: R.getOperand(N: NumPredecessors - 2));
2330 }
2331}
2332
2333Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
2334 BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
2335 // Generate code to check if the loop's trip count is less than VF * UF, or
2336 // equal to it in case a scalar epilogue is required; this implies that the
2337 // vector trip count is zero. This check also covers the case where adding one
2338 // to the backedge-taken count overflowed leading to an incorrect trip count
2339 // of zero. In this case we will also jump to the scalar loop.
2340 auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2341 : ICmpInst::ICMP_ULT;
2342
2343 // Reuse existing vector loop preheader for TC checks.
2344 // Note that new preheader block is generated for vector loop.
2345 BasicBlock *const TCCheckBlock = VectorPH;
2346 IRBuilder<InstSimplifyFolder> Builder(
2347 TCCheckBlock->getContext(),
2348 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2349 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2350
2351 // If tail is to be folded, vector loop takes care of all iterations.
2352 Value *Count = getTripCount();
2353 Type *CountTy = Count->getType();
2354 Value *CheckMinIters = Builder.getFalse();
2355 auto CreateStep = [&]() -> Value * {
2356 // Create step with max(MinProTripCount, UF * VF).
2357 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2358 return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2359
2360 Value *MinProfTC =
2361 Builder.CreateElementCount(Ty: CountTy, EC: MinProfitableTripCount);
2362 if (!VF.isScalable())
2363 return MinProfTC;
2364 return Builder.CreateBinaryIntrinsic(
2365 ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2366 };
2367
2368 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2369 if (Style == TailFoldingStyle::None) {
2370 Value *Step = CreateStep();
2371 ScalarEvolution &SE = *PSE.getSE();
2372 // TODO: Emit unconditional branch to vector preheader instead of
2373 // conditional branch with known condition.
2374 const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2375 // Check if the trip count is < the step.
2376 if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2377 // TODO: Ensure step is at most the trip count when determining max VF and
2378 // UF, w/o tail folding.
2379 CheckMinIters = Builder.getTrue();
2380 } else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2381 LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2382 // Generate the minimum iteration check only if we cannot prove the
2383 // check is known to be true, or known to be false.
2384 CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2385 } // else step known to be < trip count, use CheckMinIters preset to false.
2386 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2387 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2388 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2389 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2390 // an overflow to zero when updating induction variables and so an
2391 // additional overflow check is required before entering the vector loop.
2392
2393 // Get the maximum unsigned value for the type.
2394 Value *MaxUIntTripCount =
2395 ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2396 Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2397
2398 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2399 CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep());
2400 }
2401 return CheckMinIters;
2402}
2403
2404/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2405/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2406/// predecessors and successors of VPBB, if any, are rewired to the new
2407/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2408static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
2409 BasicBlock *IRBB,
2410 VPlan *Plan = nullptr) {
2411 if (!Plan)
2412 Plan = VPBB->getPlan();
2413 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2414 auto IP = IRVPBB->begin();
2415 for (auto &R : make_early_inc_range(Range: VPBB->phis()))
2416 R.moveBefore(BB&: *IRVPBB, I: IP);
2417
2418 for (auto &R :
2419 make_early_inc_range(Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end())))
2420 R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2421
2422 VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2423 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2424 return IRVPBB;
2425}
2426
2427BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
2428 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2429 assert(VectorPH && "Invalid loop structure");
2430 assert((OrigLoop->getUniqueLatchExitBlock() ||
2431 Cost->requiresScalarEpilogue(VF.isVector())) &&
2432 "loops not exiting via the latch without required epilogue?");
2433
2434 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2435 // wrapping the newly created scalar preheader here at the moment, because the
2436 // Plan's scalar preheader may be unreachable at this point. Instead it is
2437 // replaced in executePlan.
2438 return SplitBlock(Old: VectorPH, SplitPt: VectorPH->getTerminator(), DT, LI, MSSAU: nullptr,
2439 BBName: Twine(Prefix) + "scalar.ph");
2440}
2441
2442/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2443/// expansion results.
2444static Value *getExpandedStep(const InductionDescriptor &ID,
2445 const SCEV2ValueTy &ExpandedSCEVs) {
2446 const SCEV *Step = ID.getStep();
2447 if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
2448 return C->getValue();
2449 if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
2450 return U->getValue();
2451 Value *V = ExpandedSCEVs.lookup(Val: Step);
2452 assert(V && "SCEV must be expanded at this point");
2453 return V;
2454}
2455
2456/// Knowing that loop \p L executes a single vector iteration, add instructions
2457/// that will get simplified and thus should not have any cost to \p
2458/// InstsToIgnore.
2459static void addFullyUnrolledInstructionsToIgnore(
2460 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2461 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2462 auto *Cmp = L->getLatchCmpInst();
2463 if (Cmp)
2464 InstsToIgnore.insert(Ptr: Cmp);
2465 for (const auto &KV : IL) {
2466 // Extract the key by hand so that it can be used in the lambda below. Note
2467 // that captured structured bindings are a C++20 extension.
2468 const PHINode *IV = KV.first;
2469
2470 // Get next iteration value of the induction variable.
2471 Instruction *IVInst =
2472 cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2473 if (all_of(Range: IVInst->users(),
2474 P: [&](const User *U) { return U == IV || U == Cmp; }))
2475 InstsToIgnore.insert(Ptr: IVInst);
2476 }
2477}
2478
2479BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2480 // Create a new IR basic block for the scalar preheader.
2481 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
2482 return ScalarPH->getSinglePredecessor();
2483}
2484
2485namespace {
2486
2487struct CSEDenseMapInfo {
2488 static bool canHandle(const Instruction *I) {
2489 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2490 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2491 }
2492
2493 static inline Instruction *getEmptyKey() {
2494 return DenseMapInfo<Instruction *>::getEmptyKey();
2495 }
2496
2497 static inline Instruction *getTombstoneKey() {
2498 return DenseMapInfo<Instruction *>::getTombstoneKey();
2499 }
2500
2501 static unsigned getHashValue(const Instruction *I) {
2502 assert(canHandle(I) && "Unknown instruction!");
2503 return hash_combine(args: I->getOpcode(),
2504 args: hash_combine_range(R: I->operand_values()));
2505 }
2506
2507 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2508 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2509 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2510 return LHS == RHS;
2511 return LHS->isIdenticalTo(I: RHS);
2512 }
2513};
2514
2515} // end anonymous namespace
2516
2517/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2518/// removal, in favor of the VPlan-based one.
2519static void legacyCSE(BasicBlock *BB) {
2520 // Perform simple cse.
2521 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2522 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2523 if (!CSEDenseMapInfo::canHandle(I: &In))
2524 continue;
2525
2526 // Check if we can replace this instruction with any of the
2527 // visited instructions.
2528 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2529 In.replaceAllUsesWith(V);
2530 In.eraseFromParent();
2531 continue;
2532 }
2533
2534 CSEMap[&In] = &In;
2535 }
2536}
2537
2538/// This function attempts to return a value that represents the ElementCount
2539/// at runtime. For fixed-width VFs we know this precisely at compile
2540/// time, but for scalable VFs we calculate it based on an estimate of the
2541/// vscale value.
2542static unsigned estimateElementCount(ElementCount VF,
2543 std::optional<unsigned> VScale) {
2544 unsigned EstimatedVF = VF.getKnownMinValue();
2545 if (VF.isScalable())
2546 if (VScale)
2547 EstimatedVF *= *VScale;
2548 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2549 return EstimatedVF;
2550}
2551
2552InstructionCost
2553LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2554 ElementCount VF) const {
2555 // We only need to calculate a cost if the VF is scalar; for actual vectors
2556 // we should already have a pre-calculated cost at each VF.
2557 if (!VF.isScalar())
2558 return getCallWideningDecision(CI, VF).Cost;
2559
2560 Type *RetTy = CI->getType();
2561 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2562 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2563 return *RedCost;
2564
2565 SmallVector<Type *, 4> Tys;
2566 for (auto &ArgOp : CI->args())
2567 Tys.push_back(Elt: ArgOp->getType());
2568
2569 InstructionCost ScalarCallCost =
2570 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2571
2572 // If this is an intrinsic we may have a lower cost for it.
2573 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2574 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2575 return std::min(a: ScalarCallCost, b: IntrinsicCost);
2576 }
2577 return ScalarCallCost;
2578}
2579
2580static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2581 if (VF.isScalar() || !canVectorizeTy(Ty))
2582 return Ty;
2583 return toVectorizedTy(Ty, EC: VF);
2584}
2585
2586InstructionCost
2587LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2588 ElementCount VF) const {
2589 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2590 assert(ID && "Expected intrinsic call!");
2591 Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2592 FastMathFlags FMF;
2593 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2594 FMF = FPMO->getFastMathFlags();
2595
2596 SmallVector<const Value *> Arguments(CI->args());
2597 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2598 SmallVector<Type *> ParamTys;
2599 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2600 result: std::back_inserter(x&: ParamTys),
2601 unary_op: [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2602
2603 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2604 dyn_cast<IntrinsicInst>(Val: CI),
2605 InstructionCost::getInvalid(), TLI);
2606 return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2607}
2608
2609void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2610 // Fix widened non-induction PHIs by setting up the PHI operands.
2611 fixNonInductionPHIs(State);
2612
2613 // Don't apply optimizations below when no (vector) loop remains, as they all
2614 // require one at the moment.
2615 VPBasicBlock *HeaderVPBB =
2616 vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2617 if (!HeaderVPBB)
2618 return;
2619
2620 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2621
2622 // Remove redundant induction instructions.
2623 legacyCSE(BB: HeaderBB);
2624}
2625
2626void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2627 auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2628 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2629 for (VPRecipeBase &P : VPBB->phis()) {
2630 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2631 if (!VPPhi)
2632 continue;
2633 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2634 // Make sure the builder has a valid insert point.
2635 Builder.SetInsertPoint(NewPhi);
2636 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2637 NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB[VPBB]);
2638 }
2639 }
2640}
2641
2642void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2643 // We should not collect Scalars more than once per VF. Right now, this
2644 // function is called from collectUniformsAndScalars(), which already does
2645 // this check. Collecting Scalars for VF=1 does not make any sense.
2646 assert(VF.isVector() && !Scalars.contains(VF) &&
2647 "This function should not be visited twice for the same VF");
2648
2649 // This avoids any chances of creating a REPLICATE recipe during planning
2650 // since that would result in generation of scalarized code during execution,
2651 // which is not supported for scalable vectors.
2652 if (VF.isScalable()) {
2653 Scalars[VF].insert_range(R&: Uniforms[VF]);
2654 return;
2655 }
2656
2657 SmallSetVector<Instruction *, 8> Worklist;
2658
2659 // These sets are used to seed the analysis with pointers used by memory
2660 // accesses that will remain scalar.
2661 SmallSetVector<Instruction *, 8> ScalarPtrs;
2662 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2663 auto *Latch = TheLoop->getLoopLatch();
2664
2665 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2666 // The pointer operands of loads and stores will be scalar as long as the
2667 // memory access is not a gather or scatter operation. The value operand of a
2668 // store will remain scalar if the store is scalarized.
2669 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2670 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2671 assert(WideningDecision != CM_Unknown &&
2672 "Widening decision should be ready at this moment");
2673 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2674 if (Ptr == Store->getValueOperand())
2675 return WideningDecision == CM_Scalarize;
2676 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2677 "Ptr is neither a value or pointer operand");
2678 return WideningDecision != CM_GatherScatter;
2679 };
2680
2681 // A helper that returns true if the given value is a getelementptr
2682 // instruction contained in the loop.
2683 auto IsLoopVaryingGEP = [&](Value *V) {
2684 return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2685 };
2686
2687 // A helper that evaluates a memory access's use of a pointer. If the use will
2688 // be a scalar use and the pointer is only used by memory accesses, we place
2689 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2690 // PossibleNonScalarPtrs.
2691 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2692 // We only care about bitcast and getelementptr instructions contained in
2693 // the loop.
2694 if (!IsLoopVaryingGEP(Ptr))
2695 return;
2696
2697 // If the pointer has already been identified as scalar (e.g., if it was
2698 // also identified as uniform), there's nothing to do.
2699 auto *I = cast<Instruction>(Val: Ptr);
2700 if (Worklist.count(key: I))
2701 return;
2702
2703 // If the use of the pointer will be a scalar use, and all users of the
2704 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2705 // place the pointer in PossibleNonScalarPtrs.
2706 if (IsScalarUse(MemAccess, Ptr) &&
2707 all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2708 ScalarPtrs.insert(X: I);
2709 else
2710 PossibleNonScalarPtrs.insert(Ptr: I);
2711 };
2712
2713 // We seed the scalars analysis with three classes of instructions: (1)
2714 // instructions marked uniform-after-vectorization and (2) bitcast,
2715 // getelementptr and (pointer) phi instructions used by memory accesses
2716 // requiring a scalar use.
2717 //
2718 // (1) Add to the worklist all instructions that have been identified as
2719 // uniform-after-vectorization.
2720 Worklist.insert_range(R&: Uniforms[VF]);
2721
2722 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2723 // memory accesses requiring a scalar use. The pointer operands of loads and
2724 // stores will be scalar unless the operation is a gather or scatter.
2725 // The value operand of a store will remain scalar if the store is scalarized.
2726 for (auto *BB : TheLoop->blocks())
2727 for (auto &I : *BB) {
2728 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2729 EvaluatePtrUse(Load, Load->getPointerOperand());
2730 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2731 EvaluatePtrUse(Store, Store->getPointerOperand());
2732 EvaluatePtrUse(Store, Store->getValueOperand());
2733 }
2734 }
2735 for (auto *I : ScalarPtrs)
2736 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2737 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2738 Worklist.insert(X: I);
2739 }
2740
2741 // Insert the forced scalars.
2742 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2743 // induction variable when the PHI user is scalarized.
2744 auto ForcedScalar = ForcedScalars.find(Val: VF);
2745 if (ForcedScalar != ForcedScalars.end())
2746 for (auto *I : ForcedScalar->second) {
2747 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2748 Worklist.insert(X: I);
2749 }
2750
2751 // Expand the worklist by looking through any bitcasts and getelementptr
2752 // instructions we've already identified as scalar. This is similar to the
2753 // expansion step in collectLoopUniforms(); however, here we're only
2754 // expanding to include additional bitcasts and getelementptr instructions.
2755 unsigned Idx = 0;
2756 while (Idx != Worklist.size()) {
2757 Instruction *Dst = Worklist[Idx++];
2758 if (!IsLoopVaryingGEP(Dst->getOperand(i: 0)))
2759 continue;
2760 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
2761 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
2762 auto *J = cast<Instruction>(Val: U);
2763 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
2764 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
2765 IsScalarUse(J, Src));
2766 })) {
2767 Worklist.insert(X: Src);
2768 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2769 }
2770 }
2771
2772 // An induction variable will remain scalar if all users of the induction
2773 // variable and induction variable update remain scalar.
2774 for (const auto &Induction : Legal->getInductionVars()) {
2775 auto *Ind = Induction.first;
2776 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2777
2778 // If tail-folding is applied, the primary induction variable will be used
2779 // to feed a vector compare.
2780 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2781 continue;
2782
2783 // Returns true if \p Indvar is a pointer induction that is used directly by
2784 // load/store instruction \p I.
2785 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2786 Instruction *I) {
2787 return Induction.second.getKind() ==
2788 InductionDescriptor::IK_PtrInduction &&
2789 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
2790 Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse(I, Indvar);
2791 };
2792
2793 // Determine if all users of the induction variable are scalar after
2794 // vectorization.
2795 bool ScalarInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2796 auto *I = cast<Instruction>(Val: U);
2797 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2798 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2799 });
2800 if (!ScalarInd)
2801 continue;
2802
2803 // If the induction variable update is a fixed-order recurrence, neither the
2804 // induction variable or its update should be marked scalar after
2805 // vectorization.
2806 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2807 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2808 continue;
2809
2810 // Determine if all users of the induction variable update instruction are
2811 // scalar after vectorization.
2812 bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
2813 auto *I = cast<Instruction>(Val: U);
2814 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2815 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2816 });
2817 if (!ScalarIndUpdate)
2818 continue;
2819
2820 // The induction variable and its update instruction will remain scalar.
2821 Worklist.insert(X: Ind);
2822 Worklist.insert(X: IndUpdate);
2823 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2824 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2825 << "\n");
2826 }
2827
2828 Scalars[VF].insert_range(R&: Worklist);
2829}
2830
2831bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
2832 ElementCount VF) {
2833 if (!isPredicatedInst(I))
2834 return false;
2835
2836 // Do we have a non-scalar lowering for this predicated
2837 // instruction? No - it is scalar with predication.
2838 switch(I->getOpcode()) {
2839 default:
2840 return true;
2841 case Instruction::Call:
2842 if (VF.isScalar())
2843 return true;
2844 return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
2845 case Instruction::Load:
2846 case Instruction::Store: {
2847 auto *Ptr = getLoadStorePointerOperand(V: I);
2848 auto *Ty = getLoadStoreType(I);
2849 unsigned AS = getLoadStoreAddressSpace(I);
2850 Type *VTy = Ty;
2851 if (VF.isVector())
2852 VTy = VectorType::get(ElementType: Ty, EC: VF);
2853 const Align Alignment = getLoadStoreAlignment(I);
2854 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2855 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
2856 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2857 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
2858 }
2859 case Instruction::UDiv:
2860 case Instruction::SDiv:
2861 case Instruction::SRem:
2862 case Instruction::URem: {
2863 // We have the option to use the safe-divisor idiom to avoid predication.
2864 // The cost based decision here will always select safe-divisor for
2865 // scalable vectors as scalarization isn't legal.
2866 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2867 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2868 }
2869 }
2870}
2871
2872// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2873bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2874 // TODO: We can use the loop-preheader as context point here and get
2875 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2876 if (isSafeToSpeculativelyExecute(I) ||
2877 (isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) ||
2878 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2879 return false;
2880
2881 // If the instruction was executed conditionally in the original scalar loop,
2882 // predication is needed with a mask whose lanes are all possibly inactive.
2883 if (Legal->blockNeedsPredication(BB: I->getParent()))
2884 return true;
2885
2886 // If we're not folding the tail by masking, predication is unnecessary.
2887 if (!foldTailByMasking())
2888 return false;
2889
2890 // All that remain are instructions with side-effects originally executed in
2891 // the loop unconditionally, but now execute under a tail-fold mask (only)
2892 // having at least one active lane (the first). If the side-effects of the
2893 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2894 // - it will cause the same side-effects as when masked.
2895 switch(I->getOpcode()) {
2896 default:
2897 llvm_unreachable(
2898 "instruction should have been considered by earlier checks");
2899 case Instruction::Call:
2900 // Side-effects of a Call are assumed to be non-invariant, needing a
2901 // (fold-tail) mask.
2902 assert(Legal->isMaskRequired(I) &&
2903 "should have returned earlier for calls not needing a mask");
2904 return true;
2905 case Instruction::Load:
2906 // If the address is loop invariant no predication is needed.
2907 return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2908 case Instruction::Store: {
2909 // For stores, we need to prove both speculation safety (which follows from
2910 // the same argument as loads), but also must prove the value being stored
2911 // is correct. The easiest form of the later is to require that all values
2912 // stored are the same.
2913 return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2914 TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2915 }
2916 case Instruction::UDiv:
2917 case Instruction::URem:
2918 // If the divisor is loop-invariant no predication is needed.
2919 return !Legal->isInvariant(V: I->getOperand(i: 1));
2920 case Instruction::SDiv:
2921 case Instruction::SRem:
2922 // Conservative for now, since masked-off lanes may be poison and could
2923 // trigger signed overflow.
2924 return true;
2925 }
2926}
2927
2928uint64_t LoopVectorizationCostModel::getPredBlockCostDivisor(
2929 TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
2930 if (CostKind == TTI::TCK_CodeSize)
2931 return 1;
2932 // If the block wasn't originally predicated then return early to avoid
2933 // computing BlockFrequencyInfo unnecessarily.
2934 if (!Legal->blockNeedsPredication(BB))
2935 return 1;
2936
2937 uint64_t HeaderFreq =
2938 getBFI().getBlockFreq(BB: TheLoop->getHeader()).getFrequency();
2939 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2940 assert(HeaderFreq >= BBFreq &&
2941 "Header has smaller block freq than dominated BB?");
2942 return std::round(x: (double)HeaderFreq / BBFreq);
2943}
2944
2945std::pair<InstructionCost, InstructionCost>
2946LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
2947 ElementCount VF) {
2948 assert(I->getOpcode() == Instruction::UDiv ||
2949 I->getOpcode() == Instruction::SDiv ||
2950 I->getOpcode() == Instruction::SRem ||
2951 I->getOpcode() == Instruction::URem);
2952 assert(!isSafeToSpeculativelyExecute(I));
2953
2954 // Scalarization isn't legal for scalable vector types
2955 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2956 if (!VF.isScalable()) {
2957 // Get the scalarization cost and scale this amount by the probability of
2958 // executing the predicated block. If the instruction is not predicated,
2959 // we fall through to the next case.
2960 ScalarizationCost = 0;
2961
2962 // These instructions have a non-void type, so account for the phi nodes
2963 // that we will create. This cost is likely to be zero. The phi node
2964 // cost, if any, should be scaled by the block probability because it
2965 // models a copy at the end of each predicated block.
2966 ScalarizationCost +=
2967 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
2968
2969 // The cost of the non-predicated instruction.
2970 ScalarizationCost +=
2971 VF.getFixedValue() *
2972 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
2973
2974 // The cost of insertelement and extractelement instructions needed for
2975 // scalarization.
2976 ScalarizationCost += getScalarizationOverhead(I, VF);
2977
2978 // Scale the cost by the probability of executing the predicated blocks.
2979 // This assumes the predicated block for each vector lane is equally
2980 // likely.
2981 ScalarizationCost =
2982 ScalarizationCost / getPredBlockCostDivisor(CostKind, BB: I->getParent());
2983 }
2984
2985 InstructionCost SafeDivisorCost = 0;
2986 auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
2987 // The cost of the select guard to ensure all lanes are well defined
2988 // after we speculate above any internal control flow.
2989 SafeDivisorCost +=
2990 TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
2991 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
2992 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
2993
2994 SmallVector<const Value *, 4> Operands(I->operand_values());
2995 SafeDivisorCost += TTI.getArithmeticInstrCost(
2996 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
2997 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2998 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2999 Args: Operands, CxtI: I);
3000 return {ScalarizationCost, SafeDivisorCost};
3001}
3002
3003bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3004 Instruction *I, ElementCount VF) const {
3005 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3006 assert(getWideningDecision(I, VF) == CM_Unknown &&
3007 "Decision should not be set yet.");
3008 auto *Group = getInterleavedAccessGroup(Instr: I);
3009 assert(Group && "Must have a group.");
3010 unsigned InterleaveFactor = Group->getFactor();
3011
3012 // If the instruction's allocated size doesn't equal its type size, it
3013 // requires padding and will be scalarized.
3014 auto &DL = I->getDataLayout();
3015 auto *ScalarTy = getLoadStoreType(I);
3016 if (hasIrregularType(Ty: ScalarTy, DL))
3017 return false;
3018
3019 // For scalable vectors, the interleave factors must be <= 8 since we require
3020 // the (de)interleaveN intrinsics instead of shufflevectors.
3021 if (VF.isScalable() && InterleaveFactor > 8)
3022 return false;
3023
3024 // If the group involves a non-integral pointer, we may not be able to
3025 // losslessly cast all values to a common type.
3026 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
3027 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3028 Instruction *Member = Group->getMember(Index: Idx);
3029 if (!Member)
3030 continue;
3031 auto *MemberTy = getLoadStoreType(I: Member);
3032 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3033 // Don't coerce non-integral pointers to integers or vice versa.
3034 if (MemberNI != ScalarNI)
3035 // TODO: Consider adding special nullptr value case here
3036 return false;
3037 if (MemberNI && ScalarNI &&
3038 ScalarTy->getPointerAddressSpace() !=
3039 MemberTy->getPointerAddressSpace())
3040 return false;
3041 }
3042
3043 // Check if masking is required.
3044 // A Group may need masking for one of two reasons: it resides in a block that
3045 // needs predication, or it was decided to use masking to deal with gaps
3046 // (either a gap at the end of a load-access that may result in a speculative
3047 // load, or any gaps in a store-access).
3048 bool PredicatedAccessRequiresMasking =
3049 blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3050 Legal->isMaskRequired(I);
3051 bool LoadAccessWithGapsRequiresEpilogMasking =
3052 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3053 !isScalarEpilogueAllowed();
3054 bool StoreAccessWithGapsRequiresMasking =
3055 isa<StoreInst>(Val: I) && !Group->isFull();
3056 if (!PredicatedAccessRequiresMasking &&
3057 !LoadAccessWithGapsRequiresEpilogMasking &&
3058 !StoreAccessWithGapsRequiresMasking)
3059 return true;
3060
3061 // If masked interleaving is required, we expect that the user/target had
3062 // enabled it, because otherwise it either wouldn't have been created or
3063 // it should have been invalidated by the CostModel.
3064 assert(useMaskedInterleavedAccesses(TTI) &&
3065 "Masked interleave-groups for predicated accesses are not enabled.");
3066
3067 if (Group->isReverse())
3068 return false;
3069
3070 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
3071 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
3072 StoreAccessWithGapsRequiresMasking;
3073 if (VF.isScalable() && NeedsMaskForGaps)
3074 return false;
3075
3076 auto *Ty = getLoadStoreType(I);
3077 const Align Alignment = getLoadStoreAlignment(I);
3078 unsigned AS = getLoadStoreAddressSpace(I);
3079 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3080 : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3081}
3082
3083bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3084 Instruction *I, ElementCount VF) {
3085 // Get and ensure we have a valid memory instruction.
3086 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3087
3088 auto *Ptr = getLoadStorePointerOperand(V: I);
3089 auto *ScalarTy = getLoadStoreType(I);
3090
3091 // In order to be widened, the pointer should be consecutive, first of all.
3092 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3093 return false;
3094
3095 // If the instruction is a store located in a predicated block, it will be
3096 // scalarized.
3097 if (isScalarWithPredication(I, VF))
3098 return false;
3099
3100 // If the instruction's allocated size doesn't equal it's type size, it
3101 // requires padding and will be scalarized.
3102 auto &DL = I->getDataLayout();
3103 if (hasIrregularType(Ty: ScalarTy, DL))
3104 return false;
3105
3106 return true;
3107}
3108
3109void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3110 // We should not collect Uniforms more than once per VF. Right now,
3111 // this function is called from collectUniformsAndScalars(), which
3112 // already does this check. Collecting Uniforms for VF=1 does not make any
3113 // sense.
3114
3115 assert(VF.isVector() && !Uniforms.contains(VF) &&
3116 "This function should not be visited twice for the same VF");
3117
3118 // Visit the list of Uniforms. If we find no uniform value, we won't
3119 // analyze again. Uniforms.count(VF) will return 1.
3120 Uniforms[VF].clear();
3121
3122 // Now we know that the loop is vectorizable!
3123 // Collect instructions inside the loop that will remain uniform after
3124 // vectorization.
3125
3126 // Global values, params and instructions outside of current loop are out of
3127 // scope.
3128 auto IsOutOfScope = [&](Value *V) -> bool {
3129 Instruction *I = dyn_cast<Instruction>(Val: V);
3130 return (!I || !TheLoop->contains(Inst: I));
3131 };
3132
3133 // Worklist containing uniform instructions demanding lane 0.
3134 SetVector<Instruction *> Worklist;
3135
3136 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3137 // that require predication must not be considered uniform after
3138 // vectorization, because that would create an erroneous replicating region
3139 // where only a single instance out of VF should be formed.
3140 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3141 if (IsOutOfScope(I)) {
3142 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3143 << *I << "\n");
3144 return;
3145 }
3146 if (isPredicatedInst(I)) {
3147 LLVM_DEBUG(
3148 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3149 << "\n");
3150 return;
3151 }
3152 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3153 Worklist.insert(X: I);
3154 };
3155
3156 // Start with the conditional branches exiting the loop. If the branch
3157 // condition is an instruction contained in the loop that is only used by the
3158 // branch, it is uniform. Note conditions from uncountable early exits are not
3159 // uniform.
3160 SmallVector<BasicBlock *> Exiting;
3161 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3162 for (BasicBlock *E : Exiting) {
3163 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3164 continue;
3165 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
3166 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3167 AddToWorklistIfAllowed(Cmp);
3168 }
3169
3170 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
3171 // Return true if all lanes perform the same memory operation, and we can
3172 // thus choose to execute only one.
3173 auto IsUniformMemOpUse = [&](Instruction *I) {
3174 // If the value was already known to not be uniform for the previous
3175 // (smaller VF), it cannot be uniform for the larger VF.
3176 if (PrevVF.isVector()) {
3177 auto Iter = Uniforms.find(Val: PrevVF);
3178 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
3179 return false;
3180 }
3181 if (!Legal->isUniformMemOp(I&: *I, VF))
3182 return false;
3183 if (isa<LoadInst>(Val: I))
3184 // Loading the same address always produces the same result - at least
3185 // assuming aliasing and ordering which have already been checked.
3186 return true;
3187 // Storing the same value on every iteration.
3188 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3189 };
3190
3191 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3192 InstWidening WideningDecision = getWideningDecision(I, VF);
3193 assert(WideningDecision != CM_Unknown &&
3194 "Widening decision should be ready at this moment");
3195
3196 if (IsUniformMemOpUse(I))
3197 return true;
3198
3199 return (WideningDecision == CM_Widen ||
3200 WideningDecision == CM_Widen_Reverse ||
3201 WideningDecision == CM_Interleave);
3202 };
3203
3204 // Returns true if Ptr is the pointer operand of a memory access instruction
3205 // I, I is known to not require scalarization, and the pointer is not also
3206 // stored.
3207 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3208 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
3209 return false;
3210 return getLoadStorePointerOperand(V: I) == Ptr &&
3211 (IsUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
3212 };
3213
3214 // Holds a list of values which are known to have at least one uniform use.
3215 // Note that there may be other uses which aren't uniform. A "uniform use"
3216 // here is something which only demands lane 0 of the unrolled iterations;
3217 // it does not imply that all lanes produce the same value (e.g. this is not
3218 // the usual meaning of uniform)
3219 SetVector<Value *> HasUniformUse;
3220
3221 // Scan the loop for instructions which are either a) known to have only
3222 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3223 for (auto *BB : TheLoop->blocks())
3224 for (auto &I : *BB) {
3225 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3226 switch (II->getIntrinsicID()) {
3227 case Intrinsic::sideeffect:
3228 case Intrinsic::experimental_noalias_scope_decl:
3229 case Intrinsic::assume:
3230 case Intrinsic::lifetime_start:
3231 case Intrinsic::lifetime_end:
3232 if (TheLoop->hasLoopInvariantOperands(I: &I))
3233 AddToWorklistIfAllowed(&I);
3234 break;
3235 default:
3236 break;
3237 }
3238 }
3239
3240 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3241 if (IsOutOfScope(EVI->getAggregateOperand())) {
3242 AddToWorklistIfAllowed(EVI);
3243 continue;
3244 }
3245 // Only ExtractValue instructions where the aggregate value comes from a
3246 // call are allowed to be non-uniform.
3247 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3248 "Expected aggregate value to be call return value");
3249 }
3250
3251 // If there's no pointer operand, there's nothing to do.
3252 auto *Ptr = getLoadStorePointerOperand(V: &I);
3253 if (!Ptr)
3254 continue;
3255
3256 // If the pointer can be proven to be uniform, always add it to the
3257 // worklist.
3258 if (isa<Instruction>(Val: Ptr) && Legal->isUniform(V: Ptr, VF))
3259 AddToWorklistIfAllowed(cast<Instruction>(Val: Ptr));
3260
3261 if (IsUniformMemOpUse(&I))
3262 AddToWorklistIfAllowed(&I);
3263
3264 if (IsVectorizedMemAccessUse(&I, Ptr))
3265 HasUniformUse.insert(X: Ptr);
3266 }
3267
3268 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3269 // demanding) users. Since loops are assumed to be in LCSSA form, this
3270 // disallows uses outside the loop as well.
3271 for (auto *V : HasUniformUse) {
3272 if (IsOutOfScope(V))
3273 continue;
3274 auto *I = cast<Instruction>(Val: V);
3275 bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User *U) -> bool {
3276 auto *UI = cast<Instruction>(Val: U);
3277 return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse(UI, V);
3278 });
3279 if (UsersAreMemAccesses)
3280 AddToWorklistIfAllowed(I);
3281 }
3282
3283 // Expand Worklist in topological order: whenever a new instruction
3284 // is added , its users should be already inside Worklist. It ensures
3285 // a uniform instruction will only be used by uniform instructions.
3286 unsigned Idx = 0;
3287 while (Idx != Worklist.size()) {
3288 Instruction *I = Worklist[Idx++];
3289
3290 for (auto *OV : I->operand_values()) {
3291 // isOutOfScope operands cannot be uniform instructions.
3292 if (IsOutOfScope(OV))
3293 continue;
3294 // First order recurrence Phi's should typically be considered
3295 // non-uniform.
3296 auto *OP = dyn_cast<PHINode>(Val: OV);
3297 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3298 continue;
3299 // If all the users of the operand are uniform, then add the
3300 // operand into the uniform worklist.
3301 auto *OI = cast<Instruction>(Val: OV);
3302 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
3303 auto *J = cast<Instruction>(Val: U);
3304 return Worklist.count(key: J) || IsVectorizedMemAccessUse(J, OI);
3305 }))
3306 AddToWorklistIfAllowed(OI);
3307 }
3308 }
3309
3310 // For an instruction to be added into Worklist above, all its users inside
3311 // the loop should also be in Worklist. However, this condition cannot be
3312 // true for phi nodes that form a cyclic dependence. We must process phi
3313 // nodes separately. An induction variable will remain uniform if all users
3314 // of the induction variable and induction variable update remain uniform.
3315 // The code below handles both pointer and non-pointer induction variables.
3316 BasicBlock *Latch = TheLoop->getLoopLatch();
3317 for (const auto &Induction : Legal->getInductionVars()) {
3318 auto *Ind = Induction.first;
3319 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3320
3321 // Determine if all users of the induction variable are uniform after
3322 // vectorization.
3323 bool UniformInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3324 auto *I = cast<Instruction>(Val: U);
3325 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3326 IsVectorizedMemAccessUse(I, Ind);
3327 });
3328 if (!UniformInd)
3329 continue;
3330
3331 // Determine if all users of the induction variable update instruction are
3332 // uniform after vectorization.
3333 bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3334 auto *I = cast<Instruction>(Val: U);
3335 return I == Ind || Worklist.count(key: I) ||
3336 IsVectorizedMemAccessUse(I, IndUpdate);
3337 });
3338 if (!UniformIndUpdate)
3339 continue;
3340
3341 // The induction variable and its update instruction will remain uniform.
3342 AddToWorklistIfAllowed(Ind);
3343 AddToWorklistIfAllowed(IndUpdate);
3344 }
3345
3346 Uniforms[VF].insert_range(R&: Worklist);
3347}
3348
3349bool LoopVectorizationCostModel::runtimeChecksRequired() {
3350 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3351
3352 if (Legal->getRuntimePointerChecking()->Need) {
3353 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3354 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3355 "loop with '#pragma clang loop vectorize(enable)' when "
3356 "compiling with -Os/-Oz",
3357 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3358 return true;
3359 }
3360
3361 if (!PSE.getPredicate().isAlwaysTrue()) {
3362 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3363 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3364 "loop with '#pragma clang loop vectorize(enable)' when "
3365 "compiling with -Os/-Oz",
3366 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3367 return true;
3368 }
3369
3370 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3371 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3372 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3373 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3374 "this loop without such check by compiling with -Os/-Oz",
3375 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3376 return true;
3377 }
3378
3379 return false;
3380}
3381
3382bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3383 if (IsScalableVectorizationAllowed)
3384 return *IsScalableVectorizationAllowed;
3385
3386 IsScalableVectorizationAllowed = false;
3387 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3388 return false;
3389
3390 if (Hints->isScalableVectorizationDisabled()) {
3391 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3392 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3393 return false;
3394 }
3395
3396 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3397
3398 auto MaxScalableVF = ElementCount::getScalable(
3399 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3400
3401 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3402 // FIXME: While for scalable vectors this is currently sufficient, this should
3403 // be replaced by a more detailed mechanism that filters out specific VFs,
3404 // instead of invalidating vectorization for a whole set of VFs based on the
3405 // MaxVF.
3406
3407 // Disable scalable vectorization if the loop contains unsupported reductions.
3408 if (!canVectorizeReductions(VF: MaxScalableVF)) {
3409 reportVectorizationInfo(
3410 Msg: "Scalable vectorization not supported for the reduction "
3411 "operations found in this loop.",
3412 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3413 return false;
3414 }
3415
3416 // Disable scalable vectorization if the loop contains any instructions
3417 // with element types not supported for scalable vectors.
3418 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3419 return !Ty->isVoidTy() &&
3420 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3421 })) {
3422 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3423 "for all element types found in this loop.",
3424 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3425 return false;
3426 }
3427
3428 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3429 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3430 "for safe distance analysis.",
3431 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3432 return false;
3433 }
3434
3435 IsScalableVectorizationAllowed = true;
3436 return true;
3437}
3438
3439ElementCount
3440LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3441 if (!isScalableVectorizationAllowed())
3442 return ElementCount::getScalable(MinVal: 0);
3443
3444 auto MaxScalableVF = ElementCount::getScalable(
3445 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3446 if (Legal->isSafeForAnyVectorWidth())
3447 return MaxScalableVF;
3448
3449 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3450 // Limit MaxScalableVF by the maximum safe dependence distance.
3451 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3452
3453 if (!MaxScalableVF)
3454 reportVectorizationInfo(
3455 Msg: "Max legal vector width too small, scalable vectorization "
3456 "unfeasible.",
3457 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3458
3459 return MaxScalableVF;
3460}
3461
3462FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3463 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
3464 bool FoldTailByMasking) {
3465 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3466 unsigned SmallestType, WidestType;
3467 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3468
3469 // Get the maximum safe dependence distance in bits computed by LAA.
3470 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3471 // the memory accesses that is most restrictive (involved in the smallest
3472 // dependence distance).
3473 unsigned MaxSafeElementsPowerOf2 =
3474 bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3475 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3476 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3477 MaxSafeElementsPowerOf2 =
3478 std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3479 }
3480 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3481 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3482
3483 if (!Legal->isSafeForAnyVectorWidth())
3484 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3485
3486 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3487 << ".\n");
3488 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3489 << ".\n");
3490
3491 // First analyze the UserVF, fall back if the UserVF should be ignored.
3492 if (UserVF) {
3493 auto MaxSafeUserVF =
3494 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3495
3496 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3497 // If `VF=vscale x N` is safe, then so is `VF=N`
3498 if (UserVF.isScalable())
3499 return FixedScalableVFPair(
3500 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3501
3502 return UserVF;
3503 }
3504
3505 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3506
3507 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3508 // is better to ignore the hint and let the compiler choose a suitable VF.
3509 if (!UserVF.isScalable()) {
3510 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3511 << " is unsafe, clamping to max safe VF="
3512 << MaxSafeFixedVF << ".\n");
3513 ORE->emit(RemarkBuilder: [&]() {
3514 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3515 TheLoop->getStartLoc(),
3516 TheLoop->getHeader())
3517 << "User-specified vectorization factor "
3518 << ore::NV("UserVectorizationFactor", UserVF)
3519 << " is unsafe, clamping to maximum safe vectorization factor "
3520 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3521 });
3522 return MaxSafeFixedVF;
3523 }
3524
3525 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3526 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3527 << " is ignored because scalable vectors are not "
3528 "available.\n");
3529 ORE->emit(RemarkBuilder: [&]() {
3530 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3531 TheLoop->getStartLoc(),
3532 TheLoop->getHeader())
3533 << "User-specified vectorization factor "
3534 << ore::NV("UserVectorizationFactor", UserVF)
3535 << " is ignored because the target does not support scalable "
3536 "vectors. The compiler will pick a more suitable value.";
3537 });
3538 } else {
3539 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3540 << " is unsafe. Ignoring scalable UserVF.\n");
3541 ORE->emit(RemarkBuilder: [&]() {
3542 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3543 TheLoop->getStartLoc(),
3544 TheLoop->getHeader())
3545 << "User-specified vectorization factor "
3546 << ore::NV("UserVectorizationFactor", UserVF)
3547 << " is unsafe. Ignoring the hint to let the compiler pick a "
3548 "more suitable value.";
3549 });
3550 }
3551 }
3552
3553 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3554 << " / " << WidestType << " bits.\n");
3555
3556 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
3557 ElementCount::getScalable(MinVal: 0));
3558 if (auto MaxVF =
3559 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3560 MaxSafeVF: MaxSafeFixedVF, UserIC, FoldTailByMasking))
3561 Result.FixedVF = MaxVF;
3562
3563 if (auto MaxVF =
3564 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3565 MaxSafeVF: MaxSafeScalableVF, UserIC, FoldTailByMasking))
3566 if (MaxVF.isScalable()) {
3567 Result.ScalableVF = MaxVF;
3568 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3569 << "\n");
3570 }
3571
3572 return Result;
3573}
3574
3575FixedScalableVFPair
3576LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3577 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3578 // TODO: It may be useful to do since it's still likely to be dynamically
3579 // uniform if the target can skip.
3580 reportVectorizationFailure(
3581 DebugMsg: "Not inserting runtime ptr check for divergent target",
3582 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3583 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3584 return FixedScalableVFPair::getNone();
3585 }
3586
3587 ScalarEvolution *SE = PSE.getSE();
3588 ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3589 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3590 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3591 if (TC != ElementCount::getFixed(MinVal: MaxTC))
3592 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3593 if (TC.isScalar()) {
3594 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3595 OREMsg: "loop trip count is one, irrelevant for vectorization",
3596 ORETag: "SingleIterationLoop", ORE, TheLoop);
3597 return FixedScalableVFPair::getNone();
3598 }
3599
3600 // If BTC matches the widest induction type and is -1 then the trip count
3601 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3602 // to vectorize.
3603 const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3604 if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3605 BTC->getType()->getScalarSizeInBits() >=
3606 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3607 SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3608 RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3609 reportVectorizationFailure(
3610 DebugMsg: "Trip count computation wrapped",
3611 OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3612 ORETag: "TripCountWrapped", ORE, TheLoop);
3613 return FixedScalableVFPair::getNone();
3614 }
3615
3616 switch (ScalarEpilogueStatus) {
3617 case CM_ScalarEpilogueAllowed:
3618 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: false);
3619 case CM_ScalarEpilogueNotAllowedUsePredicate:
3620 [[fallthrough]];
3621 case CM_ScalarEpilogueNotNeededUsePredicate:
3622 LLVM_DEBUG(
3623 dbgs() << "LV: vector predicate hint/switch found.\n"
3624 << "LV: Not allowing scalar epilogue, creating predicated "
3625 << "vector loop.\n");
3626 break;
3627 case CM_ScalarEpilogueNotAllowedLowTripLoop:
3628 // fallthrough as a special case of OptForSize
3629 case CM_ScalarEpilogueNotAllowedOptSize:
3630 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3631 LLVM_DEBUG(
3632 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3633 else
3634 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3635 << "count.\n");
3636
3637 // Bail if runtime checks are required, which are not good when optimising
3638 // for size.
3639 if (runtimeChecksRequired())
3640 return FixedScalableVFPair::getNone();
3641
3642 break;
3643 }
3644
3645 // Now try the tail folding
3646
3647 // Invalidate interleave groups that require an epilogue if we can't mask
3648 // the interleave-group.
3649 if (!useMaskedInterleavedAccesses(TTI)) {
3650 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3651 "No decisions should have been taken at this point");
3652 // Note: There is no need to invalidate any cost modeling decisions here, as
3653 // none were taken so far.
3654 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3655 }
3656
3657 FixedScalableVFPair MaxFactors =
3658 computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: true);
3659
3660 // Avoid tail folding if the trip count is known to be a multiple of any VF
3661 // we choose.
3662 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3663 MaxFactors.FixedVF.getFixedValue();
3664 if (MaxFactors.ScalableVF) {
3665 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3666 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3667 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3668 a: *MaxPowerOf2RuntimeVF,
3669 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3670 } else
3671 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3672 }
3673
3674 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3675 // Return false if the loop is neither a single-latch-exit loop nor an
3676 // early-exit loop as tail-folding is not supported in that case.
3677 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3678 !Legal->hasUncountableEarlyExit())
3679 return false;
3680 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3681 ScalarEvolution *SE = PSE.getSE();
3682 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3683 // with uncountable exits. For countable loops, the symbolic maximum must
3684 // remain identical to the known back-edge taken count.
3685 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3686 assert((Legal->hasUncountableEarlyExit() ||
3687 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3688 "Invalid loop count");
3689 const SCEV *ExitCount = SE->getAddExpr(
3690 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3691 const SCEV *Rem = SE->getURemExpr(
3692 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3693 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3694 return Rem->isZero();
3695 };
3696
3697 if (MaxPowerOf2RuntimeVF > 0u) {
3698 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3699 "MaxFixedVF must be a power of 2");
3700 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3701 // Accept MaxFixedVF if we do not have a tail.
3702 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3703 return MaxFactors;
3704 }
3705 }
3706
3707 auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3708 if (ExpectedTC && ExpectedTC->isFixed() &&
3709 ExpectedTC->getFixedValue() <=
3710 TTI.getMinTripCountTailFoldingThreshold()) {
3711 if (MaxPowerOf2RuntimeVF > 0u) {
3712 // If we have a low-trip-count, and the fixed-width VF is known to divide
3713 // the trip count but the scalable factor does not, use the fixed-width
3714 // factor in preference to allow the generation of a non-predicated loop.
3715 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3716 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3717 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3718 "remain for any chosen VF.\n");
3719 MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: 0);
3720 return MaxFactors;
3721 }
3722 }
3723
3724 reportVectorizationFailure(
3725 DebugMsg: "The trip count is below the minial threshold value.",
3726 OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3727 ORE, TheLoop);
3728 return FixedScalableVFPair::getNone();
3729 }
3730
3731 // If we don't know the precise trip count, or if the trip count that we
3732 // found modulo the vectorization factor is not zero, try to fold the tail
3733 // by masking.
3734 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3735 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3736 setTailFoldingStyles(IsScalableVF: ContainsScalableVF, UserIC);
3737 if (foldTailByMasking()) {
3738 if (foldTailWithEVL()) {
3739 LLVM_DEBUG(
3740 dbgs()
3741 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3742 "try to generate VP Intrinsics with scalable vector "
3743 "factors only.\n");
3744 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3745 // for now.
3746 // TODO: extend it for fixed vectors, if required.
3747 assert(ContainsScalableVF && "Expected scalable vector factor.");
3748
3749 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
3750 }
3751 return MaxFactors;
3752 }
3753
3754 // If there was a tail-folding hint/switch, but we can't fold the tail by
3755 // masking, fallback to a vectorization with a scalar epilogue.
3756 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3757 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3758 "scalar epilogue instead.\n");
3759 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3760 return MaxFactors;
3761 }
3762
3763 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3764 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3765 return FixedScalableVFPair::getNone();
3766 }
3767
3768 if (TC.isZero()) {
3769 reportVectorizationFailure(
3770 DebugMsg: "unable to calculate the loop count due to complex control flow",
3771 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3772 return FixedScalableVFPair::getNone();
3773 }
3774
3775 reportVectorizationFailure(
3776 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3777 OREMsg: "cannot optimize for size and vectorize at the same time. "
3778 "Enable vectorization of this loop with '#pragma clang loop "
3779 "vectorize(enable)' when compiling with -Os/-Oz",
3780 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3781 return FixedScalableVFPair::getNone();
3782}
3783
3784bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
3785 ElementCount VF) {
3786 if (ConsiderRegPressure.getNumOccurrences())
3787 return ConsiderRegPressure;
3788
3789 // TODO: We should eventually consider register pressure for all targets. The
3790 // TTI hook is temporary whilst target-specific issues are being fixed.
3791 if (TTI.shouldConsiderVectorizationRegPressure())
3792 return true;
3793
3794 if (!useMaxBandwidth(RegKind: VF.isScalable()
3795 ? TargetTransformInfo::RGK_ScalableVector
3796 : TargetTransformInfo::RGK_FixedWidthVector))
3797 return false;
3798 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3799 return ElementCount::isKnownGT(
3800 LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3801 : MaxPermissibleVFWithoutMaxBW.FixedVF);
3802}
3803
3804bool LoopVectorizationCostModel::useMaxBandwidth(
3805 TargetTransformInfo::RegisterKind RegKind) {
3806 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3807 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
3808 (UseWiderVFIfCallVariantsPresent &&
3809 Legal->hasVectorCallVariants())));
3810}
3811
3812ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3813 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
3814 bool FoldTailByMasking) const {
3815 unsigned EstimatedVF = VF.getKnownMinValue();
3816 if (VF.isScalable() && TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3817 auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3818 auto Min = Attr.getVScaleRangeMin();
3819 EstimatedVF *= Min;
3820 }
3821
3822 // When a scalar epilogue is required, at least one iteration of the scalar
3823 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3824 // max VF that results in a dead vector loop.
3825 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
3826 MaxTripCount -= 1;
3827
3828 // When the user specifies an interleave count, we need to ensure that
3829 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
3830 unsigned IC = UserIC > 0 ? UserIC : 1;
3831 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
3832
3833 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
3834 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
3835 // If upper bound loop trip count (TC) is known at compile time there is no
3836 // point in choosing VF greater than TC / IC (as done in the loop below).
3837 // Select maximum power of two which doesn't exceed TC / IC. If VF is
3838 // scalable, we only fall back on a fixed VF when the TC is less than or
3839 // equal to the known number of lanes.
3840 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC);
3841 if (ClampedUpperTripCount == 0)
3842 ClampedUpperTripCount = 1;
3843 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3844 "exceeding the constant trip count"
3845 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
3846 << ClampedUpperTripCount << "\n");
3847 return ElementCount::get(MinVal: ClampedUpperTripCount,
3848 Scalable: FoldTailByMasking ? VF.isScalable() : false);
3849 }
3850 return VF;
3851}
3852
3853ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3854 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3855 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
3856 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3857 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3858 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3859 : TargetTransformInfo::RGK_FixedWidthVector);
3860
3861 // Convenience function to return the minimum of two ElementCounts.
3862 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3863 assert((LHS.isScalable() == RHS.isScalable()) &&
3864 "Scalable flags must match");
3865 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3866 };
3867
3868 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3869 // Note that both WidestRegister and WidestType may not be a powers of 2.
3870 auto MaxVectorElementCount = ElementCount::get(
3871 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3872 Scalable: ComputeScalableMaxVF);
3873 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3874 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3875 << (MaxVectorElementCount * WidestType) << " bits.\n");
3876
3877 if (!MaxVectorElementCount) {
3878 LLVM_DEBUG(dbgs() << "LV: The target has no "
3879 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3880 << " vector registers.\n");
3881 return ElementCount::getFixed(MinVal: 1);
3882 }
3883
3884 ElementCount MaxVF = clampVFByMaxTripCount(
3885 VF: MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking);
3886 // If the MaxVF was already clamped, there's no point in trying to pick a
3887 // larger one.
3888 if (MaxVF != MaxVectorElementCount)
3889 return MaxVF;
3890
3891 TargetTransformInfo::RegisterKind RegKind =
3892 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3893 : TargetTransformInfo::RGK_FixedWidthVector;
3894
3895 if (MaxVF.isScalable())
3896 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3897 else
3898 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3899
3900 if (useMaxBandwidth(RegKind)) {
3901 auto MaxVectorElementCountMaxBW = ElementCount::get(
3902 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
3903 Scalable: ComputeScalableMaxVF);
3904 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3905
3906 if (ElementCount MinVF =
3907 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
3908 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
3909 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3910 << ") with target's minimum: " << MinVF << '\n');
3911 MaxVF = MinVF;
3912 }
3913 }
3914
3915 MaxVF =
3916 clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC, FoldTailByMasking);
3917
3918 if (MaxVectorElementCount != MaxVF) {
3919 // Invalidate any widening decisions we might have made, in case the loop
3920 // requires prediction (decided later), but we have already made some
3921 // load/store widening decisions.
3922 invalidateCostModelingDecisions();
3923 }
3924 }
3925 return MaxVF;
3926}
3927
3928bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3929 const VectorizationFactor &B,
3930 const unsigned MaxTripCount,
3931 bool HasTail,
3932 bool IsEpilogue) const {
3933 InstructionCost CostA = A.Cost;
3934 InstructionCost CostB = B.Cost;
3935
3936 // Improve estimate for the vector width if it is scalable.
3937 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3938 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3939 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3940 if (A.Width.isScalable())
3941 EstimatedWidthA *= *VScale;
3942 if (B.Width.isScalable())
3943 EstimatedWidthB *= *VScale;
3944 }
3945
3946 // When optimizing for size choose whichever is smallest, which will be the
3947 // one with the smallest cost for the whole loop. On a tie pick the larger
3948 // vector width, on the assumption that throughput will be greater.
3949 if (CM.CostKind == TTI::TCK_CodeSize)
3950 return CostA < CostB ||
3951 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3952
3953 // Assume vscale may be larger than 1 (or the value being tuned for),
3954 // so that scalable vectorization is slightly favorable over fixed-width
3955 // vectorization.
3956 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3957 A.Width.isScalable() && !B.Width.isScalable();
3958
3959 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3960 const InstructionCost &RHS) {
3961 return PreferScalable ? LHS <= RHS : LHS < RHS;
3962 };
3963
3964 // To avoid the need for FP division:
3965 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3966 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3967 if (!MaxTripCount)
3968 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3969
3970 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3971 InstructionCost VectorCost,
3972 InstructionCost ScalarCost) {
3973 // If the trip count is a known (possibly small) constant, the trip count
3974 // will be rounded up to an integer number of iterations under
3975 // FoldTailByMasking. The total cost in that case will be
3976 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3977 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3978 // some extra overheads, but for the purpose of comparing the costs of
3979 // different VFs we can use this to compare the total loop-body cost
3980 // expected after vectorization.
3981 if (HasTail)
3982 return VectorCost * (MaxTripCount / VF) +
3983 ScalarCost * (MaxTripCount % VF);
3984 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
3985 };
3986
3987 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3988 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3989 return CmpFn(RTCostA, RTCostB);
3990}
3991
3992bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3993 const VectorizationFactor &B,
3994 bool HasTail,
3995 bool IsEpilogue) const {
3996 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3997 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3998 IsEpilogue);
3999}
4000
4001void LoopVectorizationPlanner::emitInvalidCostRemarks(
4002 OptimizationRemarkEmitter *ORE) {
4003 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4004 SmallVector<RecipeVFPair> InvalidCosts;
4005 for (const auto &Plan : VPlans) {
4006 for (ElementCount VF : Plan->vectorFactors()) {
4007 // The VPlan-based cost model is designed for computing vector cost.
4008 // Querying VPlan-based cost model with a scarlar VF will cause some
4009 // errors because we expect the VF is vector for most of the widen
4010 // recipes.
4011 if (VF.isScalar())
4012 continue;
4013
4014 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
4015 OrigLoop);
4016 precomputeCosts(Plan&: *Plan, VF, CostCtx);
4017 auto Iter = vp_depth_first_deep(G: Plan->getVectorLoopRegion()->getEntry());
4018 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
4019 for (auto &R : *VPBB) {
4020 if (!R.cost(VF, Ctx&: CostCtx).isValid())
4021 InvalidCosts.emplace_back(Args: &R, Args&: VF);
4022 }
4023 }
4024 }
4025 }
4026 if (InvalidCosts.empty())
4027 return;
4028
4029 // Emit a report of VFs with invalid costs in the loop.
4030
4031 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4032 DenseMap<VPRecipeBase *, unsigned> Numbering;
4033 unsigned I = 0;
4034 for (auto &Pair : InvalidCosts)
4035 if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4036 ++I;
4037
4038 // Sort the list, first on recipe(number) then on VF.
4039 sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4040 unsigned NA = Numbering[A.first];
4041 unsigned NB = Numbering[B.first];
4042 if (NA != NB)
4043 return NA < NB;
4044 return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4045 });
4046
4047 // For a list of ordered recipe-VF pairs:
4048 // [(load, VF1), (load, VF2), (store, VF1)]
4049 // group the recipes together to emit separate remarks for:
4050 // load (VF1, VF2)
4051 // store (VF1)
4052 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4053 auto Subset = ArrayRef<RecipeVFPair>();
4054 do {
4055 if (Subset.empty())
4056 Subset = Tail.take_front(N: 1);
4057
4058 VPRecipeBase *R = Subset.front().first;
4059
4060 unsigned Opcode =
4061 TypeSwitch<const VPRecipeBase *, unsigned>(R)
4062 .Case(caseFn: [](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
4063 .Case(
4064 caseFn: [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
4065 .Case(caseFn: [](const VPWidenLoadRecipe *R) { return Instruction::Load; })
4066 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4067 caseFn: [](const auto *R) { return Instruction::Call; })
4068 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4069 VPWidenCastRecipe>(
4070 caseFn: [](const auto *R) { return R->getOpcode(); })
4071 .Case(caseFn: [](const VPInterleaveRecipe *R) {
4072 return R->getStoredValues().empty() ? Instruction::Load
4073 : Instruction::Store;
4074 })
4075 .Case(caseFn: [](const VPReductionRecipe *R) {
4076 return RecurrenceDescriptor::getOpcode(Kind: R->getRecurrenceKind());
4077 });
4078
4079 // If the next recipe is different, or if there are no other pairs,
4080 // emit a remark for the collated subset. e.g.
4081 // [(load, VF1), (load, VF2))]
4082 // to emit:
4083 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4084 if (Subset == Tail || Tail[Subset.size()].first != R) {
4085 std::string OutString;
4086 raw_string_ostream OS(OutString);
4087 assert(!Subset.empty() && "Unexpected empty range");
4088 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4089 for (const auto &Pair : Subset)
4090 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4091 OS << "):";
4092 if (Opcode == Instruction::Call) {
4093 StringRef Name = "";
4094 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4095 Name = Int->getIntrinsicName();
4096 } else {
4097 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4098 Function *CalledFn =
4099 WidenCall ? WidenCall->getCalledScalarFunction()
4100 : cast<Function>(Val: R->getOperand(N: R->getNumOperands() - 1)
4101 ->getLiveInIRValue());
4102 Name = CalledFn->getName();
4103 }
4104 OS << " call to " << Name;
4105 } else
4106 OS << " " << Instruction::getOpcodeName(Opcode);
4107 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4108 DL: R->getDebugLoc());
4109 Tail = Tail.drop_front(N: Subset.size());
4110 Subset = {};
4111 } else
4112 // Grow the subset by one element
4113 Subset = Tail.take_front(N: Subset.size() + 1);
4114 } while (!Tail.empty());
4115}
4116
4117/// Check if any recipe of \p Plan will generate a vector value, which will be
4118/// assigned a vector register.
4119static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4120 const TargetTransformInfo &TTI) {
4121 assert(VF.isVector() && "Checking a scalar VF?");
4122 VPTypeAnalysis TypeInfo(Plan);
4123 DenseSet<VPRecipeBase *> EphemeralRecipes;
4124 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4125 // Set of already visited types.
4126 DenseSet<Type *> Visited;
4127 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4128 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4129 for (VPRecipeBase &R : *VPBB) {
4130 if (EphemeralRecipes.contains(V: &R))
4131 continue;
4132 // Continue early if the recipe is considered to not produce a vector
4133 // result. Note that this includes VPInstruction where some opcodes may
4134 // produce a vector, to preserve existing behavior as VPInstructions model
4135 // aspects not directly mapped to existing IR instructions.
4136 switch (R.getVPRecipeID()) {
4137 case VPRecipeBase::VPDerivedIVSC:
4138 case VPRecipeBase::VPScalarIVStepsSC:
4139 case VPRecipeBase::VPReplicateSC:
4140 case VPRecipeBase::VPInstructionSC:
4141 case VPRecipeBase::VPCanonicalIVPHISC:
4142 case VPRecipeBase::VPCurrentIterationPHISC:
4143 case VPRecipeBase::VPVectorPointerSC:
4144 case VPRecipeBase::VPVectorEndPointerSC:
4145 case VPRecipeBase::VPExpandSCEVSC:
4146 case VPRecipeBase::VPPredInstPHISC:
4147 case VPRecipeBase::VPBranchOnMaskSC:
4148 continue;
4149 case VPRecipeBase::VPReductionSC:
4150 case VPRecipeBase::VPActiveLaneMaskPHISC:
4151 case VPRecipeBase::VPWidenCallSC:
4152 case VPRecipeBase::VPWidenCanonicalIVSC:
4153 case VPRecipeBase::VPWidenCastSC:
4154 case VPRecipeBase::VPWidenGEPSC:
4155 case VPRecipeBase::VPWidenIntrinsicSC:
4156 case VPRecipeBase::VPWidenSC:
4157 case VPRecipeBase::VPBlendSC:
4158 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
4159 case VPRecipeBase::VPHistogramSC:
4160 case VPRecipeBase::VPWidenPHISC:
4161 case VPRecipeBase::VPWidenIntOrFpInductionSC:
4162 case VPRecipeBase::VPWidenPointerInductionSC:
4163 case VPRecipeBase::VPReductionPHISC:
4164 case VPRecipeBase::VPInterleaveEVLSC:
4165 case VPRecipeBase::VPInterleaveSC:
4166 case VPRecipeBase::VPWidenLoadEVLSC:
4167 case VPRecipeBase::VPWidenLoadSC:
4168 case VPRecipeBase::VPWidenStoreEVLSC:
4169 case VPRecipeBase::VPWidenStoreSC:
4170 break;
4171 default:
4172 llvm_unreachable("unhandled recipe");
4173 }
4174
4175 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4176 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4177 if (!NumLegalParts)
4178 return false;
4179 if (VF.isScalable()) {
4180 // <vscale x 1 x iN> is assumed to be profitable over iN because
4181 // scalable registers are a distinct register class from scalar
4182 // ones. If we ever find a target which wants to lower scalable
4183 // vectors back to scalars, we'll need to update this code to
4184 // explicitly ask TTI about the register class uses for each part.
4185 return NumLegalParts <= VF.getKnownMinValue();
4186 }
4187 // Two or more elements that share a register - are vectorized.
4188 return NumLegalParts < VF.getFixedValue();
4189 };
4190
4191 // If no def nor is a store, e.g., branches, continue - no value to check.
4192 if (R.getNumDefinedValues() == 0 &&
4193 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(Val: &R))
4194 continue;
4195 // For multi-def recipes, currently only interleaved loads, suffice to
4196 // check first def only.
4197 // For stores check their stored value; for interleaved stores suffice
4198 // the check first stored value only. In all cases this is the second
4199 // operand.
4200 VPValue *ToCheck =
4201 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
4202 Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4203 if (!Visited.insert(V: {ScalarTy}).second)
4204 continue;
4205 Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4206 if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4207 return true;
4208 }
4209 }
4210
4211 return false;
4212}
4213
4214static bool hasReplicatorRegion(VPlan &Plan) {
4215 return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4216 G: Plan.getVectorLoopRegion()->getEntry())),
4217 P: [](auto *VPRB) { return VPRB->isReplicator(); });
4218}
4219
4220#ifndef NDEBUG
4221VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4222 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4223 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4224 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4225 assert(
4226 any_of(VPlans,
4227 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4228 "Expected Scalar VF to be a candidate");
4229
4230 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4231 ExpectedCost);
4232 VectorizationFactor ChosenFactor = ScalarCost;
4233
4234 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4235 if (ForceVectorization &&
4236 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4237 // Ignore scalar width, because the user explicitly wants vectorization.
4238 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4239 // evaluation.
4240 ChosenFactor.Cost = InstructionCost::getMax();
4241 }
4242
4243 for (auto &P : VPlans) {
4244 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4245 P->vectorFactors().end());
4246
4247 SmallVector<VPRegisterUsage, 8> RUs;
4248 if (any_of(VFs, [this](ElementCount VF) {
4249 return CM.shouldConsiderRegPressureForVF(VF);
4250 }))
4251 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4252
4253 for (unsigned I = 0; I < VFs.size(); I++) {
4254 ElementCount VF = VFs[I];
4255 // The cost for scalar VF=1 is already calculated, so ignore it.
4256 if (VF.isScalar())
4257 continue;
4258
4259 /// If the register pressure needs to be considered for VF,
4260 /// don't consider the VF as valid if it exceeds the number
4261 /// of registers for the target.
4262 if (CM.shouldConsiderRegPressureForVF(VF) &&
4263 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4264 continue;
4265
4266 InstructionCost C = CM.expectedCost(VF);
4267
4268 // Add on other costs that are modelled in VPlan, but not in the legacy
4269 // cost model.
4270 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, CM.PSE,
4271 OrigLoop);
4272 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4273 assert(VectorRegion && "Expected to have a vector region!");
4274 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4275 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4276 for (VPRecipeBase &R : *VPBB) {
4277 auto *VPI = dyn_cast<VPInstruction>(&R);
4278 if (!VPI)
4279 continue;
4280 switch (VPI->getOpcode()) {
4281 // Selects are only modelled in the legacy cost model for safe
4282 // divisors.
4283 case Instruction::Select: {
4284 if (auto *WR =
4285 dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4286 switch (WR->getOpcode()) {
4287 case Instruction::UDiv:
4288 case Instruction::SDiv:
4289 case Instruction::URem:
4290 case Instruction::SRem:
4291 continue;
4292 default:
4293 break;
4294 }
4295 }
4296 C += VPI->cost(VF, CostCtx);
4297 break;
4298 }
4299 case VPInstruction::ActiveLaneMask: {
4300 unsigned Multiplier =
4301 cast<VPConstantInt>(VPI->getOperand(2))->getZExtValue();
4302 C += VPI->cost(VF * Multiplier, CostCtx);
4303 break;
4304 }
4305 case VPInstruction::ExplicitVectorLength:
4306 C += VPI->cost(VF, CostCtx);
4307 break;
4308 default:
4309 break;
4310 }
4311 }
4312 }
4313
4314 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4315 unsigned Width =
4316 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4317 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4318 << " costs: " << (Candidate.Cost / Width));
4319 if (VF.isScalable())
4320 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4321 << CM.getVScaleForTuning().value_or(1) << ")");
4322 LLVM_DEBUG(dbgs() << ".\n");
4323
4324 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4325 LLVM_DEBUG(
4326 dbgs()
4327 << "LV: Not considering vector loop of width " << VF
4328 << " because it will not generate any vector instructions.\n");
4329 continue;
4330 }
4331
4332 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4333 LLVM_DEBUG(
4334 dbgs()
4335 << "LV: Not considering vector loop of width " << VF
4336 << " because it would cause replicated blocks to be generated,"
4337 << " which isn't allowed when optimizing for size.\n");
4338 continue;
4339 }
4340
4341 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4342 ChosenFactor = Candidate;
4343 }
4344 }
4345
4346 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4347 reportVectorizationFailure(
4348 "There are conditional stores.",
4349 "store that is conditionally executed prevents vectorization",
4350 "ConditionalStore", ORE, OrigLoop);
4351 ChosenFactor = ScalarCost;
4352 }
4353
4354 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4355 !isMoreProfitable(ChosenFactor, ScalarCost,
4356 !CM.foldTailByMasking())) dbgs()
4357 << "LV: Vectorization seems to be not beneficial, "
4358 << "but was forced by a user.\n");
4359 return ChosenFactor;
4360}
4361#endif
4362
4363/// Returns true if the VPlan contains a VPReductionPHIRecipe with
4364/// FindLast recurrence kind.
4365static bool hasFindLastReductionPhi(VPlan &Plan) {
4366 return any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4367 P: [](VPRecipeBase &R) {
4368 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4369 return RedPhi &&
4370 RecurrenceDescriptor::isFindLastRecurrenceKind(
4371 Kind: RedPhi->getRecurrenceKind());
4372 });
4373}
4374
4375/// Returns true if the VPlan contains header phi recipes that are not currently
4376/// supported for epilogue vectorization.
4377static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan) {
4378 return any_of(
4379 Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4380 P: [](VPRecipeBase &R) {
4381 if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R))
4382 return !WidenInd->getPHINode();
4383 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4384 return RedPhi && (RecurrenceDescriptor::isFindLastRecurrenceKind(
4385 Kind: RedPhi->getRecurrenceKind()) ||
4386 !RedPhi->getUnderlyingValue());
4387 });
4388}
4389
4390bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4391 ElementCount VF) const {
4392 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4393 // reductions need special handling and are currently unsupported.
4394 if (any_of(Range: OrigLoop->getHeader()->phis(), P: [&](PHINode &Phi) {
4395 if (!Legal->isReductionVariable(PN: &Phi))
4396 return Legal->isFixedOrderRecurrence(Phi: &Phi);
4397 RecurKind Kind =
4398 Legal->getRecurrenceDescriptor(PN: &Phi).getRecurrenceKind();
4399 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
4400 }))
4401 return false;
4402
4403 // FindLast reductions and inductions without underlying PHI require special
4404 // handling and are currently not supported for epilogue vectorization.
4405 if (hasUnsupportedHeaderPhiRecipe(Plan&: getPlanFor(VF)))
4406 return false;
4407
4408 // Phis with uses outside of the loop require special handling and are
4409 // currently unsupported.
4410 for (const auto &Entry : Legal->getInductionVars()) {
4411 // Look for uses of the value of the induction at the last iteration.
4412 Value *PostInc =
4413 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4414 for (User *U : PostInc->users())
4415 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4416 return false;
4417 // Look for uses of penultimate value of the induction.
4418 for (User *U : Entry.first->users())
4419 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4420 return false;
4421 }
4422
4423 // Epilogue vectorization code has not been auditted to ensure it handles
4424 // non-latch exits properly. It may be fine, but it needs auditted and
4425 // tested.
4426 // TODO: Add support for loops with an early exit.
4427 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4428 return false;
4429
4430 return true;
4431}
4432
4433bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4434 const ElementCount VF, const unsigned IC) const {
4435 // FIXME: We need a much better cost-model to take different parameters such
4436 // as register pressure, code size increase and cost of extra branches into
4437 // account. For now we apply a very crude heuristic and only consider loops
4438 // with vectorization factors larger than a certain value.
4439
4440 // Allow the target to opt out.
4441 if (!TTI.preferEpilogueVectorization(Iters: VF * IC))
4442 return false;
4443
4444 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4445 ? EpilogueVectorizationMinVF
4446 : TTI.getEpilogueVectorizationMinVF();
4447 return estimateElementCount(VF: VF * IC, VScale: VScaleForTuning) >= MinVFThreshold;
4448}
4449
4450VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4451 const ElementCount MainLoopVF, unsigned IC) {
4452 VectorizationFactor Result = VectorizationFactor::Disabled();
4453 if (!EnableEpilogueVectorization) {
4454 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4455 return Result;
4456 }
4457
4458 if (!CM.isScalarEpilogueAllowed()) {
4459 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4460 "epilogue is allowed.\n");
4461 return Result;
4462 }
4463
4464 // Not really a cost consideration, but check for unsupported cases here to
4465 // simplify the logic.
4466 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4467 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4468 "is not a supported candidate.\n");
4469 return Result;
4470 }
4471
4472 if (EpilogueVectorizationForceVF > 1) {
4473 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4474 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4475 if (hasPlanWithVF(VF: ForcedEC))
4476 return {ForcedEC, 0, 0};
4477
4478 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4479 "viable.\n");
4480 return Result;
4481 }
4482
4483 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4484 LLVM_DEBUG(
4485 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4486 return Result;
4487 }
4488
4489 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4490 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4491 "this loop\n");
4492 return Result;
4493 }
4494
4495 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4496 // the main loop handles 8 lanes per iteration. We could still benefit from
4497 // vectorizing the epilogue loop with VF=4.
4498 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4499 MinVal: estimateElementCount(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4500
4501 Type *TCType = Legal->getWidestInductionType();
4502 const SCEV *RemainingIterations = nullptr;
4503 unsigned MaxTripCount = 0;
4504 const SCEV *TC = vputils::getSCEVExprForVPValue(
4505 V: getPlanFor(VF: MainLoopVF).getTripCount(), PSE);
4506 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4507 const SCEV *KnownMinTC;
4508 bool ScalableTC = match(S: TC, P: m_scev_c_Mul(Op0: m_SCEV(V&: KnownMinTC), Op1: m_SCEVVScale()));
4509 bool ScalableRemIter = false;
4510 ScalarEvolution &SE = *PSE.getSE();
4511 // Use versions of TC and VF in which both are either scalable or fixed.
4512 if (ScalableTC == MainLoopVF.isScalable()) {
4513 ScalableRemIter = ScalableTC;
4514 RemainingIterations =
4515 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4516 } else if (ScalableTC) {
4517 const SCEV *EstimatedTC = SE.getMulExpr(
4518 LHS: KnownMinTC,
4519 RHS: SE.getConstant(Ty: TCType, V: CM.getVScaleForTuning().value_or(u: 1)));
4520 RemainingIterations = SE.getURemExpr(
4521 LHS: EstimatedTC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4522 } else
4523 RemainingIterations =
4524 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: EstimatedRuntimeVF * IC));
4525
4526 // No iterations left to process in the epilogue.
4527 if (RemainingIterations->isZero())
4528 return Result;
4529
4530 if (MainLoopVF.isFixed()) {
4531 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4532 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4533 RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4534 MaxTripCount = SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4535 }
4536 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4537 << MaxTripCount << "\n");
4538 }
4539
4540 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
4541 return SE.isKnownPredicate(Pred: CmpInst::ICMP_UGT, LHS: VF, RHS: RemIter);
4542 };
4543 for (auto &NextVF : ProfitableVFs) {
4544 // Skip candidate VFs without a corresponding VPlan.
4545 if (!hasPlanWithVF(VF: NextVF.Width))
4546 continue;
4547
4548 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4549 // vectors) or > the VF of the main loop (fixed vectors).
4550 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4551 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) ||
4552 (NextVF.Width.isScalable() &&
4553 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) ||
4554 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4555 ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4556 continue;
4557
4558 // If NextVF is greater than the number of remaining iterations, the
4559 // epilogue loop would be dead. Skip such factors.
4560 // TODO: We should also consider comparing against a scalable
4561 // RemainingIterations when SCEV be able to evaluate non-canonical
4562 // vscale-based expressions.
4563 if (!ScalableRemIter) {
4564 // Handle the case where NextVF and RemainingIterations are in different
4565 // numerical spaces.
4566 ElementCount EC = NextVF.Width;
4567 if (NextVF.Width.isScalable())
4568 EC = ElementCount::getFixed(
4569 MinVal: estimateElementCount(VF: NextVF.Width, VScale: CM.getVScaleForTuning()));
4570 if (SkipVF(SE.getElementCount(Ty: TCType, EC), RemainingIterations))
4571 continue;
4572 }
4573
4574 if (Result.Width.isScalar() ||
4575 isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking(),
4576 /*IsEpilogue*/ true))
4577 Result = NextVF;
4578 }
4579
4580 if (Result != VectorizationFactor::Disabled())
4581 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4582 << Result.Width << "\n");
4583 return Result;
4584}
4585
4586std::pair<unsigned, unsigned>
4587LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4588 unsigned MinWidth = -1U;
4589 unsigned MaxWidth = 8;
4590 const DataLayout &DL = TheFunction->getDataLayout();
4591 // For in-loop reductions, no element types are added to ElementTypesInLoop
4592 // if there are no loads/stores in the loop. In this case, check through the
4593 // reduction variables to determine the maximum width.
4594 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4595 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4596 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4597 // When finding the min width used by the recurrence we need to account
4598 // for casts on the input operands of the recurrence.
4599 MinWidth = std::min(
4600 a: MinWidth,
4601 b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4602 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4603 MaxWidth = std::max(a: MaxWidth,
4604 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4605 }
4606 } else {
4607 for (Type *T : ElementTypesInLoop) {
4608 MinWidth = std::min<unsigned>(
4609 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4610 MaxWidth = std::max<unsigned>(
4611 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4612 }
4613 }
4614 return {MinWidth, MaxWidth};
4615}
4616
4617void LoopVectorizationCostModel::collectElementTypesForWidening() {
4618 ElementTypesInLoop.clear();
4619 // For each block.
4620 for (BasicBlock *BB : TheLoop->blocks()) {
4621 // For each instruction in the loop.
4622 for (Instruction &I : BB->instructionsWithoutDebug()) {
4623 Type *T = I.getType();
4624
4625 // Skip ignored values.
4626 if (ValuesToIgnore.count(Ptr: &I))
4627 continue;
4628
4629 // Only examine Loads, Stores and PHINodes.
4630 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4631 continue;
4632
4633 // Examine PHI nodes that are reduction variables. Update the type to
4634 // account for the recurrence type.
4635 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4636 if (!Legal->isReductionVariable(PN))
4637 continue;
4638 const RecurrenceDescriptor &RdxDesc =
4639 Legal->getRecurrenceDescriptor(PN);
4640 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4641 TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4642 Ty: RdxDesc.getRecurrenceType()))
4643 continue;
4644 T = RdxDesc.getRecurrenceType();
4645 }
4646
4647 // Examine the stored values.
4648 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4649 T = ST->getValueOperand()->getType();
4650
4651 assert(T->isSized() &&
4652 "Expected the load/store/recurrence type to be sized");
4653
4654 ElementTypesInLoop.insert(Ptr: T);
4655 }
4656 }
4657}
4658
4659unsigned
4660LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4661 InstructionCost LoopCost) {
4662 // -- The interleave heuristics --
4663 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4664 // There are many micro-architectural considerations that we can't predict
4665 // at this level. For example, frontend pressure (on decode or fetch) due to
4666 // code size, or the number and capabilities of the execution ports.
4667 //
4668 // We use the following heuristics to select the interleave count:
4669 // 1. If the code has reductions, then we interleave to break the cross
4670 // iteration dependency.
4671 // 2. If the loop is really small, then we interleave to reduce the loop
4672 // overhead.
4673 // 3. We don't interleave if we think that we will spill registers to memory
4674 // due to the increased register pressure.
4675
4676 // Only interleave tail-folded loops if wide lane masks are requested, as the
4677 // overhead of multiple instructions to calculate the predicate is likely
4678 // not beneficial. If a scalar epilogue is not allowed for any other reason,
4679 // do not interleave.
4680 if (!CM.isScalarEpilogueAllowed() &&
4681 !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4682 return 1;
4683
4684 if (any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4685 P: IsaPred<VPCurrentIterationPHIRecipe>)) {
4686 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
4687 "Unroll factor forced to be 1.\n");
4688 return 1;
4689 }
4690
4691 // We used the distance for the interleave count.
4692 if (!Legal->isSafeForAnyVectorWidth())
4693 return 1;
4694
4695 // We don't attempt to perform interleaving for loops with uncountable early
4696 // exits because the VPInstruction::AnyOf code cannot currently handle
4697 // multiple parts.
4698 if (Plan.hasEarlyExit())
4699 return 1;
4700
4701 const bool HasReductions =
4702 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4703 P: IsaPred<VPReductionPHIRecipe>);
4704
4705 // FIXME: implement interleaving for FindLast transform correctly.
4706 if (hasFindLastReductionPhi(Plan))
4707 return 1;
4708
4709 // If we did not calculate the cost for VF (because the user selected the VF)
4710 // then we calculate the cost of VF here.
4711 if (LoopCost == 0) {
4712 if (VF.isScalar())
4713 LoopCost = CM.expectedCost(VF);
4714 else
4715 LoopCost = cost(Plan, VF);
4716 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4717
4718 // Loop body is free and there is no need for interleaving.
4719 if (LoopCost == 0)
4720 return 1;
4721 }
4722
4723 VPRegisterUsage R =
4724 calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore: CM.ValuesToIgnore)[0];
4725 // We divide by these constants so assume that we have at least one
4726 // instruction that uses at least one register.
4727 for (auto &Pair : R.MaxLocalUsers) {
4728 Pair.second = std::max(a: Pair.second, b: 1U);
4729 }
4730
4731 // We calculate the interleave count using the following formula.
4732 // Subtract the number of loop invariants from the number of available
4733 // registers. These registers are used by all of the interleaved instances.
4734 // Next, divide the remaining registers by the number of registers that is
4735 // required by the loop, in order to estimate how many parallel instances
4736 // fit without causing spills. All of this is rounded down if necessary to be
4737 // a power of two. We want power of two interleave count to simplify any
4738 // addressing operations or alignment considerations.
4739 // We also want power of two interleave counts to ensure that the induction
4740 // variable of the vector loop wraps to zero, when tail is folded by masking;
4741 // this currently happens when OptForSize, in which case IC is set to 1 above.
4742 unsigned IC = UINT_MAX;
4743
4744 for (const auto &Pair : R.MaxLocalUsers) {
4745 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4746 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4747 << " registers of "
4748 << TTI.getRegisterClassName(Pair.first)
4749 << " register class\n");
4750 if (VF.isScalar()) {
4751 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4752 TargetNumRegisters = ForceTargetNumScalarRegs;
4753 } else {
4754 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4755 TargetNumRegisters = ForceTargetNumVectorRegs;
4756 }
4757 unsigned MaxLocalUsers = Pair.second;
4758 unsigned LoopInvariantRegs = 0;
4759 if (R.LoopInvariantRegs.contains(Key: Pair.first))
4760 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4761
4762 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4763 MaxLocalUsers);
4764 // Don't count the induction variable as interleaved.
4765 if (EnableIndVarRegisterHeur) {
4766 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
4767 std::max(a: 1U, b: (MaxLocalUsers - 1)));
4768 }
4769
4770 IC = std::min(a: IC, b: TmpIC);
4771 }
4772
4773 // Clamp the interleave ranges to reasonable counts.
4774 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4775
4776 // Check if the user has overridden the max.
4777 if (VF.isScalar()) {
4778 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4779 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4780 } else {
4781 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4782 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4783 }
4784
4785 // Try to get the exact trip count, or an estimate based on profiling data or
4786 // ConstantMax from PSE, failing that.
4787 auto BestKnownTC = getSmallBestKnownTC(PSE, L: OrigLoop);
4788
4789 // For fixed length VFs treat a scalable trip count as unknown.
4790 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4791 // Re-evaluate trip counts and VFs to be in the same numerical space.
4792 unsigned AvailableTC =
4793 estimateElementCount(VF: *BestKnownTC, VScale: CM.getVScaleForTuning());
4794 unsigned EstimatedVF = estimateElementCount(VF, VScale: CM.getVScaleForTuning());
4795
4796 // At least one iteration must be scalar when this constraint holds. So the
4797 // maximum available iterations for interleaving is one less.
4798 if (CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()))
4799 --AvailableTC;
4800
4801 unsigned InterleaveCountLB = bit_floor(Value: std::max(
4802 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4803
4804 if (getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop).isNonZero()) {
4805 // If the best known trip count is exact, we select between two
4806 // prospective ICs, where
4807 //
4808 // 1) the aggressive IC is capped by the trip count divided by VF
4809 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4810 //
4811 // The final IC is selected in a way that the epilogue loop trip count is
4812 // minimized while maximizing the IC itself, so that we either run the
4813 // vector loop at least once if it generates a small epilogue loop, or
4814 // else we run the vector loop at least twice.
4815
4816 unsigned InterleaveCountUB = bit_floor(Value: std::max(
4817 a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4818 MaxInterleaveCount = InterleaveCountLB;
4819
4820 if (InterleaveCountUB != InterleaveCountLB) {
4821 unsigned TailTripCountUB =
4822 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4823 unsigned TailTripCountLB =
4824 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4825 // If both produce same scalar tail, maximize the IC to do the same work
4826 // in fewer vector loop iterations
4827 if (TailTripCountUB == TailTripCountLB)
4828 MaxInterleaveCount = InterleaveCountUB;
4829 }
4830 } else {
4831 // If trip count is an estimated compile time constant, limit the
4832 // IC to be capped by the trip count divided by VF * 2, such that the
4833 // vector loop runs at least twice to make interleaving seem profitable
4834 // when there is an epilogue loop present. Since exact Trip count is not
4835 // known we choose to be conservative in our IC estimate.
4836 MaxInterleaveCount = InterleaveCountLB;
4837 }
4838 }
4839
4840 assert(MaxInterleaveCount > 0 &&
4841 "Maximum interleave count must be greater than 0");
4842
4843 // Clamp the calculated IC to be between the 1 and the max interleave count
4844 // that the target and trip count allows.
4845 if (IC > MaxInterleaveCount)
4846 IC = MaxInterleaveCount;
4847 else
4848 // Make sure IC is greater than 0.
4849 IC = std::max(a: 1u, b: IC);
4850
4851 assert(IC > 0 && "Interleave count must be greater than 0.");
4852
4853 // Interleave if we vectorized this loop and there is a reduction that could
4854 // benefit from interleaving.
4855 if (VF.isVector() && HasReductions) {
4856 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4857 return IC;
4858 }
4859
4860 // For any scalar loop that either requires runtime checks or predication we
4861 // are better off leaving this to the unroller. Note that if we've already
4862 // vectorized the loop we will have done the runtime check and so interleaving
4863 // won't require further checks.
4864 bool ScalarInterleavingRequiresPredication =
4865 (VF.isScalar() && any_of(Range: OrigLoop->blocks(), P: [this](BasicBlock *BB) {
4866 return Legal->blockNeedsPredication(BB);
4867 }));
4868 bool ScalarInterleavingRequiresRuntimePointerCheck =
4869 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4870
4871 // We want to interleave small loops in order to reduce the loop overhead and
4872 // potentially expose ILP opportunities.
4873 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4874 << "LV: IC is " << IC << '\n'
4875 << "LV: VF is " << VF << '\n');
4876 const bool AggressivelyInterleave =
4877 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4878 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4879 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4880 // We assume that the cost overhead is 1 and we use the cost model
4881 // to estimate the cost of the loop and interleave until the cost of the
4882 // loop overhead is about 5% of the cost of the loop.
4883 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4884 Value: SmallLoopCost / LoopCost.getValue()));
4885
4886 // Interleave until store/load ports (estimated by max interleave count) are
4887 // saturated.
4888 unsigned NumStores = 0;
4889 unsigned NumLoads = 0;
4890 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4891 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
4892 for (VPRecipeBase &R : *VPBB) {
4893 if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: &R)) {
4894 NumLoads++;
4895 continue;
4896 }
4897 if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(Val: &R)) {
4898 NumStores++;
4899 continue;
4900 }
4901
4902 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R)) {
4903 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4904 NumStores += StoreOps;
4905 else
4906 NumLoads += InterleaveR->getNumDefinedValues();
4907 continue;
4908 }
4909 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4910 NumLoads += isa<LoadInst>(Val: RepR->getUnderlyingInstr());
4911 NumStores += isa<StoreInst>(Val: RepR->getUnderlyingInstr());
4912 continue;
4913 }
4914 if (isa<VPHistogramRecipe>(Val: &R)) {
4915 NumLoads++;
4916 NumStores++;
4917 continue;
4918 }
4919 }
4920 }
4921 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4922 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4923
4924 // There is little point in interleaving for reductions containing selects
4925 // and compares when VF=1 since it may just create more overhead than it's
4926 // worth for loops with small trip counts. This is because we still have to
4927 // do the final reduction after the loop.
4928 bool HasSelectCmpReductions =
4929 HasReductions &&
4930 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4931 P: [](VPRecipeBase &R) {
4932 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4933 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4934 Kind: RedR->getRecurrenceKind()) ||
4935 RecurrenceDescriptor::isFindIVRecurrenceKind(
4936 Kind: RedR->getRecurrenceKind()));
4937 });
4938 if (HasSelectCmpReductions) {
4939 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4940 return 1;
4941 }
4942
4943 // If we have a scalar reduction (vector reductions are already dealt with
4944 // by this point), we can increase the critical path length if the loop
4945 // we're interleaving is inside another loop. For tree-wise reductions
4946 // set the limit to 2, and for ordered reductions it's best to disable
4947 // interleaving entirely.
4948 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4949 bool HasOrderedReductions =
4950 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4951 P: [](VPRecipeBase &R) {
4952 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4953
4954 return RedR && RedR->isOrdered();
4955 });
4956 if (HasOrderedReductions) {
4957 LLVM_DEBUG(
4958 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4959 return 1;
4960 }
4961
4962 unsigned F = MaxNestedScalarReductionIC;
4963 SmallIC = std::min(a: SmallIC, b: F);
4964 StoresIC = std::min(a: StoresIC, b: F);
4965 LoadsIC = std::min(a: LoadsIC, b: F);
4966 }
4967
4968 if (EnableLoadStoreRuntimeInterleave &&
4969 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4970 LLVM_DEBUG(
4971 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4972 return std::max(a: StoresIC, b: LoadsIC);
4973 }
4974
4975 // If there are scalar reductions and TTI has enabled aggressive
4976 // interleaving for reductions, we will interleave to expose ILP.
4977 if (VF.isScalar() && AggressivelyInterleave) {
4978 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4979 // Interleave no less than SmallIC but not as aggressive as the normal IC
4980 // to satisfy the rare situation when resources are too limited.
4981 return std::max(a: IC / 2, b: SmallIC);
4982 }
4983
4984 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4985 return SmallIC;
4986 }
4987
4988 // Interleave if this is a large loop (small loops are already dealt with by
4989 // this point) that could benefit from interleaving.
4990 if (AggressivelyInterleave) {
4991 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4992 return IC;
4993 }
4994
4995 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4996 return 1;
4997}
4998
4999bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5000 ElementCount VF) {
5001 // TODO: Cost model for emulated masked load/store is completely
5002 // broken. This hack guides the cost model to use an artificially
5003 // high enough value to practically disable vectorization with such
5004 // operations, except where previously deployed legality hack allowed
5005 // using very low cost values. This is to avoid regressions coming simply
5006 // from moving "masked load/store" check from legality to cost model.
5007 // Masked Load/Gather emulation was previously never allowed.
5008 // Limited number of Masked Store/Scatter emulation was allowed.
5009 assert((isPredicatedInst(I)) &&
5010 "Expecting a scalar emulated instruction");
5011 return isa<LoadInst>(Val: I) ||
5012 (isa<StoreInst>(Val: I) &&
5013 NumPredStores > NumberOfStoresToPredicate);
5014}
5015
5016void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5017 assert(VF.isVector() && "Expected VF >= 2");
5018
5019 // If we've already collected the instructions to scalarize or the predicated
5020 // BBs after vectorization, there's nothing to do. Collection may already have
5021 // occurred if we have a user-selected VF and are now computing the expected
5022 // cost for interleaving.
5023 if (InstsToScalarize.contains(Key: VF) ||
5024 PredicatedBBsAfterVectorization.contains(Val: VF))
5025 return;
5026
5027 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5028 // not profitable to scalarize any instructions, the presence of VF in the
5029 // map will indicate that we've analyzed it already.
5030 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5031
5032 // Find all the instructions that are scalar with predication in the loop and
5033 // determine if it would be better to not if-convert the blocks they are in.
5034 // If so, we also record the instructions to scalarize.
5035 for (BasicBlock *BB : TheLoop->blocks()) {
5036 if (!blockNeedsPredicationForAnyReason(BB))
5037 continue;
5038 for (Instruction &I : *BB)
5039 if (isScalarWithPredication(I: &I, VF)) {
5040 ScalarCostsTy ScalarCosts;
5041 // Do not apply discount logic for:
5042 // 1. Scalars after vectorization, as there will only be a single copy
5043 // of the instruction.
5044 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5045 // 3. Emulated masked memrefs, if a hacked cost is needed.
5046 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5047 !useEmulatedMaskMemRefHack(I: &I, VF) &&
5048 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) {
5049 for (const auto &[I, IC] : ScalarCosts)
5050 ScalarCostsVF.insert(KV: {I, IC});
5051 // Check if we decided to scalarize a call. If so, update the widening
5052 // decision of the call to CM_Scalarize with the computed scalar cost.
5053 for (const auto &[I, Cost] : ScalarCosts) {
5054 auto *CI = dyn_cast<CallInst>(Val: I);
5055 if (!CI || !CallWideningDecisions.contains(Val: {CI, VF}))
5056 continue;
5057 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5058 CallWideningDecisions[{CI, VF}].Cost = Cost;
5059 }
5060 }
5061 // Remember that BB will remain after vectorization.
5062 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
5063 for (auto *Pred : predecessors(BB)) {
5064 if (Pred->getSingleSuccessor() == BB)
5065 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
5066 }
5067 }
5068 }
5069}
5070
5071InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5072 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5073 assert(!isUniformAfterVectorization(PredInst, VF) &&
5074 "Instruction marked uniform-after-vectorization will be predicated");
5075
5076 // Initialize the discount to zero, meaning that the scalar version and the
5077 // vector version cost the same.
5078 InstructionCost Discount = 0;
5079
5080 // Holds instructions to analyze. The instructions we visit are mapped in
5081 // ScalarCosts. Those instructions are the ones that would be scalarized if
5082 // we find that the scalar version costs less.
5083 SmallVector<Instruction *, 8> Worklist;
5084
5085 // Returns true if the given instruction can be scalarized.
5086 auto CanBeScalarized = [&](Instruction *I) -> bool {
5087 // We only attempt to scalarize instructions forming a single-use chain
5088 // from the original predicated block that would otherwise be vectorized.
5089 // Although not strictly necessary, we give up on instructions we know will
5090 // already be scalar to avoid traversing chains that are unlikely to be
5091 // beneficial.
5092 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5093 isScalarAfterVectorization(I, VF))
5094 return false;
5095
5096 // If the instruction is scalar with predication, it will be analyzed
5097 // separately. We ignore it within the context of PredInst.
5098 if (isScalarWithPredication(I, VF))
5099 return false;
5100
5101 // If any of the instruction's operands are uniform after vectorization,
5102 // the instruction cannot be scalarized. This prevents, for example, a
5103 // masked load from being scalarized.
5104 //
5105 // We assume we will only emit a value for lane zero of an instruction
5106 // marked uniform after vectorization, rather than VF identical values.
5107 // Thus, if we scalarize an instruction that uses a uniform, we would
5108 // create uses of values corresponding to the lanes we aren't emitting code
5109 // for. This behavior can be changed by allowing getScalarValue to clone
5110 // the lane zero values for uniforms rather than asserting.
5111 for (Use &U : I->operands())
5112 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5113 if (isUniformAfterVectorization(I: J, VF))
5114 return false;
5115
5116 // Otherwise, we can scalarize the instruction.
5117 return true;
5118 };
5119
5120 // Compute the expected cost discount from scalarizing the entire expression
5121 // feeding the predicated instruction. We currently only consider expressions
5122 // that are single-use instruction chains.
5123 Worklist.push_back(Elt: PredInst);
5124 while (!Worklist.empty()) {
5125 Instruction *I = Worklist.pop_back_val();
5126
5127 // If we've already analyzed the instruction, there's nothing to do.
5128 if (ScalarCosts.contains(Key: I))
5129 continue;
5130
5131 // Cannot scalarize fixed-order recurrence phis at the moment.
5132 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5133 continue;
5134
5135 // Compute the cost of the vector instruction. Note that this cost already
5136 // includes the scalarization overhead of the predicated instruction.
5137 InstructionCost VectorCost = getInstructionCost(I, VF);
5138
5139 // Compute the cost of the scalarized instruction. This cost is the cost of
5140 // the instruction as if it wasn't if-converted and instead remained in the
5141 // predicated block. We will scale this cost by block probability after
5142 // computing the scalarization overhead.
5143 InstructionCost ScalarCost =
5144 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
5145
5146 // Compute the scalarization overhead of needed insertelement instructions
5147 // and phi nodes.
5148 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5149 Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5150 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5151 ScalarCost += TTI.getScalarizationOverhead(
5152 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5153 /*Insert=*/true,
5154 /*Extract=*/false, CostKind);
5155 }
5156 ScalarCost +=
5157 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5158 }
5159
5160 // Compute the scalarization overhead of needed extractelement
5161 // instructions. For each of the instruction's operands, if the operand can
5162 // be scalarized, add it to the worklist; otherwise, account for the
5163 // overhead.
5164 for (Use &U : I->operands())
5165 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5166 assert(canVectorizeTy(J->getType()) &&
5167 "Instruction has non-scalar type");
5168 if (CanBeScalarized(J))
5169 Worklist.push_back(Elt: J);
5170 else if (needsExtract(V: J, VF)) {
5171 Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5172 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5173 ScalarCost += TTI.getScalarizationOverhead(
5174 Ty: cast<VectorType>(Val: VectorTy),
5175 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5176 /*Extract*/ true, CostKind);
5177 }
5178 }
5179 }
5180
5181 // Scale the total scalar cost by block probability.
5182 ScalarCost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5183
5184 // Compute the discount. A non-negative discount means the vector version
5185 // of the instruction costs more, and scalarizing would be beneficial.
5186 Discount += VectorCost - ScalarCost;
5187 ScalarCosts[I] = ScalarCost;
5188 }
5189
5190 return Discount;
5191}
5192
5193InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5194 InstructionCost Cost;
5195
5196 // If the vector loop gets executed exactly once with the given VF, ignore the
5197 // costs of comparison and induction instructions, as they'll get simplified
5198 // away.
5199 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5200 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5201 if (TC == VF && !foldTailByMasking())
5202 addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5203 InstsToIgnore&: ValuesToIgnoreForVF);
5204
5205 // For each block.
5206 for (BasicBlock *BB : TheLoop->blocks()) {
5207 InstructionCost BlockCost;
5208
5209 // For each instruction in the old loop.
5210 for (Instruction &I : BB->instructionsWithoutDebug()) {
5211 // Skip ignored values.
5212 if (ValuesToIgnore.count(Ptr: &I) || ValuesToIgnoreForVF.count(Ptr: &I) ||
5213 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5214 continue;
5215
5216 InstructionCost C = getInstructionCost(I: &I, VF);
5217
5218 // Check if we should override the cost.
5219 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5220 // For interleave groups, use ForceTargetInstructionCost once for the
5221 // whole group.
5222 if (VF.isVector() && getWideningDecision(I: &I, VF) == CM_Interleave) {
5223 if (getInterleavedAccessGroup(Instr: &I)->getInsertPos() == &I)
5224 C = InstructionCost(ForceTargetInstructionCost);
5225 else
5226 C = InstructionCost(0);
5227 } else {
5228 C = InstructionCost(ForceTargetInstructionCost);
5229 }
5230 }
5231
5232 BlockCost += C;
5233 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5234 << VF << " For instruction: " << I << '\n');
5235 }
5236
5237 // If we are vectorizing a predicated block, it will have been
5238 // if-converted. This means that the block's instructions (aside from
5239 // stores and instructions that may divide by zero) will now be
5240 // unconditionally executed. For the scalar case, we may not always execute
5241 // the predicated block, if it is an if-else block. Thus, scale the block's
5242 // cost by the probability of executing it.
5243 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5244 // by the header mask when folding the tail.
5245 if (VF.isScalar())
5246 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5247
5248 Cost += BlockCost;
5249 }
5250
5251 return Cost;
5252}
5253
5254/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
5255/// according to isAddressSCEVForCost.
5256///
5257/// This SCEV can be sent to the Target in order to estimate the address
5258/// calculation cost.
5259static const SCEV *getAddressAccessSCEV(
5260 Value *Ptr,
5261 PredicatedScalarEvolution &PSE,
5262 const Loop *TheLoop) {
5263 const SCEV *Addr = PSE.getSCEV(V: Ptr);
5264 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L: TheLoop) ? Addr
5265 : nullptr;
5266}
5267
5268InstructionCost
5269LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5270 ElementCount VF) {
5271 assert(VF.isVector() &&
5272 "Scalarization cost of instruction implies vectorization.");
5273 if (VF.isScalable())
5274 return InstructionCost::getInvalid();
5275
5276 Type *ValTy = getLoadStoreType(I);
5277 auto *SE = PSE.getSE();
5278
5279 unsigned AS = getLoadStoreAddressSpace(I);
5280 Value *Ptr = getLoadStorePointerOperand(V: I);
5281 Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5282 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5283 // that it is being called from this specific place.
5284
5285 // Figure out whether the access is strided and get the stride value
5286 // if it's known in compile time
5287 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
5288
5289 // Get the cost of the scalar memory instruction and address computation.
5290 InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
5291 PtrTy, SE, Ptr: PtrSCEV, CostKind);
5292
5293 // Don't pass *I here, since it is scalar but will actually be part of a
5294 // vectorized loop where the user of it is a vectorized instruction.
5295 const Align Alignment = getLoadStoreAlignment(I);
5296 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5297 Cost += VF.getFixedValue() *
5298 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy->getScalarType(), Alignment,
5299 AddressSpace: AS, CostKind, OpdInfo: OpInfo);
5300
5301 // Get the overhead of the extractelement and insertelement instructions
5302 // we might create due to scalarization.
5303 Cost += getScalarizationOverhead(I, VF);
5304
5305 // If we have a predicated load/store, it will need extra i1 extracts and
5306 // conditional branches, but may not be executed for each vector lane. Scale
5307 // the cost by the probability of executing the predicated block.
5308 if (isPredicatedInst(I)) {
5309 Cost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5310
5311 // Add the cost of an i1 extract and a branch
5312 auto *VecI1Ty =
5313 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5314 Cost += TTI.getScalarizationOverhead(
5315 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5316 /*Insert=*/false, /*Extract=*/true, CostKind);
5317 Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
5318
5319 if (useEmulatedMaskMemRefHack(I, VF))
5320 // Artificially setting to a high enough value to practically disable
5321 // vectorization with such operations.
5322 Cost = 3000000;
5323 }
5324
5325 return Cost;
5326}
5327
5328InstructionCost
5329LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5330 ElementCount VF) {
5331 Type *ValTy = getLoadStoreType(I);
5332 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5333 Value *Ptr = getLoadStorePointerOperand(V: I);
5334 unsigned AS = getLoadStoreAddressSpace(I);
5335 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5336
5337 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5338 "Stride should be 1 or -1 for consecutive memory access");
5339 const Align Alignment = getLoadStoreAlignment(I);
5340 InstructionCost Cost = 0;
5341 if (Legal->isMaskRequired(I)) {
5342 unsigned IID = I->getOpcode() == Instruction::Load
5343 ? Intrinsic::masked_load
5344 : Intrinsic::masked_store;
5345 Cost += TTI.getMemIntrinsicInstrCost(
5346 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS), CostKind);
5347 } else {
5348 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5349 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5350 CostKind, OpdInfo: OpInfo, I);
5351 }
5352
5353 bool Reverse = ConsecutiveStride < 0;
5354 if (Reverse)
5355 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5356 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5357 return Cost;
5358}
5359
5360InstructionCost
5361LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5362 ElementCount VF) {
5363 assert(Legal->isUniformMemOp(*I, VF));
5364
5365 Type *ValTy = getLoadStoreType(I);
5366 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5367 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5368 const Align Alignment = getLoadStoreAlignment(I);
5369 unsigned AS = getLoadStoreAddressSpace(I);
5370 if (isa<LoadInst>(Val: I)) {
5371 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5372 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5373 CostKind) +
5374 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5375 SrcTy: VectorTy, Mask: {}, CostKind);
5376 }
5377 StoreInst *SI = cast<StoreInst>(Val: I);
5378
5379 bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5380 // TODO: We have existing tests that request the cost of extracting element
5381 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5382 // the actual generated code, which involves extracting the last element of
5383 // a scalable vector where the lane to extract is unknown at compile time.
5384 InstructionCost Cost =
5385 TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5386 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS, CostKind);
5387 if (!IsLoopInvariantStoreValue)
5388 Cost += TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
5389 Val: VectorTy, CostKind, Index: 0);
5390 return Cost;
5391}
5392
5393InstructionCost
5394LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5395 ElementCount VF) {
5396 Type *ValTy = getLoadStoreType(I);
5397 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5398 const Align Alignment = getLoadStoreAlignment(I);
5399 Value *Ptr = getLoadStorePointerOperand(V: I);
5400 Type *PtrTy = Ptr->getType();
5401
5402 if (!Legal->isUniform(V: Ptr, VF))
5403 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
5404
5405 unsigned IID = I->getOpcode() == Instruction::Load
5406 ? Intrinsic::masked_gather
5407 : Intrinsic::masked_scatter;
5408 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5409 TTI.getMemIntrinsicInstrCost(
5410 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Ptr,
5411 Legal->isMaskRequired(I), Alignment, I),
5412 CostKind);
5413}
5414
5415InstructionCost
5416LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5417 ElementCount VF) {
5418 const auto *Group = getInterleavedAccessGroup(Instr: I);
5419 assert(Group && "Fail to get an interleaved access group.");
5420
5421 Instruction *InsertPos = Group->getInsertPos();
5422 Type *ValTy = getLoadStoreType(I: InsertPos);
5423 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5424 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5425
5426 unsigned InterleaveFactor = Group->getFactor();
5427 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
5428
5429 // Holds the indices of existing members in the interleaved group.
5430 SmallVector<unsigned, 4> Indices;
5431 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5432 if (Group->getMember(Index: IF))
5433 Indices.push_back(Elt: IF);
5434
5435 // Calculate the cost of the whole interleaved group.
5436 bool UseMaskForGaps =
5437 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5438 (isa<StoreInst>(Val: I) && !Group->isFull());
5439 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5440 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5441 Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5442 UseMaskForGaps);
5443
5444 if (Group->isReverse()) {
5445 // TODO: Add support for reversed masked interleaved access.
5446 assert(!Legal->isMaskRequired(I) &&
5447 "Reverse masked interleaved access not supported.");
5448 Cost += Group->getNumMembers() *
5449 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5450 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5451 }
5452 return Cost;
5453}
5454
5455std::optional<InstructionCost>
5456LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5457 ElementCount VF,
5458 Type *Ty) const {
5459 using namespace llvm::PatternMatch;
5460 // Early exit for no inloop reductions
5461 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
5462 return std::nullopt;
5463 auto *VectorTy = cast<VectorType>(Val: Ty);
5464
5465 // We are looking for a pattern of, and finding the minimal acceptable cost:
5466 // reduce(mul(ext(A), ext(B))) or
5467 // reduce(mul(A, B)) or
5468 // reduce(ext(A)) or
5469 // reduce(A).
5470 // The basic idea is that we walk down the tree to do that, finding the root
5471 // reduction instruction in InLoopReductionImmediateChains. From there we find
5472 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5473 // of the components. If the reduction cost is lower then we return it for the
5474 // reduction instruction and 0 for the other instructions in the pattern. If
5475 // it is not we return an invalid cost specifying the orignal cost method
5476 // should be used.
5477 Instruction *RetI = I;
5478 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5479 if (!RetI->hasOneUser())
5480 return std::nullopt;
5481 RetI = RetI->user_back();
5482 }
5483
5484 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5485 RetI->user_back()->getOpcode() == Instruction::Add) {
5486 RetI = RetI->user_back();
5487 }
5488
5489 // Test if the found instruction is a reduction, and if not return an invalid
5490 // cost specifying the parent to use the original cost modelling.
5491 Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5492 if (!LastChain)
5493 return std::nullopt;
5494
5495 // Find the reduction this chain is a part of and calculate the basic cost of
5496 // the reduction on its own.
5497 Instruction *ReductionPhi = LastChain;
5498 while (!isa<PHINode>(Val: ReductionPhi))
5499 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5500
5501 const RecurrenceDescriptor &RdxDesc =
5502 Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
5503
5504 InstructionCost BaseCost;
5505 RecurKind RK = RdxDesc.getRecurrenceKind();
5506 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5507 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5508 BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5509 FMF: RdxDesc.getFastMathFlags(), CostKind);
5510 } else {
5511 BaseCost = TTI.getArithmeticReductionCost(
5512 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5513 }
5514
5515 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5516 // normal fmul instruction to the cost of the fadd reduction.
5517 if (RK == RecurKind::FMulAdd)
5518 BaseCost +=
5519 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5520
5521 // If we're using ordered reductions then we can just return the base cost
5522 // here, since getArithmeticReductionCost calculates the full ordered
5523 // reduction cost when FP reassociation is not allowed.
5524 if (useOrderedReductions(RdxDesc))
5525 return BaseCost;
5526
5527 // Get the operand that was not the reduction chain and match it to one of the
5528 // patterns, returning the better cost if it is found.
5529 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
5530 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
5531 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
5532
5533 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
5534
5535 Instruction *Op0, *Op1;
5536 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5537 match(V: RedOp,
5538 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5539 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5540 Op0->getOpcode() == Op1->getOpcode() &&
5541 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
5542 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5543 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5544
5545 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5546 // Note that the extend opcodes need to all match, or if A==B they will have
5547 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5548 // which is equally fine.
5549 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5550 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
5551 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5552
5553 InstructionCost ExtCost =
5554 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5555 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5556 InstructionCost MulCost =
5557 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5558 InstructionCost Ext2Cost =
5559 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5560 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5561
5562 InstructionCost RedCost = TTI.getMulAccReductionCost(
5563 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5564 CostKind);
5565
5566 if (RedCost.isValid() &&
5567 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5568 return I == RetI ? RedCost : 0;
5569 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5570 !TheLoop->isLoopInvariant(V: RedOp)) {
5571 // Matched reduce(ext(A))
5572 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5573 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
5574 InstructionCost RedCost = TTI.getExtendedReductionCost(
5575 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5576 FMF: RdxDesc.getFastMathFlags(), CostKind);
5577
5578 InstructionCost ExtCost =
5579 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5580 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5581 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5582 return I == RetI ? RedCost : 0;
5583 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5584 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5585 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5586 Op0->getOpcode() == Op1->getOpcode() &&
5587 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5588 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5589 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
5590 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
5591 Type *LargestOpTy =
5592 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5593 : Op0Ty;
5594 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5595
5596 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5597 // different sizes. We take the largest type as the ext to reduce, and add
5598 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5599 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5600 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5601 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5602 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5603 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5604 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5605 InstructionCost MulCost =
5606 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5607
5608 InstructionCost RedCost = TTI.getMulAccReductionCost(
5609 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5610 CostKind);
5611 InstructionCost ExtraExtCost = 0;
5612 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5613 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5614 ExtraExtCost = TTI.getCastInstrCost(
5615 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5616 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
5617 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5618 }
5619
5620 if (RedCost.isValid() &&
5621 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5622 return I == RetI ? RedCost : 0;
5623 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5624 // Matched reduce.add(mul())
5625 InstructionCost MulCost =
5626 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5627
5628 InstructionCost RedCost = TTI.getMulAccReductionCost(
5629 IsUnsigned: true, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy,
5630 CostKind);
5631
5632 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5633 return I == RetI ? RedCost : 0;
5634 }
5635 }
5636
5637 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5638}
5639
5640InstructionCost
5641LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5642 ElementCount VF) {
5643 // Calculate scalar cost only. Vectorization cost should be ready at this
5644 // moment.
5645 if (VF.isScalar()) {
5646 Type *ValTy = getLoadStoreType(I);
5647 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5648 const Align Alignment = getLoadStoreAlignment(I);
5649 unsigned AS = getLoadStoreAddressSpace(I);
5650
5651 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5652 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5653 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5654 OpdInfo: OpInfo, I);
5655 }
5656 return getWideningCost(I, VF);
5657}
5658
5659InstructionCost
5660LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5661 ElementCount VF) const {
5662
5663 // There is no mechanism yet to create a scalable scalarization loop,
5664 // so this is currently Invalid.
5665 if (VF.isScalable())
5666 return InstructionCost::getInvalid();
5667
5668 if (VF.isScalar())
5669 return 0;
5670
5671 InstructionCost Cost = 0;
5672 Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5673 if (!RetTy->isVoidTy() &&
5674 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5675
5676 TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None;
5677 if (isa<LoadInst>(Val: I))
5678 VIC = TTI::VectorInstrContext::Load;
5679 else if (isa<StoreInst>(Val: I))
5680 VIC = TTI::VectorInstrContext::Store;
5681
5682 for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5683 Cost += TTI.getScalarizationOverhead(
5684 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5685 /*Insert=*/true, /*Extract=*/false, CostKind,
5686 /*ForPoisonSrc=*/true, VL: {}, VIC);
5687 }
5688 }
5689
5690 // Some targets keep addresses scalar.
5691 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5692 return Cost;
5693
5694 // Some targets support efficient element stores.
5695 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5696 return Cost;
5697
5698 // Collect operands to consider.
5699 CallInst *CI = dyn_cast<CallInst>(Val: I);
5700 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5701
5702 // Skip operands that do not require extraction/scalarization and do not incur
5703 // any overhead.
5704 SmallVector<Type *> Tys;
5705 for (auto *V : filterExtractingOperands(Ops, VF))
5706 Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5707
5708 TTI::VectorInstrContext OperandVIC = isa<StoreInst>(Val: I)
5709 ? TTI::VectorInstrContext::Store
5710 : TTI::VectorInstrContext::None;
5711 return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC: OperandVIC);
5712}
5713
5714void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5715 if (VF.isScalar())
5716 return;
5717 NumPredStores = 0;
5718 for (BasicBlock *BB : TheLoop->blocks()) {
5719 // For each instruction in the old loop.
5720 for (Instruction &I : *BB) {
5721 Value *Ptr = getLoadStorePointerOperand(V: &I);
5722 if (!Ptr)
5723 continue;
5724
5725 // TODO: We should generate better code and update the cost model for
5726 // predicated uniform stores. Today they are treated as any other
5727 // predicated store (see added test cases in
5728 // invariant-store-vectorization.ll).
5729 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5730 NumPredStores++;
5731
5732 if (Legal->isUniformMemOp(I, VF)) {
5733 auto IsLegalToScalarize = [&]() {
5734 if (!VF.isScalable())
5735 // Scalarization of fixed length vectors "just works".
5736 return true;
5737
5738 // We have dedicated lowering for unpredicated uniform loads and
5739 // stores. Note that even with tail folding we know that at least
5740 // one lane is active (i.e. generalized predication is not possible
5741 // here), and the logic below depends on this fact.
5742 if (!foldTailByMasking())
5743 return true;
5744
5745 // For scalable vectors, a uniform memop load is always
5746 // uniform-by-parts and we know how to scalarize that.
5747 if (isa<LoadInst>(Val: I))
5748 return true;
5749
5750 // A uniform store isn't neccessarily uniform-by-part
5751 // and we can't assume scalarization.
5752 auto &SI = cast<StoreInst>(Val&: I);
5753 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5754 };
5755
5756 const InstructionCost GatherScatterCost =
5757 isLegalGatherOrScatter(V: &I, VF) ?
5758 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5759
5760 // Load: Scalar load + broadcast
5761 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5762 // FIXME: This cost is a significant under-estimate for tail folded
5763 // memory ops.
5764 const InstructionCost ScalarizationCost =
5765 IsLegalToScalarize() ? getUniformMemOpCost(I: &I, VF)
5766 : InstructionCost::getInvalid();
5767
5768 // Choose better solution for the current VF, Note that Invalid
5769 // costs compare as maximumal large. If both are invalid, we get
5770 // scalable invalid which signals a failure and a vectorization abort.
5771 if (GatherScatterCost < ScalarizationCost)
5772 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5773 else
5774 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5775 continue;
5776 }
5777
5778 // We assume that widening is the best solution when possible.
5779 if (memoryInstructionCanBeWidened(I: &I, VF)) {
5780 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5781 int ConsecutiveStride = Legal->isConsecutivePtr(
5782 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5783 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5784 "Expected consecutive stride.");
5785 InstWidening Decision =
5786 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5787 setWideningDecision(I: &I, VF, W: Decision, Cost);
5788 continue;
5789 }
5790
5791 // Choose between Interleaving, Gather/Scatter or Scalarization.
5792 InstructionCost InterleaveCost = InstructionCost::getInvalid();
5793 unsigned NumAccesses = 1;
5794 if (isAccessInterleaved(Instr: &I)) {
5795 const auto *Group = getInterleavedAccessGroup(Instr: &I);
5796 assert(Group && "Fail to get an interleaved access group.");
5797
5798 // Make one decision for the whole group.
5799 if (getWideningDecision(I: &I, VF) != CM_Unknown)
5800 continue;
5801
5802 NumAccesses = Group->getNumMembers();
5803 if (interleavedAccessCanBeWidened(I: &I, VF))
5804 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5805 }
5806
5807 InstructionCost GatherScatterCost =
5808 isLegalGatherOrScatter(V: &I, VF)
5809 ? getGatherScatterCost(I: &I, VF) * NumAccesses
5810 : InstructionCost::getInvalid();
5811
5812 InstructionCost ScalarizationCost =
5813 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5814
5815 // Choose better solution for the current VF,
5816 // write down this decision and use it during vectorization.
5817 InstructionCost Cost;
5818 InstWidening Decision;
5819 if (InterleaveCost <= GatherScatterCost &&
5820 InterleaveCost < ScalarizationCost) {
5821 Decision = CM_Interleave;
5822 Cost = InterleaveCost;
5823 } else if (GatherScatterCost < ScalarizationCost) {
5824 Decision = CM_GatherScatter;
5825 Cost = GatherScatterCost;
5826 } else {
5827 Decision = CM_Scalarize;
5828 Cost = ScalarizationCost;
5829 }
5830 // If the instructions belongs to an interleave group, the whole group
5831 // receives the same decision. The whole group receives the cost, but
5832 // the cost will actually be assigned to one instruction.
5833 if (const auto *Group = getInterleavedAccessGroup(Instr: &I)) {
5834 if (Decision == CM_Scalarize) {
5835 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5836 if (auto *I = Group->getMember(Index: Idx)) {
5837 setWideningDecision(I, VF, W: Decision,
5838 Cost: getMemInstScalarizationCost(I, VF));
5839 }
5840 }
5841 } else {
5842 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5843 }
5844 } else
5845 setWideningDecision(I: &I, VF, W: Decision, Cost);
5846 }
5847 }
5848
5849 // Make sure that any load of address and any other address computation
5850 // remains scalar unless there is gather/scatter support. This avoids
5851 // inevitable extracts into address registers, and also has the benefit of
5852 // activating LSR more, since that pass can't optimize vectorized
5853 // addresses.
5854 if (TTI.prefersVectorizedAddressing())
5855 return;
5856
5857 // Start with all scalar pointer uses.
5858 SmallPtrSet<Instruction *, 8> AddrDefs;
5859 for (BasicBlock *BB : TheLoop->blocks())
5860 for (Instruction &I : *BB) {
5861 Instruction *PtrDef =
5862 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5863 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5864 getWideningDecision(I: &I, VF) != CM_GatherScatter)
5865 AddrDefs.insert(Ptr: PtrDef);
5866 }
5867
5868 // Add all instructions used to generate the addresses.
5869 SmallVector<Instruction *, 4> Worklist;
5870 append_range(C&: Worklist, R&: AddrDefs);
5871 while (!Worklist.empty()) {
5872 Instruction *I = Worklist.pop_back_val();
5873 for (auto &Op : I->operands())
5874 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5875 if (TheLoop->contains(Inst: InstOp) && !isa<PHINode>(Val: InstOp) &&
5876 AddrDefs.insert(Ptr: InstOp).second)
5877 Worklist.push_back(Elt: InstOp);
5878 }
5879
5880 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5881 // If there are direct memory op users of the newly scalarized load,
5882 // their cost may have changed because there's no scalarization
5883 // overhead for the operand. Update it.
5884 for (User *U : LI->users()) {
5885 if (!isa<LoadInst, StoreInst>(Val: U))
5886 continue;
5887 if (getWideningDecision(I: cast<Instruction>(Val: U), VF) != CM_Scalarize)
5888 continue;
5889 setWideningDecision(
5890 I: cast<Instruction>(Val: U), VF, W: CM_Scalarize,
5891 Cost: getMemInstScalarizationCost(I: cast<Instruction>(Val: U), VF));
5892 }
5893 };
5894 for (auto *I : AddrDefs) {
5895 if (isa<LoadInst>(Val: I)) {
5896 // Setting the desired widening decision should ideally be handled in
5897 // by cost functions, but since this involves the task of finding out
5898 // if the loaded register is involved in an address computation, it is
5899 // instead changed here when we know this is the case.
5900 InstWidening Decision = getWideningDecision(I, VF);
5901 if (!isPredicatedInst(I) &&
5902 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5903 (!Legal->isUniformMemOp(I&: *I, VF) && Decision == CM_Scalarize))) {
5904 // Scalarize a widened load of address or update the cost of a scalar
5905 // load of an address.
5906 setWideningDecision(
5907 I, VF, W: CM_Scalarize,
5908 Cost: (VF.getKnownMinValue() *
5909 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
5910 UpdateMemOpUserCost(cast<LoadInst>(Val: I));
5911 } else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5912 // Scalarize all members of this interleaved group when any member
5913 // is used as an address. The address-used load skips scalarization
5914 // overhead, other members include it.
5915 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5916 if (Instruction *Member = Group->getMember(Index: Idx)) {
5917 InstructionCost Cost =
5918 AddrDefs.contains(Ptr: Member)
5919 ? (VF.getKnownMinValue() *
5920 getMemoryInstructionCost(I: Member,
5921 VF: ElementCount::getFixed(MinVal: 1)))
5922 : getMemInstScalarizationCost(I: Member, VF);
5923 setWideningDecision(I: Member, VF, W: CM_Scalarize, Cost);
5924 UpdateMemOpUserCost(cast<LoadInst>(Val: Member));
5925 }
5926 }
5927 }
5928 } else {
5929 // Cannot scalarize fixed-order recurrence phis at the moment.
5930 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5931 continue;
5932
5933 // Make sure I gets scalarized and a cost estimate without
5934 // scalarization overhead.
5935 ForcedScalars[VF].insert(Ptr: I);
5936 }
5937 }
5938}
5939
5940void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5941 assert(!VF.isScalar() &&
5942 "Trying to set a vectorization decision for a scalar VF");
5943
5944 auto ForcedScalar = ForcedScalars.find(Val: VF);
5945 for (BasicBlock *BB : TheLoop->blocks()) {
5946 // For each instruction in the old loop.
5947 for (Instruction &I : *BB) {
5948 CallInst *CI = dyn_cast<CallInst>(Val: &I);
5949
5950 if (!CI)
5951 continue;
5952
5953 InstructionCost ScalarCost = InstructionCost::getInvalid();
5954 InstructionCost VectorCost = InstructionCost::getInvalid();
5955 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5956 Function *ScalarFunc = CI->getCalledFunction();
5957 Type *ScalarRetTy = CI->getType();
5958 SmallVector<Type *, 4> Tys, ScalarTys;
5959 for (auto &ArgOp : CI->args())
5960 ScalarTys.push_back(Elt: ArgOp->getType());
5961
5962 // Estimate cost of scalarized vector call. The source operands are
5963 // assumed to be vectors, so we need to extract individual elements from
5964 // there, execute VF scalar calls, and then gather the result into the
5965 // vector return value.
5966 if (VF.isFixed()) {
5967 InstructionCost ScalarCallCost =
5968 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5969
5970 // Compute costs of unpacking argument values for the scalar calls and
5971 // packing the return values to a vector.
5972 InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5973 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5974 } else {
5975 // There is no point attempting to calculate the scalar cost for a
5976 // scalable VF as we know it will be Invalid.
5977 assert(!getScalarizationOverhead(CI, VF).isValid() &&
5978 "Unexpected valid cost for scalarizing scalable vectors");
5979 ScalarCost = InstructionCost::getInvalid();
5980 }
5981
5982 // Honor ForcedScalars and UniformAfterVectorization decisions.
5983 // TODO: For calls, it might still be more profitable to widen. Use
5984 // VPlan-based cost model to compare different options.
5985 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5986 ForcedScalar->second.contains(Ptr: CI)) ||
5987 isUniformAfterVectorization(I: CI, VF))) {
5988 setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5989 IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5990 Cost: ScalarCost);
5991 continue;
5992 }
5993
5994 bool MaskRequired = Legal->isMaskRequired(I: CI);
5995 // Compute corresponding vector type for return value and arguments.
5996 Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5997 for (Type *ScalarTy : ScalarTys)
5998 Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5999
6000 // An in-loop reduction using an fmuladd intrinsic is a special case;
6001 // we don't want the normal cost for that intrinsic.
6002 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
6003 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
6004 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
6005 IID: getVectorIntrinsicIDForCall(CI, TLI),
6006 MaskPos: std::nullopt, Cost: *RedCost);
6007 continue;
6008 }
6009
6010 // Find the cost of vectorizing the call, if we can find a suitable
6011 // vector variant of the function.
6012 VFInfo FuncInfo;
6013 Function *VecFunc = nullptr;
6014 // Search through any available variants for one we can use at this VF.
6015 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
6016 // Must match requested VF.
6017 if (Info.Shape.VF != VF)
6018 continue;
6019
6020 // Must take a mask argument if one is required
6021 if (MaskRequired && !Info.isMasked())
6022 continue;
6023
6024 // Check that all parameter kinds are supported
6025 bool ParamsOk = true;
6026 for (VFParameter Param : Info.Shape.Parameters) {
6027 switch (Param.ParamKind) {
6028 case VFParamKind::Vector:
6029 break;
6030 case VFParamKind::OMP_Uniform: {
6031 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6032 // Make sure the scalar parameter in the loop is invariant.
6033 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
6034 L: TheLoop))
6035 ParamsOk = false;
6036 break;
6037 }
6038 case VFParamKind::OMP_Linear: {
6039 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6040 // Find the stride for the scalar parameter in this loop and see if
6041 // it matches the stride for the variant.
6042 // TODO: do we need to figure out the cost of an extract to get the
6043 // first lane? Or do we hope that it will be folded away?
6044 ScalarEvolution *SE = PSE.getSE();
6045 if (!match(S: SE->getSCEV(V: ScalarParam),
6046 P: m_scev_AffineAddRec(
6047 Op0: m_SCEV(), Op1: m_scev_SpecificSInt(V: Param.LinearStepOrPos),
6048 L: m_SpecificLoop(L: TheLoop))))
6049 ParamsOk = false;
6050 break;
6051 }
6052 case VFParamKind::GlobalPredicate:
6053 break;
6054 default:
6055 ParamsOk = false;
6056 break;
6057 }
6058 }
6059
6060 if (!ParamsOk)
6061 continue;
6062
6063 // Found a suitable candidate, stop here.
6064 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
6065 FuncInfo = Info;
6066 break;
6067 }
6068
6069 if (TLI && VecFunc && !CI->isNoBuiltin())
6070 VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
6071
6072 // Find the cost of an intrinsic; some targets may have instructions that
6073 // perform the operation without needing an actual call.
6074 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6075 if (IID != Intrinsic::not_intrinsic)
6076 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6077
6078 InstructionCost Cost = ScalarCost;
6079 InstWidening Decision = CM_Scalarize;
6080
6081 if (VectorCost.isValid() && VectorCost <= Cost) {
6082 Cost = VectorCost;
6083 Decision = CM_VectorCall;
6084 }
6085
6086 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
6087 Cost = IntrinsicCost;
6088 Decision = CM_IntrinsicCall;
6089 }
6090
6091 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6092 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6093 }
6094 }
6095}
6096
6097bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6098 if (!Legal->isInvariant(V: Op))
6099 return false;
6100 // Consider Op invariant, if it or its operands aren't predicated
6101 // instruction in the loop. In that case, it is not trivially hoistable.
6102 auto *OpI = dyn_cast<Instruction>(Val: Op);
6103 return !OpI || !TheLoop->contains(Inst: OpI) ||
6104 (!isPredicatedInst(I: OpI) &&
6105 (!isa<PHINode>(Val: OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6106 all_of(Range: OpI->operands(),
6107 P: [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6108}
6109
6110InstructionCost
6111LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6112 ElementCount VF) {
6113 // If we know that this instruction will remain uniform, check the cost of
6114 // the scalar version.
6115 if (isUniformAfterVectorization(I, VF))
6116 VF = ElementCount::getFixed(MinVal: 1);
6117
6118 if (VF.isVector() && isProfitableToScalarize(I, VF))
6119 return InstsToScalarize[VF][I];
6120
6121 // Forced scalars do not have any scalarization overhead.
6122 auto ForcedScalar = ForcedScalars.find(Val: VF);
6123 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6124 auto InstSet = ForcedScalar->second;
6125 if (InstSet.count(Ptr: I))
6126 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
6127 VF.getKnownMinValue();
6128 }
6129
6130 Type *RetTy = I->getType();
6131 if (canTruncateToMinimalBitwidth(I, VF))
6132 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
6133 auto *SE = PSE.getSE();
6134
6135 Type *VectorTy;
6136 if (isScalarAfterVectorization(I, VF)) {
6137 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6138 [this](Instruction *I, ElementCount VF) -> bool {
6139 if (VF.isScalar())
6140 return true;
6141
6142 auto Scalarized = InstsToScalarize.find(Key: VF);
6143 assert(Scalarized != InstsToScalarize.end() &&
6144 "VF not yet analyzed for scalarization profitability");
6145 return !Scalarized->second.count(Key: I) &&
6146 llvm::all_of(Range: I->users(), P: [&](User *U) {
6147 auto *UI = cast<Instruction>(Val: U);
6148 return !Scalarized->second.count(Key: UI);
6149 });
6150 };
6151
6152 // With the exception of GEPs and PHIs, after scalarization there should
6153 // only be one copy of the instruction generated in the loop. This is
6154 // because the VF is either 1, or any instructions that need scalarizing
6155 // have already been dealt with by the time we get here. As a result,
6156 // it means we don't have to multiply the instruction cost by VF.
6157 assert(I->getOpcode() == Instruction::GetElementPtr ||
6158 I->getOpcode() == Instruction::PHI ||
6159 (I->getOpcode() == Instruction::BitCast &&
6160 I->getType()->isPointerTy()) ||
6161 HasSingleCopyAfterVectorization(I, VF));
6162 VectorTy = RetTy;
6163 } else
6164 VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
6165
6166 if (VF.isVector() && VectorTy->isVectorTy() &&
6167 !TTI.getNumberOfParts(Tp: VectorTy))
6168 return InstructionCost::getInvalid();
6169
6170 // TODO: We need to estimate the cost of intrinsic calls.
6171 switch (I->getOpcode()) {
6172 case Instruction::GetElementPtr:
6173 // We mark this instruction as zero-cost because the cost of GEPs in
6174 // vectorized code depends on whether the corresponding memory instruction
6175 // is scalarized or not. Therefore, we handle GEPs with the memory
6176 // instruction cost.
6177 return 0;
6178 case Instruction::Br: {
6179 // In cases of scalarized and predicated instructions, there will be VF
6180 // predicated blocks in the vectorized loop. Each branch around these
6181 // blocks requires also an extract of its vector compare i1 element.
6182 // Note that the conditional branch from the loop latch will be replaced by
6183 // a single branch controlling the loop, so there is no extra overhead from
6184 // scalarization.
6185 bool ScalarPredicatedBB = false;
6186 BranchInst *BI = cast<BranchInst>(Val: I);
6187 if (VF.isVector() && BI->isConditional() &&
6188 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6189 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
6190 BI->getParent() != TheLoop->getLoopLatch())
6191 ScalarPredicatedBB = true;
6192
6193 if (ScalarPredicatedBB) {
6194 // Not possible to scalarize scalable vector with predicated instructions.
6195 if (VF.isScalable())
6196 return InstructionCost::getInvalid();
6197 // Return cost for branches around scalarized and predicated blocks.
6198 auto *VecI1Ty =
6199 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6200 return (
6201 TTI.getScalarizationOverhead(
6202 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6203 /*Insert*/ false, /*Extract*/ true, CostKind) +
6204 (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6205 }
6206
6207 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6208 // The back-edge branch will remain, as will all scalar branches.
6209 return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6210
6211 // This branch will be eliminated by if-conversion.
6212 return 0;
6213 // Note: We currently assume zero cost for an unconditional branch inside
6214 // a predicated block since it will become a fall-through, although we
6215 // may decide in the future to call TTI for all branches.
6216 }
6217 case Instruction::Switch: {
6218 if (VF.isScalar())
6219 return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6220 auto *Switch = cast<SwitchInst>(Val: I);
6221 return Switch->getNumCases() *
6222 TTI.getCmpSelInstrCost(
6223 Opcode: Instruction::ICmp,
6224 ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6225 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6226 VecPred: CmpInst::ICMP_EQ, CostKind);
6227 }
6228 case Instruction::PHI: {
6229 auto *Phi = cast<PHINode>(Val: I);
6230
6231 // First-order recurrences are replaced by vector shuffles inside the loop.
6232 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6233 SmallVector<int> Mask(VF.getKnownMinValue());
6234 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6235 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6236 DstTy: cast<VectorType>(Val: VectorTy),
6237 SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6238 Index: VF.getKnownMinValue() - 1);
6239 }
6240
6241 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6242 // converted into select instructions. We require N - 1 selects per phi
6243 // node, where N is the number of incoming values.
6244 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6245 Type *ResultTy = Phi->getType();
6246
6247 // All instructions in an Any-of reduction chain are narrowed to bool.
6248 // Check if that is the case for this phi node.
6249 auto *HeaderUser = cast_if_present<PHINode>(
6250 Val: find_singleton<User>(Range: Phi->users(), P: [this](User *U, bool) -> User * {
6251 auto *Phi = dyn_cast<PHINode>(Val: U);
6252 if (Phi && Phi->getParent() == TheLoop->getHeader())
6253 return Phi;
6254 return nullptr;
6255 }));
6256 if (HeaderUser) {
6257 auto &ReductionVars = Legal->getReductionVars();
6258 auto Iter = ReductionVars.find(Key: HeaderUser);
6259 if (Iter != ReductionVars.end() &&
6260 RecurrenceDescriptor::isAnyOfRecurrenceKind(
6261 Kind: Iter->second.getRecurrenceKind()))
6262 ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6263 }
6264 return (Phi->getNumIncomingValues() - 1) *
6265 TTI.getCmpSelInstrCost(
6266 Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6267 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6268 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6269 }
6270
6271 // When tail folding with EVL, if the phi is part of an out of loop
6272 // reduction then it will be transformed into a wide vp_merge.
6273 if (VF.isVector() && foldTailWithEVL() &&
6274 Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6275 IntrinsicCostAttributes ICA(
6276 Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6277 {toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6278 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6279 }
6280
6281 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6282 }
6283 case Instruction::UDiv:
6284 case Instruction::SDiv:
6285 case Instruction::URem:
6286 case Instruction::SRem:
6287 if (VF.isVector() && isPredicatedInst(I)) {
6288 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6289 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6290 ScalarCost : SafeDivisorCost;
6291 }
6292 // We've proven all lanes safe to speculate, fall through.
6293 [[fallthrough]];
6294 case Instruction::Add:
6295 case Instruction::Sub: {
6296 auto Info = Legal->getHistogramInfo(I);
6297 if (Info && VF.isVector()) {
6298 const HistogramInfo *HGram = Info.value();
6299 // Assume that a non-constant update value (or a constant != 1) requires
6300 // a multiply, and add that into the cost.
6301 InstructionCost MulCost = TTI::TCC_Free;
6302 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
6303 if (!RHS || RHS->getZExtValue() != 1)
6304 MulCost =
6305 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6306
6307 // Find the cost of the histogram operation itself.
6308 Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6309 Type *ScalarTy = I->getType();
6310 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6311 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6312 Type::getVoidTy(C&: I->getContext()),
6313 {PtrTy, ScalarTy, MaskTy});
6314
6315 // Add the costs together with the add/sub operation.
6316 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6317 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6318 }
6319 [[fallthrough]];
6320 }
6321 case Instruction::FAdd:
6322 case Instruction::FSub:
6323 case Instruction::Mul:
6324 case Instruction::FMul:
6325 case Instruction::FDiv:
6326 case Instruction::FRem:
6327 case Instruction::Shl:
6328 case Instruction::LShr:
6329 case Instruction::AShr:
6330 case Instruction::And:
6331 case Instruction::Or:
6332 case Instruction::Xor: {
6333 // If we're speculating on the stride being 1, the multiplication may
6334 // fold away. We can generalize this for all operations using the notion
6335 // of neutral elements. (TODO)
6336 if (I->getOpcode() == Instruction::Mul &&
6337 ((TheLoop->isLoopInvariant(V: I->getOperand(i: 0)) &&
6338 PSE.getSCEV(V: I->getOperand(i: 0))->isOne()) ||
6339 (TheLoop->isLoopInvariant(V: I->getOperand(i: 1)) &&
6340 PSE.getSCEV(V: I->getOperand(i: 1))->isOne())))
6341 return 0;
6342
6343 // Detect reduction patterns
6344 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6345 return *RedCost;
6346
6347 // Certain instructions can be cheaper to vectorize if they have a constant
6348 // second vector operand. One example of this are shifts on x86.
6349 Value *Op2 = I->getOperand(i: 1);
6350 if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6351 PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6352 isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6353 Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6354 }
6355 auto Op2Info = TTI.getOperandInfo(V: Op2);
6356 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6357 shouldConsiderInvariant(Op: Op2))
6358 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6359
6360 SmallVector<const Value *, 4> Operands(I->operand_values());
6361 return TTI.getArithmeticInstrCost(
6362 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6363 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6364 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6365 }
6366 case Instruction::FNeg: {
6367 return TTI.getArithmeticInstrCost(
6368 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6369 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6370 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6371 Args: I->getOperand(i: 0), CxtI: I);
6372 }
6373 case Instruction::Select: {
6374 SelectInst *SI = cast<SelectInst>(Val: I);
6375 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6376 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6377
6378 const Value *Op0, *Op1;
6379 using namespace llvm::PatternMatch;
6380 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6381 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6382 // select x, y, false --> x & y
6383 // select x, true, y --> x | y
6384 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6385 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6386 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6387 Op1->getType()->getScalarSizeInBits() == 1);
6388
6389 return TTI.getArithmeticInstrCost(
6390 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And,
6391 Ty: VectorTy, CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: {Op0, Op1}, CxtI: I);
6392 }
6393
6394 Type *CondTy = SI->getCondition()->getType();
6395 if (!ScalarCond)
6396 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6397
6398 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6399 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6400 Pred = Cmp->getPredicate();
6401 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6402 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6403 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6404 }
6405 case Instruction::ICmp:
6406 case Instruction::FCmp: {
6407 Type *ValTy = I->getOperand(i: 0)->getType();
6408
6409 if (canTruncateToMinimalBitwidth(I, VF)) {
6410 [[maybe_unused]] Instruction *Op0AsInstruction =
6411 dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6412 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6413 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6414 "if both the operand and the compare are marked for "
6415 "truncation, they must have the same bitwidth");
6416 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[I]);
6417 }
6418
6419 VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6420 return TTI.getCmpSelInstrCost(
6421 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6422 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6423 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6424 }
6425 case Instruction::Store:
6426 case Instruction::Load: {
6427 ElementCount Width = VF;
6428 if (Width.isVector()) {
6429 InstWidening Decision = getWideningDecision(I, VF: Width);
6430 assert(Decision != CM_Unknown &&
6431 "CM decision should be taken at this point");
6432 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6433 return InstructionCost::getInvalid();
6434 if (Decision == CM_Scalarize)
6435 Width = ElementCount::getFixed(MinVal: 1);
6436 }
6437 VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6438 return getMemoryInstructionCost(I, VF);
6439 }
6440 case Instruction::BitCast:
6441 if (I->getType()->isPointerTy())
6442 return 0;
6443 [[fallthrough]];
6444 case Instruction::ZExt:
6445 case Instruction::SExt:
6446 case Instruction::FPToUI:
6447 case Instruction::FPToSI:
6448 case Instruction::FPExt:
6449 case Instruction::PtrToInt:
6450 case Instruction::IntToPtr:
6451 case Instruction::SIToFP:
6452 case Instruction::UIToFP:
6453 case Instruction::Trunc:
6454 case Instruction::FPTrunc: {
6455 // Computes the CastContextHint from a Load/Store instruction.
6456 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6457 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6458 "Expected a load or a store!");
6459
6460 if (VF.isScalar() || !TheLoop->contains(Inst: I))
6461 return TTI::CastContextHint::Normal;
6462
6463 switch (getWideningDecision(I, VF)) {
6464 case LoopVectorizationCostModel::CM_GatherScatter:
6465 return TTI::CastContextHint::GatherScatter;
6466 case LoopVectorizationCostModel::CM_Interleave:
6467 return TTI::CastContextHint::Interleave;
6468 case LoopVectorizationCostModel::CM_Scalarize:
6469 case LoopVectorizationCostModel::CM_Widen:
6470 return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6471 : TTI::CastContextHint::Normal;
6472 case LoopVectorizationCostModel::CM_Widen_Reverse:
6473 return TTI::CastContextHint::Reversed;
6474 case LoopVectorizationCostModel::CM_Unknown:
6475 llvm_unreachable("Instr did not go through cost modelling?");
6476 case LoopVectorizationCostModel::CM_VectorCall:
6477 case LoopVectorizationCostModel::CM_IntrinsicCall:
6478 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6479 }
6480
6481 llvm_unreachable("Unhandled case!");
6482 };
6483
6484 unsigned Opcode = I->getOpcode();
6485 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6486 // For Trunc, the context is the only user, which must be a StoreInst.
6487 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6488 if (I->hasOneUse())
6489 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
6490 CCH = ComputeCCH(Store);
6491 }
6492 // For Z/Sext, the context is the operand, which must be a LoadInst.
6493 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6494 Opcode == Instruction::FPExt) {
6495 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
6496 CCH = ComputeCCH(Load);
6497 }
6498
6499 // We optimize the truncation of induction variables having constant
6500 // integer steps. The cost of these truncations is the same as the scalar
6501 // operation.
6502 if (isOptimizableIVTruncate(I, VF)) {
6503 auto *Trunc = cast<TruncInst>(Val: I);
6504 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6505 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6506 }
6507
6508 // Detect reduction patterns
6509 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6510 return *RedCost;
6511
6512 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
6513 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6514 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6515 SrcScalarTy =
6516 IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6517 Type *SrcVecTy =
6518 VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6519
6520 if (canTruncateToMinimalBitwidth(I, VF)) {
6521 // If the result type is <= the source type, there will be no extend
6522 // after truncating the users to the minimal required bitwidth.
6523 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6524 (I->getOpcode() == Instruction::ZExt ||
6525 I->getOpcode() == Instruction::SExt))
6526 return 0;
6527 }
6528
6529 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6530 }
6531 case Instruction::Call:
6532 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6533 case Instruction::ExtractValue:
6534 return TTI.getInstructionCost(U: I, CostKind);
6535 case Instruction::Alloca:
6536 // We cannot easily widen alloca to a scalable alloca, as
6537 // the result would need to be a vector of pointers.
6538 if (VF.isScalable())
6539 return InstructionCost::getInvalid();
6540 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: RetTy, CostKind);
6541 default:
6542 // This opcode is unknown. Assume that it is the same as 'mul'.
6543 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6544 } // end of switch.
6545}
6546
6547void LoopVectorizationCostModel::collectValuesToIgnore() {
6548 // Ignore ephemeral values.
6549 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6550
6551 SmallVector<Value *, 4> DeadInterleavePointerOps;
6552 SmallVector<Value *, 4> DeadOps;
6553
6554 // If a scalar epilogue is required, users outside the loop won't use
6555 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6556 // that is the case.
6557 bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6558 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6559 return RequiresScalarEpilogue &&
6560 !TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6561 };
6562
6563 LoopBlocksDFS DFS(TheLoop);
6564 DFS.perform(LI);
6565 for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6566 for (Instruction &I : reverse(C&: *BB)) {
6567 if (VecValuesToIgnore.contains(Ptr: &I) || ValuesToIgnore.contains(Ptr: &I))
6568 continue;
6569
6570 // Add instructions that would be trivially dead and are only used by
6571 // values already ignored to DeadOps to seed worklist.
6572 if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6573 all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6574 return VecValuesToIgnore.contains(Ptr: U) ||
6575 ValuesToIgnore.contains(Ptr: U) || IsLiveOutDead(U);
6576 }))
6577 DeadOps.push_back(Elt: &I);
6578
6579 // For interleave groups, we only create a pointer for the start of the
6580 // interleave group. Queue up addresses of group members except the insert
6581 // position for further processing.
6582 if (isAccessInterleaved(Instr: &I)) {
6583 auto *Group = getInterleavedAccessGroup(Instr: &I);
6584 if (Group->getInsertPos() == &I)
6585 continue;
6586 Value *PointerOp = getLoadStorePointerOperand(V: &I);
6587 DeadInterleavePointerOps.push_back(Elt: PointerOp);
6588 }
6589
6590 // Queue branches for analysis. They are dead, if their successors only
6591 // contain dead instructions.
6592 if (auto *Br = dyn_cast<BranchInst>(Val: &I)) {
6593 if (Br->isConditional())
6594 DeadOps.push_back(Elt: &I);
6595 }
6596 }
6597
6598 // Mark ops feeding interleave group members as free, if they are only used
6599 // by other dead computations.
6600 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6601 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
6602 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
6603 Instruction *UI = cast<Instruction>(Val: U);
6604 return !VecValuesToIgnore.contains(Ptr: U) &&
6605 (!isAccessInterleaved(Instr: UI) ||
6606 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6607 }))
6608 continue;
6609 VecValuesToIgnore.insert(Ptr: Op);
6610 append_range(C&: DeadInterleavePointerOps, R: Op->operands());
6611 }
6612
6613 // Mark ops that would be trivially dead and are only used by ignored
6614 // instructions as free.
6615 BasicBlock *Header = TheLoop->getHeader();
6616
6617 // Returns true if the block contains only dead instructions. Such blocks will
6618 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6619 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6620 auto IsEmptyBlock = [this](BasicBlock *BB) {
6621 return all_of(Range&: *BB, P: [this](Instruction &I) {
6622 return ValuesToIgnore.contains(Ptr: &I) || VecValuesToIgnore.contains(Ptr: &I) ||
6623 (isa<BranchInst>(Val: &I) && !cast<BranchInst>(Val: &I)->isConditional());
6624 });
6625 };
6626 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6627 auto *Op = dyn_cast<Instruction>(Val: DeadOps[I]);
6628
6629 // Check if the branch should be considered dead.
6630 if (auto *Br = dyn_cast_or_null<BranchInst>(Val: Op)) {
6631 BasicBlock *ThenBB = Br->getSuccessor(i: 0);
6632 BasicBlock *ElseBB = Br->getSuccessor(i: 1);
6633 // Don't considers branches leaving the loop for simplification.
6634 if (!TheLoop->contains(BB: ThenBB) || !TheLoop->contains(BB: ElseBB))
6635 continue;
6636 bool ThenEmpty = IsEmptyBlock(ThenBB);
6637 bool ElseEmpty = IsEmptyBlock(ElseBB);
6638 if ((ThenEmpty && ElseEmpty) ||
6639 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6640 ElseBB->phis().empty()) ||
6641 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6642 ThenBB->phis().empty())) {
6643 VecValuesToIgnore.insert(Ptr: Br);
6644 DeadOps.push_back(Elt: Br->getCondition());
6645 }
6646 continue;
6647 }
6648
6649 // Skip any op that shouldn't be considered dead.
6650 if (!Op || !TheLoop->contains(Inst: Op) ||
6651 (isa<PHINode>(Val: Op) && Op->getParent() == Header) ||
6652 !wouldInstructionBeTriviallyDead(I: Op, TLI) ||
6653 any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6654 return !VecValuesToIgnore.contains(Ptr: U) &&
6655 !ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead(U);
6656 }))
6657 continue;
6658
6659 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6660 // which applies for both scalar and vector versions. Otherwise it is only
6661 // dead in vector versions, so only add it to VecValuesToIgnore.
6662 if (all_of(Range: Op->users(),
6663 P: [this](User *U) { return ValuesToIgnore.contains(Ptr: U); }))
6664 ValuesToIgnore.insert(Ptr: Op);
6665
6666 VecValuesToIgnore.insert(Ptr: Op);
6667 append_range(C&: DeadOps, R: Op->operands());
6668 }
6669
6670 // Ignore type-promoting instructions we identified during reduction
6671 // detection.
6672 for (const auto &Reduction : Legal->getReductionVars()) {
6673 const RecurrenceDescriptor &RedDes = Reduction.second;
6674 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6675 VecValuesToIgnore.insert_range(R: Casts);
6676 }
6677 // Ignore type-casting instructions we identified during induction
6678 // detection.
6679 for (const auto &Induction : Legal->getInductionVars()) {
6680 const InductionDescriptor &IndDes = Induction.second;
6681 VecValuesToIgnore.insert_range(R: IndDes.getCastInsts());
6682 }
6683}
6684
6685void LoopVectorizationCostModel::collectInLoopReductions() {
6686 // Avoid duplicating work finding in-loop reductions.
6687 if (!InLoopReductions.empty())
6688 return;
6689
6690 for (const auto &Reduction : Legal->getReductionVars()) {
6691 PHINode *Phi = Reduction.first;
6692 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6693
6694 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
6695 // separately and should not be considered for in-loop reductions.
6696 if (RdxDesc.hasUsesOutsideReductionChain())
6697 continue;
6698
6699 // We don't collect reductions that are type promoted (yet).
6700 if (RdxDesc.getRecurrenceType() != Phi->getType())
6701 continue;
6702
6703 // In-loop AnyOf and FindIV reductions are not yet supported.
6704 RecurKind Kind = RdxDesc.getRecurrenceKind();
6705 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) ||
6706 RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) ||
6707 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
6708 continue;
6709
6710 // If the target would prefer this reduction to happen "in-loop", then we
6711 // want to record it as such.
6712 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6713 !TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6714 continue;
6715
6716 // Check that we can correctly put the reductions into the loop, by
6717 // finding the chain of operations that leads from the phi to the loop
6718 // exit value.
6719 SmallVector<Instruction *, 4> ReductionOperations =
6720 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6721 bool InLoop = !ReductionOperations.empty();
6722
6723 if (InLoop) {
6724 InLoopReductions.insert(Ptr: Phi);
6725 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6726 Instruction *LastChain = Phi;
6727 for (auto *I : ReductionOperations) {
6728 InLoopReductionImmediateChains[I] = LastChain;
6729 LastChain = I;
6730 }
6731 }
6732 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6733 << " reduction for phi: " << *Phi << "\n");
6734 }
6735}
6736
6737// This function will select a scalable VF if the target supports scalable
6738// vectors and a fixed one otherwise.
6739// TODO: we could return a pair of values that specify the max VF and
6740// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6741// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6742// doesn't have a cost model that can choose which plan to execute if
6743// more than one is generated.
6744static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6745 LoopVectorizationCostModel &CM) {
6746 unsigned WidestType;
6747 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6748
6749 TargetTransformInfo::RegisterKind RegKind =
6750 TTI.enableScalableVectorization()
6751 ? TargetTransformInfo::RGK_ScalableVector
6752 : TargetTransformInfo::RGK_FixedWidthVector;
6753
6754 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6755 unsigned N = RegSize.getKnownMinValue() / WidestType;
6756 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6757}
6758
6759VectorizationFactor
6760LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6761 ElementCount VF = UserVF;
6762 // Outer loop handling: They may require CFG and instruction level
6763 // transformations before even evaluating whether vectorization is profitable.
6764 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6765 // the vectorization pipeline.
6766 if (!OrigLoop->isInnermost()) {
6767 // If the user doesn't provide a vectorization factor, determine a
6768 // reasonable one.
6769 if (UserVF.isZero()) {
6770 VF = determineVPlanVF(TTI, CM);
6771 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6772
6773 // Make sure we have a VF > 1 for stress testing.
6774 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6775 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6776 << "overriding computed VF.\n");
6777 VF = ElementCount::getFixed(MinVal: 4);
6778 }
6779 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6780 !ForceTargetSupportsScalableVectors) {
6781 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6782 << "not supported by the target.\n");
6783 reportVectorizationFailure(
6784 DebugMsg: "Scalable vectorization requested but not supported by the target",
6785 OREMsg: "the scalable user-specified vectorization width for outer-loop "
6786 "vectorization cannot be used because the target does not support "
6787 "scalable vectors.",
6788 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6789 return VectorizationFactor::Disabled();
6790 }
6791 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6792 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6793 "VF needs to be a power of two");
6794 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6795 << "VF " << VF << " to build VPlans.\n");
6796 buildVPlans(MinVF: VF, MaxVF: VF);
6797
6798 if (VPlans.empty())
6799 return VectorizationFactor::Disabled();
6800
6801 // For VPlan build stress testing, we bail out after VPlan construction.
6802 if (VPlanBuildStressTest)
6803 return VectorizationFactor::Disabled();
6804
6805 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6806 }
6807
6808 LLVM_DEBUG(
6809 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6810 "VPlan-native path.\n");
6811 return VectorizationFactor::Disabled();
6812}
6813
6814void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6815 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6816 CM.collectValuesToIgnore();
6817 CM.collectElementTypesForWidening();
6818
6819 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6820 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6821 return;
6822
6823 // Invalidate interleave groups if all blocks of loop will be predicated.
6824 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6825 !useMaskedInterleavedAccesses(TTI)) {
6826 LLVM_DEBUG(
6827 dbgs()
6828 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6829 "which requires masked-interleaved support.\n");
6830 if (CM.InterleaveInfo.invalidateGroups())
6831 // Invalidating interleave groups also requires invalidating all decisions
6832 // based on them, which includes widening decisions and uniform and scalar
6833 // values.
6834 CM.invalidateCostModelingDecisions();
6835 }
6836
6837 if (CM.foldTailByMasking())
6838 Legal->prepareToFoldTailByMasking();
6839
6840 ElementCount MaxUserVF =
6841 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6842 if (UserVF) {
6843 if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6844 reportVectorizationInfo(
6845 Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6846 ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6847 } else {
6848 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6849 "VF needs to be a power of two");
6850 // Collect the instructions (and their associated costs) that will be more
6851 // profitable to scalarize.
6852 CM.collectInLoopReductions();
6853 if (CM.selectUserVectorizationFactor(UserVF)) {
6854 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6855 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6856 LLVM_DEBUG(printPlans(dbgs()));
6857 return;
6858 }
6859 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6860 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6861 }
6862 }
6863
6864 // Collect the Vectorization Factor Candidates.
6865 SmallVector<ElementCount> VFCandidates;
6866 for (auto VF = ElementCount::getFixed(MinVal: 1);
6867 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
6868 VFCandidates.push_back(Elt: VF);
6869 for (auto VF = ElementCount::getScalable(MinVal: 1);
6870 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
6871 VFCandidates.push_back(Elt: VF);
6872
6873 CM.collectInLoopReductions();
6874 for (const auto &VF : VFCandidates) {
6875 // Collect Uniform and Scalar instructions after vectorization with VF.
6876 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6877 }
6878
6879 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
6880 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
6881
6882 LLVM_DEBUG(printPlans(dbgs()));
6883}
6884
6885InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6886 ElementCount VF) const {
6887 InstructionCost Cost = CM.getInstructionCost(I: UI, VF);
6888 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6889 return InstructionCost(ForceTargetInstructionCost);
6890 return Cost;
6891}
6892
6893bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6894 ElementCount VF) const {
6895 return CM.isUniformAfterVectorization(I, VF);
6896}
6897
6898bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6899 return CM.ValuesToIgnore.contains(Ptr: UI) ||
6900 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
6901 SkipCostComputation.contains(Ptr: UI);
6902}
6903
6904unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
6905 return CM.getPredBlockCostDivisor(CostKind, BB);
6906}
6907
6908InstructionCost
6909LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6910 VPCostContext &CostCtx) const {
6911 InstructionCost Cost;
6912 // Cost modeling for inductions is inaccurate in the legacy cost model
6913 // compared to the recipes that are generated. To match here initially during
6914 // VPlan cost model bring up directly use the induction costs from the legacy
6915 // cost model. Note that we do this as pre-processing; the VPlan may not have
6916 // any recipes associated with the original induction increment instruction
6917 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6918 // the cost of induction phis and increments (both that are represented by
6919 // recipes and those that are not), to avoid distinguishing between them here,
6920 // and skip all recipes that represent induction phis and increments (the
6921 // former case) later on, if they exist, to avoid counting them twice.
6922 // Similarly we pre-compute the cost of any optimized truncates.
6923 // TODO: Switch to more accurate costing based on VPlan.
6924 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6925 Instruction *IVInc = cast<Instruction>(
6926 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6927 SmallVector<Instruction *> IVInsts = {IVInc};
6928 for (unsigned I = 0; I != IVInsts.size(); I++) {
6929 for (Value *Op : IVInsts[I]->operands()) {
6930 auto *OpI = dyn_cast<Instruction>(Val: Op);
6931 if (Op == IV || !OpI || !OrigLoop->contains(Inst: OpI) || !Op->hasOneUse())
6932 continue;
6933 IVInsts.push_back(Elt: OpI);
6934 }
6935 }
6936 IVInsts.push_back(Elt: IV);
6937 for (User *U : IV->users()) {
6938 auto *CI = cast<Instruction>(Val: U);
6939 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6940 continue;
6941 IVInsts.push_back(Elt: CI);
6942 }
6943
6944 // If the vector loop gets executed exactly once with the given VF, ignore
6945 // the costs of comparison and induction instructions, as they'll get
6946 // simplified away.
6947 // TODO: Remove this code after stepping away from the legacy cost model and
6948 // adding code to simplify VPlans before calculating their costs.
6949 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6950 if (TC == VF && !CM.foldTailByMasking())
6951 addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6952 InstsToIgnore&: CostCtx.SkipCostComputation);
6953
6954 for (Instruction *IVInst : IVInsts) {
6955 if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6956 continue;
6957 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6958 LLVM_DEBUG({
6959 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6960 << ": induction instruction " << *IVInst << "\n";
6961 });
6962 Cost += InductionCost;
6963 CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6964 }
6965 }
6966
6967 /// Compute the cost of all exiting conditions of the loop using the legacy
6968 /// cost model. This is to match the legacy behavior, which adds the cost of
6969 /// all exit conditions. Note that this over-estimates the cost, as there will
6970 /// be a single condition to control the vector loop.
6971 SmallVector<BasicBlock *> Exiting;
6972 CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6973 SetVector<Instruction *> ExitInstrs;
6974 // Collect all exit conditions.
6975 for (BasicBlock *EB : Exiting) {
6976 auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator());
6977 if (!Term || CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6978 continue;
6979 if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) {
6980 ExitInstrs.insert(X: CondI);
6981 }
6982 }
6983 // Compute the cost of all instructions only feeding the exit conditions.
6984 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6985 Instruction *CondI = ExitInstrs[I];
6986 if (!OrigLoop->contains(Inst: CondI) ||
6987 !CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6988 continue;
6989 InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6990 LLVM_DEBUG({
6991 dbgs() << "Cost of " << CondICost << " for VF " << VF
6992 << ": exit condition instruction " << *CondI << "\n";
6993 });
6994 Cost += CondICost;
6995 for (Value *Op : CondI->operands()) {
6996 auto *OpI = dyn_cast<Instruction>(Val: Op);
6997 if (!OpI || CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) ||
6998 any_of(Range: OpI->users(), P: [&ExitInstrs](User *U) {
6999 return !ExitInstrs.contains(key: cast<Instruction>(Val: U));
7000 }))
7001 continue;
7002 ExitInstrs.insert(X: OpI);
7003 }
7004 }
7005
7006 // Pre-compute the costs for branches except for the backedge, as the number
7007 // of replicate regions in a VPlan may not directly match the number of
7008 // branches, which would lead to different decisions.
7009 // TODO: Compute cost of branches for each replicate region in the VPlan,
7010 // which is more accurate than the legacy cost model.
7011 for (BasicBlock *BB : OrigLoop->blocks()) {
7012 if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
7013 continue;
7014 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
7015 if (BB == OrigLoop->getLoopLatch())
7016 continue;
7017 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
7018 Cost += BranchCost;
7019 }
7020
7021 // Don't apply special costs when instruction cost is forced to make sure the
7022 // forced cost is used for each recipe.
7023 if (ForceTargetInstructionCost.getNumOccurrences())
7024 return Cost;
7025
7026 // Pre-compute costs for instructions that are forced-scalar or profitable to
7027 // scalarize. Their costs will be computed separately in the legacy cost
7028 // model.
7029 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7030 if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
7031 continue;
7032 CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
7033 InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
7034 LLVM_DEBUG({
7035 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7036 << ": forced scalar " << *ForcedScalar << "\n";
7037 });
7038 Cost += ForcedCost;
7039 }
7040 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7041 if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
7042 continue;
7043 CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
7044 LLVM_DEBUG({
7045 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7046 << ": profitable to scalarize " << *Scalarized << "\n";
7047 });
7048 Cost += ScalarCost;
7049 }
7050
7051 return Cost;
7052}
7053
7054InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7055 ElementCount VF) const {
7056 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
7057 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7058
7059 // Now compute and add the VPlan-based cost.
7060 Cost += Plan.cost(VF, Ctx&: CostCtx);
7061#ifndef NDEBUG
7062 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
7063 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7064 << " (Estimated cost per lane: ");
7065 if (Cost.isValid()) {
7066 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
7067 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7068 } else /* No point dividing an invalid cost - it will still be invalid */
7069 LLVM_DEBUG(dbgs() << "Invalid");
7070 LLVM_DEBUG(dbgs() << ")\n");
7071#endif
7072 return Cost;
7073}
7074
7075#ifndef NDEBUG
7076/// Return true if the original loop \ TheLoop contains any instructions that do
7077/// not have corresponding recipes in \p Plan and are not marked to be ignored
7078/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7079/// cost-model did not account for.
7080static bool planContainsAdditionalSimplifications(VPlan &Plan,
7081 VPCostContext &CostCtx,
7082 Loop *TheLoop,
7083 ElementCount VF) {
7084 // First collect all instructions for the recipes in Plan.
7085 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7086 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7087 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7088 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7089 return &WidenMem->getIngredient();
7090 return nullptr;
7091 };
7092
7093 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
7094 // the select doesn't need to be considered for the vector loop cost; go with
7095 // the more accurate VPlan-based cost model.
7096 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
7097 auto *VPI = dyn_cast<VPInstruction>(&R);
7098 if (!VPI || VPI->getOpcode() != Instruction::Select)
7099 continue;
7100
7101 if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7102 switch (WR->getOpcode()) {
7103 case Instruction::UDiv:
7104 case Instruction::SDiv:
7105 case Instruction::URem:
7106 case Instruction::SRem:
7107 return true;
7108 default:
7109 break;
7110 }
7111 }
7112 }
7113
7114 DenseSet<Instruction *> SeenInstrs;
7115 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7116 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7117 for (VPRecipeBase &R : *VPBB) {
7118 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7119 auto *IG = IR->getInterleaveGroup();
7120 unsigned NumMembers = IG->getNumMembers();
7121 for (unsigned I = 0; I != NumMembers; ++I) {
7122 if (Instruction *M = IG->getMember(I))
7123 SeenInstrs.insert(M);
7124 }
7125 continue;
7126 }
7127 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7128 // cost model won't cost it whilst the legacy will.
7129 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7130 using namespace VPlanPatternMatch;
7131 if (none_of(FOR->users(),
7132 match_fn(m_VPInstruction<
7133 VPInstruction::FirstOrderRecurrenceSplice>())))
7134 return true;
7135 }
7136 // The VPlan-based cost model is more accurate for partial reductions and
7137 // comparing against the legacy cost isn't desirable.
7138 if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7139 if (VPR->isPartialReduction())
7140 return true;
7141
7142 // The VPlan-based cost model can analyze if recipes are scalar
7143 // recursively, but the legacy cost model cannot.
7144 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7145 auto *AddrI = dyn_cast<Instruction>(
7146 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7147 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7148 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7149 return true;
7150
7151 if (WidenMemR->isReverse()) {
7152 // If the stored value of a reverse store is invariant, LICM will
7153 // hoist the reverse operation to the preheader. In this case, the
7154 // result of the VPlan-based cost model will diverge from that of
7155 // the legacy model.
7156 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
7157 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7158 return true;
7159
7160 if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
7161 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7162 return true;
7163 }
7164 }
7165
7166 // The legacy cost model costs non-header phis with a scalar VF as a phi,
7167 // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
7168 if (isa<VPBlendRecipe>(&R) &&
7169 vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
7170 return true;
7171
7172 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7173 /// but the original instruction wasn't uniform-after-vectorization in the
7174 /// legacy cost model, the legacy cost overestimates the actual cost.
7175 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7176 if (RepR->isSingleScalar() &&
7177 !CostCtx.isLegacyUniformAfterVectorization(
7178 RepR->getUnderlyingInstr(), VF))
7179 return true;
7180 }
7181 if (Instruction *UI = GetInstructionForCost(&R)) {
7182 // If we adjusted the predicate of the recipe, the cost in the legacy
7183 // cost model may be different.
7184 using namespace VPlanPatternMatch;
7185 CmpPredicate Pred;
7186 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7187 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7188 cast<CmpInst>(UI)->getPredicate())
7189 return true;
7190
7191 // Recipes with underlying instructions being moved out of the loop
7192 // region by LICM may cause discrepancies between the legacy cost model
7193 // and the VPlan-based cost model.
7194 if (!VPBB->getEnclosingLoopRegion())
7195 return true;
7196
7197 SeenInstrs.insert(UI);
7198 }
7199 }
7200 }
7201
7202 // Return true if the loop contains any instructions that are not also part of
7203 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7204 // that the VPlan contains extra simplifications.
7205 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7206 TheLoop](BasicBlock *BB) {
7207 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7208 // Skip induction phis when checking for simplifications, as they may not
7209 // be lowered directly be lowered to a corresponding PHI recipe.
7210 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7211 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7212 return false;
7213 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7214 });
7215 });
7216}
7217#endif
7218
7219VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7220 if (VPlans.empty())
7221 return VectorizationFactor::Disabled();
7222 // If there is a single VPlan with a single VF, return it directly.
7223 VPlan &FirstPlan = *VPlans[0];
7224 if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1)
7225 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7226
7227 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7228 << (CM.CostKind == TTI::TCK_RecipThroughput
7229 ? "Reciprocal Throughput\n"
7230 : CM.CostKind == TTI::TCK_Latency
7231 ? "Instruction Latency\n"
7232 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7233 : CM.CostKind == TTI::TCK_SizeAndLatency
7234 ? "Code Size and Latency\n"
7235 : "Unknown\n"));
7236
7237 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
7238 assert(hasPlanWithVF(ScalarVF) &&
7239 "More than a single plan/VF w/o any plan having scalar VF");
7240
7241 // TODO: Compute scalar cost using VPlan-based cost model.
7242 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7243 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7244 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7245 VectorizationFactor BestFactor = ScalarFactor;
7246
7247 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7248 if (ForceVectorization) {
7249 // Ignore scalar width, because the user explicitly wants vectorization.
7250 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7251 // evaluation.
7252 BestFactor.Cost = InstructionCost::getMax();
7253 }
7254
7255 for (auto &P : VPlans) {
7256 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7257 P->vectorFactors().end());
7258
7259 SmallVector<VPRegisterUsage, 8> RUs;
7260 if (any_of(Range&: VFs, P: [this](ElementCount VF) {
7261 return CM.shouldConsiderRegPressureForVF(VF);
7262 }))
7263 RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7264
7265 for (unsigned I = 0; I < VFs.size(); I++) {
7266 ElementCount VF = VFs[I];
7267 if (VF.isScalar())
7268 continue;
7269 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7270 LLVM_DEBUG(
7271 dbgs()
7272 << "LV: Not considering vector loop of width " << VF
7273 << " because it will not generate any vector instructions.\n");
7274 continue;
7275 }
7276 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7277 LLVM_DEBUG(
7278 dbgs()
7279 << "LV: Not considering vector loop of width " << VF
7280 << " because it would cause replicated blocks to be generated,"
7281 << " which isn't allowed when optimizing for size.\n");
7282 continue;
7283 }
7284
7285 InstructionCost Cost = cost(Plan&: *P, VF);
7286 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7287
7288 if (CM.shouldConsiderRegPressureForVF(VF) &&
7289 RUs[I].exceedsMaxNumRegs(TTI, OverrideMaxNumRegs: ForceTargetNumVectorRegs)) {
7290 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7291 << VF << " because it uses too many registers\n");
7292 continue;
7293 }
7294
7295 if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P->hasScalarTail()))
7296 BestFactor = CurrentFactor;
7297
7298 // If profitable add it to ProfitableVF list.
7299 if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P->hasScalarTail()))
7300 ProfitableVFs.push_back(Elt: CurrentFactor);
7301 }
7302 }
7303
7304#ifndef NDEBUG
7305 // Select the optimal vectorization factor according to the legacy cost-model.
7306 // This is now only used to verify the decisions by the new VPlan-based
7307 // cost-model and will be retired once the VPlan-based cost-model is
7308 // stabilized.
7309 VectorizationFactor LegacyVF = selectVectorizationFactor();
7310 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7311
7312 // Pre-compute the cost and use it to check if BestPlan contains any
7313 // simplifications not accounted for in the legacy cost model. If that's the
7314 // case, don't trigger the assertion, as the extra simplifications may cause a
7315 // different VF to be picked by the VPlan-based cost model.
7316 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, CM.PSE,
7317 OrigLoop);
7318 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7319 // Verify that the VPlan-based and legacy cost models agree, except for
7320 // * VPlans with early exits,
7321 // * VPlans with additional VPlan simplifications,
7322 // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
7323 // vp_scatter/vp_gather).
7324 // The legacy cost model doesn't properly model costs for such loops.
7325 bool UsesEVLGatherScatter =
7326 any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
7327 BestPlan.getVectorLoopRegion()->getEntry())),
7328 [](VPBasicBlock *VPBB) {
7329 return any_of(*VPBB, [](VPRecipeBase &R) {
7330 return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7331 !cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7332 });
7333 });
7334 assert(
7335 (BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7336 !Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
7337 planContainsAdditionalSimplifications(
7338 getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
7339 planContainsAdditionalSimplifications(
7340 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7341 " VPlan cost model and legacy cost model disagreed");
7342 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7343 "when vectorizing, the scalar cost must be computed.");
7344#endif
7345
7346 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7347 return BestFactor;
7348}
7349
7350// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7351// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7352// from the main vector loop.
7353static void fixReductionScalarResumeWhenVectorizingEpilog(
7354 VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
7355 using namespace VPlanPatternMatch;
7356 // Get the VPInstruction computing the reduction result in the middle block.
7357 // The first operand may not be from the middle block if it is not connected
7358 // to the scalar preheader. In that case, there's nothing to fix.
7359 VPValue *Incoming = EpiResumePhiR->getOperand(N: 0);
7360 match(V: Incoming, P: VPlanPatternMatch::m_ZExtOrSExt(
7361 Op0: VPlanPatternMatch::m_VPValue(V&: Incoming)));
7362 auto *EpiRedResult = dyn_cast<VPInstruction>(Val: Incoming);
7363 if (!EpiRedResult)
7364 return;
7365
7366 VPValue *BackedgeVal;
7367 bool IsFindIV = false;
7368 if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult ||
7369 EpiRedResult->getOpcode() == VPInstruction::ComputeReductionResult)
7370 BackedgeVal = EpiRedResult->getOperand(N: EpiRedResult->getNumOperands() - 1);
7371 else if (matchFindIVResult(VPI: EpiRedResult, ReducedIV: m_VPValue(V&: BackedgeVal), Start: m_VPValue()))
7372 IsFindIV = true;
7373 else
7374 return;
7375
7376 auto *EpiRedHeaderPhi = cast_if_present<VPReductionPHIRecipe>(
7377 Val: vputils::findRecipe(Start: BackedgeVal, Pred: IsaPred<VPReductionPHIRecipe>));
7378 if (!EpiRedHeaderPhi) {
7379 match(V: BackedgeVal,
7380 P: VPlanPatternMatch::m_Select(Op0: VPlanPatternMatch::m_VPValue(),
7381 Op1: VPlanPatternMatch::m_VPValue(V&: BackedgeVal),
7382 Op2: VPlanPatternMatch::m_VPValue()));
7383 EpiRedHeaderPhi = cast<VPReductionPHIRecipe>(
7384 Val: vputils::findRecipe(Start: BackedgeVal, Pred: IsaPred<VPReductionPHIRecipe>));
7385 }
7386
7387 Value *MainResumeValue;
7388 if (auto *VPI = dyn_cast<VPInstruction>(Val: EpiRedHeaderPhi->getStartValue())) {
7389 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7390 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7391 "unexpected start recipe");
7392 MainResumeValue = VPI->getOperand(N: 0)->getUnderlyingValue();
7393 } else
7394 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7395 if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
7396 [[maybe_unused]] Value *StartV =
7397 EpiRedResult->getOperand(N: 0)->getLiveInIRValue();
7398 auto *Cmp = cast<ICmpInst>(Val: MainResumeValue);
7399 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7400 "AnyOf expected to start with ICMP_NE");
7401 assert(Cmp->getOperand(1) == StartV &&
7402 "AnyOf expected to start by comparing main resume value to original "
7403 "start value");
7404 MainResumeValue = Cmp->getOperand(i_nocapture: 0);
7405 } else if (IsFindIV) {
7406 MainResumeValue = cast<SelectInst>(Val: MainResumeValue)->getFalseValue();
7407 }
7408 PHINode *MainResumePhi = cast<PHINode>(Val: MainResumeValue);
7409
7410 // When fixing reductions in the epilogue loop we should already have
7411 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7412 // over the incoming values correctly.
7413 EpiResumePhi.setIncomingValueForBlock(
7414 BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
7415}
7416
7417DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7418 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7419 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7420 assert(BestVPlan.hasVF(BestVF) &&
7421 "Trying to execute plan with unsupported VF");
7422 assert(BestVPlan.hasUF(BestUF) &&
7423 "Trying to execute plan with unsupported UF");
7424 if (BestVPlan.hasEarlyExit())
7425 ++LoopsEarlyExitVectorized;
7426 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7427 // cost model is complete for better cost estimates.
7428 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7429 RUN_VPLAN_PASS(VPlanTransforms::materializePacksAndUnpacks, BestVPlan);
7430 RUN_VPLAN_PASS(VPlanTransforms::materializeBroadcasts, BestVPlan);
7431 RUN_VPLAN_PASS(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7432 bool HasBranchWeights =
7433 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
7434 if (HasBranchWeights) {
7435 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7436 RUN_VPLAN_PASS(VPlanTransforms::addBranchWeightToMiddleTerminator,
7437 BestVPlan, BestVF, VScale);
7438 }
7439
7440 // Checks are the same for all VPlans, added to BestVPlan only for
7441 // compactness.
7442 attachRuntimeChecks(Plan&: BestVPlan, RTChecks&: ILV.RTChecks, HasBranchWeights);
7443
7444 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7445 VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
7446
7447 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7448 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7449 VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan);
7450 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7451 BestVPlan.getScalarPreheader()) {
7452 // TODO: The vector loop would be dead, should not even try to vectorize.
7453 ORE->emit(RemarkBuilder: [&]() {
7454 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7455 OrigLoop->getStartLoc(),
7456 OrigLoop->getHeader())
7457 << "Created vector loop never executes due to insufficient trip "
7458 "count.";
7459 });
7460 return DenseMap<const SCEV *, Value *>();
7461 }
7462
7463 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7464
7465 VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan);
7466 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
7467 VPlanTransforms::convertEVLExitCond(Plan&: BestVPlan);
7468 // Regions are dissolved after optimizing for VF and UF, which completely
7469 // removes unneeded loop regions first.
7470 VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7471 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
7472 // its successors.
7473 VPlanTransforms::expandBranchOnTwoConds(Plan&: BestVPlan);
7474 // Convert loops with variable-length stepping after regions are dissolved.
7475 VPlanTransforms::convertToVariableLengthStep(Plan&: BestVPlan);
7476 VPlanTransforms::materializeBackedgeTakenCount(Plan&: BestVPlan, VectorPH);
7477 VPlanTransforms::materializeVectorTripCount(
7478 Plan&: BestVPlan, VectorPHVPBB: VectorPH, TailByMasking: CM.foldTailByMasking(),
7479 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: BestVF.isVector()));
7480 VPlanTransforms::materializeFactors(Plan&: BestVPlan, VectorPH, VF: BestVF);
7481 VPlanTransforms::cse(Plan&: BestVPlan);
7482 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7483
7484 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7485 // making any changes to the CFG.
7486 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7487 VPlanTransforms::expandSCEVs(Plan&: BestVPlan, SE&: *PSE.getSE());
7488 if (!ILV.getTripCount()) {
7489 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7490 } else {
7491 assert(VectorizingEpilogue && "should only re-use the existing trip "
7492 "count during epilogue vectorization");
7493 }
7494
7495 // Perform the actual loop transformation.
7496 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7497 OrigLoop->getParentLoop(),
7498 Legal->getWidestInductionType());
7499
7500#ifdef EXPENSIVE_CHECKS
7501 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7502#endif
7503
7504 // 1. Set up the skeleton for vectorization, including vector pre-header and
7505 // middle block. The vector loop is created during VPlan execution.
7506 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7507 replaceVPBBWithIRVPBB(VPBB: BestVPlan.getScalarPreheader(),
7508 IRBB: State.CFG.PrevBB->getSingleSuccessor(), Plan: &BestVPlan);
7509 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7510
7511 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
7512
7513 // After vectorization, the exit blocks of the original loop will have
7514 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7515 // looked through single-entry phis.
7516 ScalarEvolution &SE = *PSE.getSE();
7517 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7518 if (!Exit->hasPredecessors())
7519 continue;
7520 for (VPRecipeBase &PhiR : Exit->phis())
7521 SE.forgetLcssaPhiWithNewPredecessor(L: OrigLoop,
7522 V: &cast<VPIRPhi>(Val&: PhiR).getIRPhi());
7523 }
7524 // Forget the original loop and block dispositions.
7525 SE.forgetLoop(L: OrigLoop);
7526 SE.forgetBlockAndLoopDispositions();
7527
7528 ILV.printDebugTracesAtStart();
7529
7530 //===------------------------------------------------===//
7531 //
7532 // Notice: any optimization or new instruction that go
7533 // into the code below should also be implemented in
7534 // the cost-model.
7535 //
7536 //===------------------------------------------------===//
7537
7538 // Retrieve loop information before executing the plan, which may remove the
7539 // original loop, if it becomes unreachable.
7540 MDNode *LID = OrigLoop->getLoopID();
7541 unsigned OrigLoopInvocationWeight = 0;
7542 std::optional<unsigned> OrigAverageTripCount =
7543 getLoopEstimatedTripCount(L: OrigLoop, EstimatedLoopInvocationWeight: &OrigLoopInvocationWeight);
7544
7545 BestVPlan.execute(State: &State);
7546
7547 // 2.6. Maintain Loop Hints
7548 // Keep all loop hints from the original loop on the vector loop (we'll
7549 // replace the vectorizer-specific hints below).
7550 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7551 // Add metadata to disable runtime unrolling a scalar loop when there
7552 // are no runtime checks about strides and memory. A scalar loop that is
7553 // rarely used is not worth unrolling.
7554 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7555 updateLoopMetadataAndProfileInfo(
7556 VectorLoop: HeaderVPBB ? LI->getLoopFor(BB: State.CFG.VPBB2IRBB.lookup(Val: HeaderVPBB))
7557 : nullptr,
7558 HeaderVPBB, Plan: BestVPlan, VectorizingEpilogue, OrigLoopID: LID, OrigAverageTripCount,
7559 OrigLoopInvocationWeight,
7560 EstimatedVFxUF: estimateElementCount(VF: BestVF * BestUF, VScale: CM.getVScaleForTuning()),
7561 DisableRuntimeUnroll);
7562
7563 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7564 // predication, updating analyses.
7565 ILV.fixVectorizedLoop(State);
7566
7567 ILV.printDebugTracesAtEnd();
7568
7569 return ExpandedSCEVs;
7570}
7571
7572//===--------------------------------------------------------------------===//
7573// EpilogueVectorizerMainLoop
7574//===--------------------------------------------------------------------===//
7575
7576/// This function is partially responsible for generating the control flow
7577/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7578BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
7579 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
7580 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7581
7582 // Generate the code to check the minimum iteration count of the vector
7583 // epilogue (see below).
7584 EPI.EpilogueIterationCountCheck =
7585 emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: true);
7586 EPI.EpilogueIterationCountCheck->setName("iter.check");
7587
7588 VectorPH = cast<BranchInst>(Val: EPI.EpilogueIterationCountCheck->getTerminator())
7589 ->getSuccessor(i: 1);
7590 // Generate the iteration count check for the main loop, *after* the check
7591 // for the epilogue loop, so that the path-length is shorter for the case
7592 // that goes directly through the vector epilogue. The longer-path length for
7593 // the main loop is compensated for, by the gain from vectorizing the larger
7594 // trip count. Note: the branch will get updated later on when we vectorize
7595 // the epilogue.
7596 EPI.MainLoopIterationCountCheck =
7597 emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: false);
7598
7599 return cast<BranchInst>(Val: EPI.MainLoopIterationCountCheck->getTerminator())
7600 ->getSuccessor(i: 1);
7601}
7602
7603void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7604 LLVM_DEBUG({
7605 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7606 << "Main Loop VF:" << EPI.MainLoopVF
7607 << ", Main Loop UF:" << EPI.MainLoopUF
7608 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7609 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7610 });
7611}
7612
7613void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7614 DEBUG_WITH_TYPE(VerboseDebug, {
7615 dbgs() << "intermediate fn:\n"
7616 << *OrigLoop->getHeader()->getParent() << "\n";
7617 });
7618}
7619
7620BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
7621 BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
7622 assert(Bypass && "Expected valid bypass basic block.");
7623 Value *Count = getTripCount();
7624 MinProfitableTripCount = ElementCount::getFixed(MinVal: 0);
7625 Value *CheckMinIters = createIterationCountCheck(
7626 VectorPH, VF: ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7627 UF: ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7628
7629 BasicBlock *const TCCheckBlock = VectorPH;
7630 if (!ForEpilogue)
7631 TCCheckBlock->setName("vector.main.loop.iter.check");
7632
7633 // Create new preheader for vector loop.
7634 VectorPH = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7635 DT: static_cast<DominatorTree *>(nullptr), LI, MSSAU: nullptr,
7636 BBName: "vector.ph");
7637 if (ForEpilogue) {
7638 // Save the trip count so we don't have to regenerate it in the
7639 // vec.epilog.iter.check. This is safe to do because the trip count
7640 // generated here dominates the vector epilog iter check.
7641 EPI.TripCount = Count;
7642 } else {
7643 VectorPHVPBB = replaceVPBBWithIRVPBB(VPBB: VectorPHVPBB, IRBB: VectorPH);
7644 }
7645
7646 BranchInst &BI = *BranchInst::Create(IfTrue: Bypass, IfFalse: VectorPH, Cond: CheckMinIters);
7647 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7648 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
7649 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7650
7651 // When vectorizing the main loop, its trip-count check is placed in a new
7652 // block, whereas the overall trip-count check is placed in the VPlan entry
7653 // block. When vectorizing the epilogue loop, its trip-count check is placed
7654 // in the VPlan entry block.
7655 if (!ForEpilogue)
7656 introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7657 return TCCheckBlock;
7658}
7659
7660//===--------------------------------------------------------------------===//
7661// EpilogueVectorizerEpilogueLoop
7662//===--------------------------------------------------------------------===//
7663
7664/// This function creates a new scalar preheader, using the previous one as
7665/// entry block to the epilogue VPlan. The minimum iteration check is being
7666/// represented in VPlan.
7667BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
7668 BasicBlock *NewScalarPH = createScalarPreheader(Prefix: "vec.epilog.");
7669 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7670 OriginalScalarPH->setName("vec.epilog.iter.check");
7671 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: OriginalScalarPH);
7672 VPBasicBlock *OldEntry = Plan.getEntry();
7673 for (auto &R : make_early_inc_range(Range&: *OldEntry)) {
7674 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7675 // defining.
7676 if (isa<VPIRInstruction>(Val: &R))
7677 continue;
7678 R.moveBefore(BB&: *NewEntry, I: NewEntry->end());
7679 }
7680
7681 VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7682 Plan.setEntry(NewEntry);
7683 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7684
7685 return OriginalScalarPH;
7686}
7687
7688void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7689 LLVM_DEBUG({
7690 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7691 << "Epilogue Loop VF:" << EPI.EpilogueVF
7692 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7693 });
7694}
7695
7696void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7697 DEBUG_WITH_TYPE(VerboseDebug, {
7698 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7699 });
7700}
7701
7702VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7703 VFRange &Range) {
7704 assert((VPI->getOpcode() == Instruction::Load ||
7705 VPI->getOpcode() == Instruction::Store) &&
7706 "Must be called with either a load or store");
7707 Instruction *I = VPI->getUnderlyingInstr();
7708
7709 auto WillWiden = [&](ElementCount VF) -> bool {
7710 LoopVectorizationCostModel::InstWidening Decision =
7711 CM.getWideningDecision(I, VF);
7712 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7713 "CM decision should be taken at this point.");
7714 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7715 return true;
7716 if (CM.isScalarAfterVectorization(I, VF) ||
7717 CM.isProfitableToScalarize(I, VF))
7718 return false;
7719 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7720 };
7721
7722 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7723 return nullptr;
7724
7725 // If a mask is not required, drop it - use unmasked version for safe loads.
7726 // TODO: Determine if mask is needed in VPlan.
7727 VPValue *Mask = Legal->isMaskRequired(I) ? VPI->getMask() : nullptr;
7728
7729 // Determine if the pointer operand of the access is either consecutive or
7730 // reverse consecutive.
7731 LoopVectorizationCostModel::InstWidening Decision =
7732 CM.getWideningDecision(I, VF: Range.Start);
7733 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7734 bool Consecutive =
7735 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7736
7737 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(N: 0)
7738 : VPI->getOperand(N: 1);
7739 if (Consecutive) {
7740 auto *GEP = dyn_cast<GetElementPtrInst>(
7741 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7742 VPSingleDefRecipe *VectorPtr;
7743 if (Reverse) {
7744 // When folding the tail, we may compute an address that we don't in the
7745 // original scalar loop: drop the GEP no-wrap flags in this case.
7746 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7747 // emit negative indices.
7748 GEPNoWrapFlags Flags =
7749 CM.foldTailByMasking() || !GEP
7750 ? GEPNoWrapFlags::none()
7751 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7752 VectorPtr = new VPVectorEndPointerRecipe(
7753 Ptr, &Plan.getVF(), getLoadStoreType(I),
7754 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7755 } else {
7756 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7757 GEP ? GEP->getNoWrapFlags()
7758 : GEPNoWrapFlags::none(),
7759 VPI->getDebugLoc());
7760 }
7761 Builder.insert(R: VectorPtr);
7762 Ptr = VectorPtr;
7763 }
7764
7765 if (VPI->getOpcode() == Instruction::Load) {
7766 auto *Load = cast<LoadInst>(Val: I);
7767 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7768 *VPI, Load->getDebugLoc());
7769 if (Reverse) {
7770 Builder.insert(R: LoadR);
7771 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
7772 LoadR->getDebugLoc());
7773 }
7774 return LoadR;
7775 }
7776
7777 StoreInst *Store = cast<StoreInst>(Val: I);
7778 VPValue *StoredVal = VPI->getOperand(N: 0);
7779 if (Reverse)
7780 StoredVal = Builder.createNaryOp(Opcode: VPInstruction::Reverse, Operands: StoredVal,
7781 DL: Store->getDebugLoc());
7782 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
7783 Reverse, *VPI, Store->getDebugLoc());
7784}
7785
7786VPWidenIntOrFpInductionRecipe *
7787VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7788 VFRange &Range) {
7789 auto *I = cast<TruncInst>(Val: VPI->getUnderlyingInstr());
7790 // Optimize the special case where the source is a constant integer
7791 // induction variable. Notice that we can only optimize the 'trunc' case
7792 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7793 // (c) other casts depend on pointer size.
7794
7795 // Determine whether \p K is a truncation based on an induction variable that
7796 // can be optimized.
7797 auto IsOptimizableIVTruncate =
7798 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7799 return [=](ElementCount VF) -> bool {
7800 return CM.isOptimizableIVTruncate(I: K, VF);
7801 };
7802 };
7803
7804 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
7805 Predicate: IsOptimizableIVTruncate(I), Range))
7806 return nullptr;
7807
7808 auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
7809 Val: VPI->getOperand(N: 0)->getDefiningRecipe());
7810 PHINode *Phi = WidenIV->getPHINode();
7811 VPIRValue *Start = WidenIV->getStartValue();
7812 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7813
7814 // It is always safe to copy over the NoWrap and FastMath flags. In
7815 // particular, when folding tail by masking, the masked-off lanes are never
7816 // used, so it is safe.
7817 VPIRFlags Flags = vputils::getFlagsFromIndDesc(ID: IndDesc);
7818 VPValue *Step =
7819 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
7820 return new VPWidenIntOrFpInductionRecipe(
7821 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7822}
7823
7824VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7825 VFRange &Range) {
7826 CallInst *CI = cast<CallInst>(Val: VPI->getUnderlyingInstr());
7827 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7828 Predicate: [this, CI](ElementCount VF) {
7829 return CM.isScalarWithPredication(I: CI, VF);
7830 },
7831 Range);
7832
7833 if (IsPredicated)
7834 return nullptr;
7835
7836 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7837 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7838 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7839 ID == Intrinsic::pseudoprobe ||
7840 ID == Intrinsic::experimental_noalias_scope_decl))
7841 return nullptr;
7842
7843 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7844 VPI->op_begin() + CI->arg_size());
7845
7846 // Is it beneficial to perform intrinsic call compared to lib call?
7847 bool ShouldUseVectorIntrinsic =
7848 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7849 Predicate: [&](ElementCount VF) -> bool {
7850 return CM.getCallWideningDecision(CI, VF).Kind ==
7851 LoopVectorizationCostModel::CM_IntrinsicCall;
7852 },
7853 Range);
7854 if (ShouldUseVectorIntrinsic)
7855 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
7856 VPI->getDebugLoc());
7857
7858 Function *Variant = nullptr;
7859 std::optional<unsigned> MaskPos;
7860 // Is better to call a vectorized version of the function than to to scalarize
7861 // the call?
7862 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7863 Predicate: [&](ElementCount VF) -> bool {
7864 // The following case may be scalarized depending on the VF.
7865 // The flag shows whether we can use a usual Call for vectorized
7866 // version of the instruction.
7867
7868 // If we've found a variant at a previous VF, then stop looking. A
7869 // vectorized variant of a function expects input in a certain shape
7870 // -- basically the number of input registers, the number of lanes
7871 // per register, and whether there's a mask required.
7872 // We store a pointer to the variant in the VPWidenCallRecipe, so
7873 // once we have an appropriate variant it's only valid for that VF.
7874 // This will force a different vplan to be generated for each VF that
7875 // finds a valid variant.
7876 if (Variant)
7877 return false;
7878 LoopVectorizationCostModel::CallWideningDecision Decision =
7879 CM.getCallWideningDecision(CI, VF);
7880 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7881 Variant = Decision.Variant;
7882 MaskPos = Decision.MaskPos;
7883 return true;
7884 }
7885
7886 return false;
7887 },
7888 Range);
7889 if (ShouldUseVectorCall) {
7890 if (MaskPos.has_value()) {
7891 // We have 2 cases that would require a mask:
7892 // 1) The call needs to be predicated, either due to a conditional
7893 // in the scalar loop or use of an active lane mask with
7894 // tail-folding, and we use the appropriate mask for the block.
7895 // 2) No mask is required for the call instruction, but the only
7896 // available vector variant at this VF requires a mask, so we
7897 // synthesize an all-true mask.
7898 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7899
7900 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7901 }
7902
7903 Ops.push_back(Elt: VPI->getOperand(N: VPI->getNumOperandsWithoutMask() - 1));
7904 return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
7905 VPI->getDebugLoc());
7906 }
7907
7908 return nullptr;
7909}
7910
7911bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7912 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7913 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7914 // Instruction should be widened, unless it is scalar after vectorization,
7915 // scalarization is profitable or it is predicated.
7916 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7917 return CM.isScalarAfterVectorization(I, VF) ||
7918 CM.isProfitableToScalarize(I, VF) ||
7919 CM.isScalarWithPredication(I, VF);
7920 };
7921 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7922 Range);
7923}
7924
7925VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7926 auto *I = VPI->getUnderlyingInstr();
7927 switch (VPI->getOpcode()) {
7928 default:
7929 return nullptr;
7930 case Instruction::SDiv:
7931 case Instruction::UDiv:
7932 case Instruction::SRem:
7933 case Instruction::URem: {
7934 // If not provably safe, use a select to form a safe divisor before widening the
7935 // div/rem operation itself. Otherwise fall through to general handling below.
7936 if (CM.isPredicatedInst(I)) {
7937 SmallVector<VPValue *> Ops(VPI->operandsWithoutMask());
7938 VPValue *Mask = VPI->getMask();
7939 VPValue *One = Plan.getConstantInt(Ty: I->getType(), Val: 1u);
7940 auto *SafeRHS =
7941 Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: VPI->getDebugLoc());
7942 Ops[1] = SafeRHS;
7943 return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
7944 }
7945 [[fallthrough]];
7946 }
7947 case Instruction::Add:
7948 case Instruction::And:
7949 case Instruction::AShr:
7950 case Instruction::FAdd:
7951 case Instruction::FCmp:
7952 case Instruction::FDiv:
7953 case Instruction::FMul:
7954 case Instruction::FNeg:
7955 case Instruction::FRem:
7956 case Instruction::FSub:
7957 case Instruction::ICmp:
7958 case Instruction::LShr:
7959 case Instruction::Mul:
7960 case Instruction::Or:
7961 case Instruction::Select:
7962 case Instruction::Shl:
7963 case Instruction::Sub:
7964 case Instruction::Xor:
7965 case Instruction::Freeze:
7966 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
7967 VPI->getDebugLoc());
7968 case Instruction::ExtractValue: {
7969 SmallVector<VPValue *> NewOps(VPI->operandsWithoutMask());
7970 auto *EVI = cast<ExtractValueInst>(Val: I);
7971 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7972 unsigned Idx = EVI->getIndices()[0];
7973 NewOps.push_back(Elt: Plan.getConstantInt(BitWidth: 32, Val: Idx));
7974 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7975 }
7976 };
7977}
7978
7979VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7980 VPInstruction *VPI) {
7981 // FIXME: Support other operations.
7982 unsigned Opcode = HI->Update->getOpcode();
7983 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7984 "Histogram update operation must be an Add or Sub");
7985
7986 SmallVector<VPValue *, 3> HGramOps;
7987 // Bucket address.
7988 HGramOps.push_back(Elt: VPI->getOperand(N: 1));
7989 // Increment value.
7990 HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: 1)));
7991
7992 // In case of predicated execution (due to tail-folding, or conditional
7993 // execution, or both), pass the relevant mask.
7994 if (Legal->isMaskRequired(I: HI->Store))
7995 HGramOps.push_back(Elt: VPI->getMask());
7996
7997 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7998}
7999
8000VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
8001 VFRange &Range) {
8002 auto *I = VPI->getUnderlyingInstr();
8003 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8004 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8005 Range);
8006
8007 bool IsPredicated = CM.isPredicatedInst(I);
8008
8009 // Even if the instruction is not marked as uniform, there are certain
8010 // intrinsic calls that can be effectively treated as such, so we check for
8011 // them here. Conservatively, we only do this for scalable vectors, since
8012 // for fixed-width VFs we can always fall back on full scalarization.
8013 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
8014 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
8015 case Intrinsic::assume:
8016 case Intrinsic::lifetime_start:
8017 case Intrinsic::lifetime_end:
8018 // For scalable vectors if one of the operands is variant then we still
8019 // want to mark as uniform, which will generate one instruction for just
8020 // the first lane of the vector. We can't scalarize the call in the same
8021 // way as for fixed-width vectors because we don't know how many lanes
8022 // there are.
8023 //
8024 // The reasons for doing it this way for scalable vectors are:
8025 // 1. For the assume intrinsic generating the instruction for the first
8026 // lane is still be better than not generating any at all. For
8027 // example, the input may be a splat across all lanes.
8028 // 2. For the lifetime start/end intrinsics the pointer operand only
8029 // does anything useful when the input comes from a stack object,
8030 // which suggests it should always be uniform. For non-stack objects
8031 // the effect is to poison the object, which still allows us to
8032 // remove the call.
8033 IsUniform = true;
8034 break;
8035 default:
8036 break;
8037 }
8038 }
8039 VPValue *BlockInMask = nullptr;
8040 if (!IsPredicated) {
8041 // Finalize the recipe for Instr, first if it is not predicated.
8042 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8043 } else {
8044 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8045 // Instructions marked for predication are replicated and a mask operand is
8046 // added initially. Masked replicate recipes will later be placed under an
8047 // if-then construct to prevent side-effects. Generate recipes to compute
8048 // the block mask for this region.
8049 BlockInMask = VPI->getMask();
8050 }
8051
8052 // Note that there is some custom logic to mark some intrinsics as uniform
8053 // manually above for scalable vectors, which this assert needs to account for
8054 // as well.
8055 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8056 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8057 "Should not predicate a uniform recipe");
8058 auto *Recipe =
8059 new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
8060 BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
8061 return Recipe;
8062}
8063
8064VPRecipeBase *
8065VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
8066 VFRange &Range) {
8067 assert(!R->isPhi() && "phis must be handled earlier");
8068 // First, check for specific widening recipes that deal with optimizing
8069 // truncates, calls and memory operations.
8070
8071 VPRecipeBase *Recipe;
8072 auto *VPI = cast<VPInstruction>(Val: R);
8073 if (VPI->getOpcode() == Instruction::Trunc &&
8074 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
8075 return Recipe;
8076
8077 // All widen recipes below deal only with VF > 1.
8078 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8079 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8080 return nullptr;
8081
8082 if (VPI->getOpcode() == Instruction::Call)
8083 return tryToWidenCall(VPI, Range);
8084
8085 Instruction *Instr = R->getUnderlyingInstr();
8086 if (VPI->getOpcode() == Instruction::Store)
8087 if (auto HistInfo = Legal->getHistogramInfo(I: cast<StoreInst>(Val: Instr)))
8088 return tryToWidenHistogram(HI: *HistInfo, VPI);
8089
8090 if (VPI->getOpcode() == Instruction::Load ||
8091 VPI->getOpcode() == Instruction::Store)
8092 return tryToWidenMemory(VPI, Range);
8093
8094 if (!shouldWiden(I: Instr, Range))
8095 return nullptr;
8096
8097 if (VPI->getOpcode() == Instruction::GetElementPtr)
8098 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Val: Instr),
8099 VPI->operandsWithoutMask(), *VPI,
8100 VPI->getDebugLoc());
8101
8102 if (Instruction::isCast(Opcode: VPI->getOpcode())) {
8103 auto *CI = cast<CastInst>(Val: Instr);
8104 auto *CastR = cast<VPInstructionWithType>(Val: VPI);
8105 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(N: 0),
8106 CastR->getResultType(), CI, *VPI, *VPI,
8107 VPI->getDebugLoc());
8108 }
8109
8110 return tryToWiden(VPI);
8111}
8112
8113void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8114 ElementCount MaxVF) {
8115 if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8116 return;
8117
8118 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8119
8120 const LoopAccessInfo *LAI = Legal->getLAI();
8121 LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8122 OrigLoop, LI, DT, PSE.getSE());
8123 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8124 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
8125 // Only use noalias metadata when using memory checks guaranteeing no
8126 // overlap across all iterations.
8127 LVer.prepareNoAliasMetadata();
8128 }
8129
8130 // Create initial base VPlan0, to serve as common starting point for all
8131 // candidates built later for specific VF ranges.
8132 auto VPlan0 = VPlanTransforms::buildVPlan0(
8133 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8134 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE, LVer: &LVer);
8135
8136 // Create recipes for header phis.
8137 VPlanTransforms::createHeaderPhiRecipes(
8138 Plan&: *VPlan0, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
8139 Reductions: Legal->getReductionVars(), FixedOrderRecurrences: Legal->getFixedOrderRecurrences(),
8140 InLoopReductions: CM.getInLoopReductions(), AllowReordering: Hints.allowReordering());
8141
8142 VPlanTransforms::simplifyRecipes(Plan&: *VPlan0);
8143
8144 auto MaxVFTimes2 = MaxVF * 2;
8145 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8146 VFRange SubRange = {VF, MaxVFTimes2};
8147 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8148 InitialPlan: std::unique_ptr<VPlan>(VPlan0->duplicate()), Range&: SubRange, LVer: &LVer)) {
8149 // Now optimize the initial VPlan.
8150 VPlanTransforms::hoistPredicatedLoads(Plan&: *Plan, PSE, L: OrigLoop);
8151 VPlanTransforms::sinkPredicatedStores(Plan&: *Plan, PSE, L: OrigLoop);
8152 RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
8153 CM.getMinimalBitwidths());
8154 RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
8155 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
8156 if (CM.foldTailWithEVL()) {
8157 RUN_VPLAN_PASS(VPlanTransforms::addExplicitVectorLength, *Plan,
8158 CM.getMaxSafeElements());
8159 RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
8160 }
8161
8162 if (auto P = VPlanTransforms::narrowInterleaveGroups(Plan&: *Plan, TTI))
8163 VPlans.push_back(Elt: std::move(P));
8164
8165 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8166 VPlans.push_back(Elt: std::move(Plan));
8167 }
8168 VF = SubRange.End;
8169 }
8170}
8171
8172VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8173 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8174
8175 using namespace llvm::VPlanPatternMatch;
8176 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8177
8178 // ---------------------------------------------------------------------------
8179 // Build initial VPlan: Scan the body of the loop in a topological order to
8180 // visit each basic block after having visited its predecessor basic blocks.
8181 // ---------------------------------------------------------------------------
8182
8183 bool RequiresScalarEpilogueCheck =
8184 LoopVectorizationPlanner::getDecisionAndClampRange(
8185 Predicate: [this](ElementCount VF) {
8186 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8187 },
8188 Range);
8189 VPlanTransforms::handleEarlyExits(Plan&: *Plan, HasUncountableExit: Legal->hasUncountableEarlyExit());
8190 VPlanTransforms::addMiddleCheck(Plan&: *Plan, RequiresScalarEpilogueCheck,
8191 TailFolded: CM.foldTailByMasking());
8192
8193 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8194
8195 // Don't use getDecisionAndClampRange here, because we don't know the UF
8196 // so this function is better to be conservative, rather than to split
8197 // it up into different VPlans.
8198 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8199 bool IVUpdateMayOverflow = false;
8200 for (ElementCount VF : Range)
8201 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8202
8203 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8204 // Use NUW for the induction increment if we proved that it won't overflow in
8205 // the vector loop or when not folding the tail. In the later case, we know
8206 // that the canonical induction increment will not overflow as the vector trip
8207 // count is >= increment and a multiple of the increment.
8208 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8209 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8210 if (!HasNUW) {
8211 auto *IVInc =
8212 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(N: 0);
8213 assert(match(IVInc,
8214 m_VPInstruction<Instruction::Add>(
8215 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8216 "Did not find the canonical IV increment");
8217 cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8218 }
8219
8220 // ---------------------------------------------------------------------------
8221 // Pre-construction: record ingredients whose recipes we'll need to further
8222 // process after constructing the initial VPlan.
8223 // ---------------------------------------------------------------------------
8224
8225 // For each interleave group which is relevant for this (possibly trimmed)
8226 // Range, add it to the set of groups to be later applied to the VPlan and add
8227 // placeholders for its members' Recipes which we'll be replacing with a
8228 // single VPInterleaveRecipe.
8229 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8230 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8231 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8232 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8233 LoopVectorizationCostModel::CM_Interleave);
8234 // For scalable vectors, the interleave factors must be <= 8 since we
8235 // require the (de)interleaveN intrinsics instead of shufflevectors.
8236 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8237 "Unsupported interleave factor for scalable vectors");
8238 return Result;
8239 };
8240 if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8241 continue;
8242 InterleaveGroups.insert(Ptr: IG);
8243 }
8244
8245 // ---------------------------------------------------------------------------
8246 // Predicate and linearize the top-level loop region.
8247 // ---------------------------------------------------------------------------
8248 RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize, *Plan,
8249 CM.foldTailByMasking());
8250
8251 // ---------------------------------------------------------------------------
8252 // Construct wide recipes and apply predication for original scalar
8253 // VPInstructions in the loop.
8254 // ---------------------------------------------------------------------------
8255 VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
8256
8257 // Scan the body of the loop in a topological order to visit each basic block
8258 // after having visited its predecessor basic blocks.
8259 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8260 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8261 HeaderVPBB);
8262
8263 auto *MiddleVPBB = Plan->getMiddleBlock();
8264 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8265
8266 // Collect blocks that need predication for in-loop reduction recipes.
8267 DenseSet<BasicBlock *> BlocksNeedingPredication;
8268 for (BasicBlock *BB : OrigLoop->blocks())
8269 if (CM.blockNeedsPredicationForAnyReason(BB))
8270 BlocksNeedingPredication.insert(V: BB);
8271
8272 VPlanTransforms::createInLoopReductionRecipes(Plan&: *Plan, BlocksNeedingPredication,
8273 MinVF: Range.Start);
8274
8275 // Now process all other blocks and instructions.
8276 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8277 // Convert input VPInstructions to widened recipes.
8278 for (VPRecipeBase &R : make_early_inc_range(
8279 Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end()))) {
8280 // Skip recipes that do not need transforming.
8281 if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(Val: &R))
8282 continue;
8283 auto *VPI = cast<VPInstruction>(Val: &R);
8284 if (!VPI->getUnderlyingValue())
8285 continue;
8286
8287 // TODO: Gradually replace uses of underlying instruction by analyses on
8288 // VPlan. Migrate code relying on the underlying instruction from VPlan0
8289 // to construct recipes below to not use the underlying instruction.
8290 Instruction *Instr = cast<Instruction>(Val: VPI->getUnderlyingValue());
8291 Builder.setInsertPoint(VPI);
8292
8293 // The stores with invariant address inside the loop will be deleted, and
8294 // in the exit block, a uniform store recipe will be created for the final
8295 // invariant store of the reduction.
8296 StoreInst *SI;
8297 if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8298 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8299 // Only create recipe for the final invariant store of the reduction.
8300 if (Legal->isInvariantStoreOfReduction(SI)) {
8301 auto *Recipe = new VPReplicateRecipe(
8302 SI, VPI->operandsWithoutMask(), true /* IsUniform */,
8303 nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
8304 Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8305 }
8306 R.eraseFromParent();
8307 continue;
8308 }
8309
8310 VPRecipeBase *Recipe =
8311 RecipeBuilder.tryToCreateWidenNonPhiRecipe(R: VPI, Range);
8312 if (!Recipe)
8313 Recipe =
8314 RecipeBuilder.handleReplication(VPI: cast<VPInstruction>(Val: VPI), Range);
8315
8316 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8317 if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8318 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8319 // moved to the phi section in the header.
8320 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8321 } else {
8322 Builder.insert(R: Recipe);
8323 }
8324 if (Recipe->getNumDefinedValues() == 1) {
8325 VPI->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8326 } else {
8327 assert(Recipe->getNumDefinedValues() == 0 &&
8328 "Unexpected multidef recipe");
8329 }
8330 R.eraseFromParent();
8331 }
8332 }
8333
8334 assert(isa<VPRegionBlock>(LoopRegion) &&
8335 !LoopRegion->getEntryBasicBlock()->empty() &&
8336 "entry block must be set to a VPRegionBlock having a non-empty entry "
8337 "VPBasicBlock");
8338
8339 // TODO: We can't call runPass on these transforms yet, due to verifier
8340 // failures.
8341 VPlanTransforms::addExitUsersForFirstOrderRecurrences(Plan&: *Plan, Range);
8342 DenseMap<VPValue *, VPValue *> IVEndValues;
8343 VPlanTransforms::updateScalarResumePhis(Plan&: *Plan, IVEndValues);
8344
8345 // ---------------------------------------------------------------------------
8346 // Transform initial VPlan: Apply previously taken decisions, in order, to
8347 // bring the VPlan to its final state.
8348 // ---------------------------------------------------------------------------
8349
8350 addReductionResultComputation(Plan, RecipeBuilder, MinVF: Range.Start);
8351
8352 // Optimize FindIV reductions to use sentinel-based approach when possible.
8353 RUN_VPLAN_PASS(VPlanTransforms::optimizeFindIVReductions, *Plan, PSE,
8354 *OrigLoop);
8355
8356 // Apply mandatory transformation to handle reductions with multiple in-loop
8357 // uses if possible, bail out otherwise.
8358 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMultiUseReductions, *Plan, ORE,
8359 OrigLoop))
8360 return nullptr;
8361 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8362 // NaNs if possible, bail out otherwise.
8363 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMaxMinNumReductions, *Plan))
8364 return nullptr;
8365
8366 // Create whole-vector selects for find-last recurrences.
8367 if (!RUN_VPLAN_PASS(VPlanTransforms::handleFindLastReductions, *Plan))
8368 return nullptr;
8369
8370 // Create partial reduction recipes for scaled reductions and transform
8371 // recipes to abstract recipes if it is legal and beneficial and clamp the
8372 // range for better cost estimation.
8373 // TODO: Enable following transform when the EVL-version of extended-reduction
8374 // and mulacc-reduction are implemented.
8375 if (!CM.foldTailWithEVL()) {
8376 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
8377 OrigLoop);
8378 RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
8379 Range);
8380 RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
8381 Range);
8382 }
8383
8384 for (ElementCount VF : Range)
8385 Plan->addVF(VF);
8386 Plan->setName("Initial VPlan");
8387
8388 // Interleave memory: for each Interleave Group we marked earlier as relevant
8389 // for this VPlan, replace the Recipes widening its memory instructions with a
8390 // single VPInterleaveRecipe at its insertion point.
8391 RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
8392 InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
8393
8394 // Replace VPValues for known constant strides.
8395 RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
8396 Legal->getLAI()->getSymbolicStrides());
8397
8398 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8399 return Legal->blockNeedsPredication(BB);
8400 };
8401 RUN_VPLAN_PASS(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
8402 BlockNeedsPredication);
8403
8404 // Sink users of fixed-order recurrence past the recipe defining the previous
8405 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8406 if (!RUN_VPLAN_PASS(VPlanTransforms::adjustFixedOrderRecurrences, *Plan,
8407 Builder))
8408 return nullptr;
8409
8410 if (useActiveLaneMask(Style)) {
8411 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8412 // TailFoldingStyle is visible there.
8413 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8414 bool WithoutRuntimeCheck =
8415 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8416 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8417 DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8418 }
8419 VPlanTransforms::optimizeInductionExitUsers(Plan&: *Plan, EndValues&: IVEndValues, PSE);
8420
8421 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8422 return Plan;
8423}
8424
8425VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8426 // Outer loop handling: They may require CFG and instruction level
8427 // transformations before even evaluating whether vectorization is profitable.
8428 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8429 // the vectorization pipeline.
8430 assert(!OrigLoop->isInnermost());
8431 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8432
8433 auto Plan = VPlanTransforms::buildVPlan0(
8434 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8435 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE);
8436
8437 VPlanTransforms::createHeaderPhiRecipes(
8438 Plan&: *Plan, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
8439 Reductions: MapVector<PHINode *, RecurrenceDescriptor>(),
8440 FixedOrderRecurrences: SmallPtrSet<const PHINode *, 1>(), InLoopReductions: SmallPtrSet<PHINode *, 1>(),
8441 /*AllowReordering=*/false);
8442 VPlanTransforms::handleEarlyExits(Plan&: *Plan,
8443 /*HasUncountableExit*/ false);
8444 VPlanTransforms::addMiddleCheck(Plan&: *Plan, /*RequiresScalarEpilogue*/ RequiresScalarEpilogueCheck: true,
8445 /*TailFolded*/ false);
8446
8447 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8448
8449 for (ElementCount VF : Range)
8450 Plan->addVF(VF);
8451
8452 if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(Plan&: *Plan, TLI: *TLI))
8453 return nullptr;
8454
8455 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8456 // values.
8457 // TODO: We can't call runPass on the transform yet, due to verifier
8458 // failures.
8459 DenseMap<VPValue *, VPValue *> IVEndValues;
8460 VPlanTransforms::updateScalarResumePhis(Plan&: *Plan, IVEndValues);
8461
8462 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8463 return Plan;
8464}
8465
8466void LoopVectorizationPlanner::addReductionResultComputation(
8467 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8468 using namespace VPlanPatternMatch;
8469 VPTypeAnalysis TypeInfo(*Plan);
8470 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8471 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8472 SmallVector<VPRecipeBase *> ToDelete;
8473 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8474 Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
8475 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8476 for (VPRecipeBase &R :
8477 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8478 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8479 // TODO: Remove check for constant incoming value once removeDeadRecipes is
8480 // used on VPlan0.
8481 if (!PhiR || isa<VPIRValue>(Val: PhiR->getOperand(N: 1)))
8482 continue;
8483
8484 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8485 PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
8486 Type *PhiTy = TypeInfo.inferScalarType(V: PhiR);
8487 // If tail is folded by masking, introduce selects between the phi
8488 // and the users outside the vector region of each reduction, at the
8489 // beginning of the dedicated latch block.
8490 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8491 auto *NewExitingVPV = PhiR->getBackedgeValue();
8492 // Don't output selects for partial reductions because they have an output
8493 // with fewer lanes than the VF. So the operands of the select would have
8494 // different numbers of lanes. Partial reductions mask the input instead.
8495 auto *RR = dyn_cast<VPReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe());
8496 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8497 (!RR || !RR->isPartialReduction())) {
8498 VPValue *Cond = vputils::findHeaderMask(Plan&: *Plan);
8499 NewExitingVPV =
8500 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", Flags: *PhiR);
8501 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
8502 using namespace VPlanPatternMatch;
8503 return match(
8504 U: &U, P: m_CombineOr(
8505 L: m_VPInstruction<VPInstruction::ComputeAnyOfResult>(),
8506 R: m_VPInstruction<VPInstruction::ComputeReductionResult>()));
8507 });
8508 if (CM.usePredicatedReductionSelect())
8509 PhiR->setOperand(I: 1, New: NewExitingVPV);
8510 }
8511
8512 // We want code in the middle block to appear to execute on the location of
8513 // the scalar loop's latch terminator because: (a) it is all compiler
8514 // generated, (b) these instructions are always executed after evaluating
8515 // the latch conditional branch, and (c) other passes may add new
8516 // predecessors which terminate on this line. This is the easiest way to
8517 // ensure we don't accidentally cause an extra step back into the loop while
8518 // debugging.
8519 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8520
8521 // TODO: At the moment ComputeReductionResult also drives creation of the
8522 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8523 // even for in-loop reductions, until the reduction resume value handling is
8524 // also modeled in VPlan.
8525 VPInstruction *FinalReductionResult;
8526 VPBuilder::InsertPointGuard Guard(Builder);
8527 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
8528 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8529 // For AnyOf reductions, find the select among PhiR's users. This is used
8530 // both to find NewVal for ComputeAnyOfResult and to adjust the reduction.
8531 VPRecipeBase *AnyOfSelect = nullptr;
8532 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8533 AnyOfSelect = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
8534 return match(U, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()));
8535 }));
8536 }
8537 if (AnyOfSelect) {
8538 VPValue *Start = PhiR->getStartValue();
8539 // NewVal is the non-phi operand of the select.
8540 VPValue *NewVal = AnyOfSelect->getOperand(N: 1) == PhiR
8541 ? AnyOfSelect->getOperand(N: 2)
8542 : AnyOfSelect->getOperand(N: 1);
8543 FinalReductionResult =
8544 Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
8545 Operands: {Start, NewVal, NewExitingVPV}, DL: ExitDL);
8546 } else {
8547 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
8548 PhiR->getFastMathFlags());
8549 FinalReductionResult =
8550 Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
8551 Operands: {NewExitingVPV}, Flags, DL: ExitDL);
8552 }
8553 // If the vector reduction can be performed in a smaller type, we truncate
8554 // then extend the loop exit value to enable InstCombine to evaluate the
8555 // entire expression in the smaller type.
8556 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8557 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8558 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8559 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
8560 "Unexpected truncated min-max recurrence!");
8561 Type *RdxTy = RdxDesc.getRecurrenceType();
8562 VPWidenCastRecipe *Trunc;
8563 Instruction::CastOps ExtendOpc =
8564 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8565 VPWidenCastRecipe *Extnd;
8566 {
8567 VPBuilder::InsertPointGuard Guard(Builder);
8568 Builder.setInsertPoint(
8569 TheBB: NewExitingVPV->getDefiningRecipe()->getParent(),
8570 IP: std::next(x: NewExitingVPV->getDefiningRecipe()->getIterator()));
8571 Trunc =
8572 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: NewExitingVPV, ResultTy: RdxTy);
8573 Extnd = Builder.createWidenCast(Opcode: ExtendOpc, Op: Trunc, ResultTy: PhiTy);
8574 }
8575 if (PhiR->getOperand(N: 1) == NewExitingVPV)
8576 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
8577
8578 // Update ComputeReductionResult with the truncated exiting value and
8579 // extend its result. Operand 0 provides the values to be reduced.
8580 FinalReductionResult->setOperand(I: 0, New: Trunc);
8581 FinalReductionResult =
8582 Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
8583 }
8584
8585 // Update all users outside the vector region. Also replace redundant
8586 // extracts.
8587 for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
8588 auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
8589 if (FinalReductionResult == U || Parent->getParent())
8590 continue;
8591 // Skip FindIV reduction chain recipes (ComputeReductionResult, icmp).
8592 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind) &&
8593 match(U, P: m_CombineOr(
8594 L: m_VPInstruction<VPInstruction::ComputeReductionResult>(),
8595 R: m_VPInstruction<Instruction::ICmp>())))
8596 continue;
8597 U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
8598
8599 // Look through ExtractLastPart.
8600 if (match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
8601 U = cast<VPInstruction>(Val: U)->getSingleUser();
8602
8603 if (match(U, P: m_CombineOr(L: m_ExtractLane(Op0: m_VPValue(), Op1: m_VPValue()),
8604 R: m_ExtractLastLane(Op0: m_VPValue()))))
8605 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
8606 }
8607
8608 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8609 // with a boolean reduction phi node to check if the condition is true in
8610 // any iteration. The final value is selected by the final
8611 // ComputeReductionResult.
8612 if (AnyOfSelect) {
8613 VPValue *Cmp = AnyOfSelect->getOperand(N: 0);
8614 // If the compare is checking the reduction PHI node, adjust it to check
8615 // the start value.
8616 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8617 CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
8618 Builder.setInsertPoint(AnyOfSelect);
8619
8620 // If the true value of the select is the reduction phi, the new value is
8621 // selected if the negated condition is true in any iteration.
8622 if (AnyOfSelect->getOperand(N: 1) == PhiR)
8623 Cmp = Builder.createNot(Operand: Cmp);
8624 VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
8625 AnyOfSelect->getVPSingleValue()->replaceAllUsesWith(New: Or);
8626 // Delete AnyOfSelect now that it has invalid types.
8627 ToDelete.push_back(Elt: AnyOfSelect);
8628
8629 // Convert the reduction phi to operate on bools.
8630 PhiR->setOperand(I: 0, New: Plan->getFalse());
8631 continue;
8632 }
8633
8634 RecurKind RK = PhiR->getRecurrenceKind();
8635 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
8636 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
8637 !RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) &&
8638 !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RK))) {
8639 VPBuilder PHBuilder(Plan->getVectorPreheader());
8640 VPValue *Iden = Plan->getOrAddLiveIn(
8641 V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: PhiR->getFastMathFlags()));
8642 auto *ScaleFactorVPV = Plan->getConstantInt(BitWidth: 32, Val: 1);
8643 VPValue *StartV = PHBuilder.createNaryOp(
8644 Opcode: VPInstruction::ReductionStartVector,
8645 Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV}, Flags: *PhiR);
8646 PhiR->setOperand(I: 0, New: StartV);
8647 }
8648 }
8649 for (VPRecipeBase *R : ToDelete)
8650 R->eraseFromParent();
8651
8652 RUN_VPLAN_PASS(VPlanTransforms::clearReductionWrapFlags, *Plan);
8653}
8654
8655void LoopVectorizationPlanner::attachRuntimeChecks(
8656 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8657 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8658 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(N: 0)) {
8659 assert((!CM.OptForSize ||
8660 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8661 "Cannot SCEV check stride or overflow when optimizing for size");
8662 VPlanTransforms::attachCheckBlock(Plan, Cond: SCEVCheckCond, CheckBlock: SCEVCheckBlock,
8663 AddBranchWeights: HasBranchWeights);
8664 }
8665 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8666 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(N: 0)) {
8667 // VPlan-native path does not do any analysis for runtime checks
8668 // currently.
8669 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
8670 "Runtime checks are not supported for outer loops yet");
8671
8672 if (CM.OptForSize) {
8673 assert(
8674 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8675 "Cannot emit memory checks when optimizing for size, unless forced "
8676 "to vectorize.");
8677 ORE->emit(RemarkBuilder: [&]() {
8678 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
8679 OrigLoop->getStartLoc(),
8680 OrigLoop->getHeader())
8681 << "Code-size may be reduced by not forcing "
8682 "vectorization, or by source-code modifications "
8683 "eliminating the need for runtime checks "
8684 "(e.g., adding 'restrict').";
8685 });
8686 }
8687 VPlanTransforms::attachCheckBlock(Plan, Cond: MemCheckCond, CheckBlock: MemCheckBlock,
8688 AddBranchWeights: HasBranchWeights);
8689 }
8690}
8691
8692void LoopVectorizationPlanner::addMinimumIterationCheck(
8693 VPlan &Plan, ElementCount VF, unsigned UF,
8694 ElementCount MinProfitableTripCount) const {
8695 // vscale is not necessarily a power-of-2, which means we cannot guarantee
8696 // an overflow to zero when updating induction variables and so an
8697 // additional overflow check is required before entering the vector loop.
8698 bool IsIndvarOverflowCheckNeededForVF =
8699 VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
8700 !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF, UF) &&
8701 CM.getTailFoldingStyle() !=
8702 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8703 const uint32_t *BranchWeigths =
8704 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())
8705 ? &MinItersBypassWeights[0]
8706 : nullptr;
8707 VPlanTransforms::addMinimumIterationCheck(
8708 Plan, VF, UF, MinProfitableTripCount,
8709 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()), TailFolded: CM.foldTailByMasking(),
8710 CheckNeededWithTailFolding: IsIndvarOverflowCheckNeededForVF, OrigLoop, MinItersBypassWeights: BranchWeigths,
8711 DL: OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
8712}
8713
8714// Determine how to lower the scalar epilogue, which depends on 1) optimising
8715// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8716// predication, and 4) a TTI hook that analyses whether the loop is suitable
8717// for predication.
8718static ScalarEpilogueLowering getScalarEpilogueLowering(
8719 Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
8720 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8721 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
8722 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8723 // don't look at hints or options, and don't request a scalar epilogue.
8724 if (F->hasOptSize() ||
8725 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8726 return CM_ScalarEpilogueNotAllowedOptSize;
8727
8728 // 2) If set, obey the directives
8729 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8730 switch (PreferPredicateOverEpilogue) {
8731 case PreferPredicateTy::ScalarEpilogue:
8732 return CM_ScalarEpilogueAllowed;
8733 case PreferPredicateTy::PredicateElseScalarEpilogue:
8734 return CM_ScalarEpilogueNotNeededUsePredicate;
8735 case PreferPredicateTy::PredicateOrDontVectorize:
8736 return CM_ScalarEpilogueNotAllowedUsePredicate;
8737 };
8738 }
8739
8740 // 3) If set, obey the hints
8741 switch (Hints.getPredicate()) {
8742 case LoopVectorizeHints::FK_Enabled:
8743 return CM_ScalarEpilogueNotNeededUsePredicate;
8744 case LoopVectorizeHints::FK_Disabled:
8745 return CM_ScalarEpilogueAllowed;
8746 };
8747
8748 // 4) if the TTI hook indicates this is profitable, request predication.
8749 TailFoldingInfo TFI(TLI, &LVL, IAI);
8750 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
8751 return CM_ScalarEpilogueNotNeededUsePredicate;
8752
8753 return CM_ScalarEpilogueAllowed;
8754}
8755
8756// Process the loop in the VPlan-native vectorization path. This path builds
8757// VPlan upfront in the vectorization pipeline, which allows to apply
8758// VPlan-to-VPlan transformations from the very beginning without modifying the
8759// input LLVM IR.
8760static bool processLoopInVPlanNativePath(
8761 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8762 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8763 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8764 OptimizationRemarkEmitter *ORE,
8765 std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
8766 LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
8767
8768 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
8769 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8770 return false;
8771 }
8772 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8773 Function *F = L->getHeader()->getParent();
8774 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8775
8776 ScalarEpilogueLowering SEL =
8777 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL&: *LVL, IAI: &IAI);
8778
8779 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
8780 GetBFI, F, &Hints, IAI, OptForSize);
8781 // Use the planner for outer loop vectorization.
8782 // TODO: CM is not used at this point inside the planner. Turn CM into an
8783 // optional argument if we don't need it in the future.
8784 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
8785 ORE);
8786
8787 // Get user vectorization factor.
8788 ElementCount UserVF = Hints.getWidth();
8789
8790 CM.collectElementTypesForWidening();
8791
8792 // Plan how to best vectorize, return the best VF and its cost.
8793 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8794
8795 // If we are stress testing VPlan builds, do not attempt to generate vector
8796 // code. Masked vector code generation support will follow soon.
8797 // Also, do not attempt to vectorize if no vector code will be produced.
8798 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
8799 return false;
8800
8801 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
8802
8803 {
8804 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
8805 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
8806 Checks, BestPlan);
8807 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8808 << L->getHeader()->getParent()->getName() << "\"\n");
8809 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, /*UF=*/1,
8810 MinProfitableTripCount: VF.MinProfitableTripCount);
8811
8812 LVP.executePlan(BestVF: VF.Width, /*UF=*/BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
8813 }
8814
8815 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
8816
8817 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8818 return true;
8819}
8820
8821// Emit a remark if there are stores to floats that required a floating point
8822// extension. If the vectorized loop was generated with floating point there
8823// will be a performance penalty from the conversion overhead and the change in
8824// the vector width.
8825static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
8826 SmallVector<Instruction *, 4> Worklist;
8827 for (BasicBlock *BB : L->getBlocks()) {
8828 for (Instruction &Inst : *BB) {
8829 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
8830 if (S->getValueOperand()->getType()->isFloatTy())
8831 Worklist.push_back(Elt: S);
8832 }
8833 }
8834 }
8835
8836 // Traverse the floating point stores upwards searching, for floating point
8837 // conversions.
8838 SmallPtrSet<const Instruction *, 4> Visited;
8839 SmallPtrSet<const Instruction *, 4> EmittedRemark;
8840 while (!Worklist.empty()) {
8841 auto *I = Worklist.pop_back_val();
8842 if (!L->contains(Inst: I))
8843 continue;
8844 if (!Visited.insert(Ptr: I).second)
8845 continue;
8846
8847 // Emit a remark if the floating point store required a floating
8848 // point conversion.
8849 // TODO: More work could be done to identify the root cause such as a
8850 // constant or a function return type and point the user to it.
8851 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
8852 ORE->emit(RemarkBuilder: [&]() {
8853 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
8854 I->getDebugLoc(), L->getHeader())
8855 << "floating point conversion changes vector width. "
8856 << "Mixed floating point precision requires an up/down "
8857 << "cast that will negatively impact performance.";
8858 });
8859
8860 for (Use &Op : I->operands())
8861 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
8862 Worklist.push_back(Elt: OpI);
8863 }
8864}
8865
8866/// For loops with uncountable early exits, find the cost of doing work when
8867/// exiting the loop early, such as calculating the final exit values of
8868/// variables used outside the loop.
8869/// TODO: This is currently overly pessimistic because the loop may not take
8870/// the early exit, but better to keep this conservative for now. In future,
8871/// it might be possible to relax this by using branch probabilities.
8872static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
8873 VPlan &Plan, ElementCount VF) {
8874 InstructionCost Cost = 0;
8875 for (auto *ExitVPBB : Plan.getExitBlocks()) {
8876 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
8877 // If the predecessor is not the middle.block, then it must be the
8878 // vector.early.exit block, which may contain work to calculate the exit
8879 // values of variables used outside the loop.
8880 if (PredVPBB != Plan.getMiddleBlock()) {
8881 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
8882 << PredVPBB->getName() << ":\n");
8883 Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
8884 }
8885 }
8886 }
8887 return Cost;
8888}
8889
8890/// This function determines whether or not it's still profitable to vectorize
8891/// the loop given the extra work we have to do outside of the loop:
8892/// 1. Perform the runtime checks before entering the loop to ensure it's safe
8893/// to vectorize.
8894/// 2. In the case of loops with uncountable early exits, we may have to do
8895/// extra work when exiting the loop early, such as calculating the final
8896/// exit values of variables used outside the loop.
8897/// 3. The middle block.
8898static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
8899 VectorizationFactor &VF, Loop *L,
8900 PredicatedScalarEvolution &PSE,
8901 VPCostContext &CostCtx, VPlan &Plan,
8902 ScalarEpilogueLowering SEL,
8903 std::optional<unsigned> VScale) {
8904 InstructionCost RtC = Checks.getCost();
8905 if (!RtC.isValid())
8906 return false;
8907
8908 // When interleaving only scalar and vector cost will be equal, which in turn
8909 // would lead to a divide by 0. Fall back to hard threshold.
8910 if (VF.Width.isScalar()) {
8911 // TODO: Should we rename VectorizeMemoryCheckThreshold?
8912 if (RtC > VectorizeMemoryCheckThreshold) {
8913 LLVM_DEBUG(
8914 dbgs()
8915 << "LV: Interleaving only is not profitable due to runtime checks\n");
8916 return false;
8917 }
8918 return true;
8919 }
8920
8921 // The scalar cost should only be 0 when vectorizing with a user specified
8922 // VF/IC. In those cases, runtime checks should always be generated.
8923 uint64_t ScalarC = VF.ScalarCost.getValue();
8924 if (ScalarC == 0)
8925 return true;
8926
8927 InstructionCost TotalCost = RtC;
8928 // Add on the cost of any work required in the vector early exit block, if
8929 // one exists.
8930 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
8931 TotalCost += Plan.getMiddleBlock()->cost(VF: VF.Width, Ctx&: CostCtx);
8932
8933 // First, compute the minimum iteration count required so that the vector
8934 // loop outperforms the scalar loop.
8935 // The total cost of the scalar loop is
8936 // ScalarC * TC
8937 // where
8938 // * TC is the actual trip count of the loop.
8939 // * ScalarC is the cost of a single scalar iteration.
8940 //
8941 // The total cost of the vector loop is
8942 // TotalCost + VecC * (TC / VF) + EpiC
8943 // where
8944 // * TotalCost is the sum of the costs cost of
8945 // - the generated runtime checks, i.e. RtC
8946 // - performing any additional work in the vector.early.exit block for
8947 // loops with uncountable early exits.
8948 // - the middle block, if ExpectedTC <= VF.Width.
8949 // * VecC is the cost of a single vector iteration.
8950 // * TC is the actual trip count of the loop
8951 // * VF is the vectorization factor
8952 // * EpiCost is the cost of the generated epilogue, including the cost
8953 // of the remaining scalar operations.
8954 //
8955 // Vectorization is profitable once the total vector cost is less than the
8956 // total scalar cost:
8957 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
8958 //
8959 // Now we can compute the minimum required trip count TC as
8960 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
8961 //
8962 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
8963 // the computations are performed on doubles, not integers and the result
8964 // is rounded up, hence we get an upper estimate of the TC.
8965 unsigned IntVF = estimateElementCount(VF: VF.Width, VScale);
8966 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
8967 uint64_t MinTC1 =
8968 Div == 0 ? 0 : divideCeil(Numerator: TotalCost.getValue() * IntVF, Denominator: Div);
8969
8970 // Second, compute a minimum iteration count so that the cost of the
8971 // runtime checks is only a fraction of the total scalar loop cost. This
8972 // adds a loop-dependent bound on the overhead incurred if the runtime
8973 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
8974 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
8975 // cost, compute
8976 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
8977 uint64_t MinTC2 = divideCeil(Numerator: RtC.getValue() * 10, Denominator: ScalarC);
8978
8979 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
8980 // epilogue is allowed, choose the next closest multiple of VF. This should
8981 // partly compensate for ignoring the epilogue cost.
8982 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
8983 if (SEL == CM_ScalarEpilogueAllowed)
8984 MinTC = alignTo(Value: MinTC, Align: IntVF);
8985 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
8986
8987 LLVM_DEBUG(
8988 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
8989 << VF.MinProfitableTripCount << "\n");
8990
8991 // Skip vectorization if the expected trip count is less than the minimum
8992 // required trip count.
8993 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
8994 if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
8995 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
8996 "trip count < minimum profitable VF ("
8997 << *ExpectedTC << " < " << VF.MinProfitableTripCount
8998 << ")\n");
8999
9000 return false;
9001 }
9002 }
9003 return true;
9004}
9005
9006LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9007 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9008 !EnableLoopInterleaving),
9009 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9010 !EnableLoopVectorization) {}
9011
9012/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9013/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9014/// don't have a corresponding wide induction in \p EpiPlan.
9015static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9016 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9017 // will need their resume-values computed in the main vector loop. Others
9018 // can be removed from the main VPlan.
9019 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9020 for (VPRecipeBase &R :
9021 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9022 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9023 continue;
9024 EpiWidenedPhis.insert(
9025 Ptr: cast<PHINode>(Val: R.getVPSingleValue()->getUnderlyingValue()));
9026 }
9027 for (VPRecipeBase &R :
9028 make_early_inc_range(Range: MainPlan.getScalarHeader()->phis())) {
9029 auto *VPIRInst = cast<VPIRPhi>(Val: &R);
9030 if (EpiWidenedPhis.contains(Ptr: &VPIRInst->getIRPhi()))
9031 continue;
9032 // There is no corresponding wide induction in the epilogue plan that would
9033 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9034 // together with the corresponding ResumePhi. The resume values for the
9035 // scalar loop will be created during execution of EpiPlan.
9036 VPRecipeBase *ResumePhi = VPIRInst->getOperand(N: 0)->getDefiningRecipe();
9037 VPIRInst->eraseFromParent();
9038 ResumePhi->eraseFromParent();
9039 }
9040 RUN_VPLAN_PASS(VPlanTransforms::removeDeadRecipes, MainPlan);
9041
9042 using namespace VPlanPatternMatch;
9043 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9044 // introduce multiple uses of undef/poison. If the reduction start value may
9045 // be undef or poison it needs to be frozen and the frozen start has to be
9046 // used when computing the reduction result. We also need to use the frozen
9047 // value in the resume phi generated by the main vector loop, as this is also
9048 // used to compute the reduction result after the epilogue vector loop.
9049 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9050 bool UpdateResumePhis) {
9051 VPBuilder Builder(Plan.getEntry());
9052 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9053 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9054 if (!VPI)
9055 continue;
9056 VPValue *OrigStart;
9057 if (!matchFindIVResult(VPI, ReducedIV: m_VPValue(), Start: m_VPValue(V&: OrigStart)))
9058 continue;
9059 if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
9060 continue;
9061 VPInstruction *Freeze =
9062 Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, DL: {}, Name: "fr");
9063 VPI->setOperand(I: 2, New: Freeze);
9064 if (UpdateResumePhis)
9065 OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
9066 return Freeze != &U && isa<VPPhi>(Val: &U);
9067 });
9068 }
9069 };
9070 AddFreezeForFindLastIVReductions(MainPlan, true);
9071 AddFreezeForFindLastIVReductions(EpiPlan, false);
9072
9073 VPValue *VectorTC = nullptr;
9074 auto *Term =
9075 MainPlan.getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9076 [[maybe_unused]] bool MatchedTC =
9077 match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue(V&: VectorTC)));
9078 assert(MatchedTC && "must match vector trip count");
9079
9080 // If there is a suitable resume value for the canonical induction in the
9081 // scalar (which will become vector) epilogue loop, use it and move it to the
9082 // beginning of the scalar preheader. Otherwise create it below.
9083 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9084 auto ResumePhiIter =
9085 find_if(Range: MainScalarPH->phis(), P: [VectorTC](VPRecipeBase &R) {
9086 return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Ops: m_Specific(VPV: VectorTC),
9087 Ops: m_ZeroInt()));
9088 });
9089 VPPhi *ResumePhi = nullptr;
9090 if (ResumePhiIter == MainScalarPH->phis().end()) {
9091 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9092 ResumePhi = ScalarPHBuilder.createScalarPhi(
9093 IncomingValues: {VectorTC,
9094 MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue()},
9095 DL: {}, Name: "vec.epilog.resume.val");
9096 } else {
9097 ResumePhi = cast<VPPhi>(Val: &*ResumePhiIter);
9098 if (MainScalarPH->begin() == MainScalarPH->end())
9099 ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->end());
9100 else if (&*MainScalarPH->begin() != ResumePhi)
9101 ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->begin());
9102 }
9103 // Add a user to to make sure the resume phi won't get removed.
9104 VPBuilder(MainScalarPH)
9105 .createNaryOp(Opcode: VPInstruction::ResumeForEpilogue, Operands: ResumePhi);
9106}
9107
9108/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9109/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
9110/// reductions require creating new instructions to compute the resume values.
9111/// They are collected in a vector and returned. They must be moved to the
9112/// preheader of the vector epilogue loop, after created by the execution of \p
9113/// Plan.
9114static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
9115 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
9116 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
9117 ScalarEvolution &SE) {
9118 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9119 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9120 Header->setName("vec.epilog.vector.body");
9121
9122 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
9123 // When vectorizing the epilogue loop, the canonical induction needs to be
9124 // adjusted by the value after the main vector loop. Find the resume value
9125 // created during execution of the main VPlan. It must be the first phi in the
9126 // loop preheader. Use the value to increment the canonical IV, and update all
9127 // users in the loop region to use the adjusted value.
9128 // FIXME: Improve modeling for canonical IV start values in the epilogue
9129 // loop.
9130 using namespace llvm::PatternMatch;
9131 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9132 for (Value *Inc : EPResumeVal->incoming_values()) {
9133 if (match(V: Inc, P: m_SpecificInt(V: 0)))
9134 continue;
9135 assert(!EPI.VectorTripCount &&
9136 "Must only have a single non-zero incoming value");
9137 EPI.VectorTripCount = Inc;
9138 }
9139 // If we didn't find a non-zero vector trip count, all incoming values
9140 // must be zero, which also means the vector trip count is zero. Pick the
9141 // first zero as vector trip count.
9142 // TODO: We should not choose VF * UF so the main vector loop is known to
9143 // be dead.
9144 if (!EPI.VectorTripCount) {
9145 assert(EPResumeVal->getNumIncomingValues() > 0 &&
9146 all_of(EPResumeVal->incoming_values(),
9147 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9148 "all incoming values must be 0");
9149 EPI.VectorTripCount = EPResumeVal->getOperand(i_nocapture: 0);
9150 }
9151 VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9152 assert(all_of(IV->users(),
9153 [](const VPUser *U) {
9154 return isa<VPScalarIVStepsRecipe>(U) ||
9155 isa<VPDerivedIVRecipe>(U) ||
9156 cast<VPRecipeBase>(U)->isScalarCast() ||
9157 cast<VPInstruction>(U)->getOpcode() ==
9158 Instruction::Add;
9159 }) &&
9160 "the canonical IV should only be used by its increment or "
9161 "ScalarIVSteps when resetting the start value");
9162 VPBuilder Builder(Header, Header->getFirstNonPhi());
9163 VPInstruction *Add = Builder.createAdd(LHS: IV, RHS: VPV);
9164 IV->replaceAllUsesWith(New: Add);
9165 Add->setOperand(I: 0, New: IV);
9166
9167 DenseMap<Value *, Value *> ToFrozen;
9168 SmallVector<Instruction *> InstsToMove;
9169 // Ensure that the start values for all header phi recipes are updated before
9170 // vectorizing the epilogue loop. Skip the canonical IV, which has been
9171 // handled above.
9172 for (VPRecipeBase &R : drop_begin(RangeOrContainer: Header->phis())) {
9173 Value *ResumeV = nullptr;
9174 // TODO: Move setting of resume values to prepareToExecute.
9175 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9176 // Find the reduction result by searching users of the phi or its backedge
9177 // value.
9178 auto IsReductionResult = [](VPRecipeBase *R) {
9179 auto *VPI = dyn_cast<VPInstruction>(Val: R);
9180 if (!VPI)
9181 return false;
9182 return VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9183 VPI->getOpcode() == VPInstruction::ComputeReductionResult;
9184 };
9185 auto *RdxResult = cast<VPInstruction>(
9186 Val: vputils::findRecipe(Start: ReductionPhi->getBackedgeValue(), Pred: IsReductionResult));
9187 assert(RdxResult && "expected to find reduction result");
9188
9189 ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9190 ->getIncomingValueForBlock(BB: L->getLoopPreheader());
9191
9192 // Check for FindIV pattern by looking for icmp user of RdxResult.
9193 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
9194 using namespace VPlanPatternMatch;
9195 VPValue *SentinelVPV = nullptr;
9196 bool IsFindIV = any_of(Range: RdxResult->users(), P: [&](VPUser *U) {
9197 return match(U, P: VPlanPatternMatch::m_SpecificICmp(
9198 MatchPred: ICmpInst::ICMP_NE, Op0: m_Specific(VPV: RdxResult),
9199 Op1: m_VPValue(V&: SentinelVPV)));
9200 });
9201
9202 if (RdxResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
9203 Value *StartV = RdxResult->getOperand(N: 0)->getLiveInIRValue();
9204 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9205 // start value; compare the final value from the main vector loop
9206 // to the start value.
9207 BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9208 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9209 ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9210 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9211 InstsToMove.push_back(Elt: I);
9212 } else if (IsFindIV) {
9213 assert(SentinelVPV && "expected to find icmp using RdxResult");
9214
9215 // Get the frozen start value from the main loop.
9216 Value *FrozenStartV = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9217 BB: EPI.MainLoopIterationCountCheck);
9218 if (auto *FreezeI = dyn_cast<FreezeInst>(Val: FrozenStartV))
9219 ToFrozen[FreezeI->getOperand(i_nocapture: 0)] = FrozenStartV;
9220
9221 // Adjust resume: select(icmp eq ResumeV, FrozenStartV), Sentinel,
9222 // ResumeV
9223 BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9224 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9225 Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: FrozenStartV);
9226 if (auto *I = dyn_cast<Instruction>(Val: Cmp))
9227 InstsToMove.push_back(Elt: I);
9228 ResumeV =
9229 Builder.CreateSelect(C: Cmp, True: SentinelVPV->getLiveInIRValue(), False: ResumeV);
9230 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9231 InstsToMove.push_back(Elt: I);
9232 } else {
9233 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9234 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9235 if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9236 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9237 "unexpected start value");
9238 // Partial sub-reductions always start at 0 and account for the
9239 // reduction start value in a final subtraction. Update it to use the
9240 // resume value from the main vector loop.
9241 if (PhiR->getVFScaleFactor() > 1 &&
9242 PhiR->getRecurrenceKind() == RecurKind::Sub) {
9243 auto *Sub = cast<VPInstruction>(Val: RdxResult->getSingleUser());
9244 assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode");
9245 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
9246 "Expected operand to match the original start value of the "
9247 "reduction");
9248 assert(VPlanPatternMatch::match(VPI->getOperand(0),
9249 VPlanPatternMatch::m_ZeroInt()) &&
9250 "Expected start value for partial sub-reduction to start at "
9251 "zero");
9252 Sub->setOperand(I: 0, New: StartVal);
9253 } else
9254 VPI->setOperand(I: 0, New: StartVal);
9255 continue;
9256 }
9257 }
9258 } else {
9259 // Retrieve the induction resume values for wide inductions from
9260 // their original phi nodes in the scalar loop.
9261 PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9262 // Hook up to the PHINode generated by a ResumePhi recipe of main
9263 // loop VPlan, which feeds the scalar loop.
9264 ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9265 }
9266 assert(ResumeV && "Must have a resume value");
9267 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9268 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9269 }
9270
9271 // For some VPValues in the epilogue plan we must re-use the generated IR
9272 // values from the main plan. Replace them with live-in VPValues.
9273 // TODO: This is a workaround needed for epilogue vectorization and it
9274 // should be removed once induction resume value creation is done
9275 // directly in VPlan.
9276 for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9277 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9278 // epilogue plan. This ensures all users use the same frozen value.
9279 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9280 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9281 VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9282 V: ToFrozen.lookup(Val: VPI->getOperand(N: 0)->getLiveInIRValue())));
9283 continue;
9284 }
9285
9286 // Re-use the trip count and steps expanded for the main loop, as
9287 // skeleton creation needs it as a value that dominates both the scalar
9288 // and vector epilogue loops
9289 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9290 if (!ExpandR)
9291 continue;
9292 VPValue *ExpandedVal =
9293 Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9294 ExpandR->replaceAllUsesWith(New: ExpandedVal);
9295 if (Plan.getTripCount() == ExpandR)
9296 Plan.resetTripCount(NewTripCount: ExpandedVal);
9297 ExpandR->eraseFromParent();
9298 }
9299
9300 auto VScale = CM.getVScaleForTuning();
9301 unsigned MainLoopStep =
9302 estimateElementCount(VF: EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9303 unsigned EpilogueLoopStep =
9304 estimateElementCount(VF: EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9305 VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
9306 Plan, TripCount: EPI.TripCount, VectorTripCount: EPI.VectorTripCount,
9307 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()), EpilogueVF: EPI.EpilogueVF,
9308 EpilogueUF: EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9309
9310 return InstsToMove;
9311}
9312
9313// Generate bypass values from the additional bypass block. Note that when the
9314// vectorized epilogue is skipped due to iteration count check, then the
9315// resume value for the induction variable comes from the trip count of the
9316// main vector loop, passed as the second argument.
9317static Value *createInductionAdditionalBypassValues(
9318 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9319 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9320 Instruction *OldInduction) {
9321 Value *Step = getExpandedStep(ID: II, ExpandedSCEVs);
9322 // For the primary induction the additional bypass end value is known.
9323 // Otherwise it is computed.
9324 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9325 if (OrigPhi != OldInduction) {
9326 auto *BinOp = II.getInductionBinOp();
9327 // Fast-math-flags propagate from the original induction instruction.
9328 if (isa_and_nonnull<FPMathOperator>(Val: BinOp))
9329 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9330
9331 // Compute the end value for the additional bypass.
9332 EndValueFromAdditionalBypass =
9333 emitTransformedIndex(B&: BypassBuilder, Index: MainVectorTripCount,
9334 StartValue: II.getStartValue(), Step, InductionKind: II.getKind(), InductionBinOp: BinOp);
9335 EndValueFromAdditionalBypass->setName("ind.end");
9336 }
9337 return EndValueFromAdditionalBypass;
9338}
9339
9340static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
9341 VPlan &BestEpiPlan,
9342 LoopVectorizationLegality &LVL,
9343 const SCEV2ValueTy &ExpandedSCEVs,
9344 Value *MainVectorTripCount) {
9345 // Fix reduction resume values from the additional bypass block.
9346 BasicBlock *PH = L->getLoopPreheader();
9347 for (auto *Pred : predecessors(BB: PH)) {
9348 for (PHINode &Phi : PH->phis()) {
9349 if (Phi.getBasicBlockIndex(BB: Pred) != -1)
9350 continue;
9351 Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
9352 }
9353 }
9354 auto *ScalarPH = cast<VPIRBasicBlock>(Val: BestEpiPlan.getScalarPreheader());
9355 if (ScalarPH->hasPredecessors()) {
9356 // If ScalarPH has predecessors, we may need to update its reduction
9357 // resume values.
9358 for (const auto &[R, IRPhi] :
9359 zip(t: ScalarPH->phis(), u: ScalarPH->getIRBasicBlock()->phis())) {
9360 fixReductionScalarResumeWhenVectorizingEpilog(EpiResumePhiR: cast<VPPhi>(Val: &R), EpiResumePhi&: IRPhi,
9361 BypassBlock);
9362 }
9363 }
9364
9365 // Fix induction resume values from the additional bypass block.
9366 IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
9367 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
9368 auto *Inc = cast<PHINode>(Val: IVPhi->getIncomingValueForBlock(BB: PH));
9369 Value *V = createInductionAdditionalBypassValues(
9370 OrigPhi: IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
9371 OldInduction: LVL.getPrimaryInduction());
9372 // TODO: Directly add as extra operand to the VPResumePHI recipe.
9373 Inc->setIncomingValueForBlock(BB: BypassBlock, V);
9374 }
9375}
9376
9377/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9378// loop, after both plans have executed, updating branches from the iteration
9379// and runtime checks of the main loop, as well as updating various phis. \p
9380// InstsToMove contains instructions that need to be moved to the preheader of
9381// the epilogue vector loop.
9382static void connectEpilogueVectorLoop(
9383 VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
9384 DominatorTree *DT, LoopVectorizationLegality &LVL,
9385 DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
9386 ArrayRef<Instruction *> InstsToMove) {
9387 BasicBlock *VecEpilogueIterationCountCheck =
9388 cast<VPIRBasicBlock>(Val: EpiPlan.getEntry())->getIRBasicBlock();
9389
9390 BasicBlock *VecEpiloguePreHeader =
9391 cast<BranchInst>(Val: VecEpilogueIterationCountCheck->getTerminator())
9392 ->getSuccessor(i: 1);
9393 // Adjust the control flow taking the state info from the main loop
9394 // vectorization into account.
9395 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
9396 "expected this to be saved from the previous pass.");
9397 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9398 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
9399 From: VecEpilogueIterationCountCheck, To: VecEpiloguePreHeader);
9400
9401 DTU.applyUpdates(Updates: {{DominatorTree::Delete, EPI.MainLoopIterationCountCheck,
9402 VecEpilogueIterationCountCheck},
9403 {DominatorTree::Insert, EPI.MainLoopIterationCountCheck,
9404 VecEpiloguePreHeader}});
9405
9406 BasicBlock *ScalarPH =
9407 cast<VPIRBasicBlock>(Val: EpiPlan.getScalarPreheader())->getIRBasicBlock();
9408 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
9409 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9410 DTU.applyUpdates(
9411 Updates: {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck,
9412 VecEpilogueIterationCountCheck},
9413 {DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}});
9414
9415 // Adjust the terminators of runtime check blocks and phis using them.
9416 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9417 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9418 if (SCEVCheckBlock) {
9419 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9420 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9421 DTU.applyUpdates(Updates: {{DominatorTree::Delete, SCEVCheckBlock,
9422 VecEpilogueIterationCountCheck},
9423 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9424 }
9425 if (MemCheckBlock) {
9426 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9427 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9428 DTU.applyUpdates(
9429 Updates: {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9430 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9431 }
9432
9433 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9434 // or reductions which merge control-flow from the latch block and the
9435 // middle block. Update the incoming values here and move the Phi into the
9436 // preheader.
9437 SmallVector<PHINode *, 4> PhisInBlock(
9438 llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
9439
9440 for (PHINode *Phi : PhisInBlock) {
9441 Phi->moveBefore(InsertPos: VecEpiloguePreHeader->getFirstNonPHIIt());
9442 Phi->replaceIncomingBlockWith(
9443 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
9444 New: VecEpilogueIterationCountCheck);
9445
9446 // If the phi doesn't have an incoming value from the
9447 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9448 // incoming value and also those from other check blocks. This is needed
9449 // for reduction phis only.
9450 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
9451 return EPI.EpilogueIterationCountCheck == IncB;
9452 }))
9453 continue;
9454 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
9455 if (SCEVCheckBlock)
9456 Phi->removeIncomingValue(BB: SCEVCheckBlock);
9457 if (MemCheckBlock)
9458 Phi->removeIncomingValue(BB: MemCheckBlock);
9459 }
9460
9461 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9462 for (auto *I : InstsToMove)
9463 I->moveBefore(InsertPos: IP);
9464
9465 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9466 // after executing the main loop. We need to update the resume values of
9467 // inductions and reductions during epilogue vectorization.
9468 fixScalarResumeValuesFromBypass(BypassBlock: VecEpilogueIterationCountCheck, L, BestEpiPlan&: EpiPlan,
9469 LVL, ExpandedSCEVs, MainVectorTripCount: EPI.VectorTripCount);
9470}
9471
9472bool LoopVectorizePass::processLoop(Loop *L) {
9473 assert((EnableVPlanNativePath || L->isInnermost()) &&
9474 "VPlan-native path is not enabled. Only process inner loops.");
9475
9476 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9477 << L->getHeader()->getParent()->getName() << "' from "
9478 << L->getLocStr() << "\n");
9479
9480 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9481
9482 LLVM_DEBUG(
9483 dbgs() << "LV: Loop hints:"
9484 << " force="
9485 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9486 ? "disabled"
9487 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9488 ? "enabled"
9489 : "?"))
9490 << " width=" << Hints.getWidth()
9491 << " interleave=" << Hints.getInterleave() << "\n");
9492
9493 // Function containing loop
9494 Function *F = L->getHeader()->getParent();
9495
9496 // Looking at the diagnostic output is the only way to determine if a loop
9497 // was vectorized (other than looking at the IR or machine code), so it
9498 // is important to generate an optimization remark for each loop. Most of
9499 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9500 // generated as OptimizationRemark and OptimizationRemarkMissed are
9501 // less verbose reporting vectorized loops and unvectorized loops that may
9502 // benefit from vectorization, respectively.
9503
9504 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9505 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9506 return false;
9507 }
9508
9509 PredicatedScalarEvolution PSE(*SE, *L);
9510
9511 // Query this against the original loop and save it here because the profile
9512 // of the original loop header may change as the transformation happens.
9513 bool OptForSize = llvm::shouldOptimizeForSize(
9514 BB: L->getHeader(), PSI,
9515 BFI: PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
9516 QueryType: PGSOQueryType::IRPass);
9517
9518 // Check if it is legal to vectorize the loop.
9519 LoopVectorizationRequirements Requirements;
9520 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9521 &Requirements, &Hints, DB, AC,
9522 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
9523 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9524 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9525 Hints.emitRemarkWithHints();
9526 return false;
9527 }
9528
9529 if (LVL.hasUncountableEarlyExit()) {
9530 if (!EnableEarlyExitVectorization) {
9531 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
9532 "early exit is not enabled",
9533 ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
9534 return false;
9535 }
9536 }
9537
9538 if (!LVL.getPotentiallyFaultingLoads().empty()) {
9539 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with potentially "
9540 "faulting load is not supported",
9541 ORETag: "PotentiallyFaultingLoadsNotSupported", ORE, TheLoop: L);
9542 return false;
9543 }
9544
9545 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9546 // here. They may require CFG and instruction level transformations before
9547 // even evaluating whether vectorization is profitable. Since we cannot modify
9548 // the incoming IR, we need to build VPlan upfront in the vectorization
9549 // pipeline.
9550 if (!L->isInnermost())
9551 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9552 ORE, GetBFI, OptForSize, Hints,
9553 Requirements);
9554
9555 assert(L->isInnermost() && "Inner loop expected.");
9556
9557 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9558 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9559
9560 // If an override option has been passed in for interleaved accesses, use it.
9561 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9562 UseInterleaved = EnableInterleavedMemAccesses;
9563
9564 // Analyze interleaved memory accesses.
9565 if (UseInterleaved)
9566 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9567
9568 if (LVL.hasUncountableEarlyExit()) {
9569 BasicBlock *LoopLatch = L->getLoopLatch();
9570 if (IAI.requiresScalarEpilogue() ||
9571 any_of(Range: LVL.getCountableExitingBlocks(),
9572 P: [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9573 reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
9574 "requiring a scalar epilogue is unsupported",
9575 ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
9576 return false;
9577 }
9578 }
9579
9580 // Check the function attributes and profiles to find out if this function
9581 // should be optimized for size.
9582 ScalarEpilogueLowering SEL =
9583 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, IAI: &IAI);
9584
9585 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9586 // count by optimizing for size, to minimize overheads.
9587 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9588 if (ExpectedTC && ExpectedTC->isFixed() &&
9589 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9590 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9591 << "This loop is worth vectorizing only if no scalar "
9592 << "iteration overheads are incurred.");
9593 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9594 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9595 else {
9596 LLVM_DEBUG(dbgs() << "\n");
9597 // Predicate tail-folded loops are efficient even when the loop
9598 // iteration count is low. However, setting the epilogue policy to
9599 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9600 // with runtime checks. It's more effective to let
9601 // `isOutsideLoopWorkProfitable` determine if vectorization is
9602 // beneficial for the loop.
9603 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9604 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9605 }
9606 }
9607
9608 // Check the function attributes to see if implicit floats or vectors are
9609 // allowed.
9610 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
9611 reportVectorizationFailure(
9612 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9613 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9614 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9615 Hints.emitRemarkWithHints();
9616 return false;
9617 }
9618
9619 // Check if the target supports potentially unsafe FP vectorization.
9620 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9621 // for the target we're vectorizing for, to make sure none of the
9622 // additional fp-math flags can help.
9623 if (Hints.isPotentiallyUnsafe() &&
9624 TTI->isFPVectorizationPotentiallyUnsafe()) {
9625 reportVectorizationFailure(
9626 DebugMsg: "Potentially unsafe FP op prevents vectorization",
9627 OREMsg: "loop not vectorized due to unsafe FP support.",
9628 ORETag: "UnsafeFP", ORE, TheLoop: L);
9629 Hints.emitRemarkWithHints();
9630 return false;
9631 }
9632
9633 bool AllowOrderedReductions;
9634 // If the flag is set, use that instead and override the TTI behaviour.
9635 if (ForceOrderedReductions.getNumOccurrences() > 0)
9636 AllowOrderedReductions = ForceOrderedReductions;
9637 else
9638 AllowOrderedReductions = TTI->enableOrderedReductions();
9639 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9640 ORE->emit(RemarkBuilder: [&]() {
9641 auto *ExactFPMathInst = Requirements.getExactFPInst();
9642 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9643 ExactFPMathInst->getDebugLoc(),
9644 ExactFPMathInst->getParent())
9645 << "loop not vectorized: cannot prove it is safe to reorder "
9646 "floating-point operations";
9647 });
9648 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9649 "reorder floating-point operations\n");
9650 Hints.emitRemarkWithHints();
9651 return false;
9652 }
9653
9654 // Use the cost model.
9655 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9656 GetBFI, F, &Hints, IAI, OptForSize);
9657 // Use the planner for vectorization.
9658 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9659 ORE);
9660
9661 // Get user vectorization factor and interleave count.
9662 ElementCount UserVF = Hints.getWidth();
9663 unsigned UserIC = Hints.getInterleave();
9664 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9665 UserIC = 1;
9666
9667 // Plan how to best vectorize.
9668 LVP.plan(UserVF, UserIC);
9669 VectorizationFactor VF = LVP.computeBestVF();
9670 unsigned IC = 1;
9671
9672 if (ORE->allowExtraAnalysis(LV_NAME))
9673 LVP.emitInvalidCostRemarks(ORE);
9674
9675 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
9676 if (LVP.hasPlanWithVF(VF: VF.Width)) {
9677 // Select the interleave count.
9678 IC = LVP.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
9679
9680 unsigned SelectedIC = std::max(a: IC, b: UserIC);
9681 // Optimistically generate runtime checks if they are needed. Drop them if
9682 // they turn out to not be profitable.
9683 if (VF.Width.isVector() || SelectedIC > 1) {
9684 Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC,
9685 ORE&: *ORE);
9686
9687 // Bail out early if either the SCEV or memory runtime checks are known to
9688 // fail. In that case, the vector loop would never execute.
9689 using namespace llvm::PatternMatch;
9690 if (Checks.getSCEVChecks().first &&
9691 match(V: Checks.getSCEVChecks().first, P: m_One()))
9692 return false;
9693 if (Checks.getMemRuntimeChecks().first &&
9694 match(V: Checks.getMemRuntimeChecks().first, P: m_One()))
9695 return false;
9696 }
9697
9698 // Check if it is profitable to vectorize with runtime checks.
9699 bool ForceVectorization =
9700 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9701 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF: VF.Width), CM,
9702 CM.CostKind, CM.PSE, L);
9703 if (!ForceVectorization &&
9704 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9705 Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
9706 VScale: CM.getVScaleForTuning())) {
9707 ORE->emit(RemarkBuilder: [&]() {
9708 return OptimizationRemarkAnalysisAliasing(
9709 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9710 L->getHeader())
9711 << "loop not vectorized: cannot prove it is safe to reorder "
9712 "memory operations";
9713 });
9714 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9715 Hints.emitRemarkWithHints();
9716 return false;
9717 }
9718 }
9719
9720 // Identify the diagnostic messages that should be produced.
9721 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9722 bool VectorizeLoop = true, InterleaveLoop = true;
9723 if (VF.Width.isScalar()) {
9724 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9725 VecDiagMsg = {
9726 "VectorizationNotBeneficial",
9727 "the cost-model indicates that vectorization is not beneficial"};
9728 VectorizeLoop = false;
9729 }
9730
9731 if (UserIC == 1 && Hints.getInterleave() > 1) {
9732 assert(!LVL.isSafeForAnyVectorWidth() &&
9733 "UserIC should only be ignored due to unsafe dependencies");
9734 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9735 IntDiagMsg = {"InterleavingUnsafe",
9736 "Ignoring user-specified interleave count due to possibly "
9737 "unsafe dependencies in the loop."};
9738 InterleaveLoop = false;
9739 } else if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > 1) {
9740 // Tell the user interleaving was avoided up-front, despite being explicitly
9741 // requested.
9742 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9743 "interleaving should be avoided up front\n");
9744 IntDiagMsg = {"InterleavingAvoided",
9745 "Ignoring UserIC, because interleaving was avoided up front"};
9746 InterleaveLoop = false;
9747 } else if (IC == 1 && UserIC <= 1) {
9748 // Tell the user interleaving is not beneficial.
9749 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9750 IntDiagMsg = {
9751 "InterleavingNotBeneficial",
9752 "the cost-model indicates that interleaving is not beneficial"};
9753 InterleaveLoop = false;
9754 if (UserIC == 1) {
9755 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9756 IntDiagMsg.second +=
9757 " and is explicitly disabled or interleave count is set to 1";
9758 }
9759 } else if (IC > 1 && UserIC == 1) {
9760 // Tell the user interleaving is beneficial, but it explicitly disabled.
9761 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
9762 "disabled.\n");
9763 IntDiagMsg = {"InterleavingBeneficialButDisabled",
9764 "the cost-model indicates that interleaving is beneficial "
9765 "but is explicitly disabled or interleave count is set to 1"};
9766 InterleaveLoop = false;
9767 }
9768
9769 // If there is a histogram in the loop, do not just interleave without
9770 // vectorizing. The order of operations will be incorrect without the
9771 // histogram intrinsics, which are only used for recipes with VF > 1.
9772 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
9773 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9774 << "to histogram operations.\n");
9775 IntDiagMsg = {
9776 "HistogramPreventsScalarInterleaving",
9777 "Unable to interleave without vectorization due to constraints on "
9778 "the order of histogram operations"};
9779 InterleaveLoop = false;
9780 }
9781
9782 // Override IC if user provided an interleave count.
9783 IC = UserIC > 0 ? UserIC : IC;
9784
9785 // FIXME: Enable interleaving for FindLast reductions.
9786 if (InterleaveLoop && hasFindLastReductionPhi(Plan&: LVP.getPlanFor(VF: VF.Width))) {
9787 LLVM_DEBUG(dbgs() << "LV: Not interleaving due to FindLast reduction.\n");
9788 IntDiagMsg = {"FindLastPreventsScalarInterleaving",
9789 "Unable to interleave due to FindLast reduction."};
9790 InterleaveLoop = false;
9791 IC = 1;
9792 }
9793
9794 // Emit diagnostic messages, if any.
9795 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9796 if (!VectorizeLoop && !InterleaveLoop) {
9797 // Do not vectorize or interleaving the loop.
9798 ORE->emit(RemarkBuilder: [&]() {
9799 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9800 L->getStartLoc(), L->getHeader())
9801 << VecDiagMsg.second;
9802 });
9803 ORE->emit(RemarkBuilder: [&]() {
9804 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9805 L->getStartLoc(), L->getHeader())
9806 << IntDiagMsg.second;
9807 });
9808 return false;
9809 }
9810
9811 if (!VectorizeLoop && InterleaveLoop) {
9812 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9813 ORE->emit(RemarkBuilder: [&]() {
9814 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9815 L->getStartLoc(), L->getHeader())
9816 << VecDiagMsg.second;
9817 });
9818 } else if (VectorizeLoop && !InterleaveLoop) {
9819 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9820 << ") in " << L->getLocStr() << '\n');
9821 ORE->emit(RemarkBuilder: [&]() {
9822 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9823 L->getStartLoc(), L->getHeader())
9824 << IntDiagMsg.second;
9825 });
9826 } else if (VectorizeLoop && InterleaveLoop) {
9827 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9828 << ") in " << L->getLocStr() << '\n');
9829 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9830 }
9831
9832 // Report the vectorization decision.
9833 if (VF.Width.isScalar()) {
9834 using namespace ore;
9835 assert(IC > 1);
9836 ORE->emit(RemarkBuilder: [&]() {
9837 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9838 L->getHeader())
9839 << "interleaved loop (interleaved count: "
9840 << NV("InterleaveCount", IC) << ")";
9841 });
9842 } else {
9843 // Report the vectorization decision.
9844 reportVectorization(ORE, TheLoop: L, VF, IC);
9845 }
9846 if (ORE->allowExtraAnalysis(LV_NAME))
9847 checkMixedPrecision(L, ORE);
9848
9849 // If we decided that it is *legal* to interleave or vectorize the loop, then
9850 // do it.
9851
9852 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9853 // Consider vectorizing the epilogue too if it's profitable.
9854 VectorizationFactor EpilogueVF =
9855 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
9856 if (EpilogueVF.Width.isVector()) {
9857 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
9858
9859 // The first pass vectorizes the main loop and creates a scalar epilogue
9860 // to be vectorized by executing the plan (potentially with a different
9861 // factor) again shortly afterwards.
9862 VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
9863 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
9864 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
9865 preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
9866 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
9867 BestEpiPlan);
9868 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9869 Checks, *BestMainPlan);
9870 auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
9871 BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false);
9872 ++LoopsVectorized;
9873
9874 // Second pass vectorizes the epilogue and adjusts the control flow
9875 // edges from the first pass.
9876 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9877 Checks, BestEpiPlan);
9878 SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
9879 Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI, CM, SE&: *PSE.getSE());
9880 LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, DT,
9881 VectorizingEpilogue: true);
9882 connectEpilogueVectorLoop(EpiPlan&: BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
9883 Checks, InstsToMove);
9884 ++LoopsEpilogueVectorized;
9885 } else {
9886 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
9887 BestPlan);
9888 // TODO: Move to general VPlan pipeline once epilogue loops are also
9889 // supported.
9890 RUN_VPLAN_PASS(VPlanTransforms::materializeConstantVectorTripCount,
9891 BestPlan, VF.Width, IC, PSE);
9892 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, UF: IC,
9893 MinProfitableTripCount: VF.MinProfitableTripCount);
9894
9895 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9896 ++LoopsVectorized;
9897 }
9898
9899 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
9900 "DT not preserved correctly");
9901 assert(!verifyFunction(*F, &dbgs()));
9902
9903 return true;
9904}
9905
9906LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
9907
9908 // Don't attempt if
9909 // 1. the target claims to have no vector registers, and
9910 // 2. interleaving won't help ILP.
9911 //
9912 // The second condition is necessary because, even if the target has no
9913 // vector registers, loop vectorization may still enable scalar
9914 // interleaving.
9915 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
9916 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
9917 return LoopVectorizeResult(false, false);
9918
9919 bool Changed = false, CFGChanged = false;
9920
9921 // The vectorizer requires loops to be in simplified form.
9922 // Since simplification may add new inner loops, it has to run before the
9923 // legality and profitability checks. This means running the loop vectorizer
9924 // will simplify all loops, regardless of whether anything end up being
9925 // vectorized.
9926 for (const auto &L : *LI)
9927 Changed |= CFGChanged |=
9928 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
9929
9930 // Build up a worklist of inner-loops to vectorize. This is necessary as
9931 // the act of vectorizing or partially unrolling a loop creates new loops
9932 // and can invalidate iterators across the loops.
9933 SmallVector<Loop *, 8> Worklist;
9934
9935 for (Loop *L : *LI)
9936 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
9937
9938 LoopsAnalyzed += Worklist.size();
9939
9940 // Now walk the identified inner loops.
9941 while (!Worklist.empty()) {
9942 Loop *L = Worklist.pop_back_val();
9943
9944 // For the inner loops we actually process, form LCSSA to simplify the
9945 // transform.
9946 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
9947
9948 Changed |= CFGChanged |= processLoop(L);
9949
9950 if (Changed) {
9951 LAIs->clear();
9952
9953#ifndef NDEBUG
9954 if (VerifySCEV)
9955 SE->verify();
9956#endif
9957 }
9958 }
9959
9960 // Process each loop nest in the function.
9961 return LoopVectorizeResult(Changed, CFGChanged);
9962}
9963
9964PreservedAnalyses LoopVectorizePass::run(Function &F,
9965 FunctionAnalysisManager &AM) {
9966 LI = &AM.getResult<LoopAnalysis>(IR&: F);
9967 // There are no loops in the function. Return before computing other
9968 // expensive analyses.
9969 if (LI->empty())
9970 return PreservedAnalyses::all();
9971 SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
9972 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
9973 DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
9974 TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
9975 AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
9976 DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
9977 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
9978 LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
9979 AA = &AM.getResult<AAManager>(IR&: F);
9980
9981 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
9982 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
9983 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
9984 return AM.getResult<BlockFrequencyAnalysis>(IR&: F);
9985 };
9986 LoopVectorizeResult Result = runImpl(F);
9987 if (!Result.MadeAnyChange)
9988 return PreservedAnalyses::all();
9989 PreservedAnalyses PA;
9990
9991 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
9992 for (auto &BB : F)
9993 RemoveRedundantDbgInstrs(BB: &BB);
9994 }
9995
9996 PA.preserve<LoopAnalysis>();
9997 PA.preserve<DominatorTreeAnalysis>();
9998 PA.preserve<ScalarEvolutionAnalysis>();
9999 PA.preserve<LoopAccessAnalysis>();
10000
10001 if (Result.MadeCFGChange) {
10002 // Making CFG changes likely means a loop got vectorized. Indicate that
10003 // extra simplification passes should be run.
10004 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10005 // be run if runtime checks have been added.
10006 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10007 PA.preserve<ShouldRunExtraVectorPasses>();
10008 } else {
10009 PA.preserveSet<CFGAnalyses>();
10010 }
10011 return PA;
10012}
10013
10014void LoopVectorizePass::printPipeline(
10015 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10016 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10017 OS, MapClassName2PassName);
10018
10019 OS << '<';
10020 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10021 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10022 OS << '>';
10023}
10024