1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/DenseMapInfo.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
74#include "llvm/ADT/SmallPtrSet.h"
75#include "llvm/ADT/SmallVector.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
80#include "llvm/ADT/iterator_range.h"
81#include "llvm/Analysis/AssumptionCache.h"
82#include "llvm/Analysis/BasicAliasAnalysis.h"
83#include "llvm/Analysis/BlockFrequencyInfo.h"
84#include "llvm/Analysis/CFG.h"
85#include "llvm/Analysis/CodeMetrics.h"
86#include "llvm/Analysis/DemandedBits.h"
87#include "llvm/Analysis/GlobalsModRef.h"
88#include "llvm/Analysis/LoopAccessAnalysis.h"
89#include "llvm/Analysis/LoopAnalysisManager.h"
90#include "llvm/Analysis/LoopInfo.h"
91#include "llvm/Analysis/LoopIterator.h"
92#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93#include "llvm/Analysis/ProfileSummaryInfo.h"
94#include "llvm/Analysis/ScalarEvolution.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
97#include "llvm/Analysis/TargetLibraryInfo.h"
98#include "llvm/Analysis/TargetTransformInfo.h"
99#include "llvm/Analysis/ValueTracking.h"
100#include "llvm/Analysis/VectorUtils.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/DiagnosticInfo.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
117#include "llvm/IR/IntrinsicInst.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/ProfDataUtils.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
131#include "llvm/Support/CommandLine.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/NativeFormatting.h"
137#include "llvm/Support/raw_ostream.h"
138#include "llvm/Transforms/Utils/BasicBlockUtils.h"
139#include "llvm/Transforms/Utils/InjectTLIMappings.h"
140#include "llvm/Transforms/Utils/Local.h"
141#include "llvm/Transforms/Utils/LoopSimplify.h"
142#include "llvm/Transforms/Utils/LoopUtils.h"
143#include "llvm/Transforms/Utils/LoopVersioning.h"
144#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
145#include "llvm/Transforms/Utils/SizeOpts.h"
146#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
174static cl::opt<bool> EnableEpilogueVectorization(
175 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
178static cl::opt<unsigned> EpilogueVectorizationForceVF(
179 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
184static cl::opt<unsigned> EpilogueVectorizationMinVF(
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
191static cl::opt<unsigned> TinyTripCountVectorThreshold(
192 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
197static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
198 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
202// that predication is preferred, and this lists all options. I.e., the
203// vectorizer will try to fold the tail-loop (epilogue) into the vector body
204// and predicate the instructions accordingly. If tail-folding fails, there are
205// different fallback strategies depending on these values:
206namespace PreferPredicateTy {
207 enum Option {
208 ScalarEpilogue = 0,
209 PredicateElseScalarEpilogue,
210 PredicateOrDontVectorize
211 };
212} // namespace PreferPredicateTy
213
214static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
215 "prefer-predicate-over-epilogue",
216 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
217 cl::Hidden,
218 cl::desc("Tail-folding and predication preferences over creating a scalar "
219 "epilogue loop."),
220 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
221 "scalar-epilogue",
222 "Don't tail-predicate loops, create scalar epilogue"),
223 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
224 "predicate-else-scalar-epilogue",
225 "prefer tail-folding, create scalar epilogue if tail "
226 "folding fails."),
227 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
228 "predicate-dont-vectorize",
229 "prefers tail-folding, don't attempt vectorization if "
230 "tail-folding fails.")));
231
232static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
233 "force-tail-folding-style", cl::desc("Force the tail folding style"),
234 cl::init(Val: TailFoldingStyle::None),
235 cl::values(
236 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
237 clEnumValN(
238 TailFoldingStyle::Data, "data",
239 "Create lane mask for data only, using active.lane.mask intrinsic"),
240 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
241 "data-without-lane-mask",
242 "Create lane mask with compare/stepvector"),
243 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
244 "Create lane mask using active.lane.mask intrinsic, and use "
245 "it for both data and control flow"),
246 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
247 "data-and-control-without-rt-check",
248 "Similar to data-and-control, but remove the runtime check"),
249 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
250 "Use predicated EVL instructions for tail folding. If EVL "
251 "is unsupported, fallback to data-without-lane-mask.")));
252
253cl::opt<bool> llvm::EnableWideActiveLaneMask(
254 "enable-wide-lane-mask", cl::init(Val: false), cl::Hidden,
255 cl::desc("Enable use of wide lane masks when used for control flow in "
256 "tail-folded loops"));
257
258static cl::opt<bool> MaximizeBandwidth(
259 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
260 cl::desc("Maximize bandwidth when selecting vectorization factor which "
261 "will be determined by the smallest type in loop."));
262
263static cl::opt<bool> EnableInterleavedMemAccesses(
264 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
265 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
266
267/// An interleave-group may need masking if it resides in a block that needs
268/// predication, or in order to mask away gaps.
269static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
270 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
271 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
272
273static cl::opt<unsigned> ForceTargetNumScalarRegs(
274 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
275 cl::desc("A flag that overrides the target's number of scalar registers."));
276
277static cl::opt<unsigned> ForceTargetNumVectorRegs(
278 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of vector registers."));
280
281static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
282 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
283 cl::desc("A flag that overrides the target's max interleave factor for "
284 "scalar loops."));
285
286static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
287 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
288 cl::desc("A flag that overrides the target's max interleave factor for "
289 "vectorized loops."));
290
291cl::opt<unsigned> llvm::ForceTargetInstructionCost(
292 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
293 cl::desc("A flag that overrides the target's expected cost for "
294 "an instruction to a single constant value. Mostly "
295 "useful for getting consistent testing."));
296
297static cl::opt<bool> ForceTargetSupportsScalableVectors(
298 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
299 cl::desc(
300 "Pretend that scalable vectors are supported, even if the target does "
301 "not support them. This flag should only be used for testing."));
302
303static cl::opt<unsigned> SmallLoopCost(
304 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
305 cl::desc(
306 "The cost of a loop that is considered 'small' by the interleaver."));
307
308static cl::opt<bool> LoopVectorizeWithBlockFrequency(
309 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
310 cl::desc("Enable the use of the block frequency analysis to access PGO "
311 "heuristics minimizing code growth in cold regions and being more "
312 "aggressive in hot regions."));
313
314// Runtime interleave loops for load/store throughput.
315static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
316 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
317 cl::desc(
318 "Enable runtime interleaving until load/store ports are saturated"));
319
320/// The number of stores in a loop that are allowed to need predication.
321static cl::opt<unsigned> NumberOfStoresToPredicate(
322 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
323 cl::desc("Max number of stores to be predicated behind an if."));
324
325static cl::opt<bool> EnableIndVarRegisterHeur(
326 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
327 cl::desc("Count the induction variable only once when interleaving"));
328
329static cl::opt<bool> EnableCondStoresVectorization(
330 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
331 cl::desc("Enable if predication of stores during vectorization."));
332
333static cl::opt<unsigned> MaxNestedScalarReductionIC(
334 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
335 cl::desc("The maximum interleave count to use when interleaving a scalar "
336 "reduction in a nested loop."));
337
338static cl::opt<bool>
339 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
340 cl::Hidden,
341 cl::desc("Prefer in-loop vector reductions, "
342 "overriding the targets preference."));
343
344static cl::opt<bool> ForceOrderedReductions(
345 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
346 cl::desc("Enable the vectorisation of loops with in-order (strict) "
347 "FP reductions"));
348
349static cl::opt<bool> PreferPredicatedReductionSelect(
350 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
351 cl::desc(
352 "Prefer predicating a reduction operation over an after loop select."));
353
354cl::opt<bool> llvm::EnableVPlanNativePath(
355 "enable-vplan-native-path", cl::Hidden,
356 cl::desc("Enable VPlan-native vectorization path with "
357 "support for outer loop vectorization."));
358
359cl::opt<bool>
360 llvm::VerifyEachVPlan("vplan-verify-each",
361#ifdef EXPENSIVE_CHECKS
362 cl::init(true),
363#else
364 cl::init(Val: false),
365#endif
366 cl::Hidden,
367 cl::desc("Verfiy VPlans after VPlan transforms."));
368
369#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
370cl::opt<bool> llvm::PrintAfterEachVPlanPass(
371 "vplan-print-after-all", cl::init(false), cl::Hidden,
372 cl::desc("Print after each VPlanTransforms::runPass."));
373#endif
374
375// This flag enables the stress testing of the VPlan H-CFG construction in the
376// VPlan-native vectorization path. It must be used in conjuction with
377// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
378// verification of the H-CFGs built.
379static cl::opt<bool> VPlanBuildStressTest(
380 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
381 cl::desc(
382 "Build VPlan for every supported loop nest in the function and bail "
383 "out right after the build (stress test the VPlan H-CFG construction "
384 "in the VPlan-native vectorization path)."));
385
386cl::opt<bool> llvm::EnableLoopInterleaving(
387 "interleave-loops", cl::init(Val: true), cl::Hidden,
388 cl::desc("Enable loop interleaving in Loop vectorization passes"));
389cl::opt<bool> llvm::EnableLoopVectorization(
390 "vectorize-loops", cl::init(Val: true), cl::Hidden,
391 cl::desc("Run the Loop vectorization passes"));
392
393static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
394 "force-widen-divrem-via-safe-divisor", cl::Hidden,
395 cl::desc(
396 "Override cost based safe divisor widening for div/rem instructions"));
397
398static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
399 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
400 cl::Hidden,
401 cl::desc("Try wider VFs if they enable the use of vector variants"));
402
403static cl::opt<bool> EnableEarlyExitVectorization(
404 "enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
405 cl::desc(
406 "Enable vectorization of early exit loops with uncountable exits."));
407
408static cl::opt<bool> ConsiderRegPressure(
409 "vectorizer-consider-reg-pressure", cl::init(Val: false), cl::Hidden,
410 cl::desc("Discard VFs if their register pressure is too high."));
411
412// Likelyhood of bypassing the vectorized loop because there are zero trips left
413// after prolog. See `emitIterationCountCheck`.
414static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
415
416/// A helper function that returns true if the given type is irregular. The
417/// type is irregular if its allocated size doesn't equal the store size of an
418/// element of the corresponding vector type.
419static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
420 // Determine if an array of N elements of type Ty is "bitcast compatible"
421 // with a <N x Ty> vector.
422 // This is only true if there is no padding between the array elements.
423 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
424}
425
426/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
427/// ElementCount to include loops whose trip count is a function of vscale.
428static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
429 const Loop *L) {
430 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
431 return ElementCount::getFixed(MinVal: ExpectedTC);
432
433 const SCEV *BTC = SE->getBackedgeTakenCount(L);
434 if (isa<SCEVCouldNotCompute>(Val: BTC))
435 return ElementCount::getFixed(MinVal: 0);
436
437 const SCEV *ExitCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
438 if (isa<SCEVVScale>(Val: ExitCount))
439 return ElementCount::getScalable(MinVal: 1);
440
441 const APInt *Scale;
442 if (match(S: ExitCount, P: m_scev_Mul(Op0: m_scev_APInt(C&: Scale), Op1: m_SCEVVScale())))
443 if (cast<SCEVMulExpr>(Val: ExitCount)->hasNoUnsignedWrap())
444 if (Scale->getActiveBits() <= 32)
445 return ElementCount::getScalable(MinVal: Scale->getZExtValue());
446
447 return ElementCount::getFixed(MinVal: 0);
448}
449
450/// Returns "best known" trip count, which is either a valid positive trip count
451/// or std::nullopt when an estimate cannot be made (including when the trip
452/// count would overflow), for the specified loop \p L as defined by the
453/// following procedure:
454/// 1) Returns exact trip count if it is known.
455/// 2) Returns expected trip count according to profile data if any.
456/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
457/// 4) Returns std::nullopt if all of the above failed.
458static std::optional<ElementCount>
459getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
460 bool CanUseConstantMax = true) {
461 // Check if exact trip count is known.
462 if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
463 return ExpectedTC;
464
465 // Check if there is an expected trip count available from profile data.
466 if (LoopVectorizeWithBlockFrequency)
467 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
468 return ElementCount::getFixed(MinVal: *EstimatedTC);
469
470 if (!CanUseConstantMax)
471 return std::nullopt;
472
473 // Check if upper bound estimate is known.
474 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
475 return ElementCount::getFixed(MinVal: ExpectedTC);
476
477 return std::nullopt;
478}
479
480namespace {
481// Forward declare GeneratedRTChecks.
482class GeneratedRTChecks;
483
484using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
485} // namespace
486
487namespace llvm {
488
489AnalysisKey ShouldRunExtraVectorPasses::Key;
490
491/// InnerLoopVectorizer vectorizes loops which contain only one basic
492/// block to a specified vectorization factor (VF).
493/// This class performs the widening of scalars into vectors, or multiple
494/// scalars. This class also implements the following features:
495/// * It inserts an epilogue loop for handling loops that don't have iteration
496/// counts that are known to be a multiple of the vectorization factor.
497/// * It handles the code generation for reduction variables.
498/// * Scalarization (implementation using scalars) of un-vectorizable
499/// instructions.
500/// InnerLoopVectorizer does not perform any vectorization-legality
501/// checks, and relies on the caller to check for the different legality
502/// aspects. The InnerLoopVectorizer relies on the
503/// LoopVectorizationLegality class to provide information about the induction
504/// and reduction variables that were found to a given vectorization factor.
505class InnerLoopVectorizer {
506public:
507 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
508 LoopInfo *LI, DominatorTree *DT,
509 const TargetTransformInfo *TTI, AssumptionCache *AC,
510 ElementCount VecWidth, unsigned UnrollFactor,
511 LoopVectorizationCostModel *CM,
512 GeneratedRTChecks &RTChecks, VPlan &Plan)
513 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
514 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
515 Cost(CM), RTChecks(RTChecks), Plan(Plan),
516 VectorPHVPBB(cast<VPBasicBlock>(
517 Val: Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
518
519 virtual ~InnerLoopVectorizer() = default;
520
521 /// Creates a basic block for the scalar preheader. Both
522 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
523 /// the method to create additional blocks and checks needed for epilogue
524 /// vectorization.
525 virtual BasicBlock *createVectorizedLoopSkeleton();
526
527 /// Fix the vectorized code, taking care of header phi's, and more.
528 void fixVectorizedLoop(VPTransformState &State);
529
530 /// Fix the non-induction PHIs in \p Plan.
531 void fixNonInductionPHIs(VPTransformState &State);
532
533 /// Returns the original loop trip count.
534 Value *getTripCount() const { return TripCount; }
535
536 /// Used to set the trip count after ILV's construction and after the
537 /// preheader block has been executed. Note that this always holds the trip
538 /// count of the original loop for both main loop and epilogue vectorization.
539 void setTripCount(Value *TC) { TripCount = TC; }
540
541protected:
542 friend class LoopVectorizationPlanner;
543
544 /// Create and return a new IR basic block for the scalar preheader whose name
545 /// is prefixed with \p Prefix.
546 BasicBlock *createScalarPreheader(StringRef Prefix);
547
548 /// Allow subclasses to override and print debug traces before/after vplan
549 /// execution, when trace information is requested.
550 virtual void printDebugTracesAtStart() {}
551 virtual void printDebugTracesAtEnd() {}
552
553 /// The original loop.
554 Loop *OrigLoop;
555
556 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
557 /// dynamic knowledge to simplify SCEV expressions and converts them to a
558 /// more usable form.
559 PredicatedScalarEvolution &PSE;
560
561 /// Loop Info.
562 LoopInfo *LI;
563
564 /// Dominator Tree.
565 DominatorTree *DT;
566
567 /// Target Transform Info.
568 const TargetTransformInfo *TTI;
569
570 /// Assumption Cache.
571 AssumptionCache *AC;
572
573 /// The vectorization SIMD factor to use. Each vector will have this many
574 /// vector elements.
575 ElementCount VF;
576
577 /// The vectorization unroll factor to use. Each scalar is vectorized to this
578 /// many different vector instructions.
579 unsigned UF;
580
581 /// The builder that we use
582 IRBuilder<> Builder;
583
584 // --- Vectorization state ---
585
586 /// Trip count of the original loop.
587 Value *TripCount = nullptr;
588
589 /// The profitablity analysis.
590 LoopVectorizationCostModel *Cost;
591
592 /// Structure to hold information about generated runtime checks, responsible
593 /// for cleaning the checks, if vectorization turns out unprofitable.
594 GeneratedRTChecks &RTChecks;
595
596 VPlan &Plan;
597
598 /// The vector preheader block of \p Plan, used as target for check blocks
599 /// introduced during skeleton creation.
600 VPBasicBlock *VectorPHVPBB;
601};
602
603/// Encapsulate information regarding vectorization of a loop and its epilogue.
604/// This information is meant to be updated and used across two stages of
605/// epilogue vectorization.
606struct EpilogueLoopVectorizationInfo {
607 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
608 unsigned MainLoopUF = 0;
609 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
610 unsigned EpilogueUF = 0;
611 BasicBlock *MainLoopIterationCountCheck = nullptr;
612 BasicBlock *EpilogueIterationCountCheck = nullptr;
613 Value *TripCount = nullptr;
614 Value *VectorTripCount = nullptr;
615 VPlan &EpiloguePlan;
616
617 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
618 ElementCount EVF, unsigned EUF,
619 VPlan &EpiloguePlan)
620 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
621 EpiloguePlan(EpiloguePlan) {
622 assert(EUF == 1 &&
623 "A high UF for the epilogue loop is likely not beneficial.");
624 }
625};
626
627/// An extension of the inner loop vectorizer that creates a skeleton for a
628/// vectorized loop that has its epilogue (residual) also vectorized.
629/// The idea is to run the vplan on a given loop twice, firstly to setup the
630/// skeleton and vectorize the main loop, and secondly to complete the skeleton
631/// from the first step and vectorize the epilogue. This is achieved by
632/// deriving two concrete strategy classes from this base class and invoking
633/// them in succession from the loop vectorizer planner.
634class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
635public:
636 InnerLoopAndEpilogueVectorizer(
637 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
638 DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
639 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
640 GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
641 ElementCount MinProfitableTripCount, unsigned UnrollFactor)
642 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
643 UnrollFactor, CM, Checks, Plan),
644 EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
645
646 /// Holds and updates state information required to vectorize the main loop
647 /// and its epilogue in two separate passes. This setup helps us avoid
648 /// regenerating and recomputing runtime safety checks. It also helps us to
649 /// shorten the iteration-count-check path length for the cases where the
650 /// iteration count of the loop is so small that the main vector loop is
651 /// completely skipped.
652 EpilogueLoopVectorizationInfo &EPI;
653
654protected:
655 ElementCount MinProfitableTripCount;
656};
657
658/// A specialized derived class of inner loop vectorizer that performs
659/// vectorization of *main* loops in the process of vectorizing loops and their
660/// epilogues.
661class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
662public:
663 EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
664 LoopInfo *LI, DominatorTree *DT,
665 const TargetTransformInfo *TTI,
666 AssumptionCache *AC,
667 EpilogueLoopVectorizationInfo &EPI,
668 LoopVectorizationCostModel *CM,
669 GeneratedRTChecks &Check, VPlan &Plan)
670 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
671 Check, Plan, EPI.MainLoopVF,
672 EPI.MainLoopVF, EPI.MainLoopUF) {}
673 /// Implements the interface for creating a vectorized skeleton using the
674 /// *main loop* strategy (i.e., the first pass of VPlan execution).
675 BasicBlock *createVectorizedLoopSkeleton() final;
676
677protected:
678 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
679 /// vector preheader and its predecessor, also connecting the new block to the
680 /// scalar preheader.
681 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
682
683 // Create a check to see if the main vector loop should be executed
684 Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF,
685 unsigned UF) const;
686
687 /// Emits an iteration count bypass check once for the main loop (when \p
688 /// ForEpilogue is false) and once for the epilogue loop (when \p
689 /// ForEpilogue is true).
690 BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass,
691 bool ForEpilogue);
692 void printDebugTracesAtStart() override;
693 void printDebugTracesAtEnd() override;
694};
695
696// A specialized derived class of inner loop vectorizer that performs
697// vectorization of *epilogue* loops in the process of vectorizing loops and
698// their epilogues.
699class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
700public:
701 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
702 LoopInfo *LI, DominatorTree *DT,
703 const TargetTransformInfo *TTI,
704 AssumptionCache *AC,
705 EpilogueLoopVectorizationInfo &EPI,
706 LoopVectorizationCostModel *CM,
707 GeneratedRTChecks &Checks, VPlan &Plan)
708 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
709 Checks, Plan, EPI.EpilogueVF,
710 EPI.EpilogueVF, EPI.EpilogueUF) {}
711 /// Implements the interface for creating a vectorized skeleton using the
712 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
713 BasicBlock *createVectorizedLoopSkeleton() final;
714
715protected:
716 void printDebugTracesAtStart() override;
717 void printDebugTracesAtEnd() override;
718};
719} // end namespace llvm
720
721/// Look for a meaningful debug location on the instruction or its operands.
722static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
723 if (!I)
724 return DebugLoc::getUnknown();
725
726 DebugLoc Empty;
727 if (I->getDebugLoc() != Empty)
728 return I->getDebugLoc();
729
730 for (Use &Op : I->operands()) {
731 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
732 if (OpInst->getDebugLoc() != Empty)
733 return OpInst->getDebugLoc();
734 }
735
736 return I->getDebugLoc();
737}
738
739/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
740/// is passed, the message relates to that particular instruction.
741#ifndef NDEBUG
742static void debugVectorizationMessage(const StringRef Prefix,
743 const StringRef DebugMsg,
744 Instruction *I) {
745 dbgs() << "LV: " << Prefix << DebugMsg;
746 if (I != nullptr)
747 dbgs() << " " << *I;
748 else
749 dbgs() << '.';
750 dbgs() << '\n';
751}
752#endif
753
754/// Create an analysis remark that explains why vectorization failed
755///
756/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
757/// RemarkName is the identifier for the remark. If \p I is passed it is an
758/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
759/// the location of the remark. If \p DL is passed, use it as debug location for
760/// the remark. \return the remark object that can be streamed to.
761static OptimizationRemarkAnalysis
762createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
763 Instruction *I, DebugLoc DL = {}) {
764 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
765 // If debug location is attached to the instruction, use it. Otherwise if DL
766 // was not provided, use the loop's.
767 if (I && I->getDebugLoc())
768 DL = I->getDebugLoc();
769 else if (!DL)
770 DL = TheLoop->getStartLoc();
771
772 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
773}
774
775namespace llvm {
776
777/// Return a value for Step multiplied by VF.
778Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
779 int64_t Step) {
780 assert(Ty->isIntegerTy() && "Expected an integer step");
781 ElementCount VFxStep = VF.multiplyCoefficientBy(RHS: Step);
782 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
783 if (VF.isScalable() && isPowerOf2_64(Value: Step)) {
784 return B.CreateShl(
785 LHS: B.CreateVScale(Ty),
786 RHS: ConstantInt::get(Ty, V: Log2_64(Value: VFxStep.getKnownMinValue())), Name: "", HasNUW: true);
787 }
788 return B.CreateElementCount(Ty, EC: VFxStep);
789}
790
791/// Return the runtime value for VF.
792Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
793 return B.CreateElementCount(Ty, EC: VF);
794}
795
796void reportVectorizationFailure(const StringRef DebugMsg,
797 const StringRef OREMsg, const StringRef ORETag,
798 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
799 Instruction *I) {
800 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
801 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
802 ORE->emit(
803 OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
804 << "loop not vectorized: " << OREMsg);
805}
806
807/// Reports an informative message: print \p Msg for debugging purposes as well
808/// as an optimization remark. Uses either \p I as location of the remark, or
809/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
810/// remark. If \p DL is passed, use it as debug location for the remark.
811static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
812 OptimizationRemarkEmitter *ORE,
813 Loop *TheLoop, Instruction *I = nullptr,
814 DebugLoc DL = {}) {
815 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
816 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
817 ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
818 I, DL)
819 << Msg);
820}
821
822/// Report successful vectorization of the loop. In case an outer loop is
823/// vectorized, prepend "outer" to the vectorization remark.
824static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
825 VectorizationFactor VF, unsigned IC) {
826 LLVM_DEBUG(debugVectorizationMessage(
827 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
828 nullptr));
829 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
830 ORE->emit(RemarkBuilder: [&]() {
831 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
832 TheLoop->getHeader())
833 << "vectorized " << LoopType << "loop (vectorization width: "
834 << ore::NV("VectorizationFactor", VF.Width)
835 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
836 });
837}
838
839} // end namespace llvm
840
841namespace llvm {
842
843// Loop vectorization cost-model hints how the scalar epilogue loop should be
844// lowered.
845enum ScalarEpilogueLowering {
846
847 // The default: allowing scalar epilogues.
848 CM_ScalarEpilogueAllowed,
849
850 // Vectorization with OptForSize: don't allow epilogues.
851 CM_ScalarEpilogueNotAllowedOptSize,
852
853 // A special case of vectorisation with OptForSize: loops with a very small
854 // trip count are considered for vectorization under OptForSize, thereby
855 // making sure the cost of their loop body is dominant, free of runtime
856 // guards and scalar iteration overheads.
857 CM_ScalarEpilogueNotAllowedLowTripLoop,
858
859 // Loop hint predicate indicating an epilogue is undesired.
860 CM_ScalarEpilogueNotNeededUsePredicate,
861
862 // Directive indicating we must either tail fold or not vectorize
863 CM_ScalarEpilogueNotAllowedUsePredicate
864};
865
866/// LoopVectorizationCostModel - estimates the expected speedups due to
867/// vectorization.
868/// In many cases vectorization is not profitable. This can happen because of
869/// a number of reasons. In this class we mainly attempt to predict the
870/// expected speedup/slowdowns due to the supported instruction set. We use the
871/// TargetTransformInfo to query the different backends for the cost of
872/// different operations.
873class LoopVectorizationCostModel {
874 friend class LoopVectorizationPlanner;
875
876public:
877 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
878 PredicatedScalarEvolution &PSE, LoopInfo *LI,
879 LoopVectorizationLegality *Legal,
880 const TargetTransformInfo &TTI,
881 const TargetLibraryInfo *TLI, DemandedBits *DB,
882 AssumptionCache *AC,
883 OptimizationRemarkEmitter *ORE,
884 std::function<BlockFrequencyInfo &()> GetBFI,
885 const Function *F, const LoopVectorizeHints *Hints,
886 InterleavedAccessInfo &IAI, bool OptForSize)
887 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
888 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
889 TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
890 OptForSize(OptForSize) {
891 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
892 initializeVScaleForTuning();
893 CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
894 }
895
896 /// \return An upper bound for the vectorization factors (both fixed and
897 /// scalable). If the factors are 0, vectorization and interleaving should be
898 /// avoided up front.
899 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
900
901 /// \return True if runtime checks are required for vectorization, and false
902 /// otherwise.
903 bool runtimeChecksRequired();
904
905 /// Setup cost-based decisions for user vectorization factor.
906 /// \return true if the UserVF is a feasible VF to be chosen.
907 bool selectUserVectorizationFactor(ElementCount UserVF) {
908 collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
909 return expectedCost(VF: UserVF).isValid();
910 }
911
912 /// \return True if maximizing vector bandwidth is enabled by the target or
913 /// user options, for the given register kind.
914 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
915
916 /// \return True if register pressure should be considered for the given VF.
917 bool shouldConsiderRegPressureForVF(ElementCount VF);
918
919 /// \return The size (in bits) of the smallest and widest types in the code
920 /// that needs to be vectorized. We ignore values that remain scalar such as
921 /// 64 bit loop indices.
922 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
923
924 /// Memory access instruction may be vectorized in more than one way.
925 /// Form of instruction after vectorization depends on cost.
926 /// This function takes cost-based decisions for Load/Store instructions
927 /// and collects them in a map. This decisions map is used for building
928 /// the lists of loop-uniform and loop-scalar instructions.
929 /// The calculated cost is saved with widening decision in order to
930 /// avoid redundant calculations.
931 void setCostBasedWideningDecision(ElementCount VF);
932
933 /// A call may be vectorized in different ways depending on whether we have
934 /// vectorized variants available and whether the target supports masking.
935 /// This function analyzes all calls in the function at the supplied VF,
936 /// makes a decision based on the costs of available options, and stores that
937 /// decision in a map for use in planning and plan execution.
938 void setVectorizedCallDecision(ElementCount VF);
939
940 /// Collect values we want to ignore in the cost model.
941 void collectValuesToIgnore();
942
943 /// Collect all element types in the loop for which widening is needed.
944 void collectElementTypesForWidening();
945
946 /// Split reductions into those that happen in the loop, and those that happen
947 /// outside. In loop reductions are collected into InLoopReductions.
948 void collectInLoopReductions();
949
950 /// Returns true if we should use strict in-order reductions for the given
951 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
952 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
953 /// of FP operations.
954 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
955 return !Hints->allowReordering() && RdxDesc.isOrdered();
956 }
957
958 /// \returns The smallest bitwidth each instruction can be represented with.
959 /// The vector equivalents of these instructions should be truncated to this
960 /// type.
961 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
962 return MinBWs;
963 }
964
965 /// \returns True if it is more profitable to scalarize instruction \p I for
966 /// vectorization factor \p VF.
967 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
968 assert(VF.isVector() &&
969 "Profitable to scalarize relevant only for VF > 1.");
970 assert(
971 TheLoop->isInnermost() &&
972 "cost-model should not be used for outer loops (in VPlan-native path)");
973
974 auto Scalars = InstsToScalarize.find(Key: VF);
975 assert(Scalars != InstsToScalarize.end() &&
976 "VF not yet analyzed for scalarization profitability");
977 return Scalars->second.contains(Key: I);
978 }
979
980 /// Returns true if \p I is known to be uniform after vectorization.
981 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
982 assert(
983 TheLoop->isInnermost() &&
984 "cost-model should not be used for outer loops (in VPlan-native path)");
985 // Pseudo probe needs to be duplicated for each unrolled iteration and
986 // vector lane so that profiled loop trip count can be accurately
987 // accumulated instead of being under counted.
988 if (isa<PseudoProbeInst>(Val: I))
989 return false;
990
991 if (VF.isScalar())
992 return true;
993
994 auto UniformsPerVF = Uniforms.find(Val: VF);
995 assert(UniformsPerVF != Uniforms.end() &&
996 "VF not yet analyzed for uniformity");
997 return UniformsPerVF->second.count(Ptr: I);
998 }
999
1000 /// Returns true if \p I is known to be scalar after vectorization.
1001 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1002 assert(
1003 TheLoop->isInnermost() &&
1004 "cost-model should not be used for outer loops (in VPlan-native path)");
1005 if (VF.isScalar())
1006 return true;
1007
1008 auto ScalarsPerVF = Scalars.find(Val: VF);
1009 assert(ScalarsPerVF != Scalars.end() &&
1010 "Scalar values are not calculated for VF");
1011 return ScalarsPerVF->second.count(Ptr: I);
1012 }
1013
1014 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1015 /// for vectorization factor \p VF.
1016 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1017 // Truncs must truncate at most to their destination type.
1018 if (isa_and_nonnull<TruncInst>(Val: I) && MinBWs.contains(Key: I) &&
1019 I->getType()->getScalarSizeInBits() < MinBWs.lookup(Key: I))
1020 return false;
1021 return VF.isVector() && MinBWs.contains(Key: I) &&
1022 !isProfitableToScalarize(I, VF) &&
1023 !isScalarAfterVectorization(I, VF);
1024 }
1025
1026 /// Decision that was taken during cost calculation for memory instruction.
1027 enum InstWidening {
1028 CM_Unknown,
1029 CM_Widen, // For consecutive accesses with stride +1.
1030 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1031 CM_Interleave,
1032 CM_GatherScatter,
1033 CM_Scalarize,
1034 CM_VectorCall,
1035 CM_IntrinsicCall
1036 };
1037
1038 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1039 /// instruction \p I and vector width \p VF.
1040 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1041 InstructionCost Cost) {
1042 assert(VF.isVector() && "Expected VF >=2");
1043 WideningDecisions[{I, VF}] = {W, Cost};
1044 }
1045
1046 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1047 /// interleaving group \p Grp and vector width \p VF.
1048 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1049 ElementCount VF, InstWidening W,
1050 InstructionCost Cost) {
1051 assert(VF.isVector() && "Expected VF >=2");
1052 /// Broadcast this decicion to all instructions inside the group.
1053 /// When interleaving, the cost will only be assigned one instruction, the
1054 /// insert position. For other cases, add the appropriate fraction of the
1055 /// total cost to each instruction. This ensures accurate costs are used,
1056 /// even if the insert position instruction is not used.
1057 InstructionCost InsertPosCost = Cost;
1058 InstructionCost OtherMemberCost = 0;
1059 if (W != CM_Interleave)
1060 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1061 ;
1062 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1063 if (auto *I = Grp->getMember(Index: Idx)) {
1064 if (Grp->getInsertPos() == I)
1065 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1066 else
1067 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1068 }
1069 }
1070 }
1071
1072 /// Return the cost model decision for the given instruction \p I and vector
1073 /// width \p VF. Return CM_Unknown if this instruction did not pass
1074 /// through the cost modeling.
1075 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1076 assert(VF.isVector() && "Expected VF to be a vector VF");
1077 assert(
1078 TheLoop->isInnermost() &&
1079 "cost-model should not be used for outer loops (in VPlan-native path)");
1080
1081 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1082 auto Itr = WideningDecisions.find(Val: InstOnVF);
1083 if (Itr == WideningDecisions.end())
1084 return CM_Unknown;
1085 return Itr->second.first;
1086 }
1087
1088 /// Return the vectorization cost for the given instruction \p I and vector
1089 /// width \p VF.
1090 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1091 assert(VF.isVector() && "Expected VF >=2");
1092 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1093 assert(WideningDecisions.contains(InstOnVF) &&
1094 "The cost is not calculated");
1095 return WideningDecisions[InstOnVF].second;
1096 }
1097
1098 struct CallWideningDecision {
1099 InstWidening Kind;
1100 Function *Variant;
1101 Intrinsic::ID IID;
1102 std::optional<unsigned> MaskPos;
1103 InstructionCost Cost;
1104 };
1105
1106 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1107 Function *Variant, Intrinsic::ID IID,
1108 std::optional<unsigned> MaskPos,
1109 InstructionCost Cost) {
1110 assert(!VF.isScalar() && "Expected vector VF");
1111 CallWideningDecisions[{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1112 }
1113
1114 CallWideningDecision getCallWideningDecision(CallInst *CI,
1115 ElementCount VF) const {
1116 assert(!VF.isScalar() && "Expected vector VF");
1117 auto I = CallWideningDecisions.find(Val: {CI, VF});
1118 if (I == CallWideningDecisions.end())
1119 return {.Kind: CM_Unknown, .Variant: nullptr, .IID: Intrinsic::not_intrinsic, .MaskPos: std::nullopt, .Cost: 0};
1120 return I->second;
1121 }
1122
1123 /// Return True if instruction \p I is an optimizable truncate whose operand
1124 /// is an induction variable. Such a truncate will be removed by adding a new
1125 /// induction variable with the destination type.
1126 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1127 // If the instruction is not a truncate, return false.
1128 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1129 if (!Trunc)
1130 return false;
1131
1132 // Get the source and destination types of the truncate.
1133 Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1134 Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1135
1136 // If the truncate is free for the given types, return false. Replacing a
1137 // free truncate with an induction variable would add an induction variable
1138 // update instruction to each iteration of the loop. We exclude from this
1139 // check the primary induction variable since it will need an update
1140 // instruction regardless.
1141 Value *Op = Trunc->getOperand(i_nocapture: 0);
1142 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1143 return false;
1144
1145 // If the truncated value is not an induction variable, return false.
1146 return Legal->isInductionPhi(V: Op);
1147 }
1148
1149 /// Collects the instructions to scalarize for each predicated instruction in
1150 /// the loop.
1151 void collectInstsToScalarize(ElementCount VF);
1152
1153 /// Collect values that will not be widened, including Uniforms, Scalars, and
1154 /// Instructions to Scalarize for the given \p VF.
1155 /// The sets depend on CM decision for Load/Store instructions
1156 /// that may be vectorized as interleave, gather-scatter or scalarized.
1157 /// Also make a decision on what to do about call instructions in the loop
1158 /// at that VF -- scalarize, call a known vector routine, or call a
1159 /// vector intrinsic.
1160 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1161 // Do the analysis once.
1162 if (VF.isScalar() || Uniforms.contains(Val: VF))
1163 return;
1164 setCostBasedWideningDecision(VF);
1165 collectLoopUniforms(VF);
1166 setVectorizedCallDecision(VF);
1167 collectLoopScalars(VF);
1168 collectInstsToScalarize(VF);
1169 }
1170
1171 /// Returns true if the target machine supports masked store operation
1172 /// for the given \p DataType and kind of access to \p Ptr.
1173 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1174 unsigned AddressSpace) const {
1175 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1176 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1177 }
1178
1179 /// Returns true if the target machine supports masked load operation
1180 /// for the given \p DataType and kind of access to \p Ptr.
1181 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1182 unsigned AddressSpace) const {
1183 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1184 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1185 }
1186
1187 /// Returns true if the target machine can represent \p V as a masked gather
1188 /// or scatter operation.
1189 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1190 bool LI = isa<LoadInst>(Val: V);
1191 bool SI = isa<StoreInst>(Val: V);
1192 if (!LI && !SI)
1193 return false;
1194 auto *Ty = getLoadStoreType(I: V);
1195 Align Align = getLoadStoreAlignment(I: V);
1196 if (VF.isVector())
1197 Ty = VectorType::get(ElementType: Ty, EC: VF);
1198 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1199 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1200 }
1201
1202 /// Returns true if the target machine supports all of the reduction
1203 /// variables found for the given VF.
1204 bool canVectorizeReductions(ElementCount VF) const {
1205 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1206 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1207 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1208 }));
1209 }
1210
1211 /// Given costs for both strategies, return true if the scalar predication
1212 /// lowering should be used for div/rem. This incorporates an override
1213 /// option so it is not simply a cost comparison.
1214 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1215 InstructionCost SafeDivisorCost) const {
1216 switch (ForceSafeDivisor) {
1217 case cl::BOU_UNSET:
1218 return ScalarCost < SafeDivisorCost;
1219 case cl::BOU_TRUE:
1220 return false;
1221 case cl::BOU_FALSE:
1222 return true;
1223 }
1224 llvm_unreachable("impossible case value");
1225 }
1226
1227 /// Returns true if \p I is an instruction which requires predication and
1228 /// for which our chosen predication strategy is scalarization (i.e. we
1229 /// don't have an alternate strategy such as masking available).
1230 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1231 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1232
1233 /// Returns true if \p I is an instruction that needs to be predicated
1234 /// at runtime. The result is independent of the predication mechanism.
1235 /// Superset of instructions that return true for isScalarWithPredication.
1236 bool isPredicatedInst(Instruction *I) const;
1237
1238 /// A helper function that returns how much we should divide the cost of a
1239 /// predicated block by. Typically this is the reciprocal of the block
1240 /// probability, i.e. if we return X we are assuming the predicated block will
1241 /// execute once for every X iterations of the loop header so the block should
1242 /// only contribute 1/X of its cost to the total cost calculation, but when
1243 /// optimizing for code size it will just be 1 as code size costs don't depend
1244 /// on execution probabilities.
1245 ///
1246 /// Note that if a block wasn't originally predicated but was predicated due
1247 /// to tail folding, the divisor will still be 1 because it will execute for
1248 /// every iteration of the loop header.
1249 inline uint64_t
1250 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1251 const BasicBlock *BB);
1252
1253 /// Return the costs for our two available strategies for lowering a
1254 /// div/rem operation which requires speculating at least one lane.
1255 /// First result is for scalarization (will be invalid for scalable
1256 /// vectors); second is for the safe-divisor strategy.
1257 std::pair<InstructionCost, InstructionCost>
1258 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1259
1260 /// Returns true if \p I is a memory instruction with consecutive memory
1261 /// access that can be widened.
1262 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1263
1264 /// Returns true if \p I is a memory instruction in an interleaved-group
1265 /// of memory accesses that can be vectorized with wide vector loads/stores
1266 /// and shuffles.
1267 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1268
1269 /// Check if \p Instr belongs to any interleaved access group.
1270 bool isAccessInterleaved(Instruction *Instr) const {
1271 return InterleaveInfo.isInterleaved(Instr);
1272 }
1273
1274 /// Get the interleaved access group that \p Instr belongs to.
1275 const InterleaveGroup<Instruction> *
1276 getInterleavedAccessGroup(Instruction *Instr) const {
1277 return InterleaveInfo.getInterleaveGroup(Instr);
1278 }
1279
1280 /// Returns true if we're required to use a scalar epilogue for at least
1281 /// the final iteration of the original loop.
1282 bool requiresScalarEpilogue(bool IsVectorizing) const {
1283 if (!isScalarEpilogueAllowed()) {
1284 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1285 return false;
1286 }
1287 // If we might exit from anywhere but the latch and early exit vectorization
1288 // is disabled, we must run the exiting iteration in scalar form.
1289 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1290 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1291 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1292 "from latch block\n");
1293 return true;
1294 }
1295 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1296 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1297 "interleaved group requires scalar epilogue\n");
1298 return true;
1299 }
1300 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1301 return false;
1302 }
1303
1304 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1305 /// loop hint annotation.
1306 bool isScalarEpilogueAllowed() const {
1307 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1308 }
1309
1310 /// Returns true if tail-folding is preferred over a scalar epilogue.
1311 bool preferPredicatedLoop() const {
1312 return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1313 ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1314 }
1315
1316 /// Returns the TailFoldingStyle that is best for the current loop.
1317 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1318 if (!ChosenTailFoldingStyle)
1319 return TailFoldingStyle::None;
1320 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1321 : ChosenTailFoldingStyle->second;
1322 }
1323
1324 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1325 /// overflow or not.
1326 /// \param IsScalableVF true if scalable vector factors enabled.
1327 /// \param UserIC User specific interleave count.
1328 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1329 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1330 if (!Legal->canFoldTailByMasking()) {
1331 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1332 return;
1333 }
1334
1335 // Default to TTI preference, but allow command line override.
1336 ChosenTailFoldingStyle = {
1337 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1338 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1339 if (ForceTailFoldingStyle.getNumOccurrences())
1340 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1341 ForceTailFoldingStyle.getValue()};
1342
1343 if (ChosenTailFoldingStyle->first != TailFoldingStyle::DataWithEVL &&
1344 ChosenTailFoldingStyle->second != TailFoldingStyle::DataWithEVL)
1345 return;
1346 // Override EVL styles if needed.
1347 // FIXME: Investigate opportunity for fixed vector factor.
1348 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1349 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1350 if (EVLIsLegal)
1351 return;
1352 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1353 // if it's allowed, or DataWithoutLaneMask otherwise.
1354 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1355 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1356 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1357 else
1358 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1359 TailFoldingStyle::DataWithoutLaneMask};
1360
1361 LLVM_DEBUG(
1362 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1363 "not try to generate VP Intrinsics "
1364 << (UserIC > 1
1365 ? "since interleave count specified is greater than 1.\n"
1366 : "due to non-interleaving reasons.\n"));
1367 }
1368
1369 /// Returns true if all loop blocks should be masked to fold tail loop.
1370 bool foldTailByMasking() const {
1371 // TODO: check if it is possible to check for None style independent of
1372 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1373 return getTailFoldingStyle() != TailFoldingStyle::None;
1374 }
1375
1376 /// Returns true if the use of wide lane masks is requested and the loop is
1377 /// using tail-folding with a lane mask for control flow.
1378 bool useWideActiveLaneMask() const {
1379 if (!EnableWideActiveLaneMask)
1380 return false;
1381
1382 TailFoldingStyle TF = getTailFoldingStyle();
1383 return TF == TailFoldingStyle::DataAndControlFlow ||
1384 TF == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
1385 }
1386
1387 /// Return maximum safe number of elements to be processed per vector
1388 /// iteration, which do not prevent store-load forwarding and are safe with
1389 /// regard to the memory dependencies. Required for EVL-based VPlans to
1390 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1391 /// MaxSafeElements).
1392 /// TODO: need to consider adjusting cost model to use this value as a
1393 /// vectorization factor for EVL-based vectorization.
1394 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1395
1396 /// Returns true if the instructions in this block requires predication
1397 /// for any reason, e.g. because tail folding now requires a predicate
1398 /// or because the block in the original loop was predicated.
1399 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1400 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1401 }
1402
1403 /// Returns true if VP intrinsics with explicit vector length support should
1404 /// be generated in the tail folded loop.
1405 bool foldTailWithEVL() const {
1406 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1407 }
1408
1409 /// Returns true if the Phi is part of an inloop reduction.
1410 bool isInLoopReduction(PHINode *Phi) const {
1411 return InLoopReductions.contains(Ptr: Phi);
1412 }
1413
1414 /// Returns the set of in-loop reduction PHIs.
1415 const SmallPtrSetImpl<PHINode *> &getInLoopReductions() const {
1416 return InLoopReductions;
1417 }
1418
1419 /// Returns true if the predicated reduction select should be used to set the
1420 /// incoming value for the reduction phi.
1421 bool usePredicatedReductionSelect() const {
1422 // Force to use predicated reduction select since the EVL of the
1423 // second-to-last iteration might not be VF*UF.
1424 if (foldTailWithEVL())
1425 return true;
1426 return PreferPredicatedReductionSelect ||
1427 TTI.preferPredicatedReductionSelect();
1428 }
1429
1430 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1431 /// with factor VF. Return the cost of the instruction, including
1432 /// scalarization overhead if it's needed.
1433 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1434
1435 /// Estimate cost of a call instruction CI if it were vectorized with factor
1436 /// VF. Return the cost of the instruction, including scalarization overhead
1437 /// if it's needed.
1438 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1439
1440 /// Invalidates decisions already taken by the cost model.
1441 void invalidateCostModelingDecisions() {
1442 WideningDecisions.clear();
1443 CallWideningDecisions.clear();
1444 Uniforms.clear();
1445 Scalars.clear();
1446 }
1447
1448 /// Returns the expected execution cost. The unit of the cost does
1449 /// not matter because we use the 'cost' units to compare different
1450 /// vector widths. The cost that is returned is *not* normalized by
1451 /// the factor width.
1452 InstructionCost expectedCost(ElementCount VF);
1453
1454 bool hasPredStores() const { return NumPredStores > 0; }
1455
1456 /// Returns true if epilogue vectorization is considered profitable, and
1457 /// false otherwise.
1458 /// \p VF is the vectorization factor chosen for the original loop.
1459 /// \p Multiplier is an aditional scaling factor applied to VF before
1460 /// comparing to EpilogueVectorizationMinVF.
1461 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1462 const unsigned IC) const;
1463
1464 /// Returns the execution time cost of an instruction for a given vector
1465 /// width. Vector width of one means scalar.
1466 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1467
1468 /// Return the cost of instructions in an inloop reduction pattern, if I is
1469 /// part of that pattern.
1470 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1471 ElementCount VF,
1472 Type *VectorTy) const;
1473
1474 /// Returns true if \p Op should be considered invariant and if it is
1475 /// trivially hoistable.
1476 bool shouldConsiderInvariant(Value *Op);
1477
1478 /// Return the value of vscale used for tuning the cost model.
1479 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1480
1481private:
1482 unsigned NumPredStores = 0;
1483
1484 /// Used to store the value of vscale used for tuning the cost model. It is
1485 /// initialized during object construction.
1486 std::optional<unsigned> VScaleForTuning;
1487
1488 /// Initializes the value of vscale used for tuning the cost model. If
1489 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1490 /// return the value returned by the corresponding TTI method.
1491 void initializeVScaleForTuning() {
1492 const Function *Fn = TheLoop->getHeader()->getParent();
1493 if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1494 auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1495 auto Min = Attr.getVScaleRangeMin();
1496 auto Max = Attr.getVScaleRangeMax();
1497 if (Max && Min == Max) {
1498 VScaleForTuning = Max;
1499 return;
1500 }
1501 }
1502
1503 VScaleForTuning = TTI.getVScaleForTuning();
1504 }
1505
1506 /// \return An upper bound for the vectorization factors for both
1507 /// fixed and scalable vectorization, where the minimum-known number of
1508 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1509 /// disabled or unsupported, then the scalable part will be equal to
1510 /// ElementCount::getScalable(0).
1511 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1512 ElementCount UserVF, unsigned UserIC,
1513 bool FoldTailByMasking);
1514
1515 /// If \p VF * \p UserIC > MaxTripcount, clamps VF to the next lower VF that
1516 /// results in VF * UserIC <= MaxTripCount.
1517 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1518 unsigned UserIC,
1519 bool FoldTailByMasking) const;
1520
1521 /// \return the maximized element count based on the targets vector
1522 /// registers and the loop trip-count, but limited to a maximum safe VF.
1523 /// This is a helper function of computeFeasibleMaxVF.
1524 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1525 unsigned SmallestType,
1526 unsigned WidestType,
1527 ElementCount MaxSafeVF, unsigned UserIC,
1528 bool FoldTailByMasking);
1529
1530 /// Checks if scalable vectorization is supported and enabled. Caches the
1531 /// result to avoid repeated debug dumps for repeated queries.
1532 bool isScalableVectorizationAllowed();
1533
1534 /// \return the maximum legal scalable VF, based on the safe max number
1535 /// of elements.
1536 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1537
1538 /// Calculate vectorization cost of memory instruction \p I.
1539 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1540
1541 /// The cost computation for scalarized memory instruction.
1542 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1543
1544 /// The cost computation for interleaving group of memory instructions.
1545 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1546
1547 /// The cost computation for Gather/Scatter instruction.
1548 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1549
1550 /// The cost computation for widening instruction \p I with consecutive
1551 /// memory access.
1552 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1553
1554 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1555 /// Load: scalar load + broadcast.
1556 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1557 /// element)
1558 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1559
1560 /// Estimate the overhead of scalarizing an instruction. This is a
1561 /// convenience wrapper for the type-based getScalarizationOverhead API.
1562 InstructionCost getScalarizationOverhead(Instruction *I,
1563 ElementCount VF) const;
1564
1565 /// Returns true if an artificially high cost for emulated masked memrefs
1566 /// should be used.
1567 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1568
1569 /// Map of scalar integer values to the smallest bitwidth they can be legally
1570 /// represented as. The vector equivalents of these values should be truncated
1571 /// to this type.
1572 MapVector<Instruction *, uint64_t> MinBWs;
1573
1574 /// A type representing the costs for instructions if they were to be
1575 /// scalarized rather than vectorized. The entries are Instruction-Cost
1576 /// pairs.
1577 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1578
1579 /// A set containing all BasicBlocks that are known to present after
1580 /// vectorization as a predicated block.
1581 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1582 PredicatedBBsAfterVectorization;
1583
1584 /// Records whether it is allowed to have the original scalar loop execute at
1585 /// least once. This may be needed as a fallback loop in case runtime
1586 /// aliasing/dependence checks fail, or to handle the tail/remainder
1587 /// iterations when the trip count is unknown or doesn't divide by the VF,
1588 /// or as a peel-loop to handle gaps in interleave-groups.
1589 /// Under optsize and when the trip count is very small we don't allow any
1590 /// iterations to execute in the scalar loop.
1591 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1592
1593 /// Control finally chosen tail folding style. The first element is used if
1594 /// the IV update may overflow, the second element - if it does not.
1595 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1596 ChosenTailFoldingStyle;
1597
1598 /// true if scalable vectorization is supported and enabled.
1599 std::optional<bool> IsScalableVectorizationAllowed;
1600
1601 /// Maximum safe number of elements to be processed per vector iteration,
1602 /// which do not prevent store-load forwarding and are safe with regard to the
1603 /// memory dependencies. Required for EVL-based veectorization, where this
1604 /// value is used as the upper bound of the safe AVL.
1605 std::optional<unsigned> MaxSafeElements;
1606
1607 /// A map holding scalar costs for different vectorization factors. The
1608 /// presence of a cost for an instruction in the mapping indicates that the
1609 /// instruction will be scalarized when vectorizing with the associated
1610 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1611 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1612
1613 /// Holds the instructions known to be uniform after vectorization.
1614 /// The data is collected per VF.
1615 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1616
1617 /// Holds the instructions known to be scalar after vectorization.
1618 /// The data is collected per VF.
1619 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1620
1621 /// Holds the instructions (address computations) that are forced to be
1622 /// scalarized.
1623 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1624
1625 /// PHINodes of the reductions that should be expanded in-loop.
1626 SmallPtrSet<PHINode *, 4> InLoopReductions;
1627
1628 /// A Map of inloop reduction operations and their immediate chain operand.
1629 /// FIXME: This can be removed once reductions can be costed correctly in
1630 /// VPlan. This was added to allow quick lookup of the inloop operations.
1631 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1632
1633 /// Returns the expected difference in cost from scalarizing the expression
1634 /// feeding a predicated instruction \p PredInst. The instructions to
1635 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1636 /// non-negative return value implies the expression will be scalarized.
1637 /// Currently, only single-use chains are considered for scalarization.
1638 InstructionCost computePredInstDiscount(Instruction *PredInst,
1639 ScalarCostsTy &ScalarCosts,
1640 ElementCount VF);
1641
1642 /// Collect the instructions that are uniform after vectorization. An
1643 /// instruction is uniform if we represent it with a single scalar value in
1644 /// the vectorized loop corresponding to each vector iteration. Examples of
1645 /// uniform instructions include pointer operands of consecutive or
1646 /// interleaved memory accesses. Note that although uniformity implies an
1647 /// instruction will be scalar, the reverse is not true. In general, a
1648 /// scalarized instruction will be represented by VF scalar values in the
1649 /// vectorized loop, each corresponding to an iteration of the original
1650 /// scalar loop.
1651 void collectLoopUniforms(ElementCount VF);
1652
1653 /// Collect the instructions that are scalar after vectorization. An
1654 /// instruction is scalar if it is known to be uniform or will be scalarized
1655 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1656 /// to the list if they are used by a load/store instruction that is marked as
1657 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1658 /// VF values in the vectorized loop, each corresponding to an iteration of
1659 /// the original scalar loop.
1660 void collectLoopScalars(ElementCount VF);
1661
1662 /// Keeps cost model vectorization decision and cost for instructions.
1663 /// Right now it is used for memory instructions only.
1664 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1665 std::pair<InstWidening, InstructionCost>>;
1666
1667 DecisionList WideningDecisions;
1668
1669 using CallDecisionList =
1670 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1671
1672 CallDecisionList CallWideningDecisions;
1673
1674 /// Returns true if \p V is expected to be vectorized and it needs to be
1675 /// extracted.
1676 bool needsExtract(Value *V, ElementCount VF) const {
1677 Instruction *I = dyn_cast<Instruction>(Val: V);
1678 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1679 TheLoop->isLoopInvariant(V: I) ||
1680 getWideningDecision(I, VF) == CM_Scalarize ||
1681 (isa<CallInst>(Val: I) &&
1682 getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize))
1683 return false;
1684
1685 // Assume we can vectorize V (and hence we need extraction) if the
1686 // scalars are not computed yet. This can happen, because it is called
1687 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1688 // the scalars are collected. That should be a safe assumption in most
1689 // cases, because we check if the operands have vectorizable types
1690 // beforehand in LoopVectorizationLegality.
1691 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1692 };
1693
1694 /// Returns a range containing only operands needing to be extracted.
1695 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1696 ElementCount VF) const {
1697
1698 SmallPtrSet<const Value *, 4> UniqueOperands;
1699 SmallVector<Value *, 4> Res;
1700 for (Value *Op : Ops) {
1701 if (isa<Constant>(Val: Op) || !UniqueOperands.insert(Ptr: Op).second ||
1702 !needsExtract(V: Op, VF))
1703 continue;
1704 Res.push_back(Elt: Op);
1705 }
1706 return Res;
1707 }
1708
1709public:
1710 /// The loop that we evaluate.
1711 Loop *TheLoop;
1712
1713 /// Predicated scalar evolution analysis.
1714 PredicatedScalarEvolution &PSE;
1715
1716 /// Loop Info analysis.
1717 LoopInfo *LI;
1718
1719 /// Vectorization legality.
1720 LoopVectorizationLegality *Legal;
1721
1722 /// Vector target information.
1723 const TargetTransformInfo &TTI;
1724
1725 /// Target Library Info.
1726 const TargetLibraryInfo *TLI;
1727
1728 /// Demanded bits analysis.
1729 DemandedBits *DB;
1730
1731 /// Assumption cache.
1732 AssumptionCache *AC;
1733
1734 /// Interface to emit optimization remarks.
1735 OptimizationRemarkEmitter *ORE;
1736
1737 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1738 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1739 /// there is no predication.
1740 std::function<BlockFrequencyInfo &()> GetBFI;
1741 /// The BlockFrequencyInfo returned from GetBFI.
1742 BlockFrequencyInfo *BFI = nullptr;
1743 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1744 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1745 BlockFrequencyInfo &getBFI() {
1746 if (!BFI)
1747 BFI = &GetBFI();
1748 return *BFI;
1749 }
1750
1751 const Function *TheFunction;
1752
1753 /// Loop Vectorize Hint.
1754 const LoopVectorizeHints *Hints;
1755
1756 /// The interleave access information contains groups of interleaved accesses
1757 /// with the same stride and close to each other.
1758 InterleavedAccessInfo &InterleaveInfo;
1759
1760 /// Values to ignore in the cost model.
1761 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1762
1763 /// Values to ignore in the cost model when VF > 1.
1764 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1765
1766 /// All element types found in the loop.
1767 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1768
1769 /// The kind of cost that we are calculating
1770 TTI::TargetCostKind CostKind;
1771
1772 /// Whether this loop should be optimized for size based on function attribute
1773 /// or profile information.
1774 bool OptForSize;
1775
1776 /// The highest VF possible for this loop, without using MaxBandwidth.
1777 FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
1778};
1779} // end namespace llvm
1780
1781namespace {
1782/// Helper struct to manage generating runtime checks for vectorization.
1783///
1784/// The runtime checks are created up-front in temporary blocks to allow better
1785/// estimating the cost and un-linked from the existing IR. After deciding to
1786/// vectorize, the checks are moved back. If deciding not to vectorize, the
1787/// temporary blocks are completely removed.
1788class GeneratedRTChecks {
1789 /// Basic block which contains the generated SCEV checks, if any.
1790 BasicBlock *SCEVCheckBlock = nullptr;
1791
1792 /// The value representing the result of the generated SCEV checks. If it is
1793 /// nullptr no SCEV checks have been generated.
1794 Value *SCEVCheckCond = nullptr;
1795
1796 /// Basic block which contains the generated memory runtime checks, if any.
1797 BasicBlock *MemCheckBlock = nullptr;
1798
1799 /// The value representing the result of the generated memory runtime checks.
1800 /// If it is nullptr no memory runtime checks have been generated.
1801 Value *MemRuntimeCheckCond = nullptr;
1802
1803 DominatorTree *DT;
1804 LoopInfo *LI;
1805 TargetTransformInfo *TTI;
1806
1807 SCEVExpander SCEVExp;
1808 SCEVExpander MemCheckExp;
1809
1810 bool CostTooHigh = false;
1811
1812 Loop *OuterLoop = nullptr;
1813
1814 PredicatedScalarEvolution &PSE;
1815
1816 /// The kind of cost that we are calculating
1817 TTI::TargetCostKind CostKind;
1818
1819public:
1820 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1821 LoopInfo *LI, TargetTransformInfo *TTI,
1822 TTI::TargetCostKind CostKind)
1823 : DT(DT), LI(LI), TTI(TTI),
1824 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1825 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1826 PSE(PSE), CostKind(CostKind) {}
1827
1828 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1829 /// accurately estimate the cost of the runtime checks. The blocks are
1830 /// un-linked from the IR and are added back during vector code generation. If
1831 /// there is no vector code generation, the check blocks are removed
1832 /// completely.
1833 void create(Loop *L, const LoopAccessInfo &LAI,
1834 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1835 OptimizationRemarkEmitter &ORE) {
1836
1837 // Hard cutoff to limit compile-time increase in case a very large number of
1838 // runtime checks needs to be generated.
1839 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1840 // profile info.
1841 CostTooHigh =
1842 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1843 if (CostTooHigh) {
1844 // Mark runtime checks as never succeeding when they exceed the threshold.
1845 MemRuntimeCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1846 SCEVCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1847 ORE.emit(RemarkBuilder: [&]() {
1848 return OptimizationRemarkAnalysisAliasing(
1849 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1850 L->getHeader())
1851 << "loop not vectorized: too many memory checks needed";
1852 });
1853 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1854 return;
1855 }
1856
1857 BasicBlock *LoopHeader = L->getHeader();
1858 BasicBlock *Preheader = L->getLoopPreheader();
1859
1860 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1861 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1862 // may be used by SCEVExpander. The blocks will be un-linked from their
1863 // predecessors and removed from LI & DT at the end of the function.
1864 if (!UnionPred.isAlwaysTrue()) {
1865 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1866 MSSAU: nullptr, BBName: "vector.scevcheck");
1867
1868 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1869 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1870 if (isa<Constant>(Val: SCEVCheckCond)) {
1871 // Clean up directly after expanding the predicate to a constant, to
1872 // avoid further expansions re-using anything left over from SCEVExp.
1873 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1874 SCEVCleaner.cleanup();
1875 }
1876 }
1877
1878 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1879 if (RtPtrChecking.Need) {
1880 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1881 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1882 BBName: "vector.memcheck");
1883
1884 auto DiffChecks = RtPtrChecking.getDiffChecks();
1885 if (DiffChecks) {
1886 Value *RuntimeVF = nullptr;
1887 MemRuntimeCheckCond = addDiffRuntimeChecks(
1888 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1889 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1890 if (!RuntimeVF)
1891 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1892 return RuntimeVF;
1893 },
1894 IC);
1895 } else {
1896 MemRuntimeCheckCond = addRuntimeChecks(
1897 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1898 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1899 }
1900 assert(MemRuntimeCheckCond &&
1901 "no RT checks generated although RtPtrChecking "
1902 "claimed checks are required");
1903 }
1904
1905 SCEVExp.eraseDeadInstructions(Root: SCEVCheckCond);
1906
1907 if (!MemCheckBlock && !SCEVCheckBlock)
1908 return;
1909
1910 // Unhook the temporary block with the checks, update various places
1911 // accordingly.
1912 if (SCEVCheckBlock)
1913 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1914 if (MemCheckBlock)
1915 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1916
1917 if (SCEVCheckBlock) {
1918 SCEVCheckBlock->getTerminator()->moveBefore(
1919 InsertPos: Preheader->getTerminator()->getIterator());
1920 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1921 UI->setDebugLoc(DebugLoc::getTemporary());
1922 Preheader->getTerminator()->eraseFromParent();
1923 }
1924 if (MemCheckBlock) {
1925 MemCheckBlock->getTerminator()->moveBefore(
1926 InsertPos: Preheader->getTerminator()->getIterator());
1927 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1928 UI->setDebugLoc(DebugLoc::getTemporary());
1929 Preheader->getTerminator()->eraseFromParent();
1930 }
1931
1932 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1933 if (MemCheckBlock) {
1934 DT->eraseNode(BB: MemCheckBlock);
1935 LI->removeBlock(BB: MemCheckBlock);
1936 }
1937 if (SCEVCheckBlock) {
1938 DT->eraseNode(BB: SCEVCheckBlock);
1939 LI->removeBlock(BB: SCEVCheckBlock);
1940 }
1941
1942 // Outer loop is used as part of the later cost calculations.
1943 OuterLoop = L->getParentLoop();
1944 }
1945
1946 InstructionCost getCost() {
1947 if (SCEVCheckBlock || MemCheckBlock)
1948 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1949
1950 if (CostTooHigh) {
1951 InstructionCost Cost;
1952 Cost.setInvalid();
1953 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1954 return Cost;
1955 }
1956
1957 InstructionCost RTCheckCost = 0;
1958 if (SCEVCheckBlock)
1959 for (Instruction &I : *SCEVCheckBlock) {
1960 if (SCEVCheckBlock->getTerminator() == &I)
1961 continue;
1962 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1963 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1964 RTCheckCost += C;
1965 }
1966 if (MemCheckBlock) {
1967 InstructionCost MemCheckCost = 0;
1968 for (Instruction &I : *MemCheckBlock) {
1969 if (MemCheckBlock->getTerminator() == &I)
1970 continue;
1971 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1972 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1973 MemCheckCost += C;
1974 }
1975
1976 // If the runtime memory checks are being created inside an outer loop
1977 // we should find out if these checks are outer loop invariant. If so,
1978 // the checks will likely be hoisted out and so the effective cost will
1979 // reduce according to the outer loop trip count.
1980 if (OuterLoop) {
1981 ScalarEvolution *SE = MemCheckExp.getSE();
1982 // TODO: If profitable, we could refine this further by analysing every
1983 // individual memory check, since there could be a mixture of loop
1984 // variant and invariant checks that mean the final condition is
1985 // variant.
1986 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1987 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1988 // It seems reasonable to assume that we can reduce the effective
1989 // cost of the checks even when we know nothing about the trip
1990 // count. Assume that the outer loop executes at least twice.
1991 unsigned BestTripCount = 2;
1992
1993 // Get the best known TC estimate.
1994 if (auto EstimatedTC = getSmallBestKnownTC(
1995 PSE, L: OuterLoop, /* CanUseConstantMax = */ false))
1996 if (EstimatedTC->isFixed())
1997 BestTripCount = EstimatedTC->getFixedValue();
1998
1999 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2000
2001 // Let's ensure the cost is always at least 1.
2002 NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
2003 b: (InstructionCost::CostType)1);
2004
2005 if (BestTripCount > 1)
2006 LLVM_DEBUG(dbgs()
2007 << "We expect runtime memory checks to be hoisted "
2008 << "out of the outer loop. Cost reduced from "
2009 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2010
2011 MemCheckCost = NewMemCheckCost;
2012 }
2013 }
2014
2015 RTCheckCost += MemCheckCost;
2016 }
2017
2018 if (SCEVCheckBlock || MemCheckBlock)
2019 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2020 << "\n");
2021
2022 return RTCheckCost;
2023 }
2024
2025 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2026 /// unused.
2027 ~GeneratedRTChecks() {
2028 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2029 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2030 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(BB: SCEVCheckBlock);
2031 bool MemChecksUsed = !MemCheckBlock || !pred_empty(BB: MemCheckBlock);
2032 if (SCEVChecksUsed)
2033 SCEVCleaner.markResultUsed();
2034
2035 if (MemChecksUsed) {
2036 MemCheckCleaner.markResultUsed();
2037 } else {
2038 auto &SE = *MemCheckExp.getSE();
2039 // Memory runtime check generation creates compares that use expanded
2040 // values. Remove them before running the SCEVExpanderCleaners.
2041 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2042 if (MemCheckExp.isInsertedInstruction(I: &I))
2043 continue;
2044 SE.forgetValue(V: &I);
2045 I.eraseFromParent();
2046 }
2047 }
2048 MemCheckCleaner.cleanup();
2049 SCEVCleaner.cleanup();
2050
2051 if (!SCEVChecksUsed)
2052 SCEVCheckBlock->eraseFromParent();
2053 if (!MemChecksUsed)
2054 MemCheckBlock->eraseFromParent();
2055 }
2056
2057 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2058 /// outside VPlan.
2059 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2060 using namespace llvm::PatternMatch;
2061 if (!SCEVCheckCond || match(V: SCEVCheckCond, P: m_ZeroInt()))
2062 return {nullptr, nullptr};
2063
2064 return {SCEVCheckCond, SCEVCheckBlock};
2065 }
2066
2067 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2068 /// outside VPlan.
2069 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2070 using namespace llvm::PatternMatch;
2071 if (MemRuntimeCheckCond && match(V: MemRuntimeCheckCond, P: m_ZeroInt()))
2072 return {nullptr, nullptr};
2073 return {MemRuntimeCheckCond, MemCheckBlock};
2074 }
2075
2076 /// Return true if any runtime checks have been added
2077 bool hasChecks() const {
2078 return getSCEVChecks().first || getMemRuntimeChecks().first;
2079 }
2080};
2081} // namespace
2082
2083static bool useActiveLaneMask(TailFoldingStyle Style) {
2084 return Style == TailFoldingStyle::Data ||
2085 Style == TailFoldingStyle::DataAndControlFlow ||
2086 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2087}
2088
2089static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2090 return Style == TailFoldingStyle::DataAndControlFlow ||
2091 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2092}
2093
2094// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2095// vectorization. The loop needs to be annotated with #pragma omp simd
2096// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2097// vector length information is not provided, vectorization is not considered
2098// explicit. Interleave hints are not allowed either. These limitations will be
2099// relaxed in the future.
2100// Please, note that we are currently forced to abuse the pragma 'clang
2101// vectorize' semantics. This pragma provides *auto-vectorization hints*
2102// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2103// provides *explicit vectorization hints* (LV can bypass legal checks and
2104// assume that vectorization is legal). However, both hints are implemented
2105// using the same metadata (llvm.loop.vectorize, processed by
2106// LoopVectorizeHints). This will be fixed in the future when the native IR
2107// representation for pragma 'omp simd' is introduced.
2108static bool isExplicitVecOuterLoop(Loop *OuterLp,
2109 OptimizationRemarkEmitter *ORE) {
2110 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2111 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2112
2113 // Only outer loops with an explicit vectorization hint are supported.
2114 // Unannotated outer loops are ignored.
2115 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2116 return false;
2117
2118 Function *Fn = OuterLp->getHeader()->getParent();
2119 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2120 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2121 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2122 return false;
2123 }
2124
2125 if (Hints.getInterleave() > 1) {
2126 // TODO: Interleave support is future work.
2127 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2128 "outer loops.\n");
2129 Hints.emitRemarkWithHints();
2130 return false;
2131 }
2132
2133 return true;
2134}
2135
2136static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2137 OptimizationRemarkEmitter *ORE,
2138 SmallVectorImpl<Loop *> &V) {
2139 // Collect inner loops and outer loops without irreducible control flow. For
2140 // now, only collect outer loops that have explicit vectorization hints. If we
2141 // are stress testing the VPlan H-CFG construction, we collect the outermost
2142 // loop of every loop nest.
2143 if (L.isInnermost() || VPlanBuildStressTest ||
2144 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2145 LoopBlocksRPO RPOT(&L);
2146 RPOT.perform(LI);
2147 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2148 V.push_back(Elt: &L);
2149 // TODO: Collect inner loops inside marked outer loops in case
2150 // vectorization fails for the outer loop. Do not invoke
2151 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2152 // already known to be reducible. We can use an inherited attribute for
2153 // that.
2154 return;
2155 }
2156 }
2157 for (Loop *InnerL : L)
2158 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2159}
2160
2161//===----------------------------------------------------------------------===//
2162// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2163// LoopVectorizationCostModel and LoopVectorizationPlanner.
2164//===----------------------------------------------------------------------===//
2165
2166/// FIXME: The newly created binary instructions should contain nsw/nuw
2167/// flags, which can be found from the original scalar operations.
2168Value *
2169llvm::emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2170 Value *Step,
2171 InductionDescriptor::InductionKind InductionKind,
2172 const BinaryOperator *InductionBinOp) {
2173 using namespace llvm::PatternMatch;
2174 Type *StepTy = Step->getType();
2175 Value *CastedIndex = StepTy->isIntegerTy()
2176 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2177 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2178 if (CastedIndex != Index) {
2179 CastedIndex->setName(CastedIndex->getName() + ".cast");
2180 Index = CastedIndex;
2181 }
2182
2183 // Note: the IR at this point is broken. We cannot use SE to create any new
2184 // SCEV and then expand it, hoping that SCEV's simplification will give us
2185 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2186 // lead to various SCEV crashes. So all we can do is to use builder and rely
2187 // on InstCombine for future simplifications. Here we handle some trivial
2188 // cases only.
2189 auto CreateAdd = [&B](Value *X, Value *Y) {
2190 assert(X->getType() == Y->getType() && "Types don't match!");
2191 if (match(V: X, P: m_ZeroInt()))
2192 return Y;
2193 if (match(V: Y, P: m_ZeroInt()))
2194 return X;
2195 return B.CreateAdd(LHS: X, RHS: Y);
2196 };
2197
2198 // We allow X to be a vector type, in which case Y will potentially be
2199 // splatted into a vector with the same element count.
2200 auto CreateMul = [&B](Value *X, Value *Y) {
2201 assert(X->getType()->getScalarType() == Y->getType() &&
2202 "Types don't match!");
2203 if (match(V: X, P: m_One()))
2204 return Y;
2205 if (match(V: Y, P: m_One()))
2206 return X;
2207 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2208 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2209 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2210 return B.CreateMul(LHS: X, RHS: Y);
2211 };
2212
2213 switch (InductionKind) {
2214 case InductionDescriptor::IK_IntInduction: {
2215 assert(!isa<VectorType>(Index->getType()) &&
2216 "Vector indices not supported for integer inductions yet");
2217 assert(Index->getType() == StartValue->getType() &&
2218 "Index type does not match StartValue type");
2219 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2220 return B.CreateSub(LHS: StartValue, RHS: Index);
2221 auto *Offset = CreateMul(Index, Step);
2222 return CreateAdd(StartValue, Offset);
2223 }
2224 case InductionDescriptor::IK_PtrInduction:
2225 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2226 case InductionDescriptor::IK_FpInduction: {
2227 assert(!isa<VectorType>(Index->getType()) &&
2228 "Vector indices not supported for FP inductions yet");
2229 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2230 assert(InductionBinOp &&
2231 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2232 InductionBinOp->getOpcode() == Instruction::FSub) &&
2233 "Original bin op should be defined for FP induction");
2234
2235 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2236 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2237 Name: "induction");
2238 }
2239 case InductionDescriptor::IK_NoInduction:
2240 return nullptr;
2241 }
2242 llvm_unreachable("invalid enum");
2243}
2244
2245static std::optional<unsigned> getMaxVScale(const Function &F,
2246 const TargetTransformInfo &TTI) {
2247 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2248 return MaxVScale;
2249
2250 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2251 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2252
2253 return std::nullopt;
2254}
2255
2256/// For the given VF and UF and maximum trip count computed for the loop, return
2257/// whether the induction variable might overflow in the vectorized loop. If not,
2258/// then we know a runtime overflow check always evaluates to false and can be
2259/// removed.
2260static bool isIndvarOverflowCheckKnownFalse(
2261 const LoopVectorizationCostModel *Cost,
2262 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2263 // Always be conservative if we don't know the exact unroll factor.
2264 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2265
2266 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2267 APInt MaxUIntTripCount = IdxTy->getMask();
2268
2269 // We know the runtime overflow check is known false iff the (max) trip-count
2270 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2271 // the vector loop induction variable.
2272 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2273 uint64_t MaxVF = VF.getKnownMinValue();
2274 if (VF.isScalable()) {
2275 std::optional<unsigned> MaxVScale =
2276 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2277 if (!MaxVScale)
2278 return false;
2279 MaxVF *= *MaxVScale;
2280 }
2281
2282 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2283 }
2284
2285 return false;
2286}
2287
2288// Return whether we allow using masked interleave-groups (for dealing with
2289// strided loads/stores that reside in predicated blocks, or for dealing
2290// with gaps).
2291static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2292 // If an override option has been passed in for interleaved accesses, use it.
2293 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2294 return EnableMaskedInterleavedMemAccesses;
2295
2296 return TTI.enableMaskedInterleavedAccessVectorization();
2297}
2298
2299void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan(
2300 BasicBlock *CheckIRBB) {
2301 // Note: The block with the minimum trip-count check is already connected
2302 // during earlier VPlan construction.
2303 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2304 VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2305 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2306 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2307 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2308 VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPBB, BlockPtr: CheckVPIRBB);
2309 PreVectorPH = CheckVPIRBB;
2310 VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2311 PreVectorPH->swapSuccessors();
2312
2313 // We just connected a new block to the scalar preheader. Update all
2314 // VPPhis by adding an incoming value for it, replicating the last value.
2315 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2316 for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2317 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2318 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2319 "must have incoming values for all operands");
2320 R.addOperand(Operand: R.getOperand(N: NumPredecessors - 2));
2321 }
2322}
2323
2324Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
2325 BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
2326 // Generate code to check if the loop's trip count is less than VF * UF, or
2327 // equal to it in case a scalar epilogue is required; this implies that the
2328 // vector trip count is zero. This check also covers the case where adding one
2329 // to the backedge-taken count overflowed leading to an incorrect trip count
2330 // of zero. In this case we will also jump to the scalar loop.
2331 auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2332 : ICmpInst::ICMP_ULT;
2333
2334 // Reuse existing vector loop preheader for TC checks.
2335 // Note that new preheader block is generated for vector loop.
2336 BasicBlock *const TCCheckBlock = VectorPH;
2337 IRBuilder<InstSimplifyFolder> Builder(
2338 TCCheckBlock->getContext(),
2339 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2340 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2341
2342 // If tail is to be folded, vector loop takes care of all iterations.
2343 Value *Count = getTripCount();
2344 Type *CountTy = Count->getType();
2345 Value *CheckMinIters = Builder.getFalse();
2346 auto CreateStep = [&]() -> Value * {
2347 // Create step with max(MinProTripCount, UF * VF).
2348 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2349 return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2350
2351 Value *MinProfTC =
2352 Builder.CreateElementCount(Ty: CountTy, EC: MinProfitableTripCount);
2353 if (!VF.isScalable())
2354 return MinProfTC;
2355 return Builder.CreateBinaryIntrinsic(
2356 ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2357 };
2358
2359 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2360 if (Style == TailFoldingStyle::None) {
2361 Value *Step = CreateStep();
2362 ScalarEvolution &SE = *PSE.getSE();
2363 // TODO: Emit unconditional branch to vector preheader instead of
2364 // conditional branch with known condition.
2365 const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2366 // Check if the trip count is < the step.
2367 if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2368 // TODO: Ensure step is at most the trip count when determining max VF and
2369 // UF, w/o tail folding.
2370 CheckMinIters = Builder.getTrue();
2371 } else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2372 LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2373 // Generate the minimum iteration check only if we cannot prove the
2374 // check is known to be true, or known to be false.
2375 CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2376 } // else step known to be < trip count, use CheckMinIters preset to false.
2377 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2378 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2379 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2380 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2381 // an overflow to zero when updating induction variables and so an
2382 // additional overflow check is required before entering the vector loop.
2383
2384 // Get the maximum unsigned value for the type.
2385 Value *MaxUIntTripCount =
2386 ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2387 Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2388
2389 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2390 CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep());
2391 }
2392 return CheckMinIters;
2393}
2394
2395/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2396/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2397/// predecessors and successors of VPBB, if any, are rewired to the new
2398/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2399static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
2400 BasicBlock *IRBB,
2401 VPlan *Plan = nullptr) {
2402 if (!Plan)
2403 Plan = VPBB->getPlan();
2404 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2405 auto IP = IRVPBB->begin();
2406 for (auto &R : make_early_inc_range(Range: VPBB->phis()))
2407 R.moveBefore(BB&: *IRVPBB, I: IP);
2408
2409 for (auto &R :
2410 make_early_inc_range(Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end())))
2411 R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2412
2413 VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2414 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2415 return IRVPBB;
2416}
2417
2418BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
2419 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2420 assert(VectorPH && "Invalid loop structure");
2421 assert((OrigLoop->getUniqueLatchExitBlock() ||
2422 Cost->requiresScalarEpilogue(VF.isVector())) &&
2423 "loops not exiting via the latch without required epilogue?");
2424
2425 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2426 // wrapping the newly created scalar preheader here at the moment, because the
2427 // Plan's scalar preheader may be unreachable at this point. Instead it is
2428 // replaced in executePlan.
2429 return SplitBlock(Old: VectorPH, SplitPt: VectorPH->getTerminator(), DT, LI, MSSAU: nullptr,
2430 BBName: Twine(Prefix) + "scalar.ph");
2431}
2432
2433/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2434/// expansion results.
2435static Value *getExpandedStep(const InductionDescriptor &ID,
2436 const SCEV2ValueTy &ExpandedSCEVs) {
2437 const SCEV *Step = ID.getStep();
2438 if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
2439 return C->getValue();
2440 if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
2441 return U->getValue();
2442 Value *V = ExpandedSCEVs.lookup(Val: Step);
2443 assert(V && "SCEV must be expanded at this point");
2444 return V;
2445}
2446
2447/// Knowing that loop \p L executes a single vector iteration, add instructions
2448/// that will get simplified and thus should not have any cost to \p
2449/// InstsToIgnore.
2450static void addFullyUnrolledInstructionsToIgnore(
2451 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2452 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2453 auto *Cmp = L->getLatchCmpInst();
2454 if (Cmp)
2455 InstsToIgnore.insert(Ptr: Cmp);
2456 for (const auto &KV : IL) {
2457 // Extract the key by hand so that it can be used in the lambda below. Note
2458 // that captured structured bindings are a C++20 extension.
2459 const PHINode *IV = KV.first;
2460
2461 // Get next iteration value of the induction variable.
2462 Instruction *IVInst =
2463 cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2464 if (all_of(Range: IVInst->users(),
2465 P: [&](const User *U) { return U == IV || U == Cmp; }))
2466 InstsToIgnore.insert(Ptr: IVInst);
2467 }
2468}
2469
2470BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2471 // Create a new IR basic block for the scalar preheader.
2472 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
2473 return ScalarPH->getSinglePredecessor();
2474}
2475
2476namespace {
2477
2478struct CSEDenseMapInfo {
2479 static bool canHandle(const Instruction *I) {
2480 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2481 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2482 }
2483
2484 static inline Instruction *getEmptyKey() {
2485 return DenseMapInfo<Instruction *>::getEmptyKey();
2486 }
2487
2488 static inline Instruction *getTombstoneKey() {
2489 return DenseMapInfo<Instruction *>::getTombstoneKey();
2490 }
2491
2492 static unsigned getHashValue(const Instruction *I) {
2493 assert(canHandle(I) && "Unknown instruction!");
2494 return hash_combine(args: I->getOpcode(),
2495 args: hash_combine_range(R: I->operand_values()));
2496 }
2497
2498 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2499 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2500 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2501 return LHS == RHS;
2502 return LHS->isIdenticalTo(I: RHS);
2503 }
2504};
2505
2506} // end anonymous namespace
2507
2508/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2509/// removal, in favor of the VPlan-based one.
2510static void legacyCSE(BasicBlock *BB) {
2511 // Perform simple cse.
2512 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2513 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2514 if (!CSEDenseMapInfo::canHandle(I: &In))
2515 continue;
2516
2517 // Check if we can replace this instruction with any of the
2518 // visited instructions.
2519 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2520 In.replaceAllUsesWith(V);
2521 In.eraseFromParent();
2522 continue;
2523 }
2524
2525 CSEMap[&In] = &In;
2526 }
2527}
2528
2529/// This function attempts to return a value that represents the ElementCount
2530/// at runtime. For fixed-width VFs we know this precisely at compile
2531/// time, but for scalable VFs we calculate it based on an estimate of the
2532/// vscale value.
2533static unsigned estimateElementCount(ElementCount VF,
2534 std::optional<unsigned> VScale) {
2535 unsigned EstimatedVF = VF.getKnownMinValue();
2536 if (VF.isScalable())
2537 if (VScale)
2538 EstimatedVF *= *VScale;
2539 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2540 return EstimatedVF;
2541}
2542
2543InstructionCost
2544LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2545 ElementCount VF) const {
2546 // We only need to calculate a cost if the VF is scalar; for actual vectors
2547 // we should already have a pre-calculated cost at each VF.
2548 if (!VF.isScalar())
2549 return getCallWideningDecision(CI, VF).Cost;
2550
2551 Type *RetTy = CI->getType();
2552 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2553 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2554 return *RedCost;
2555
2556 SmallVector<Type *, 4> Tys;
2557 for (auto &ArgOp : CI->args())
2558 Tys.push_back(Elt: ArgOp->getType());
2559
2560 InstructionCost ScalarCallCost =
2561 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2562
2563 // If this is an intrinsic we may have a lower cost for it.
2564 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2565 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2566 return std::min(a: ScalarCallCost, b: IntrinsicCost);
2567 }
2568 return ScalarCallCost;
2569}
2570
2571static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2572 if (VF.isScalar() || !canVectorizeTy(Ty))
2573 return Ty;
2574 return toVectorizedTy(Ty, EC: VF);
2575}
2576
2577InstructionCost
2578LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2579 ElementCount VF) const {
2580 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2581 assert(ID && "Expected intrinsic call!");
2582 Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2583 FastMathFlags FMF;
2584 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2585 FMF = FPMO->getFastMathFlags();
2586
2587 SmallVector<const Value *> Arguments(CI->args());
2588 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2589 SmallVector<Type *> ParamTys;
2590 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2591 result: std::back_inserter(x&: ParamTys),
2592 unary_op: [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2593
2594 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2595 dyn_cast<IntrinsicInst>(Val: CI),
2596 InstructionCost::getInvalid(), TLI);
2597 return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2598}
2599
2600void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2601 // Fix widened non-induction PHIs by setting up the PHI operands.
2602 fixNonInductionPHIs(State);
2603
2604 // Don't apply optimizations below when no (vector) loop remains, as they all
2605 // require one at the moment.
2606 VPBasicBlock *HeaderVPBB =
2607 vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2608 if (!HeaderVPBB)
2609 return;
2610
2611 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2612
2613 // Remove redundant induction instructions.
2614 legacyCSE(BB: HeaderBB);
2615}
2616
2617void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2618 auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2619 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2620 for (VPRecipeBase &P : VPBB->phis()) {
2621 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2622 if (!VPPhi)
2623 continue;
2624 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2625 // Make sure the builder has a valid insert point.
2626 Builder.SetInsertPoint(NewPhi);
2627 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2628 NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB[VPBB]);
2629 }
2630 }
2631}
2632
2633void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2634 // We should not collect Scalars more than once per VF. Right now, this
2635 // function is called from collectUniformsAndScalars(), which already does
2636 // this check. Collecting Scalars for VF=1 does not make any sense.
2637 assert(VF.isVector() && !Scalars.contains(VF) &&
2638 "This function should not be visited twice for the same VF");
2639
2640 // This avoids any chances of creating a REPLICATE recipe during planning
2641 // since that would result in generation of scalarized code during execution,
2642 // which is not supported for scalable vectors.
2643 if (VF.isScalable()) {
2644 Scalars[VF].insert_range(R&: Uniforms[VF]);
2645 return;
2646 }
2647
2648 SmallSetVector<Instruction *, 8> Worklist;
2649
2650 // These sets are used to seed the analysis with pointers used by memory
2651 // accesses that will remain scalar.
2652 SmallSetVector<Instruction *, 8> ScalarPtrs;
2653 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2654 auto *Latch = TheLoop->getLoopLatch();
2655
2656 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2657 // The pointer operands of loads and stores will be scalar as long as the
2658 // memory access is not a gather or scatter operation. The value operand of a
2659 // store will remain scalar if the store is scalarized.
2660 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2661 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2662 assert(WideningDecision != CM_Unknown &&
2663 "Widening decision should be ready at this moment");
2664 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2665 if (Ptr == Store->getValueOperand())
2666 return WideningDecision == CM_Scalarize;
2667 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2668 "Ptr is neither a value or pointer operand");
2669 return WideningDecision != CM_GatherScatter;
2670 };
2671
2672 // A helper that returns true if the given value is a getelementptr
2673 // instruction contained in the loop.
2674 auto IsLoopVaryingGEP = [&](Value *V) {
2675 return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2676 };
2677
2678 // A helper that evaluates a memory access's use of a pointer. If the use will
2679 // be a scalar use and the pointer is only used by memory accesses, we place
2680 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2681 // PossibleNonScalarPtrs.
2682 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2683 // We only care about bitcast and getelementptr instructions contained in
2684 // the loop.
2685 if (!IsLoopVaryingGEP(Ptr))
2686 return;
2687
2688 // If the pointer has already been identified as scalar (e.g., if it was
2689 // also identified as uniform), there's nothing to do.
2690 auto *I = cast<Instruction>(Val: Ptr);
2691 if (Worklist.count(key: I))
2692 return;
2693
2694 // If the use of the pointer will be a scalar use, and all users of the
2695 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2696 // place the pointer in PossibleNonScalarPtrs.
2697 if (IsScalarUse(MemAccess, Ptr) &&
2698 all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2699 ScalarPtrs.insert(X: I);
2700 else
2701 PossibleNonScalarPtrs.insert(Ptr: I);
2702 };
2703
2704 // We seed the scalars analysis with three classes of instructions: (1)
2705 // instructions marked uniform-after-vectorization and (2) bitcast,
2706 // getelementptr and (pointer) phi instructions used by memory accesses
2707 // requiring a scalar use.
2708 //
2709 // (1) Add to the worklist all instructions that have been identified as
2710 // uniform-after-vectorization.
2711 Worklist.insert_range(R&: Uniforms[VF]);
2712
2713 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2714 // memory accesses requiring a scalar use. The pointer operands of loads and
2715 // stores will be scalar unless the operation is a gather or scatter.
2716 // The value operand of a store will remain scalar if the store is scalarized.
2717 for (auto *BB : TheLoop->blocks())
2718 for (auto &I : *BB) {
2719 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2720 EvaluatePtrUse(Load, Load->getPointerOperand());
2721 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2722 EvaluatePtrUse(Store, Store->getPointerOperand());
2723 EvaluatePtrUse(Store, Store->getValueOperand());
2724 }
2725 }
2726 for (auto *I : ScalarPtrs)
2727 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2728 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2729 Worklist.insert(X: I);
2730 }
2731
2732 // Insert the forced scalars.
2733 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2734 // induction variable when the PHI user is scalarized.
2735 auto ForcedScalar = ForcedScalars.find(Val: VF);
2736 if (ForcedScalar != ForcedScalars.end())
2737 for (auto *I : ForcedScalar->second) {
2738 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2739 Worklist.insert(X: I);
2740 }
2741
2742 // Expand the worklist by looking through any bitcasts and getelementptr
2743 // instructions we've already identified as scalar. This is similar to the
2744 // expansion step in collectLoopUniforms(); however, here we're only
2745 // expanding to include additional bitcasts and getelementptr instructions.
2746 unsigned Idx = 0;
2747 while (Idx != Worklist.size()) {
2748 Instruction *Dst = Worklist[Idx++];
2749 if (!IsLoopVaryingGEP(Dst->getOperand(i: 0)))
2750 continue;
2751 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
2752 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
2753 auto *J = cast<Instruction>(Val: U);
2754 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
2755 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
2756 IsScalarUse(J, Src));
2757 })) {
2758 Worklist.insert(X: Src);
2759 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2760 }
2761 }
2762
2763 // An induction variable will remain scalar if all users of the induction
2764 // variable and induction variable update remain scalar.
2765 for (const auto &Induction : Legal->getInductionVars()) {
2766 auto *Ind = Induction.first;
2767 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2768
2769 // If tail-folding is applied, the primary induction variable will be used
2770 // to feed a vector compare.
2771 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2772 continue;
2773
2774 // Returns true if \p Indvar is a pointer induction that is used directly by
2775 // load/store instruction \p I.
2776 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2777 Instruction *I) {
2778 return Induction.second.getKind() ==
2779 InductionDescriptor::IK_PtrInduction &&
2780 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
2781 Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse(I, Indvar);
2782 };
2783
2784 // Determine if all users of the induction variable are scalar after
2785 // vectorization.
2786 bool ScalarInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2787 auto *I = cast<Instruction>(Val: U);
2788 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2789 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2790 });
2791 if (!ScalarInd)
2792 continue;
2793
2794 // If the induction variable update is a fixed-order recurrence, neither the
2795 // induction variable or its update should be marked scalar after
2796 // vectorization.
2797 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2798 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2799 continue;
2800
2801 // Determine if all users of the induction variable update instruction are
2802 // scalar after vectorization.
2803 bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
2804 auto *I = cast<Instruction>(Val: U);
2805 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2806 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2807 });
2808 if (!ScalarIndUpdate)
2809 continue;
2810
2811 // The induction variable and its update instruction will remain scalar.
2812 Worklist.insert(X: Ind);
2813 Worklist.insert(X: IndUpdate);
2814 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2815 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2816 << "\n");
2817 }
2818
2819 Scalars[VF].insert_range(R&: Worklist);
2820}
2821
2822bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
2823 ElementCount VF) {
2824 if (!isPredicatedInst(I))
2825 return false;
2826
2827 // Do we have a non-scalar lowering for this predicated
2828 // instruction? No - it is scalar with predication.
2829 switch(I->getOpcode()) {
2830 default:
2831 return true;
2832 case Instruction::Call:
2833 if (VF.isScalar())
2834 return true;
2835 return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
2836 case Instruction::Load:
2837 case Instruction::Store: {
2838 auto *Ptr = getLoadStorePointerOperand(V: I);
2839 auto *Ty = getLoadStoreType(I);
2840 unsigned AS = getLoadStoreAddressSpace(I);
2841 Type *VTy = Ty;
2842 if (VF.isVector())
2843 VTy = VectorType::get(ElementType: Ty, EC: VF);
2844 const Align Alignment = getLoadStoreAlignment(I);
2845 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2846 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
2847 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2848 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
2849 }
2850 case Instruction::UDiv:
2851 case Instruction::SDiv:
2852 case Instruction::SRem:
2853 case Instruction::URem: {
2854 // We have the option to use the safe-divisor idiom to avoid predication.
2855 // The cost based decision here will always select safe-divisor for
2856 // scalable vectors as scalarization isn't legal.
2857 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2858 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2859 }
2860 }
2861}
2862
2863// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2864bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2865 // TODO: We can use the loop-preheader as context point here and get
2866 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2867 if (isSafeToSpeculativelyExecute(I) ||
2868 (isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) ||
2869 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2870 return false;
2871
2872 // If the instruction was executed conditionally in the original scalar loop,
2873 // predication is needed with a mask whose lanes are all possibly inactive.
2874 if (Legal->blockNeedsPredication(BB: I->getParent()))
2875 return true;
2876
2877 // If we're not folding the tail by masking, predication is unnecessary.
2878 if (!foldTailByMasking())
2879 return false;
2880
2881 // All that remain are instructions with side-effects originally executed in
2882 // the loop unconditionally, but now execute under a tail-fold mask (only)
2883 // having at least one active lane (the first). If the side-effects of the
2884 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2885 // - it will cause the same side-effects as when masked.
2886 switch(I->getOpcode()) {
2887 default:
2888 llvm_unreachable(
2889 "instruction should have been considered by earlier checks");
2890 case Instruction::Call:
2891 // Side-effects of a Call are assumed to be non-invariant, needing a
2892 // (fold-tail) mask.
2893 assert(Legal->isMaskRequired(I) &&
2894 "should have returned earlier for calls not needing a mask");
2895 return true;
2896 case Instruction::Load:
2897 // If the address is loop invariant no predication is needed.
2898 return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2899 case Instruction::Store: {
2900 // For stores, we need to prove both speculation safety (which follows from
2901 // the same argument as loads), but also must prove the value being stored
2902 // is correct. The easiest form of the later is to require that all values
2903 // stored are the same.
2904 return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2905 TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2906 }
2907 case Instruction::UDiv:
2908 case Instruction::URem:
2909 // If the divisor is loop-invariant no predication is needed.
2910 return !Legal->isInvariant(V: I->getOperand(i: 1));
2911 case Instruction::SDiv:
2912 case Instruction::SRem:
2913 // Conservative for now, since masked-off lanes may be poison and could
2914 // trigger signed overflow.
2915 return true;
2916 }
2917}
2918
2919uint64_t LoopVectorizationCostModel::getPredBlockCostDivisor(
2920 TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
2921 if (CostKind == TTI::TCK_CodeSize)
2922 return 1;
2923 // If the block wasn't originally predicated then return early to avoid
2924 // computing BlockFrequencyInfo unnecessarily.
2925 if (!Legal->blockNeedsPredication(BB))
2926 return 1;
2927
2928 uint64_t HeaderFreq =
2929 getBFI().getBlockFreq(BB: TheLoop->getHeader()).getFrequency();
2930 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2931 assert(HeaderFreq >= BBFreq &&
2932 "Header has smaller block freq than dominated BB?");
2933 return std::round(x: (double)HeaderFreq / BBFreq);
2934}
2935
2936std::pair<InstructionCost, InstructionCost>
2937LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
2938 ElementCount VF) {
2939 assert(I->getOpcode() == Instruction::UDiv ||
2940 I->getOpcode() == Instruction::SDiv ||
2941 I->getOpcode() == Instruction::SRem ||
2942 I->getOpcode() == Instruction::URem);
2943 assert(!isSafeToSpeculativelyExecute(I));
2944
2945 // Scalarization isn't legal for scalable vector types
2946 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2947 if (!VF.isScalable()) {
2948 // Get the scalarization cost and scale this amount by the probability of
2949 // executing the predicated block. If the instruction is not predicated,
2950 // we fall through to the next case.
2951 ScalarizationCost = 0;
2952
2953 // These instructions have a non-void type, so account for the phi nodes
2954 // that we will create. This cost is likely to be zero. The phi node
2955 // cost, if any, should be scaled by the block probability because it
2956 // models a copy at the end of each predicated block.
2957 ScalarizationCost +=
2958 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
2959
2960 // The cost of the non-predicated instruction.
2961 ScalarizationCost +=
2962 VF.getFixedValue() *
2963 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
2964
2965 // The cost of insertelement and extractelement instructions needed for
2966 // scalarization.
2967 ScalarizationCost += getScalarizationOverhead(I, VF);
2968
2969 // Scale the cost by the probability of executing the predicated blocks.
2970 // This assumes the predicated block for each vector lane is equally
2971 // likely.
2972 ScalarizationCost =
2973 ScalarizationCost / getPredBlockCostDivisor(CostKind, BB: I->getParent());
2974 }
2975
2976 InstructionCost SafeDivisorCost = 0;
2977 auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
2978 // The cost of the select guard to ensure all lanes are well defined
2979 // after we speculate above any internal control flow.
2980 SafeDivisorCost +=
2981 TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
2982 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
2983 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
2984
2985 SmallVector<const Value *, 4> Operands(I->operand_values());
2986 SafeDivisorCost += TTI.getArithmeticInstrCost(
2987 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
2988 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2989 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2990 Args: Operands, CxtI: I);
2991 return {ScalarizationCost, SafeDivisorCost};
2992}
2993
2994bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
2995 Instruction *I, ElementCount VF) const {
2996 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2997 assert(getWideningDecision(I, VF) == CM_Unknown &&
2998 "Decision should not be set yet.");
2999 auto *Group = getInterleavedAccessGroup(Instr: I);
3000 assert(Group && "Must have a group.");
3001 unsigned InterleaveFactor = Group->getFactor();
3002
3003 // If the instruction's allocated size doesn't equal its type size, it
3004 // requires padding and will be scalarized.
3005 auto &DL = I->getDataLayout();
3006 auto *ScalarTy = getLoadStoreType(I);
3007 if (hasIrregularType(Ty: ScalarTy, DL))
3008 return false;
3009
3010 // For scalable vectors, the interleave factors must be <= 8 since we require
3011 // the (de)interleaveN intrinsics instead of shufflevectors.
3012 if (VF.isScalable() && InterleaveFactor > 8)
3013 return false;
3014
3015 // If the group involves a non-integral pointer, we may not be able to
3016 // losslessly cast all values to a common type.
3017 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
3018 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3019 Instruction *Member = Group->getMember(Index: Idx);
3020 if (!Member)
3021 continue;
3022 auto *MemberTy = getLoadStoreType(I: Member);
3023 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3024 // Don't coerce non-integral pointers to integers or vice versa.
3025 if (MemberNI != ScalarNI)
3026 // TODO: Consider adding special nullptr value case here
3027 return false;
3028 if (MemberNI && ScalarNI &&
3029 ScalarTy->getPointerAddressSpace() !=
3030 MemberTy->getPointerAddressSpace())
3031 return false;
3032 }
3033
3034 // Check if masking is required.
3035 // A Group may need masking for one of two reasons: it resides in a block that
3036 // needs predication, or it was decided to use masking to deal with gaps
3037 // (either a gap at the end of a load-access that may result in a speculative
3038 // load, or any gaps in a store-access).
3039 bool PredicatedAccessRequiresMasking =
3040 blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3041 Legal->isMaskRequired(I);
3042 bool LoadAccessWithGapsRequiresEpilogMasking =
3043 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3044 !isScalarEpilogueAllowed();
3045 bool StoreAccessWithGapsRequiresMasking =
3046 isa<StoreInst>(Val: I) && !Group->isFull();
3047 if (!PredicatedAccessRequiresMasking &&
3048 !LoadAccessWithGapsRequiresEpilogMasking &&
3049 !StoreAccessWithGapsRequiresMasking)
3050 return true;
3051
3052 // If masked interleaving is required, we expect that the user/target had
3053 // enabled it, because otherwise it either wouldn't have been created or
3054 // it should have been invalidated by the CostModel.
3055 assert(useMaskedInterleavedAccesses(TTI) &&
3056 "Masked interleave-groups for predicated accesses are not enabled.");
3057
3058 if (Group->isReverse())
3059 return false;
3060
3061 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
3062 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
3063 StoreAccessWithGapsRequiresMasking;
3064 if (VF.isScalable() && NeedsMaskForGaps)
3065 return false;
3066
3067 auto *Ty = getLoadStoreType(I);
3068 const Align Alignment = getLoadStoreAlignment(I);
3069 unsigned AS = getLoadStoreAddressSpace(I);
3070 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3071 : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3072}
3073
3074bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3075 Instruction *I, ElementCount VF) {
3076 // Get and ensure we have a valid memory instruction.
3077 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3078
3079 auto *Ptr = getLoadStorePointerOperand(V: I);
3080 auto *ScalarTy = getLoadStoreType(I);
3081
3082 // In order to be widened, the pointer should be consecutive, first of all.
3083 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3084 return false;
3085
3086 // If the instruction is a store located in a predicated block, it will be
3087 // scalarized.
3088 if (isScalarWithPredication(I, VF))
3089 return false;
3090
3091 // If the instruction's allocated size doesn't equal it's type size, it
3092 // requires padding and will be scalarized.
3093 auto &DL = I->getDataLayout();
3094 if (hasIrregularType(Ty: ScalarTy, DL))
3095 return false;
3096
3097 return true;
3098}
3099
3100void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3101 // We should not collect Uniforms more than once per VF. Right now,
3102 // this function is called from collectUniformsAndScalars(), which
3103 // already does this check. Collecting Uniforms for VF=1 does not make any
3104 // sense.
3105
3106 assert(VF.isVector() && !Uniforms.contains(VF) &&
3107 "This function should not be visited twice for the same VF");
3108
3109 // Visit the list of Uniforms. If we find no uniform value, we won't
3110 // analyze again. Uniforms.count(VF) will return 1.
3111 Uniforms[VF].clear();
3112
3113 // Now we know that the loop is vectorizable!
3114 // Collect instructions inside the loop that will remain uniform after
3115 // vectorization.
3116
3117 // Global values, params and instructions outside of current loop are out of
3118 // scope.
3119 auto IsOutOfScope = [&](Value *V) -> bool {
3120 Instruction *I = dyn_cast<Instruction>(Val: V);
3121 return (!I || !TheLoop->contains(Inst: I));
3122 };
3123
3124 // Worklist containing uniform instructions demanding lane 0.
3125 SetVector<Instruction *> Worklist;
3126
3127 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3128 // that require predication must not be considered uniform after
3129 // vectorization, because that would create an erroneous replicating region
3130 // where only a single instance out of VF should be formed.
3131 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3132 if (IsOutOfScope(I)) {
3133 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3134 << *I << "\n");
3135 return;
3136 }
3137 if (isPredicatedInst(I)) {
3138 LLVM_DEBUG(
3139 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3140 << "\n");
3141 return;
3142 }
3143 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3144 Worklist.insert(X: I);
3145 };
3146
3147 // Start with the conditional branches exiting the loop. If the branch
3148 // condition is an instruction contained in the loop that is only used by the
3149 // branch, it is uniform. Note conditions from uncountable early exits are not
3150 // uniform.
3151 SmallVector<BasicBlock *> Exiting;
3152 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3153 for (BasicBlock *E : Exiting) {
3154 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3155 continue;
3156 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
3157 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3158 AddToWorklistIfAllowed(Cmp);
3159 }
3160
3161 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
3162 // Return true if all lanes perform the same memory operation, and we can
3163 // thus choose to execute only one.
3164 auto IsUniformMemOpUse = [&](Instruction *I) {
3165 // If the value was already known to not be uniform for the previous
3166 // (smaller VF), it cannot be uniform for the larger VF.
3167 if (PrevVF.isVector()) {
3168 auto Iter = Uniforms.find(Val: PrevVF);
3169 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
3170 return false;
3171 }
3172 if (!Legal->isUniformMemOp(I&: *I, VF))
3173 return false;
3174 if (isa<LoadInst>(Val: I))
3175 // Loading the same address always produces the same result - at least
3176 // assuming aliasing and ordering which have already been checked.
3177 return true;
3178 // Storing the same value on every iteration.
3179 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3180 };
3181
3182 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3183 InstWidening WideningDecision = getWideningDecision(I, VF);
3184 assert(WideningDecision != CM_Unknown &&
3185 "Widening decision should be ready at this moment");
3186
3187 if (IsUniformMemOpUse(I))
3188 return true;
3189
3190 return (WideningDecision == CM_Widen ||
3191 WideningDecision == CM_Widen_Reverse ||
3192 WideningDecision == CM_Interleave);
3193 };
3194
3195 // Returns true if Ptr is the pointer operand of a memory access instruction
3196 // I, I is known to not require scalarization, and the pointer is not also
3197 // stored.
3198 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3199 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
3200 return false;
3201 return getLoadStorePointerOperand(V: I) == Ptr &&
3202 (IsUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
3203 };
3204
3205 // Holds a list of values which are known to have at least one uniform use.
3206 // Note that there may be other uses which aren't uniform. A "uniform use"
3207 // here is something which only demands lane 0 of the unrolled iterations;
3208 // it does not imply that all lanes produce the same value (e.g. this is not
3209 // the usual meaning of uniform)
3210 SetVector<Value *> HasUniformUse;
3211
3212 // Scan the loop for instructions which are either a) known to have only
3213 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3214 for (auto *BB : TheLoop->blocks())
3215 for (auto &I : *BB) {
3216 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3217 switch (II->getIntrinsicID()) {
3218 case Intrinsic::sideeffect:
3219 case Intrinsic::experimental_noalias_scope_decl:
3220 case Intrinsic::assume:
3221 case Intrinsic::lifetime_start:
3222 case Intrinsic::lifetime_end:
3223 if (TheLoop->hasLoopInvariantOperands(I: &I))
3224 AddToWorklistIfAllowed(&I);
3225 break;
3226 default:
3227 break;
3228 }
3229 }
3230
3231 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3232 if (IsOutOfScope(EVI->getAggregateOperand())) {
3233 AddToWorklistIfAllowed(EVI);
3234 continue;
3235 }
3236 // Only ExtractValue instructions where the aggregate value comes from a
3237 // call are allowed to be non-uniform.
3238 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3239 "Expected aggregate value to be call return value");
3240 }
3241
3242 // If there's no pointer operand, there's nothing to do.
3243 auto *Ptr = getLoadStorePointerOperand(V: &I);
3244 if (!Ptr)
3245 continue;
3246
3247 // If the pointer can be proven to be uniform, always add it to the
3248 // worklist.
3249 if (isa<Instruction>(Val: Ptr) && Legal->isUniform(V: Ptr, VF))
3250 AddToWorklistIfAllowed(cast<Instruction>(Val: Ptr));
3251
3252 if (IsUniformMemOpUse(&I))
3253 AddToWorklistIfAllowed(&I);
3254
3255 if (IsVectorizedMemAccessUse(&I, Ptr))
3256 HasUniformUse.insert(X: Ptr);
3257 }
3258
3259 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3260 // demanding) users. Since loops are assumed to be in LCSSA form, this
3261 // disallows uses outside the loop as well.
3262 for (auto *V : HasUniformUse) {
3263 if (IsOutOfScope(V))
3264 continue;
3265 auto *I = cast<Instruction>(Val: V);
3266 bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User *U) -> bool {
3267 auto *UI = cast<Instruction>(Val: U);
3268 return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse(UI, V);
3269 });
3270 if (UsersAreMemAccesses)
3271 AddToWorklistIfAllowed(I);
3272 }
3273
3274 // Expand Worklist in topological order: whenever a new instruction
3275 // is added , its users should be already inside Worklist. It ensures
3276 // a uniform instruction will only be used by uniform instructions.
3277 unsigned Idx = 0;
3278 while (Idx != Worklist.size()) {
3279 Instruction *I = Worklist[Idx++];
3280
3281 for (auto *OV : I->operand_values()) {
3282 // isOutOfScope operands cannot be uniform instructions.
3283 if (IsOutOfScope(OV))
3284 continue;
3285 // First order recurrence Phi's should typically be considered
3286 // non-uniform.
3287 auto *OP = dyn_cast<PHINode>(Val: OV);
3288 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3289 continue;
3290 // If all the users of the operand are uniform, then add the
3291 // operand into the uniform worklist.
3292 auto *OI = cast<Instruction>(Val: OV);
3293 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
3294 auto *J = cast<Instruction>(Val: U);
3295 return Worklist.count(key: J) || IsVectorizedMemAccessUse(J, OI);
3296 }))
3297 AddToWorklistIfAllowed(OI);
3298 }
3299 }
3300
3301 // For an instruction to be added into Worklist above, all its users inside
3302 // the loop should also be in Worklist. However, this condition cannot be
3303 // true for phi nodes that form a cyclic dependence. We must process phi
3304 // nodes separately. An induction variable will remain uniform if all users
3305 // of the induction variable and induction variable update remain uniform.
3306 // The code below handles both pointer and non-pointer induction variables.
3307 BasicBlock *Latch = TheLoop->getLoopLatch();
3308 for (const auto &Induction : Legal->getInductionVars()) {
3309 auto *Ind = Induction.first;
3310 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3311
3312 // Determine if all users of the induction variable are uniform after
3313 // vectorization.
3314 bool UniformInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3315 auto *I = cast<Instruction>(Val: U);
3316 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3317 IsVectorizedMemAccessUse(I, Ind);
3318 });
3319 if (!UniformInd)
3320 continue;
3321
3322 // Determine if all users of the induction variable update instruction are
3323 // uniform after vectorization.
3324 bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3325 auto *I = cast<Instruction>(Val: U);
3326 return I == Ind || Worklist.count(key: I) ||
3327 IsVectorizedMemAccessUse(I, IndUpdate);
3328 });
3329 if (!UniformIndUpdate)
3330 continue;
3331
3332 // The induction variable and its update instruction will remain uniform.
3333 AddToWorklistIfAllowed(Ind);
3334 AddToWorklistIfAllowed(IndUpdate);
3335 }
3336
3337 Uniforms[VF].insert_range(R&: Worklist);
3338}
3339
3340bool LoopVectorizationCostModel::runtimeChecksRequired() {
3341 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3342
3343 if (Legal->getRuntimePointerChecking()->Need) {
3344 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3345 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3346 "loop with '#pragma clang loop vectorize(enable)' when "
3347 "compiling with -Os/-Oz",
3348 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3349 return true;
3350 }
3351
3352 if (!PSE.getPredicate().isAlwaysTrue()) {
3353 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3354 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3355 "loop with '#pragma clang loop vectorize(enable)' when "
3356 "compiling with -Os/-Oz",
3357 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3358 return true;
3359 }
3360
3361 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3362 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3363 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3364 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3365 "this loop without such check by compiling with -Os/-Oz",
3366 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3367 return true;
3368 }
3369
3370 return false;
3371}
3372
3373bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3374 if (IsScalableVectorizationAllowed)
3375 return *IsScalableVectorizationAllowed;
3376
3377 IsScalableVectorizationAllowed = false;
3378 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3379 return false;
3380
3381 if (Hints->isScalableVectorizationDisabled()) {
3382 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3383 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3384 return false;
3385 }
3386
3387 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3388
3389 auto MaxScalableVF = ElementCount::getScalable(
3390 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3391
3392 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3393 // FIXME: While for scalable vectors this is currently sufficient, this should
3394 // be replaced by a more detailed mechanism that filters out specific VFs,
3395 // instead of invalidating vectorization for a whole set of VFs based on the
3396 // MaxVF.
3397
3398 // Disable scalable vectorization if the loop contains unsupported reductions.
3399 if (!canVectorizeReductions(VF: MaxScalableVF)) {
3400 reportVectorizationInfo(
3401 Msg: "Scalable vectorization not supported for the reduction "
3402 "operations found in this loop.",
3403 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3404 return false;
3405 }
3406
3407 // Disable scalable vectorization if the loop contains any instructions
3408 // with element types not supported for scalable vectors.
3409 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3410 return !Ty->isVoidTy() &&
3411 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3412 })) {
3413 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3414 "for all element types found in this loop.",
3415 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3416 return false;
3417 }
3418
3419 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3420 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3421 "for safe distance analysis.",
3422 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3423 return false;
3424 }
3425
3426 IsScalableVectorizationAllowed = true;
3427 return true;
3428}
3429
3430ElementCount
3431LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3432 if (!isScalableVectorizationAllowed())
3433 return ElementCount::getScalable(MinVal: 0);
3434
3435 auto MaxScalableVF = ElementCount::getScalable(
3436 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3437 if (Legal->isSafeForAnyVectorWidth())
3438 return MaxScalableVF;
3439
3440 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3441 // Limit MaxScalableVF by the maximum safe dependence distance.
3442 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3443
3444 if (!MaxScalableVF)
3445 reportVectorizationInfo(
3446 Msg: "Max legal vector width too small, scalable vectorization "
3447 "unfeasible.",
3448 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3449
3450 return MaxScalableVF;
3451}
3452
3453FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3454 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
3455 bool FoldTailByMasking) {
3456 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3457 unsigned SmallestType, WidestType;
3458 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3459
3460 // Get the maximum safe dependence distance in bits computed by LAA.
3461 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3462 // the memory accesses that is most restrictive (involved in the smallest
3463 // dependence distance).
3464 unsigned MaxSafeElementsPowerOf2 =
3465 bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3466 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3467 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3468 MaxSafeElementsPowerOf2 =
3469 std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3470 }
3471 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3472 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3473
3474 if (!Legal->isSafeForAnyVectorWidth())
3475 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3476
3477 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3478 << ".\n");
3479 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3480 << ".\n");
3481
3482 // First analyze the UserVF, fall back if the UserVF should be ignored.
3483 if (UserVF) {
3484 auto MaxSafeUserVF =
3485 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3486
3487 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3488 // If `VF=vscale x N` is safe, then so is `VF=N`
3489 if (UserVF.isScalable())
3490 return FixedScalableVFPair(
3491 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3492
3493 return UserVF;
3494 }
3495
3496 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3497
3498 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3499 // is better to ignore the hint and let the compiler choose a suitable VF.
3500 if (!UserVF.isScalable()) {
3501 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3502 << " is unsafe, clamping to max safe VF="
3503 << MaxSafeFixedVF << ".\n");
3504 ORE->emit(RemarkBuilder: [&]() {
3505 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3506 TheLoop->getStartLoc(),
3507 TheLoop->getHeader())
3508 << "User-specified vectorization factor "
3509 << ore::NV("UserVectorizationFactor", UserVF)
3510 << " is unsafe, clamping to maximum safe vectorization factor "
3511 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3512 });
3513 return MaxSafeFixedVF;
3514 }
3515
3516 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3517 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3518 << " is ignored because scalable vectors are not "
3519 "available.\n");
3520 ORE->emit(RemarkBuilder: [&]() {
3521 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3522 TheLoop->getStartLoc(),
3523 TheLoop->getHeader())
3524 << "User-specified vectorization factor "
3525 << ore::NV("UserVectorizationFactor", UserVF)
3526 << " is ignored because the target does not support scalable "
3527 "vectors. The compiler will pick a more suitable value.";
3528 });
3529 } else {
3530 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3531 << " is unsafe. Ignoring scalable UserVF.\n");
3532 ORE->emit(RemarkBuilder: [&]() {
3533 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3534 TheLoop->getStartLoc(),
3535 TheLoop->getHeader())
3536 << "User-specified vectorization factor "
3537 << ore::NV("UserVectorizationFactor", UserVF)
3538 << " is unsafe. Ignoring the hint to let the compiler pick a "
3539 "more suitable value.";
3540 });
3541 }
3542 }
3543
3544 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3545 << " / " << WidestType << " bits.\n");
3546
3547 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
3548 ElementCount::getScalable(MinVal: 0));
3549 if (auto MaxVF =
3550 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3551 MaxSafeVF: MaxSafeFixedVF, UserIC, FoldTailByMasking))
3552 Result.FixedVF = MaxVF;
3553
3554 if (auto MaxVF =
3555 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3556 MaxSafeVF: MaxSafeScalableVF, UserIC, FoldTailByMasking))
3557 if (MaxVF.isScalable()) {
3558 Result.ScalableVF = MaxVF;
3559 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3560 << "\n");
3561 }
3562
3563 return Result;
3564}
3565
3566FixedScalableVFPair
3567LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3568 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3569 // TODO: It may be useful to do since it's still likely to be dynamically
3570 // uniform if the target can skip.
3571 reportVectorizationFailure(
3572 DebugMsg: "Not inserting runtime ptr check for divergent target",
3573 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3574 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3575 return FixedScalableVFPair::getNone();
3576 }
3577
3578 ScalarEvolution *SE = PSE.getSE();
3579 ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3580 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3581 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3582 if (TC != ElementCount::getFixed(MinVal: MaxTC))
3583 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3584 if (TC.isScalar()) {
3585 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3586 OREMsg: "loop trip count is one, irrelevant for vectorization",
3587 ORETag: "SingleIterationLoop", ORE, TheLoop);
3588 return FixedScalableVFPair::getNone();
3589 }
3590
3591 // If BTC matches the widest induction type and is -1 then the trip count
3592 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3593 // to vectorize.
3594 const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3595 if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3596 BTC->getType()->getScalarSizeInBits() >=
3597 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3598 SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3599 RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3600 reportVectorizationFailure(
3601 DebugMsg: "Trip count computation wrapped",
3602 OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3603 ORETag: "TripCountWrapped", ORE, TheLoop);
3604 return FixedScalableVFPair::getNone();
3605 }
3606
3607 switch (ScalarEpilogueStatus) {
3608 case CM_ScalarEpilogueAllowed:
3609 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: false);
3610 case CM_ScalarEpilogueNotAllowedUsePredicate:
3611 [[fallthrough]];
3612 case CM_ScalarEpilogueNotNeededUsePredicate:
3613 LLVM_DEBUG(
3614 dbgs() << "LV: vector predicate hint/switch found.\n"
3615 << "LV: Not allowing scalar epilogue, creating predicated "
3616 << "vector loop.\n");
3617 break;
3618 case CM_ScalarEpilogueNotAllowedLowTripLoop:
3619 // fallthrough as a special case of OptForSize
3620 case CM_ScalarEpilogueNotAllowedOptSize:
3621 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3622 LLVM_DEBUG(
3623 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3624 else
3625 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3626 << "count.\n");
3627
3628 // Bail if runtime checks are required, which are not good when optimising
3629 // for size.
3630 if (runtimeChecksRequired())
3631 return FixedScalableVFPair::getNone();
3632
3633 break;
3634 }
3635
3636 // Now try the tail folding
3637
3638 // Invalidate interleave groups that require an epilogue if we can't mask
3639 // the interleave-group.
3640 if (!useMaskedInterleavedAccesses(TTI)) {
3641 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3642 "No decisions should have been taken at this point");
3643 // Note: There is no need to invalidate any cost modeling decisions here, as
3644 // none were taken so far.
3645 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3646 }
3647
3648 FixedScalableVFPair MaxFactors =
3649 computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: true);
3650
3651 // Avoid tail folding if the trip count is known to be a multiple of any VF
3652 // we choose.
3653 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3654 MaxFactors.FixedVF.getFixedValue();
3655 if (MaxFactors.ScalableVF) {
3656 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3657 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3658 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3659 a: *MaxPowerOf2RuntimeVF,
3660 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3661 } else
3662 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3663 }
3664
3665 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3666 // Return false if the loop is neither a single-latch-exit loop nor an
3667 // early-exit loop as tail-folding is not supported in that case.
3668 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3669 !Legal->hasUncountableEarlyExit())
3670 return false;
3671 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3672 ScalarEvolution *SE = PSE.getSE();
3673 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3674 // with uncountable exits. For countable loops, the symbolic maximum must
3675 // remain identical to the known back-edge taken count.
3676 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3677 assert((Legal->hasUncountableEarlyExit() ||
3678 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3679 "Invalid loop count");
3680 const SCEV *ExitCount = SE->getAddExpr(
3681 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3682 const SCEV *Rem = SE->getURemExpr(
3683 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3684 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3685 return Rem->isZero();
3686 };
3687
3688 if (MaxPowerOf2RuntimeVF > 0u) {
3689 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3690 "MaxFixedVF must be a power of 2");
3691 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3692 // Accept MaxFixedVF if we do not have a tail.
3693 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3694 return MaxFactors;
3695 }
3696 }
3697
3698 auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3699 if (ExpectedTC && ExpectedTC->isFixed() &&
3700 ExpectedTC->getFixedValue() <=
3701 TTI.getMinTripCountTailFoldingThreshold()) {
3702 if (MaxPowerOf2RuntimeVF > 0u) {
3703 // If we have a low-trip-count, and the fixed-width VF is known to divide
3704 // the trip count but the scalable factor does not, use the fixed-width
3705 // factor in preference to allow the generation of a non-predicated loop.
3706 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3707 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3708 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3709 "remain for any chosen VF.\n");
3710 MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: 0);
3711 return MaxFactors;
3712 }
3713 }
3714
3715 reportVectorizationFailure(
3716 DebugMsg: "The trip count is below the minial threshold value.",
3717 OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3718 ORE, TheLoop);
3719 return FixedScalableVFPair::getNone();
3720 }
3721
3722 // If we don't know the precise trip count, or if the trip count that we
3723 // found modulo the vectorization factor is not zero, try to fold the tail
3724 // by masking.
3725 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3726 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3727 setTailFoldingStyles(IsScalableVF: ContainsScalableVF, UserIC);
3728 if (foldTailByMasking()) {
3729 if (foldTailWithEVL()) {
3730 LLVM_DEBUG(
3731 dbgs()
3732 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3733 "try to generate VP Intrinsics with scalable vector "
3734 "factors only.\n");
3735 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3736 // for now.
3737 // TODO: extend it for fixed vectors, if required.
3738 assert(ContainsScalableVF && "Expected scalable vector factor.");
3739
3740 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
3741 }
3742 return MaxFactors;
3743 }
3744
3745 // If there was a tail-folding hint/switch, but we can't fold the tail by
3746 // masking, fallback to a vectorization with a scalar epilogue.
3747 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3748 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3749 "scalar epilogue instead.\n");
3750 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3751 return MaxFactors;
3752 }
3753
3754 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3755 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3756 return FixedScalableVFPair::getNone();
3757 }
3758
3759 if (TC.isZero()) {
3760 reportVectorizationFailure(
3761 DebugMsg: "unable to calculate the loop count due to complex control flow",
3762 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3763 return FixedScalableVFPair::getNone();
3764 }
3765
3766 reportVectorizationFailure(
3767 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3768 OREMsg: "cannot optimize for size and vectorize at the same time. "
3769 "Enable vectorization of this loop with '#pragma clang loop "
3770 "vectorize(enable)' when compiling with -Os/-Oz",
3771 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3772 return FixedScalableVFPair::getNone();
3773}
3774
3775bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
3776 ElementCount VF) {
3777 if (ConsiderRegPressure.getNumOccurrences())
3778 return ConsiderRegPressure;
3779
3780 // TODO: We should eventually consider register pressure for all targets. The
3781 // TTI hook is temporary whilst target-specific issues are being fixed.
3782 if (TTI.shouldConsiderVectorizationRegPressure())
3783 return true;
3784
3785 if (!useMaxBandwidth(RegKind: VF.isScalable()
3786 ? TargetTransformInfo::RGK_ScalableVector
3787 : TargetTransformInfo::RGK_FixedWidthVector))
3788 return false;
3789 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3790 return ElementCount::isKnownGT(
3791 LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3792 : MaxPermissibleVFWithoutMaxBW.FixedVF);
3793}
3794
3795bool LoopVectorizationCostModel::useMaxBandwidth(
3796 TargetTransformInfo::RegisterKind RegKind) {
3797 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3798 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
3799 (UseWiderVFIfCallVariantsPresent &&
3800 Legal->hasVectorCallVariants())));
3801}
3802
3803ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3804 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
3805 bool FoldTailByMasking) const {
3806 unsigned EstimatedVF = VF.getKnownMinValue();
3807 if (VF.isScalable() && TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3808 auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3809 auto Min = Attr.getVScaleRangeMin();
3810 EstimatedVF *= Min;
3811 }
3812
3813 // When a scalar epilogue is required, at least one iteration of the scalar
3814 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3815 // max VF that results in a dead vector loop.
3816 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
3817 MaxTripCount -= 1;
3818
3819 // When the user specifies an interleave count, we need to ensure that
3820 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
3821 unsigned IC = UserIC > 0 ? UserIC : 1;
3822 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
3823
3824 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
3825 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
3826 // If upper bound loop trip count (TC) is known at compile time there is no
3827 // point in choosing VF greater than TC / IC (as done in the loop below).
3828 // Select maximum power of two which doesn't exceed TC / IC. If VF is
3829 // scalable, we only fall back on a fixed VF when the TC is less than or
3830 // equal to the known number of lanes.
3831 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC);
3832 if (ClampedUpperTripCount == 0)
3833 ClampedUpperTripCount = 1;
3834 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3835 "exceeding the constant trip count"
3836 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
3837 << ClampedUpperTripCount << "\n");
3838 return ElementCount::get(MinVal: ClampedUpperTripCount,
3839 Scalable: FoldTailByMasking ? VF.isScalable() : false);
3840 }
3841 return VF;
3842}
3843
3844ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3845 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3846 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
3847 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3848 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3849 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3850 : TargetTransformInfo::RGK_FixedWidthVector);
3851
3852 // Convenience function to return the minimum of two ElementCounts.
3853 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3854 assert((LHS.isScalable() == RHS.isScalable()) &&
3855 "Scalable flags must match");
3856 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3857 };
3858
3859 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3860 // Note that both WidestRegister and WidestType may not be a powers of 2.
3861 auto MaxVectorElementCount = ElementCount::get(
3862 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3863 Scalable: ComputeScalableMaxVF);
3864 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3865 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3866 << (MaxVectorElementCount * WidestType) << " bits.\n");
3867
3868 if (!MaxVectorElementCount) {
3869 LLVM_DEBUG(dbgs() << "LV: The target has no "
3870 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3871 << " vector registers.\n");
3872 return ElementCount::getFixed(MinVal: 1);
3873 }
3874
3875 ElementCount MaxVF = clampVFByMaxTripCount(
3876 VF: MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking);
3877 // If the MaxVF was already clamped, there's no point in trying to pick a
3878 // larger one.
3879 if (MaxVF != MaxVectorElementCount)
3880 return MaxVF;
3881
3882 TargetTransformInfo::RegisterKind RegKind =
3883 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3884 : TargetTransformInfo::RGK_FixedWidthVector;
3885
3886 if (MaxVF.isScalable())
3887 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3888 else
3889 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3890
3891 if (useMaxBandwidth(RegKind)) {
3892 auto MaxVectorElementCountMaxBW = ElementCount::get(
3893 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
3894 Scalable: ComputeScalableMaxVF);
3895 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3896
3897 if (ElementCount MinVF =
3898 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
3899 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
3900 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3901 << ") with target's minimum: " << MinVF << '\n');
3902 MaxVF = MinVF;
3903 }
3904 }
3905
3906 MaxVF =
3907 clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC, FoldTailByMasking);
3908
3909 if (MaxVectorElementCount != MaxVF) {
3910 // Invalidate any widening decisions we might have made, in case the loop
3911 // requires prediction (decided later), but we have already made some
3912 // load/store widening decisions.
3913 invalidateCostModelingDecisions();
3914 }
3915 }
3916 return MaxVF;
3917}
3918
3919bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3920 const VectorizationFactor &B,
3921 const unsigned MaxTripCount,
3922 bool HasTail,
3923 bool IsEpilogue) const {
3924 InstructionCost CostA = A.Cost;
3925 InstructionCost CostB = B.Cost;
3926
3927 // Improve estimate for the vector width if it is scalable.
3928 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3929 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3930 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3931 if (A.Width.isScalable())
3932 EstimatedWidthA *= *VScale;
3933 if (B.Width.isScalable())
3934 EstimatedWidthB *= *VScale;
3935 }
3936
3937 // When optimizing for size choose whichever is smallest, which will be the
3938 // one with the smallest cost for the whole loop. On a tie pick the larger
3939 // vector width, on the assumption that throughput will be greater.
3940 if (CM.CostKind == TTI::TCK_CodeSize)
3941 return CostA < CostB ||
3942 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3943
3944 // Assume vscale may be larger than 1 (or the value being tuned for),
3945 // so that scalable vectorization is slightly favorable over fixed-width
3946 // vectorization.
3947 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3948 A.Width.isScalable() && !B.Width.isScalable();
3949
3950 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3951 const InstructionCost &RHS) {
3952 return PreferScalable ? LHS <= RHS : LHS < RHS;
3953 };
3954
3955 // To avoid the need for FP division:
3956 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3957 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3958 if (!MaxTripCount)
3959 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3960
3961 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3962 InstructionCost VectorCost,
3963 InstructionCost ScalarCost) {
3964 // If the trip count is a known (possibly small) constant, the trip count
3965 // will be rounded up to an integer number of iterations under
3966 // FoldTailByMasking. The total cost in that case will be
3967 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3968 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3969 // some extra overheads, but for the purpose of comparing the costs of
3970 // different VFs we can use this to compare the total loop-body cost
3971 // expected after vectorization.
3972 if (HasTail)
3973 return VectorCost * (MaxTripCount / VF) +
3974 ScalarCost * (MaxTripCount % VF);
3975 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
3976 };
3977
3978 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3979 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3980 return CmpFn(RTCostA, RTCostB);
3981}
3982
3983bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3984 const VectorizationFactor &B,
3985 bool HasTail,
3986 bool IsEpilogue) const {
3987 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3988 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3989 IsEpilogue);
3990}
3991
3992void LoopVectorizationPlanner::emitInvalidCostRemarks(
3993 OptimizationRemarkEmitter *ORE) {
3994 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3995 SmallVector<RecipeVFPair> InvalidCosts;
3996 for (const auto &Plan : VPlans) {
3997 for (ElementCount VF : Plan->vectorFactors()) {
3998 // The VPlan-based cost model is designed for computing vector cost.
3999 // Querying VPlan-based cost model with a scarlar VF will cause some
4000 // errors because we expect the VF is vector for most of the widen
4001 // recipes.
4002 if (VF.isScalar())
4003 continue;
4004
4005 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
4006 OrigLoop);
4007 precomputeCosts(Plan&: *Plan, VF, CostCtx);
4008 auto Iter = vp_depth_first_deep(G: Plan->getVectorLoopRegion()->getEntry());
4009 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
4010 for (auto &R : *VPBB) {
4011 if (!R.cost(VF, Ctx&: CostCtx).isValid())
4012 InvalidCosts.emplace_back(Args: &R, Args&: VF);
4013 }
4014 }
4015 }
4016 }
4017 if (InvalidCosts.empty())
4018 return;
4019
4020 // Emit a report of VFs with invalid costs in the loop.
4021
4022 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4023 DenseMap<VPRecipeBase *, unsigned> Numbering;
4024 unsigned I = 0;
4025 for (auto &Pair : InvalidCosts)
4026 if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4027 ++I;
4028
4029 // Sort the list, first on recipe(number) then on VF.
4030 sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4031 unsigned NA = Numbering[A.first];
4032 unsigned NB = Numbering[B.first];
4033 if (NA != NB)
4034 return NA < NB;
4035 return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4036 });
4037
4038 // For a list of ordered recipe-VF pairs:
4039 // [(load, VF1), (load, VF2), (store, VF1)]
4040 // group the recipes together to emit separate remarks for:
4041 // load (VF1, VF2)
4042 // store (VF1)
4043 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4044 auto Subset = ArrayRef<RecipeVFPair>();
4045 do {
4046 if (Subset.empty())
4047 Subset = Tail.take_front(N: 1);
4048
4049 VPRecipeBase *R = Subset.front().first;
4050
4051 unsigned Opcode =
4052 TypeSwitch<const VPRecipeBase *, unsigned>(R)
4053 .Case(caseFn: [](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
4054 .Case(
4055 caseFn: [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
4056 .Case(caseFn: [](const VPWidenLoadRecipe *R) { return Instruction::Load; })
4057 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4058 caseFn: [](const auto *R) { return Instruction::Call; })
4059 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4060 VPWidenCastRecipe>(
4061 caseFn: [](const auto *R) { return R->getOpcode(); })
4062 .Case(caseFn: [](const VPInterleaveRecipe *R) {
4063 return R->getStoredValues().empty() ? Instruction::Load
4064 : Instruction::Store;
4065 })
4066 .Case(caseFn: [](const VPReductionRecipe *R) {
4067 return RecurrenceDescriptor::getOpcode(Kind: R->getRecurrenceKind());
4068 });
4069
4070 // If the next recipe is different, or if there are no other pairs,
4071 // emit a remark for the collated subset. e.g.
4072 // [(load, VF1), (load, VF2))]
4073 // to emit:
4074 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4075 if (Subset == Tail || Tail[Subset.size()].first != R) {
4076 std::string OutString;
4077 raw_string_ostream OS(OutString);
4078 assert(!Subset.empty() && "Unexpected empty range");
4079 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4080 for (const auto &Pair : Subset)
4081 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4082 OS << "):";
4083 if (Opcode == Instruction::Call) {
4084 StringRef Name = "";
4085 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4086 Name = Int->getIntrinsicName();
4087 } else {
4088 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4089 Function *CalledFn =
4090 WidenCall ? WidenCall->getCalledScalarFunction()
4091 : cast<Function>(Val: R->getOperand(N: R->getNumOperands() - 1)
4092 ->getLiveInIRValue());
4093 Name = CalledFn->getName();
4094 }
4095 OS << " call to " << Name;
4096 } else
4097 OS << " " << Instruction::getOpcodeName(Opcode);
4098 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4099 DL: R->getDebugLoc());
4100 Tail = Tail.drop_front(N: Subset.size());
4101 Subset = {};
4102 } else
4103 // Grow the subset by one element
4104 Subset = Tail.take_front(N: Subset.size() + 1);
4105 } while (!Tail.empty());
4106}
4107
4108/// Check if any recipe of \p Plan will generate a vector value, which will be
4109/// assigned a vector register.
4110static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4111 const TargetTransformInfo &TTI) {
4112 assert(VF.isVector() && "Checking a scalar VF?");
4113 VPTypeAnalysis TypeInfo(Plan);
4114 DenseSet<VPRecipeBase *> EphemeralRecipes;
4115 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4116 // Set of already visited types.
4117 DenseSet<Type *> Visited;
4118 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4119 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4120 for (VPRecipeBase &R : *VPBB) {
4121 if (EphemeralRecipes.contains(V: &R))
4122 continue;
4123 // Continue early if the recipe is considered to not produce a vector
4124 // result. Note that this includes VPInstruction where some opcodes may
4125 // produce a vector, to preserve existing behavior as VPInstructions model
4126 // aspects not directly mapped to existing IR instructions.
4127 switch (R.getVPRecipeID()) {
4128 case VPRecipeBase::VPDerivedIVSC:
4129 case VPRecipeBase::VPScalarIVStepsSC:
4130 case VPRecipeBase::VPReplicateSC:
4131 case VPRecipeBase::VPInstructionSC:
4132 case VPRecipeBase::VPCanonicalIVPHISC:
4133 case VPRecipeBase::VPVectorPointerSC:
4134 case VPRecipeBase::VPVectorEndPointerSC:
4135 case VPRecipeBase::VPExpandSCEVSC:
4136 case VPRecipeBase::VPEVLBasedIVPHISC:
4137 case VPRecipeBase::VPPredInstPHISC:
4138 case VPRecipeBase::VPBranchOnMaskSC:
4139 continue;
4140 case VPRecipeBase::VPReductionSC:
4141 case VPRecipeBase::VPActiveLaneMaskPHISC:
4142 case VPRecipeBase::VPWidenCallSC:
4143 case VPRecipeBase::VPWidenCanonicalIVSC:
4144 case VPRecipeBase::VPWidenCastSC:
4145 case VPRecipeBase::VPWidenGEPSC:
4146 case VPRecipeBase::VPWidenIntrinsicSC:
4147 case VPRecipeBase::VPWidenSC:
4148 case VPRecipeBase::VPBlendSC:
4149 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
4150 case VPRecipeBase::VPHistogramSC:
4151 case VPRecipeBase::VPWidenPHISC:
4152 case VPRecipeBase::VPWidenIntOrFpInductionSC:
4153 case VPRecipeBase::VPWidenPointerInductionSC:
4154 case VPRecipeBase::VPReductionPHISC:
4155 case VPRecipeBase::VPInterleaveEVLSC:
4156 case VPRecipeBase::VPInterleaveSC:
4157 case VPRecipeBase::VPWidenLoadEVLSC:
4158 case VPRecipeBase::VPWidenLoadSC:
4159 case VPRecipeBase::VPWidenStoreEVLSC:
4160 case VPRecipeBase::VPWidenStoreSC:
4161 break;
4162 default:
4163 llvm_unreachable("unhandled recipe");
4164 }
4165
4166 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4167 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4168 if (!NumLegalParts)
4169 return false;
4170 if (VF.isScalable()) {
4171 // <vscale x 1 x iN> is assumed to be profitable over iN because
4172 // scalable registers are a distinct register class from scalar
4173 // ones. If we ever find a target which wants to lower scalable
4174 // vectors back to scalars, we'll need to update this code to
4175 // explicitly ask TTI about the register class uses for each part.
4176 return NumLegalParts <= VF.getKnownMinValue();
4177 }
4178 // Two or more elements that share a register - are vectorized.
4179 return NumLegalParts < VF.getFixedValue();
4180 };
4181
4182 // If no def nor is a store, e.g., branches, continue - no value to check.
4183 if (R.getNumDefinedValues() == 0 &&
4184 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(Val: &R))
4185 continue;
4186 // For multi-def recipes, currently only interleaved loads, suffice to
4187 // check first def only.
4188 // For stores check their stored value; for interleaved stores suffice
4189 // the check first stored value only. In all cases this is the second
4190 // operand.
4191 VPValue *ToCheck =
4192 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
4193 Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4194 if (!Visited.insert(V: {ScalarTy}).second)
4195 continue;
4196 Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4197 if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4198 return true;
4199 }
4200 }
4201
4202 return false;
4203}
4204
4205static bool hasReplicatorRegion(VPlan &Plan) {
4206 return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4207 G: Plan.getVectorLoopRegion()->getEntry())),
4208 P: [](auto *VPRB) { return VPRB->isReplicator(); });
4209}
4210
4211#ifndef NDEBUG
4212VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4213 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4214 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4215 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4216 assert(
4217 any_of(VPlans,
4218 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4219 "Expected Scalar VF to be a candidate");
4220
4221 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4222 ExpectedCost);
4223 VectorizationFactor ChosenFactor = ScalarCost;
4224
4225 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4226 if (ForceVectorization &&
4227 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4228 // Ignore scalar width, because the user explicitly wants vectorization.
4229 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4230 // evaluation.
4231 ChosenFactor.Cost = InstructionCost::getMax();
4232 }
4233
4234 for (auto &P : VPlans) {
4235 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4236 P->vectorFactors().end());
4237
4238 SmallVector<VPRegisterUsage, 8> RUs;
4239 if (any_of(VFs, [this](ElementCount VF) {
4240 return CM.shouldConsiderRegPressureForVF(VF);
4241 }))
4242 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4243
4244 for (unsigned I = 0; I < VFs.size(); I++) {
4245 ElementCount VF = VFs[I];
4246 // The cost for scalar VF=1 is already calculated, so ignore it.
4247 if (VF.isScalar())
4248 continue;
4249
4250 /// If the register pressure needs to be considered for VF,
4251 /// don't consider the VF as valid if it exceeds the number
4252 /// of registers for the target.
4253 if (CM.shouldConsiderRegPressureForVF(VF) &&
4254 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4255 continue;
4256
4257 InstructionCost C = CM.expectedCost(VF);
4258
4259 // Add on other costs that are modelled in VPlan, but not in the legacy
4260 // cost model.
4261 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, CM.PSE,
4262 OrigLoop);
4263 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4264 assert(VectorRegion && "Expected to have a vector region!");
4265 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4266 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4267 for (VPRecipeBase &R : *VPBB) {
4268 auto *VPI = dyn_cast<VPInstruction>(&R);
4269 if (!VPI)
4270 continue;
4271 switch (VPI->getOpcode()) {
4272 // Selects are only modelled in the legacy cost model for safe
4273 // divisors.
4274 case Instruction::Select: {
4275 if (auto *WR =
4276 dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4277 switch (WR->getOpcode()) {
4278 case Instruction::UDiv:
4279 case Instruction::SDiv:
4280 case Instruction::URem:
4281 case Instruction::SRem:
4282 continue;
4283 default:
4284 break;
4285 }
4286 }
4287 C += VPI->cost(VF, CostCtx);
4288 break;
4289 }
4290 case VPInstruction::ActiveLaneMask: {
4291 unsigned Multiplier =
4292 cast<VPConstantInt>(VPI->getOperand(2))->getZExtValue();
4293 C += VPI->cost(VF * Multiplier, CostCtx);
4294 break;
4295 }
4296 case VPInstruction::ExplicitVectorLength:
4297 C += VPI->cost(VF, CostCtx);
4298 break;
4299 default:
4300 break;
4301 }
4302 }
4303 }
4304
4305 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4306 unsigned Width =
4307 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4308 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4309 << " costs: " << (Candidate.Cost / Width));
4310 if (VF.isScalable())
4311 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4312 << CM.getVScaleForTuning().value_or(1) << ")");
4313 LLVM_DEBUG(dbgs() << ".\n");
4314
4315 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4316 LLVM_DEBUG(
4317 dbgs()
4318 << "LV: Not considering vector loop of width " << VF
4319 << " because it will not generate any vector instructions.\n");
4320 continue;
4321 }
4322
4323 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4324 LLVM_DEBUG(
4325 dbgs()
4326 << "LV: Not considering vector loop of width " << VF
4327 << " because it would cause replicated blocks to be generated,"
4328 << " which isn't allowed when optimizing for size.\n");
4329 continue;
4330 }
4331
4332 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4333 ChosenFactor = Candidate;
4334 }
4335 }
4336
4337 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4338 reportVectorizationFailure(
4339 "There are conditional stores.",
4340 "store that is conditionally executed prevents vectorization",
4341 "ConditionalStore", ORE, OrigLoop);
4342 ChosenFactor = ScalarCost;
4343 }
4344
4345 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4346 !isMoreProfitable(ChosenFactor, ScalarCost,
4347 !CM.foldTailByMasking())) dbgs()
4348 << "LV: Vectorization seems to be not beneficial, "
4349 << "but was forced by a user.\n");
4350 return ChosenFactor;
4351}
4352#endif
4353
4354/// Returns true if the VPlan contains a VPReductionPHIRecipe with
4355/// FindLast recurrence kind.
4356static bool hasFindLastReductionPhi(VPlan &Plan) {
4357 return any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4358 P: [](VPRecipeBase &R) {
4359 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4360 return RedPhi &&
4361 RecurrenceDescriptor::isFindLastRecurrenceKind(
4362 Kind: RedPhi->getRecurrenceKind());
4363 });
4364}
4365
4366bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4367 ElementCount VF) const {
4368 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4369 // reductions need special handling and are currently unsupported.
4370 if (any_of(Range: OrigLoop->getHeader()->phis(), P: [&](PHINode &Phi) {
4371 if (!Legal->isReductionVariable(PN: &Phi))
4372 return Legal->isFixedOrderRecurrence(Phi: &Phi);
4373 RecurKind Kind =
4374 Legal->getRecurrenceDescriptor(PN: &Phi).getRecurrenceKind();
4375 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
4376 }))
4377 return false;
4378
4379 // FindLast reductions require special handling for the synthesized mask PHI
4380 // and are currently unsupported for epilogue vectorization.
4381 if (hasFindLastReductionPhi(Plan&: getPlanFor(VF)))
4382 return false;
4383
4384 // Phis with uses outside of the loop require special handling and are
4385 // currently unsupported.
4386 for (const auto &Entry : Legal->getInductionVars()) {
4387 // Look for uses of the value of the induction at the last iteration.
4388 Value *PostInc =
4389 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4390 for (User *U : PostInc->users())
4391 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4392 return false;
4393 // Look for uses of penultimate value of the induction.
4394 for (User *U : Entry.first->users())
4395 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4396 return false;
4397 }
4398
4399 // Epilogue vectorization code has not been auditted to ensure it handles
4400 // non-latch exits properly. It may be fine, but it needs auditted and
4401 // tested.
4402 // TODO: Add support for loops with an early exit.
4403 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4404 return false;
4405
4406 return true;
4407}
4408
4409bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4410 const ElementCount VF, const unsigned IC) const {
4411 // FIXME: We need a much better cost-model to take different parameters such
4412 // as register pressure, code size increase and cost of extra branches into
4413 // account. For now we apply a very crude heuristic and only consider loops
4414 // with vectorization factors larger than a certain value.
4415
4416 // Allow the target to opt out entirely.
4417 if (!TTI.preferEpilogueVectorization())
4418 return false;
4419
4420 // We also consider epilogue vectorization unprofitable for targets that don't
4421 // consider interleaving beneficial (eg. MVE).
4422 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4423 return false;
4424
4425 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4426 ? EpilogueVectorizationMinVF
4427 : TTI.getEpilogueVectorizationMinVF();
4428 return estimateElementCount(VF: VF * IC, VScale: VScaleForTuning) >= MinVFThreshold;
4429}
4430
4431VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4432 const ElementCount MainLoopVF, unsigned IC) {
4433 VectorizationFactor Result = VectorizationFactor::Disabled();
4434 if (!EnableEpilogueVectorization) {
4435 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4436 return Result;
4437 }
4438
4439 if (!CM.isScalarEpilogueAllowed()) {
4440 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4441 "epilogue is allowed.\n");
4442 return Result;
4443 }
4444
4445 // Not really a cost consideration, but check for unsupported cases here to
4446 // simplify the logic.
4447 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4448 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4449 "is not a supported candidate.\n");
4450 return Result;
4451 }
4452
4453 if (EpilogueVectorizationForceVF > 1) {
4454 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4455 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4456 if (hasPlanWithVF(VF: ForcedEC))
4457 return {ForcedEC, 0, 0};
4458
4459 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4460 "viable.\n");
4461 return Result;
4462 }
4463
4464 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4465 LLVM_DEBUG(
4466 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4467 return Result;
4468 }
4469
4470 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4471 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4472 "this loop\n");
4473 return Result;
4474 }
4475
4476 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4477 // the main loop handles 8 lanes per iteration. We could still benefit from
4478 // vectorizing the epilogue loop with VF=4.
4479 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4480 MinVal: estimateElementCount(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4481
4482 Type *TCType = Legal->getWidestInductionType();
4483 const SCEV *RemainingIterations = nullptr;
4484 unsigned MaxTripCount = 0;
4485 const SCEV *TC = vputils::getSCEVExprForVPValue(
4486 V: getPlanFor(VF: MainLoopVF).getTripCount(), PSE);
4487 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4488 const SCEV *KnownMinTC;
4489 bool ScalableTC = match(S: TC, P: m_scev_c_Mul(Op0: m_SCEV(V&: KnownMinTC), Op1: m_SCEVVScale()));
4490 bool ScalableRemIter = false;
4491 ScalarEvolution &SE = *PSE.getSE();
4492 // Use versions of TC and VF in which both are either scalable or fixed.
4493 if (ScalableTC == MainLoopVF.isScalable()) {
4494 ScalableRemIter = ScalableTC;
4495 RemainingIterations =
4496 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4497 } else if (ScalableTC) {
4498 const SCEV *EstimatedTC = SE.getMulExpr(
4499 LHS: KnownMinTC,
4500 RHS: SE.getConstant(Ty: TCType, V: CM.getVScaleForTuning().value_or(u: 1)));
4501 RemainingIterations = SE.getURemExpr(
4502 LHS: EstimatedTC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4503 } else
4504 RemainingIterations =
4505 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: EstimatedRuntimeVF * IC));
4506
4507 // No iterations left to process in the epilogue.
4508 if (RemainingIterations->isZero())
4509 return Result;
4510
4511 if (MainLoopVF.isFixed()) {
4512 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4513 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4514 RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4515 MaxTripCount = SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4516 }
4517 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4518 << MaxTripCount << "\n");
4519 }
4520
4521 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
4522 return SE.isKnownPredicate(Pred: CmpInst::ICMP_UGT, LHS: VF, RHS: RemIter);
4523 };
4524 for (auto &NextVF : ProfitableVFs) {
4525 // Skip candidate VFs without a corresponding VPlan.
4526 if (!hasPlanWithVF(VF: NextVF.Width))
4527 continue;
4528
4529 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4530 // vectors) or > the VF of the main loop (fixed vectors).
4531 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4532 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) ||
4533 (NextVF.Width.isScalable() &&
4534 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) ||
4535 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4536 ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4537 continue;
4538
4539 // If NextVF is greater than the number of remaining iterations, the
4540 // epilogue loop would be dead. Skip such factors.
4541 // TODO: We should also consider comparing against a scalable
4542 // RemainingIterations when SCEV be able to evaluate non-canonical
4543 // vscale-based expressions.
4544 if (!ScalableRemIter) {
4545 // Handle the case where NextVF and RemainingIterations are in different
4546 // numerical spaces.
4547 ElementCount EC = NextVF.Width;
4548 if (NextVF.Width.isScalable())
4549 EC = ElementCount::getFixed(
4550 MinVal: estimateElementCount(VF: NextVF.Width, VScale: CM.getVScaleForTuning()));
4551 if (SkipVF(SE.getElementCount(Ty: TCType, EC), RemainingIterations))
4552 continue;
4553 }
4554
4555 if (Result.Width.isScalar() ||
4556 isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking(),
4557 /*IsEpilogue*/ true))
4558 Result = NextVF;
4559 }
4560
4561 if (Result != VectorizationFactor::Disabled())
4562 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4563 << Result.Width << "\n");
4564 return Result;
4565}
4566
4567std::pair<unsigned, unsigned>
4568LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4569 unsigned MinWidth = -1U;
4570 unsigned MaxWidth = 8;
4571 const DataLayout &DL = TheFunction->getDataLayout();
4572 // For in-loop reductions, no element types are added to ElementTypesInLoop
4573 // if there are no loads/stores in the loop. In this case, check through the
4574 // reduction variables to determine the maximum width.
4575 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4576 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4577 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4578 // When finding the min width used by the recurrence we need to account
4579 // for casts on the input operands of the recurrence.
4580 MinWidth = std::min(
4581 a: MinWidth,
4582 b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4583 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4584 MaxWidth = std::max(a: MaxWidth,
4585 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4586 }
4587 } else {
4588 for (Type *T : ElementTypesInLoop) {
4589 MinWidth = std::min<unsigned>(
4590 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4591 MaxWidth = std::max<unsigned>(
4592 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4593 }
4594 }
4595 return {MinWidth, MaxWidth};
4596}
4597
4598void LoopVectorizationCostModel::collectElementTypesForWidening() {
4599 ElementTypesInLoop.clear();
4600 // For each block.
4601 for (BasicBlock *BB : TheLoop->blocks()) {
4602 // For each instruction in the loop.
4603 for (Instruction &I : BB->instructionsWithoutDebug()) {
4604 Type *T = I.getType();
4605
4606 // Skip ignored values.
4607 if (ValuesToIgnore.count(Ptr: &I))
4608 continue;
4609
4610 // Only examine Loads, Stores and PHINodes.
4611 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4612 continue;
4613
4614 // Examine PHI nodes that are reduction variables. Update the type to
4615 // account for the recurrence type.
4616 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4617 if (!Legal->isReductionVariable(PN))
4618 continue;
4619 const RecurrenceDescriptor &RdxDesc =
4620 Legal->getRecurrenceDescriptor(PN);
4621 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4622 TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4623 Ty: RdxDesc.getRecurrenceType()))
4624 continue;
4625 T = RdxDesc.getRecurrenceType();
4626 }
4627
4628 // Examine the stored values.
4629 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4630 T = ST->getValueOperand()->getType();
4631
4632 assert(T->isSized() &&
4633 "Expected the load/store/recurrence type to be sized");
4634
4635 ElementTypesInLoop.insert(Ptr: T);
4636 }
4637 }
4638}
4639
4640unsigned
4641LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4642 InstructionCost LoopCost) {
4643 // -- The interleave heuristics --
4644 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4645 // There are many micro-architectural considerations that we can't predict
4646 // at this level. For example, frontend pressure (on decode or fetch) due to
4647 // code size, or the number and capabilities of the execution ports.
4648 //
4649 // We use the following heuristics to select the interleave count:
4650 // 1. If the code has reductions, then we interleave to break the cross
4651 // iteration dependency.
4652 // 2. If the loop is really small, then we interleave to reduce the loop
4653 // overhead.
4654 // 3. We don't interleave if we think that we will spill registers to memory
4655 // due to the increased register pressure.
4656
4657 // Only interleave tail-folded loops if wide lane masks are requested, as the
4658 // overhead of multiple instructions to calculate the predicate is likely
4659 // not beneficial. If a scalar epilogue is not allowed for any other reason,
4660 // do not interleave.
4661 if (!CM.isScalarEpilogueAllowed() &&
4662 !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4663 return 1;
4664
4665 if (any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4666 P: IsaPred<VPEVLBasedIVPHIRecipe>)) {
4667 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4668 "Unroll factor forced to be 1.\n");
4669 return 1;
4670 }
4671
4672 // We used the distance for the interleave count.
4673 if (!Legal->isSafeForAnyVectorWidth())
4674 return 1;
4675
4676 // We don't attempt to perform interleaving for loops with uncountable early
4677 // exits because the VPInstruction::AnyOf code cannot currently handle
4678 // multiple parts.
4679 if (Plan.hasEarlyExit())
4680 return 1;
4681
4682 const bool HasReductions =
4683 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4684 P: IsaPred<VPReductionPHIRecipe>);
4685
4686 // FIXME: implement interleaving for FindLast transform correctly.
4687 if (hasFindLastReductionPhi(Plan))
4688 return 1;
4689
4690 // If we did not calculate the cost for VF (because the user selected the VF)
4691 // then we calculate the cost of VF here.
4692 if (LoopCost == 0) {
4693 if (VF.isScalar())
4694 LoopCost = CM.expectedCost(VF);
4695 else
4696 LoopCost = cost(Plan, VF);
4697 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4698
4699 // Loop body is free and there is no need for interleaving.
4700 if (LoopCost == 0)
4701 return 1;
4702 }
4703
4704 VPRegisterUsage R =
4705 calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore: CM.ValuesToIgnore)[0];
4706 // We divide by these constants so assume that we have at least one
4707 // instruction that uses at least one register.
4708 for (auto &Pair : R.MaxLocalUsers) {
4709 Pair.second = std::max(a: Pair.second, b: 1U);
4710 }
4711
4712 // We calculate the interleave count using the following formula.
4713 // Subtract the number of loop invariants from the number of available
4714 // registers. These registers are used by all of the interleaved instances.
4715 // Next, divide the remaining registers by the number of registers that is
4716 // required by the loop, in order to estimate how many parallel instances
4717 // fit without causing spills. All of this is rounded down if necessary to be
4718 // a power of two. We want power of two interleave count to simplify any
4719 // addressing operations or alignment considerations.
4720 // We also want power of two interleave counts to ensure that the induction
4721 // variable of the vector loop wraps to zero, when tail is folded by masking;
4722 // this currently happens when OptForSize, in which case IC is set to 1 above.
4723 unsigned IC = UINT_MAX;
4724
4725 for (const auto &Pair : R.MaxLocalUsers) {
4726 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4727 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4728 << " registers of "
4729 << TTI.getRegisterClassName(Pair.first)
4730 << " register class\n");
4731 if (VF.isScalar()) {
4732 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4733 TargetNumRegisters = ForceTargetNumScalarRegs;
4734 } else {
4735 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4736 TargetNumRegisters = ForceTargetNumVectorRegs;
4737 }
4738 unsigned MaxLocalUsers = Pair.second;
4739 unsigned LoopInvariantRegs = 0;
4740 if (R.LoopInvariantRegs.contains(Key: Pair.first))
4741 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4742
4743 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4744 MaxLocalUsers);
4745 // Don't count the induction variable as interleaved.
4746 if (EnableIndVarRegisterHeur) {
4747 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
4748 std::max(a: 1U, b: (MaxLocalUsers - 1)));
4749 }
4750
4751 IC = std::min(a: IC, b: TmpIC);
4752 }
4753
4754 // Clamp the interleave ranges to reasonable counts.
4755 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4756
4757 // Check if the user has overridden the max.
4758 if (VF.isScalar()) {
4759 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4760 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4761 } else {
4762 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4763 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4764 }
4765
4766 // Try to get the exact trip count, or an estimate based on profiling data or
4767 // ConstantMax from PSE, failing that.
4768 auto BestKnownTC = getSmallBestKnownTC(PSE, L: OrigLoop);
4769
4770 // For fixed length VFs treat a scalable trip count as unknown.
4771 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4772 // Re-evaluate trip counts and VFs to be in the same numerical space.
4773 unsigned AvailableTC =
4774 estimateElementCount(VF: *BestKnownTC, VScale: CM.getVScaleForTuning());
4775 unsigned EstimatedVF = estimateElementCount(VF, VScale: CM.getVScaleForTuning());
4776
4777 // At least one iteration must be scalar when this constraint holds. So the
4778 // maximum available iterations for interleaving is one less.
4779 if (CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()))
4780 --AvailableTC;
4781
4782 unsigned InterleaveCountLB = bit_floor(Value: std::max(
4783 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4784
4785 if (getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop).isNonZero()) {
4786 // If the best known trip count is exact, we select between two
4787 // prospective ICs, where
4788 //
4789 // 1) the aggressive IC is capped by the trip count divided by VF
4790 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4791 //
4792 // The final IC is selected in a way that the epilogue loop trip count is
4793 // minimized while maximizing the IC itself, so that we either run the
4794 // vector loop at least once if it generates a small epilogue loop, or
4795 // else we run the vector loop at least twice.
4796
4797 unsigned InterleaveCountUB = bit_floor(Value: std::max(
4798 a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4799 MaxInterleaveCount = InterleaveCountLB;
4800
4801 if (InterleaveCountUB != InterleaveCountLB) {
4802 unsigned TailTripCountUB =
4803 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4804 unsigned TailTripCountLB =
4805 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4806 // If both produce same scalar tail, maximize the IC to do the same work
4807 // in fewer vector loop iterations
4808 if (TailTripCountUB == TailTripCountLB)
4809 MaxInterleaveCount = InterleaveCountUB;
4810 }
4811 } else {
4812 // If trip count is an estimated compile time constant, limit the
4813 // IC to be capped by the trip count divided by VF * 2, such that the
4814 // vector loop runs at least twice to make interleaving seem profitable
4815 // when there is an epilogue loop present. Since exact Trip count is not
4816 // known we choose to be conservative in our IC estimate.
4817 MaxInterleaveCount = InterleaveCountLB;
4818 }
4819 }
4820
4821 assert(MaxInterleaveCount > 0 &&
4822 "Maximum interleave count must be greater than 0");
4823
4824 // Clamp the calculated IC to be between the 1 and the max interleave count
4825 // that the target and trip count allows.
4826 if (IC > MaxInterleaveCount)
4827 IC = MaxInterleaveCount;
4828 else
4829 // Make sure IC is greater than 0.
4830 IC = std::max(a: 1u, b: IC);
4831
4832 assert(IC > 0 && "Interleave count must be greater than 0.");
4833
4834 // Interleave if we vectorized this loop and there is a reduction that could
4835 // benefit from interleaving.
4836 if (VF.isVector() && HasReductions) {
4837 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4838 return IC;
4839 }
4840
4841 // For any scalar loop that either requires runtime checks or predication we
4842 // are better off leaving this to the unroller. Note that if we've already
4843 // vectorized the loop we will have done the runtime check and so interleaving
4844 // won't require further checks.
4845 bool ScalarInterleavingRequiresPredication =
4846 (VF.isScalar() && any_of(Range: OrigLoop->blocks(), P: [this](BasicBlock *BB) {
4847 return Legal->blockNeedsPredication(BB);
4848 }));
4849 bool ScalarInterleavingRequiresRuntimePointerCheck =
4850 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4851
4852 // We want to interleave small loops in order to reduce the loop overhead and
4853 // potentially expose ILP opportunities.
4854 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4855 << "LV: IC is " << IC << '\n'
4856 << "LV: VF is " << VF << '\n');
4857 const bool AggressivelyInterleaveReductions =
4858 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4859 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4860 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4861 // We assume that the cost overhead is 1 and we use the cost model
4862 // to estimate the cost of the loop and interleave until the cost of the
4863 // loop overhead is about 5% of the cost of the loop.
4864 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4865 Value: SmallLoopCost / LoopCost.getValue()));
4866
4867 // Interleave until store/load ports (estimated by max interleave count) are
4868 // saturated.
4869 unsigned NumStores = 0;
4870 unsigned NumLoads = 0;
4871 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4872 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
4873 for (VPRecipeBase &R : *VPBB) {
4874 if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: &R)) {
4875 NumLoads++;
4876 continue;
4877 }
4878 if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(Val: &R)) {
4879 NumStores++;
4880 continue;
4881 }
4882
4883 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R)) {
4884 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4885 NumStores += StoreOps;
4886 else
4887 NumLoads += InterleaveR->getNumDefinedValues();
4888 continue;
4889 }
4890 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4891 NumLoads += isa<LoadInst>(Val: RepR->getUnderlyingInstr());
4892 NumStores += isa<StoreInst>(Val: RepR->getUnderlyingInstr());
4893 continue;
4894 }
4895 if (isa<VPHistogramRecipe>(Val: &R)) {
4896 NumLoads++;
4897 NumStores++;
4898 continue;
4899 }
4900 }
4901 }
4902 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4903 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4904
4905 // There is little point in interleaving for reductions containing selects
4906 // and compares when VF=1 since it may just create more overhead than it's
4907 // worth for loops with small trip counts. This is because we still have to
4908 // do the final reduction after the loop.
4909 bool HasSelectCmpReductions =
4910 HasReductions &&
4911 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4912 P: [](VPRecipeBase &R) {
4913 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4914 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4915 Kind: RedR->getRecurrenceKind()) ||
4916 RecurrenceDescriptor::isFindIVRecurrenceKind(
4917 Kind: RedR->getRecurrenceKind()));
4918 });
4919 if (HasSelectCmpReductions) {
4920 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4921 return 1;
4922 }
4923
4924 // If we have a scalar reduction (vector reductions are already dealt with
4925 // by this point), we can increase the critical path length if the loop
4926 // we're interleaving is inside another loop. For tree-wise reductions
4927 // set the limit to 2, and for ordered reductions it's best to disable
4928 // interleaving entirely.
4929 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4930 bool HasOrderedReductions =
4931 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4932 P: [](VPRecipeBase &R) {
4933 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4934
4935 return RedR && RedR->isOrdered();
4936 });
4937 if (HasOrderedReductions) {
4938 LLVM_DEBUG(
4939 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4940 return 1;
4941 }
4942
4943 unsigned F = MaxNestedScalarReductionIC;
4944 SmallIC = std::min(a: SmallIC, b: F);
4945 StoresIC = std::min(a: StoresIC, b: F);
4946 LoadsIC = std::min(a: LoadsIC, b: F);
4947 }
4948
4949 if (EnableLoadStoreRuntimeInterleave &&
4950 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4951 LLVM_DEBUG(
4952 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4953 return std::max(a: StoresIC, b: LoadsIC);
4954 }
4955
4956 // If there are scalar reductions and TTI has enabled aggressive
4957 // interleaving for reductions, we will interleave to expose ILP.
4958 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4959 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4960 // Interleave no less than SmallIC but not as aggressive as the normal IC
4961 // to satisfy the rare situation when resources are too limited.
4962 return std::max(a: IC / 2, b: SmallIC);
4963 }
4964
4965 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4966 return SmallIC;
4967 }
4968
4969 // Interleave if this is a large loop (small loops are already dealt with by
4970 // this point) that could benefit from interleaving.
4971 if (AggressivelyInterleaveReductions) {
4972 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4973 return IC;
4974 }
4975
4976 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4977 return 1;
4978}
4979
4980bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4981 ElementCount VF) {
4982 // TODO: Cost model for emulated masked load/store is completely
4983 // broken. This hack guides the cost model to use an artificially
4984 // high enough value to practically disable vectorization with such
4985 // operations, except where previously deployed legality hack allowed
4986 // using very low cost values. This is to avoid regressions coming simply
4987 // from moving "masked load/store" check from legality to cost model.
4988 // Masked Load/Gather emulation was previously never allowed.
4989 // Limited number of Masked Store/Scatter emulation was allowed.
4990 assert((isPredicatedInst(I)) &&
4991 "Expecting a scalar emulated instruction");
4992 return isa<LoadInst>(Val: I) ||
4993 (isa<StoreInst>(Val: I) &&
4994 NumPredStores > NumberOfStoresToPredicate);
4995}
4996
4997void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4998 assert(VF.isVector() && "Expected VF >= 2");
4999
5000 // If we've already collected the instructions to scalarize or the predicated
5001 // BBs after vectorization, there's nothing to do. Collection may already have
5002 // occurred if we have a user-selected VF and are now computing the expected
5003 // cost for interleaving.
5004 if (InstsToScalarize.contains(Key: VF) ||
5005 PredicatedBBsAfterVectorization.contains(Val: VF))
5006 return;
5007
5008 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5009 // not profitable to scalarize any instructions, the presence of VF in the
5010 // map will indicate that we've analyzed it already.
5011 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5012
5013 // Find all the instructions that are scalar with predication in the loop and
5014 // determine if it would be better to not if-convert the blocks they are in.
5015 // If so, we also record the instructions to scalarize.
5016 for (BasicBlock *BB : TheLoop->blocks()) {
5017 if (!blockNeedsPredicationForAnyReason(BB))
5018 continue;
5019 for (Instruction &I : *BB)
5020 if (isScalarWithPredication(I: &I, VF)) {
5021 ScalarCostsTy ScalarCosts;
5022 // Do not apply discount logic for:
5023 // 1. Scalars after vectorization, as there will only be a single copy
5024 // of the instruction.
5025 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5026 // 3. Emulated masked memrefs, if a hacked cost is needed.
5027 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5028 !useEmulatedMaskMemRefHack(I: &I, VF) &&
5029 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) {
5030 for (const auto &[I, IC] : ScalarCosts)
5031 ScalarCostsVF.insert(KV: {I, IC});
5032 // Check if we decided to scalarize a call. If so, update the widening
5033 // decision of the call to CM_Scalarize with the computed scalar cost.
5034 for (const auto &[I, Cost] : ScalarCosts) {
5035 auto *CI = dyn_cast<CallInst>(Val: I);
5036 if (!CI || !CallWideningDecisions.contains(Val: {CI, VF}))
5037 continue;
5038 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5039 CallWideningDecisions[{CI, VF}].Cost = Cost;
5040 }
5041 }
5042 // Remember that BB will remain after vectorization.
5043 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
5044 for (auto *Pred : predecessors(BB)) {
5045 if (Pred->getSingleSuccessor() == BB)
5046 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
5047 }
5048 }
5049 }
5050}
5051
5052InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5053 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5054 assert(!isUniformAfterVectorization(PredInst, VF) &&
5055 "Instruction marked uniform-after-vectorization will be predicated");
5056
5057 // Initialize the discount to zero, meaning that the scalar version and the
5058 // vector version cost the same.
5059 InstructionCost Discount = 0;
5060
5061 // Holds instructions to analyze. The instructions we visit are mapped in
5062 // ScalarCosts. Those instructions are the ones that would be scalarized if
5063 // we find that the scalar version costs less.
5064 SmallVector<Instruction *, 8> Worklist;
5065
5066 // Returns true if the given instruction can be scalarized.
5067 auto CanBeScalarized = [&](Instruction *I) -> bool {
5068 // We only attempt to scalarize instructions forming a single-use chain
5069 // from the original predicated block that would otherwise be vectorized.
5070 // Although not strictly necessary, we give up on instructions we know will
5071 // already be scalar to avoid traversing chains that are unlikely to be
5072 // beneficial.
5073 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5074 isScalarAfterVectorization(I, VF))
5075 return false;
5076
5077 // If the instruction is scalar with predication, it will be analyzed
5078 // separately. We ignore it within the context of PredInst.
5079 if (isScalarWithPredication(I, VF))
5080 return false;
5081
5082 // If any of the instruction's operands are uniform after vectorization,
5083 // the instruction cannot be scalarized. This prevents, for example, a
5084 // masked load from being scalarized.
5085 //
5086 // We assume we will only emit a value for lane zero of an instruction
5087 // marked uniform after vectorization, rather than VF identical values.
5088 // Thus, if we scalarize an instruction that uses a uniform, we would
5089 // create uses of values corresponding to the lanes we aren't emitting code
5090 // for. This behavior can be changed by allowing getScalarValue to clone
5091 // the lane zero values for uniforms rather than asserting.
5092 for (Use &U : I->operands())
5093 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5094 if (isUniformAfterVectorization(I: J, VF))
5095 return false;
5096
5097 // Otherwise, we can scalarize the instruction.
5098 return true;
5099 };
5100
5101 // Compute the expected cost discount from scalarizing the entire expression
5102 // feeding the predicated instruction. We currently only consider expressions
5103 // that are single-use instruction chains.
5104 Worklist.push_back(Elt: PredInst);
5105 while (!Worklist.empty()) {
5106 Instruction *I = Worklist.pop_back_val();
5107
5108 // If we've already analyzed the instruction, there's nothing to do.
5109 if (ScalarCosts.contains(Key: I))
5110 continue;
5111
5112 // Cannot scalarize fixed-order recurrence phis at the moment.
5113 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5114 continue;
5115
5116 // Compute the cost of the vector instruction. Note that this cost already
5117 // includes the scalarization overhead of the predicated instruction.
5118 InstructionCost VectorCost = getInstructionCost(I, VF);
5119
5120 // Compute the cost of the scalarized instruction. This cost is the cost of
5121 // the instruction as if it wasn't if-converted and instead remained in the
5122 // predicated block. We will scale this cost by block probability after
5123 // computing the scalarization overhead.
5124 InstructionCost ScalarCost =
5125 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
5126
5127 // Compute the scalarization overhead of needed insertelement instructions
5128 // and phi nodes.
5129 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5130 Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5131 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5132 ScalarCost += TTI.getScalarizationOverhead(
5133 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5134 /*Insert=*/true,
5135 /*Extract=*/false, CostKind);
5136 }
5137 ScalarCost +=
5138 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5139 }
5140
5141 // Compute the scalarization overhead of needed extractelement
5142 // instructions. For each of the instruction's operands, if the operand can
5143 // be scalarized, add it to the worklist; otherwise, account for the
5144 // overhead.
5145 for (Use &U : I->operands())
5146 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5147 assert(canVectorizeTy(J->getType()) &&
5148 "Instruction has non-scalar type");
5149 if (CanBeScalarized(J))
5150 Worklist.push_back(Elt: J);
5151 else if (needsExtract(V: J, VF)) {
5152 Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5153 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5154 ScalarCost += TTI.getScalarizationOverhead(
5155 Ty: cast<VectorType>(Val: VectorTy),
5156 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5157 /*Extract*/ true, CostKind);
5158 }
5159 }
5160 }
5161
5162 // Scale the total scalar cost by block probability.
5163 ScalarCost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5164
5165 // Compute the discount. A non-negative discount means the vector version
5166 // of the instruction costs more, and scalarizing would be beneficial.
5167 Discount += VectorCost - ScalarCost;
5168 ScalarCosts[I] = ScalarCost;
5169 }
5170
5171 return Discount;
5172}
5173
5174InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5175 InstructionCost Cost;
5176
5177 // If the vector loop gets executed exactly once with the given VF, ignore the
5178 // costs of comparison and induction instructions, as they'll get simplified
5179 // away.
5180 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5181 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5182 if (TC == VF && !foldTailByMasking())
5183 addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5184 InstsToIgnore&: ValuesToIgnoreForVF);
5185
5186 // For each block.
5187 for (BasicBlock *BB : TheLoop->blocks()) {
5188 InstructionCost BlockCost;
5189
5190 // For each instruction in the old loop.
5191 for (Instruction &I : BB->instructionsWithoutDebug()) {
5192 // Skip ignored values.
5193 if (ValuesToIgnore.count(Ptr: &I) || ValuesToIgnoreForVF.count(Ptr: &I) ||
5194 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5195 continue;
5196
5197 InstructionCost C = getInstructionCost(I: &I, VF);
5198
5199 // Check if we should override the cost.
5200 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5201 // For interleave groups, use ForceTargetInstructionCost once for the
5202 // whole group.
5203 if (VF.isVector() && getWideningDecision(I: &I, VF) == CM_Interleave) {
5204 if (getInterleavedAccessGroup(Instr: &I)->getInsertPos() == &I)
5205 C = InstructionCost(ForceTargetInstructionCost);
5206 else
5207 C = InstructionCost(0);
5208 } else {
5209 C = InstructionCost(ForceTargetInstructionCost);
5210 }
5211 }
5212
5213 BlockCost += C;
5214 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5215 << VF << " For instruction: " << I << '\n');
5216 }
5217
5218 // If we are vectorizing a predicated block, it will have been
5219 // if-converted. This means that the block's instructions (aside from
5220 // stores and instructions that may divide by zero) will now be
5221 // unconditionally executed. For the scalar case, we may not always execute
5222 // the predicated block, if it is an if-else block. Thus, scale the block's
5223 // cost by the probability of executing it.
5224 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5225 // by the header mask when folding the tail.
5226 if (VF.isScalar())
5227 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5228
5229 Cost += BlockCost;
5230 }
5231
5232 return Cost;
5233}
5234
5235/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
5236/// according to isAddressSCEVForCost.
5237///
5238/// This SCEV can be sent to the Target in order to estimate the address
5239/// calculation cost.
5240static const SCEV *getAddressAccessSCEV(
5241 Value *Ptr,
5242 PredicatedScalarEvolution &PSE,
5243 const Loop *TheLoop) {
5244 const SCEV *Addr = PSE.getSCEV(V: Ptr);
5245 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L: TheLoop) ? Addr
5246 : nullptr;
5247}
5248
5249InstructionCost
5250LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5251 ElementCount VF) {
5252 assert(VF.isVector() &&
5253 "Scalarization cost of instruction implies vectorization.");
5254 if (VF.isScalable())
5255 return InstructionCost::getInvalid();
5256
5257 Type *ValTy = getLoadStoreType(I);
5258 auto *SE = PSE.getSE();
5259
5260 unsigned AS = getLoadStoreAddressSpace(I);
5261 Value *Ptr = getLoadStorePointerOperand(V: I);
5262 Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5263 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5264 // that it is being called from this specific place.
5265
5266 // Figure out whether the access is strided and get the stride value
5267 // if it's known in compile time
5268 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
5269
5270 // Get the cost of the scalar memory instruction and address computation.
5271 InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
5272 PtrTy, SE, Ptr: PtrSCEV, CostKind);
5273
5274 // Don't pass *I here, since it is scalar but will actually be part of a
5275 // vectorized loop where the user of it is a vectorized instruction.
5276 const Align Alignment = getLoadStoreAlignment(I);
5277 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5278 Cost += VF.getFixedValue() *
5279 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy->getScalarType(), Alignment,
5280 AddressSpace: AS, CostKind, OpdInfo: OpInfo);
5281
5282 // Get the overhead of the extractelement and insertelement instructions
5283 // we might create due to scalarization.
5284 Cost += getScalarizationOverhead(I, VF);
5285
5286 // If we have a predicated load/store, it will need extra i1 extracts and
5287 // conditional branches, but may not be executed for each vector lane. Scale
5288 // the cost by the probability of executing the predicated block.
5289 if (isPredicatedInst(I)) {
5290 Cost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5291
5292 // Add the cost of an i1 extract and a branch
5293 auto *VecI1Ty =
5294 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5295 Cost += TTI.getScalarizationOverhead(
5296 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5297 /*Insert=*/false, /*Extract=*/true, CostKind);
5298 Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
5299
5300 if (useEmulatedMaskMemRefHack(I, VF))
5301 // Artificially setting to a high enough value to practically disable
5302 // vectorization with such operations.
5303 Cost = 3000000;
5304 }
5305
5306 return Cost;
5307}
5308
5309InstructionCost
5310LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5311 ElementCount VF) {
5312 Type *ValTy = getLoadStoreType(I);
5313 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5314 Value *Ptr = getLoadStorePointerOperand(V: I);
5315 unsigned AS = getLoadStoreAddressSpace(I);
5316 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5317
5318 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5319 "Stride should be 1 or -1 for consecutive memory access");
5320 const Align Alignment = getLoadStoreAlignment(I);
5321 InstructionCost Cost = 0;
5322 if (Legal->isMaskRequired(I)) {
5323 unsigned IID = I->getOpcode() == Instruction::Load
5324 ? Intrinsic::masked_load
5325 : Intrinsic::masked_store;
5326 Cost += TTI.getMemIntrinsicInstrCost(
5327 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS), CostKind);
5328 } else {
5329 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5330 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5331 CostKind, OpdInfo: OpInfo, I);
5332 }
5333
5334 bool Reverse = ConsecutiveStride < 0;
5335 if (Reverse)
5336 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5337 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5338 return Cost;
5339}
5340
5341InstructionCost
5342LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5343 ElementCount VF) {
5344 assert(Legal->isUniformMemOp(*I, VF));
5345
5346 Type *ValTy = getLoadStoreType(I);
5347 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5348 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5349 const Align Alignment = getLoadStoreAlignment(I);
5350 unsigned AS = getLoadStoreAddressSpace(I);
5351 if (isa<LoadInst>(Val: I)) {
5352 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5353 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5354 CostKind) +
5355 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5356 SrcTy: VectorTy, Mask: {}, CostKind);
5357 }
5358 StoreInst *SI = cast<StoreInst>(Val: I);
5359
5360 bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5361 // TODO: We have existing tests that request the cost of extracting element
5362 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5363 // the actual generated code, which involves extracting the last element of
5364 // a scalable vector where the lane to extract is unknown at compile time.
5365 InstructionCost Cost =
5366 TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5367 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS, CostKind);
5368 if (!IsLoopInvariantStoreValue)
5369 Cost += TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
5370 Val: VectorTy, CostKind, Index: 0);
5371 return Cost;
5372}
5373
5374InstructionCost
5375LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5376 ElementCount VF) {
5377 Type *ValTy = getLoadStoreType(I);
5378 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5379 const Align Alignment = getLoadStoreAlignment(I);
5380 Value *Ptr = getLoadStorePointerOperand(V: I);
5381 Type *PtrTy = Ptr->getType();
5382
5383 if (!Legal->isUniform(V: Ptr, VF))
5384 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
5385
5386 unsigned IID = I->getOpcode() == Instruction::Load
5387 ? Intrinsic::masked_gather
5388 : Intrinsic::masked_scatter;
5389 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5390 TTI.getMemIntrinsicInstrCost(
5391 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Ptr,
5392 Legal->isMaskRequired(I), Alignment, I),
5393 CostKind);
5394}
5395
5396InstructionCost
5397LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5398 ElementCount VF) {
5399 const auto *Group = getInterleavedAccessGroup(Instr: I);
5400 assert(Group && "Fail to get an interleaved access group.");
5401
5402 Instruction *InsertPos = Group->getInsertPos();
5403 Type *ValTy = getLoadStoreType(I: InsertPos);
5404 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5405 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5406
5407 unsigned InterleaveFactor = Group->getFactor();
5408 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
5409
5410 // Holds the indices of existing members in the interleaved group.
5411 SmallVector<unsigned, 4> Indices;
5412 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5413 if (Group->getMember(Index: IF))
5414 Indices.push_back(Elt: IF);
5415
5416 // Calculate the cost of the whole interleaved group.
5417 bool UseMaskForGaps =
5418 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5419 (isa<StoreInst>(Val: I) && !Group->isFull());
5420 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5421 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5422 Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5423 UseMaskForGaps);
5424
5425 if (Group->isReverse()) {
5426 // TODO: Add support for reversed masked interleaved access.
5427 assert(!Legal->isMaskRequired(I) &&
5428 "Reverse masked interleaved access not supported.");
5429 Cost += Group->getNumMembers() *
5430 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5431 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5432 }
5433 return Cost;
5434}
5435
5436std::optional<InstructionCost>
5437LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5438 ElementCount VF,
5439 Type *Ty) const {
5440 using namespace llvm::PatternMatch;
5441 // Early exit for no inloop reductions
5442 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
5443 return std::nullopt;
5444 auto *VectorTy = cast<VectorType>(Val: Ty);
5445
5446 // We are looking for a pattern of, and finding the minimal acceptable cost:
5447 // reduce(mul(ext(A), ext(B))) or
5448 // reduce(mul(A, B)) or
5449 // reduce(ext(A)) or
5450 // reduce(A).
5451 // The basic idea is that we walk down the tree to do that, finding the root
5452 // reduction instruction in InLoopReductionImmediateChains. From there we find
5453 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5454 // of the components. If the reduction cost is lower then we return it for the
5455 // reduction instruction and 0 for the other instructions in the pattern. If
5456 // it is not we return an invalid cost specifying the orignal cost method
5457 // should be used.
5458 Instruction *RetI = I;
5459 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5460 if (!RetI->hasOneUser())
5461 return std::nullopt;
5462 RetI = RetI->user_back();
5463 }
5464
5465 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5466 RetI->user_back()->getOpcode() == Instruction::Add) {
5467 RetI = RetI->user_back();
5468 }
5469
5470 // Test if the found instruction is a reduction, and if not return an invalid
5471 // cost specifying the parent to use the original cost modelling.
5472 Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5473 if (!LastChain)
5474 return std::nullopt;
5475
5476 // Find the reduction this chain is a part of and calculate the basic cost of
5477 // the reduction on its own.
5478 Instruction *ReductionPhi = LastChain;
5479 while (!isa<PHINode>(Val: ReductionPhi))
5480 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5481
5482 const RecurrenceDescriptor &RdxDesc =
5483 Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
5484
5485 InstructionCost BaseCost;
5486 RecurKind RK = RdxDesc.getRecurrenceKind();
5487 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5488 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5489 BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5490 FMF: RdxDesc.getFastMathFlags(), CostKind);
5491 } else {
5492 BaseCost = TTI.getArithmeticReductionCost(
5493 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5494 }
5495
5496 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5497 // normal fmul instruction to the cost of the fadd reduction.
5498 if (RK == RecurKind::FMulAdd)
5499 BaseCost +=
5500 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5501
5502 // If we're using ordered reductions then we can just return the base cost
5503 // here, since getArithmeticReductionCost calculates the full ordered
5504 // reduction cost when FP reassociation is not allowed.
5505 if (useOrderedReductions(RdxDesc))
5506 return BaseCost;
5507
5508 // Get the operand that was not the reduction chain and match it to one of the
5509 // patterns, returning the better cost if it is found.
5510 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
5511 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
5512 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
5513
5514 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
5515
5516 Instruction *Op0, *Op1;
5517 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5518 match(V: RedOp,
5519 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5520 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5521 Op0->getOpcode() == Op1->getOpcode() &&
5522 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
5523 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5524 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5525
5526 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5527 // Note that the extend opcodes need to all match, or if A==B they will have
5528 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5529 // which is equally fine.
5530 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5531 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
5532 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5533
5534 InstructionCost ExtCost =
5535 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5536 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5537 InstructionCost MulCost =
5538 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5539 InstructionCost Ext2Cost =
5540 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5541 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5542
5543 InstructionCost RedCost = TTI.getMulAccReductionCost(
5544 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5545 CostKind);
5546
5547 if (RedCost.isValid() &&
5548 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5549 return I == RetI ? RedCost : 0;
5550 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5551 !TheLoop->isLoopInvariant(V: RedOp)) {
5552 // Matched reduce(ext(A))
5553 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5554 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
5555 InstructionCost RedCost = TTI.getExtendedReductionCost(
5556 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5557 FMF: RdxDesc.getFastMathFlags(), CostKind);
5558
5559 InstructionCost ExtCost =
5560 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5561 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5562 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5563 return I == RetI ? RedCost : 0;
5564 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5565 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5566 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5567 Op0->getOpcode() == Op1->getOpcode() &&
5568 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5569 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5570 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
5571 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
5572 Type *LargestOpTy =
5573 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5574 : Op0Ty;
5575 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5576
5577 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5578 // different sizes. We take the largest type as the ext to reduce, and add
5579 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5580 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5581 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5582 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5583 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5584 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5585 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5586 InstructionCost MulCost =
5587 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5588
5589 InstructionCost RedCost = TTI.getMulAccReductionCost(
5590 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5591 CostKind);
5592 InstructionCost ExtraExtCost = 0;
5593 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5594 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5595 ExtraExtCost = TTI.getCastInstrCost(
5596 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5597 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
5598 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5599 }
5600
5601 if (RedCost.isValid() &&
5602 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5603 return I == RetI ? RedCost : 0;
5604 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5605 // Matched reduce.add(mul())
5606 InstructionCost MulCost =
5607 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5608
5609 InstructionCost RedCost = TTI.getMulAccReductionCost(
5610 IsUnsigned: true, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy,
5611 CostKind);
5612
5613 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5614 return I == RetI ? RedCost : 0;
5615 }
5616 }
5617
5618 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5619}
5620
5621InstructionCost
5622LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5623 ElementCount VF) {
5624 // Calculate scalar cost only. Vectorization cost should be ready at this
5625 // moment.
5626 if (VF.isScalar()) {
5627 Type *ValTy = getLoadStoreType(I);
5628 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5629 const Align Alignment = getLoadStoreAlignment(I);
5630 unsigned AS = getLoadStoreAddressSpace(I);
5631
5632 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5633 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5634 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5635 OpdInfo: OpInfo, I);
5636 }
5637 return getWideningCost(I, VF);
5638}
5639
5640InstructionCost
5641LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5642 ElementCount VF) const {
5643
5644 // There is no mechanism yet to create a scalable scalarization loop,
5645 // so this is currently Invalid.
5646 if (VF.isScalable())
5647 return InstructionCost::getInvalid();
5648
5649 if (VF.isScalar())
5650 return 0;
5651
5652 InstructionCost Cost = 0;
5653 Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5654 if (!RetTy->isVoidTy() &&
5655 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5656
5657 TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None;
5658 if (isa<LoadInst>(Val: I))
5659 VIC = TTI::VectorInstrContext::Load;
5660 else if (isa<StoreInst>(Val: I))
5661 VIC = TTI::VectorInstrContext::Store;
5662
5663 for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5664 Cost += TTI.getScalarizationOverhead(
5665 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5666 /*Insert=*/true, /*Extract=*/false, CostKind,
5667 /*ForPoisonSrc=*/true, VL: {}, VIC);
5668 }
5669 }
5670
5671 // Some targets keep addresses scalar.
5672 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5673 return Cost;
5674
5675 // Some targets support efficient element stores.
5676 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5677 return Cost;
5678
5679 // Collect operands to consider.
5680 CallInst *CI = dyn_cast<CallInst>(Val: I);
5681 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5682
5683 // Skip operands that do not require extraction/scalarization and do not incur
5684 // any overhead.
5685 SmallVector<Type *> Tys;
5686 for (auto *V : filterExtractingOperands(Ops, VF))
5687 Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5688
5689 TTI::VectorInstrContext OperandVIC = isa<StoreInst>(Val: I)
5690 ? TTI::VectorInstrContext::Store
5691 : TTI::VectorInstrContext::None;
5692 return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC: OperandVIC);
5693}
5694
5695void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5696 if (VF.isScalar())
5697 return;
5698 NumPredStores = 0;
5699 for (BasicBlock *BB : TheLoop->blocks()) {
5700 // For each instruction in the old loop.
5701 for (Instruction &I : *BB) {
5702 Value *Ptr = getLoadStorePointerOperand(V: &I);
5703 if (!Ptr)
5704 continue;
5705
5706 // TODO: We should generate better code and update the cost model for
5707 // predicated uniform stores. Today they are treated as any other
5708 // predicated store (see added test cases in
5709 // invariant-store-vectorization.ll).
5710 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5711 NumPredStores++;
5712
5713 if (Legal->isUniformMemOp(I, VF)) {
5714 auto IsLegalToScalarize = [&]() {
5715 if (!VF.isScalable())
5716 // Scalarization of fixed length vectors "just works".
5717 return true;
5718
5719 // We have dedicated lowering for unpredicated uniform loads and
5720 // stores. Note that even with tail folding we know that at least
5721 // one lane is active (i.e. generalized predication is not possible
5722 // here), and the logic below depends on this fact.
5723 if (!foldTailByMasking())
5724 return true;
5725
5726 // For scalable vectors, a uniform memop load is always
5727 // uniform-by-parts and we know how to scalarize that.
5728 if (isa<LoadInst>(Val: I))
5729 return true;
5730
5731 // A uniform store isn't neccessarily uniform-by-part
5732 // and we can't assume scalarization.
5733 auto &SI = cast<StoreInst>(Val&: I);
5734 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5735 };
5736
5737 const InstructionCost GatherScatterCost =
5738 isLegalGatherOrScatter(V: &I, VF) ?
5739 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5740
5741 // Load: Scalar load + broadcast
5742 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5743 // FIXME: This cost is a significant under-estimate for tail folded
5744 // memory ops.
5745 const InstructionCost ScalarizationCost =
5746 IsLegalToScalarize() ? getUniformMemOpCost(I: &I, VF)
5747 : InstructionCost::getInvalid();
5748
5749 // Choose better solution for the current VF, Note that Invalid
5750 // costs compare as maximumal large. If both are invalid, we get
5751 // scalable invalid which signals a failure and a vectorization abort.
5752 if (GatherScatterCost < ScalarizationCost)
5753 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5754 else
5755 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5756 continue;
5757 }
5758
5759 // We assume that widening is the best solution when possible.
5760 if (memoryInstructionCanBeWidened(I: &I, VF)) {
5761 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5762 int ConsecutiveStride = Legal->isConsecutivePtr(
5763 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5764 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5765 "Expected consecutive stride.");
5766 InstWidening Decision =
5767 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5768 setWideningDecision(I: &I, VF, W: Decision, Cost);
5769 continue;
5770 }
5771
5772 // Choose between Interleaving, Gather/Scatter or Scalarization.
5773 InstructionCost InterleaveCost = InstructionCost::getInvalid();
5774 unsigned NumAccesses = 1;
5775 if (isAccessInterleaved(Instr: &I)) {
5776 const auto *Group = getInterleavedAccessGroup(Instr: &I);
5777 assert(Group && "Fail to get an interleaved access group.");
5778
5779 // Make one decision for the whole group.
5780 if (getWideningDecision(I: &I, VF) != CM_Unknown)
5781 continue;
5782
5783 NumAccesses = Group->getNumMembers();
5784 if (interleavedAccessCanBeWidened(I: &I, VF))
5785 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5786 }
5787
5788 InstructionCost GatherScatterCost =
5789 isLegalGatherOrScatter(V: &I, VF)
5790 ? getGatherScatterCost(I: &I, VF) * NumAccesses
5791 : InstructionCost::getInvalid();
5792
5793 InstructionCost ScalarizationCost =
5794 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5795
5796 // Choose better solution for the current VF,
5797 // write down this decision and use it during vectorization.
5798 InstructionCost Cost;
5799 InstWidening Decision;
5800 if (InterleaveCost <= GatherScatterCost &&
5801 InterleaveCost < ScalarizationCost) {
5802 Decision = CM_Interleave;
5803 Cost = InterleaveCost;
5804 } else if (GatherScatterCost < ScalarizationCost) {
5805 Decision = CM_GatherScatter;
5806 Cost = GatherScatterCost;
5807 } else {
5808 Decision = CM_Scalarize;
5809 Cost = ScalarizationCost;
5810 }
5811 // If the instructions belongs to an interleave group, the whole group
5812 // receives the same decision. The whole group receives the cost, but
5813 // the cost will actually be assigned to one instruction.
5814 if (const auto *Group = getInterleavedAccessGroup(Instr: &I)) {
5815 if (Decision == CM_Scalarize) {
5816 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5817 if (auto *I = Group->getMember(Index: Idx)) {
5818 setWideningDecision(I, VF, W: Decision,
5819 Cost: getMemInstScalarizationCost(I, VF));
5820 }
5821 }
5822 } else {
5823 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5824 }
5825 } else
5826 setWideningDecision(I: &I, VF, W: Decision, Cost);
5827 }
5828 }
5829
5830 // Make sure that any load of address and any other address computation
5831 // remains scalar unless there is gather/scatter support. This avoids
5832 // inevitable extracts into address registers, and also has the benefit of
5833 // activating LSR more, since that pass can't optimize vectorized
5834 // addresses.
5835 if (TTI.prefersVectorizedAddressing())
5836 return;
5837
5838 // Start with all scalar pointer uses.
5839 SmallPtrSet<Instruction *, 8> AddrDefs;
5840 for (BasicBlock *BB : TheLoop->blocks())
5841 for (Instruction &I : *BB) {
5842 Instruction *PtrDef =
5843 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5844 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5845 getWideningDecision(I: &I, VF) != CM_GatherScatter)
5846 AddrDefs.insert(Ptr: PtrDef);
5847 }
5848
5849 // Add all instructions used to generate the addresses.
5850 SmallVector<Instruction *, 4> Worklist;
5851 append_range(C&: Worklist, R&: AddrDefs);
5852 while (!Worklist.empty()) {
5853 Instruction *I = Worklist.pop_back_val();
5854 for (auto &Op : I->operands())
5855 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5856 if (TheLoop->contains(Inst: InstOp) && !isa<PHINode>(Val: InstOp) &&
5857 AddrDefs.insert(Ptr: InstOp).second)
5858 Worklist.push_back(Elt: InstOp);
5859 }
5860
5861 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5862 // If there are direct memory op users of the newly scalarized load,
5863 // their cost may have changed because there's no scalarization
5864 // overhead for the operand. Update it.
5865 for (User *U : LI->users()) {
5866 if (!isa<LoadInst, StoreInst>(Val: U))
5867 continue;
5868 if (getWideningDecision(I: cast<Instruction>(Val: U), VF) != CM_Scalarize)
5869 continue;
5870 setWideningDecision(
5871 I: cast<Instruction>(Val: U), VF, W: CM_Scalarize,
5872 Cost: getMemInstScalarizationCost(I: cast<Instruction>(Val: U), VF));
5873 }
5874 };
5875 for (auto *I : AddrDefs) {
5876 if (isa<LoadInst>(Val: I)) {
5877 // Setting the desired widening decision should ideally be handled in
5878 // by cost functions, but since this involves the task of finding out
5879 // if the loaded register is involved in an address computation, it is
5880 // instead changed here when we know this is the case.
5881 InstWidening Decision = getWideningDecision(I, VF);
5882 if (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5883 (!isPredicatedInst(I) && !Legal->isUniformMemOp(I&: *I, VF) &&
5884 Decision == CM_Scalarize)) {
5885 // Scalarize a widened load of address or update the cost of a scalar
5886 // load of an address.
5887 setWideningDecision(
5888 I, VF, W: CM_Scalarize,
5889 Cost: (VF.getKnownMinValue() *
5890 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
5891 UpdateMemOpUserCost(cast<LoadInst>(Val: I));
5892 } else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5893 // Scalarize all members of this interleaved group when any member
5894 // is used as an address. The address-used load skips scalarization
5895 // overhead, other members include it.
5896 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5897 if (Instruction *Member = Group->getMember(Index: Idx)) {
5898 InstructionCost Cost =
5899 AddrDefs.contains(Ptr: Member)
5900 ? (VF.getKnownMinValue() *
5901 getMemoryInstructionCost(I: Member,
5902 VF: ElementCount::getFixed(MinVal: 1)))
5903 : getMemInstScalarizationCost(I: Member, VF);
5904 setWideningDecision(I: Member, VF, W: CM_Scalarize, Cost);
5905 UpdateMemOpUserCost(cast<LoadInst>(Val: Member));
5906 }
5907 }
5908 }
5909 } else {
5910 // Cannot scalarize fixed-order recurrence phis at the moment.
5911 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5912 continue;
5913
5914 // Make sure I gets scalarized and a cost estimate without
5915 // scalarization overhead.
5916 ForcedScalars[VF].insert(Ptr: I);
5917 }
5918 }
5919}
5920
5921void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5922 assert(!VF.isScalar() &&
5923 "Trying to set a vectorization decision for a scalar VF");
5924
5925 auto ForcedScalar = ForcedScalars.find(Val: VF);
5926 for (BasicBlock *BB : TheLoop->blocks()) {
5927 // For each instruction in the old loop.
5928 for (Instruction &I : *BB) {
5929 CallInst *CI = dyn_cast<CallInst>(Val: &I);
5930
5931 if (!CI)
5932 continue;
5933
5934 InstructionCost ScalarCost = InstructionCost::getInvalid();
5935 InstructionCost VectorCost = InstructionCost::getInvalid();
5936 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5937 Function *ScalarFunc = CI->getCalledFunction();
5938 Type *ScalarRetTy = CI->getType();
5939 SmallVector<Type *, 4> Tys, ScalarTys;
5940 for (auto &ArgOp : CI->args())
5941 ScalarTys.push_back(Elt: ArgOp->getType());
5942
5943 // Estimate cost of scalarized vector call. The source operands are
5944 // assumed to be vectors, so we need to extract individual elements from
5945 // there, execute VF scalar calls, and then gather the result into the
5946 // vector return value.
5947 if (VF.isFixed()) {
5948 InstructionCost ScalarCallCost =
5949 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5950
5951 // Compute costs of unpacking argument values for the scalar calls and
5952 // packing the return values to a vector.
5953 InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5954 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5955 } else {
5956 // There is no point attempting to calculate the scalar cost for a
5957 // scalable VF as we know it will be Invalid.
5958 assert(!getScalarizationOverhead(CI, VF).isValid() &&
5959 "Unexpected valid cost for scalarizing scalable vectors");
5960 ScalarCost = InstructionCost::getInvalid();
5961 }
5962
5963 // Honor ForcedScalars and UniformAfterVectorization decisions.
5964 // TODO: For calls, it might still be more profitable to widen. Use
5965 // VPlan-based cost model to compare different options.
5966 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5967 ForcedScalar->second.contains(Ptr: CI)) ||
5968 isUniformAfterVectorization(I: CI, VF))) {
5969 setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5970 IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5971 Cost: ScalarCost);
5972 continue;
5973 }
5974
5975 bool MaskRequired = Legal->isMaskRequired(I: CI);
5976 // Compute corresponding vector type for return value and arguments.
5977 Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5978 for (Type *ScalarTy : ScalarTys)
5979 Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5980
5981 // An in-loop reduction using an fmuladd intrinsic is a special case;
5982 // we don't want the normal cost for that intrinsic.
5983 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5984 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5985 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5986 IID: getVectorIntrinsicIDForCall(CI, TLI),
5987 MaskPos: std::nullopt, Cost: *RedCost);
5988 continue;
5989 }
5990
5991 // Find the cost of vectorizing the call, if we can find a suitable
5992 // vector variant of the function.
5993 VFInfo FuncInfo;
5994 Function *VecFunc = nullptr;
5995 // Search through any available variants for one we can use at this VF.
5996 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5997 // Must match requested VF.
5998 if (Info.Shape.VF != VF)
5999 continue;
6000
6001 // Must take a mask argument if one is required
6002 if (MaskRequired && !Info.isMasked())
6003 continue;
6004
6005 // Check that all parameter kinds are supported
6006 bool ParamsOk = true;
6007 for (VFParameter Param : Info.Shape.Parameters) {
6008 switch (Param.ParamKind) {
6009 case VFParamKind::Vector:
6010 break;
6011 case VFParamKind::OMP_Uniform: {
6012 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6013 // Make sure the scalar parameter in the loop is invariant.
6014 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
6015 L: TheLoop))
6016 ParamsOk = false;
6017 break;
6018 }
6019 case VFParamKind::OMP_Linear: {
6020 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6021 // Find the stride for the scalar parameter in this loop and see if
6022 // it matches the stride for the variant.
6023 // TODO: do we need to figure out the cost of an extract to get the
6024 // first lane? Or do we hope that it will be folded away?
6025 ScalarEvolution *SE = PSE.getSE();
6026 if (!match(S: SE->getSCEV(V: ScalarParam),
6027 P: m_scev_AffineAddRec(
6028 Op0: m_SCEV(), Op1: m_scev_SpecificSInt(V: Param.LinearStepOrPos),
6029 L: m_SpecificLoop(L: TheLoop))))
6030 ParamsOk = false;
6031 break;
6032 }
6033 case VFParamKind::GlobalPredicate:
6034 break;
6035 default:
6036 ParamsOk = false;
6037 break;
6038 }
6039 }
6040
6041 if (!ParamsOk)
6042 continue;
6043
6044 // Found a suitable candidate, stop here.
6045 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
6046 FuncInfo = Info;
6047 break;
6048 }
6049
6050 if (TLI && VecFunc && !CI->isNoBuiltin())
6051 VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
6052
6053 // Find the cost of an intrinsic; some targets may have instructions that
6054 // perform the operation without needing an actual call.
6055 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6056 if (IID != Intrinsic::not_intrinsic)
6057 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6058
6059 InstructionCost Cost = ScalarCost;
6060 InstWidening Decision = CM_Scalarize;
6061
6062 if (VectorCost.isValid() && VectorCost <= Cost) {
6063 Cost = VectorCost;
6064 Decision = CM_VectorCall;
6065 }
6066
6067 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
6068 Cost = IntrinsicCost;
6069 Decision = CM_IntrinsicCall;
6070 }
6071
6072 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6073 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6074 }
6075 }
6076}
6077
6078bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6079 if (!Legal->isInvariant(V: Op))
6080 return false;
6081 // Consider Op invariant, if it or its operands aren't predicated
6082 // instruction in the loop. In that case, it is not trivially hoistable.
6083 auto *OpI = dyn_cast<Instruction>(Val: Op);
6084 return !OpI || !TheLoop->contains(Inst: OpI) ||
6085 (!isPredicatedInst(I: OpI) &&
6086 (!isa<PHINode>(Val: OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6087 all_of(Range: OpI->operands(),
6088 P: [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6089}
6090
6091InstructionCost
6092LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6093 ElementCount VF) {
6094 // If we know that this instruction will remain uniform, check the cost of
6095 // the scalar version.
6096 if (isUniformAfterVectorization(I, VF))
6097 VF = ElementCount::getFixed(MinVal: 1);
6098
6099 if (VF.isVector() && isProfitableToScalarize(I, VF))
6100 return InstsToScalarize[VF][I];
6101
6102 // Forced scalars do not have any scalarization overhead.
6103 auto ForcedScalar = ForcedScalars.find(Val: VF);
6104 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6105 auto InstSet = ForcedScalar->second;
6106 if (InstSet.count(Ptr: I))
6107 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
6108 VF.getKnownMinValue();
6109 }
6110
6111 Type *RetTy = I->getType();
6112 if (canTruncateToMinimalBitwidth(I, VF))
6113 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
6114 auto *SE = PSE.getSE();
6115
6116 Type *VectorTy;
6117 if (isScalarAfterVectorization(I, VF)) {
6118 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6119 [this](Instruction *I, ElementCount VF) -> bool {
6120 if (VF.isScalar())
6121 return true;
6122
6123 auto Scalarized = InstsToScalarize.find(Key: VF);
6124 assert(Scalarized != InstsToScalarize.end() &&
6125 "VF not yet analyzed for scalarization profitability");
6126 return !Scalarized->second.count(Key: I) &&
6127 llvm::all_of(Range: I->users(), P: [&](User *U) {
6128 auto *UI = cast<Instruction>(Val: U);
6129 return !Scalarized->second.count(Key: UI);
6130 });
6131 };
6132
6133 // With the exception of GEPs and PHIs, after scalarization there should
6134 // only be one copy of the instruction generated in the loop. This is
6135 // because the VF is either 1, or any instructions that need scalarizing
6136 // have already been dealt with by the time we get here. As a result,
6137 // it means we don't have to multiply the instruction cost by VF.
6138 assert(I->getOpcode() == Instruction::GetElementPtr ||
6139 I->getOpcode() == Instruction::PHI ||
6140 (I->getOpcode() == Instruction::BitCast &&
6141 I->getType()->isPointerTy()) ||
6142 HasSingleCopyAfterVectorization(I, VF));
6143 VectorTy = RetTy;
6144 } else
6145 VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
6146
6147 if (VF.isVector() && VectorTy->isVectorTy() &&
6148 !TTI.getNumberOfParts(Tp: VectorTy))
6149 return InstructionCost::getInvalid();
6150
6151 // TODO: We need to estimate the cost of intrinsic calls.
6152 switch (I->getOpcode()) {
6153 case Instruction::GetElementPtr:
6154 // We mark this instruction as zero-cost because the cost of GEPs in
6155 // vectorized code depends on whether the corresponding memory instruction
6156 // is scalarized or not. Therefore, we handle GEPs with the memory
6157 // instruction cost.
6158 return 0;
6159 case Instruction::Br: {
6160 // In cases of scalarized and predicated instructions, there will be VF
6161 // predicated blocks in the vectorized loop. Each branch around these
6162 // blocks requires also an extract of its vector compare i1 element.
6163 // Note that the conditional branch from the loop latch will be replaced by
6164 // a single branch controlling the loop, so there is no extra overhead from
6165 // scalarization.
6166 bool ScalarPredicatedBB = false;
6167 BranchInst *BI = cast<BranchInst>(Val: I);
6168 if (VF.isVector() && BI->isConditional() &&
6169 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6170 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
6171 BI->getParent() != TheLoop->getLoopLatch())
6172 ScalarPredicatedBB = true;
6173
6174 if (ScalarPredicatedBB) {
6175 // Not possible to scalarize scalable vector with predicated instructions.
6176 if (VF.isScalable())
6177 return InstructionCost::getInvalid();
6178 // Return cost for branches around scalarized and predicated blocks.
6179 auto *VecI1Ty =
6180 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6181 return (
6182 TTI.getScalarizationOverhead(
6183 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6184 /*Insert*/ false, /*Extract*/ true, CostKind) +
6185 (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6186 }
6187
6188 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6189 // The back-edge branch will remain, as will all scalar branches.
6190 return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6191
6192 // This branch will be eliminated by if-conversion.
6193 return 0;
6194 // Note: We currently assume zero cost for an unconditional branch inside
6195 // a predicated block since it will become a fall-through, although we
6196 // may decide in the future to call TTI for all branches.
6197 }
6198 case Instruction::Switch: {
6199 if (VF.isScalar())
6200 return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6201 auto *Switch = cast<SwitchInst>(Val: I);
6202 return Switch->getNumCases() *
6203 TTI.getCmpSelInstrCost(
6204 Opcode: Instruction::ICmp,
6205 ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6206 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6207 VecPred: CmpInst::ICMP_EQ, CostKind);
6208 }
6209 case Instruction::PHI: {
6210 auto *Phi = cast<PHINode>(Val: I);
6211
6212 // First-order recurrences are replaced by vector shuffles inside the loop.
6213 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6214 SmallVector<int> Mask(VF.getKnownMinValue());
6215 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6216 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6217 DstTy: cast<VectorType>(Val: VectorTy),
6218 SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6219 Index: VF.getKnownMinValue() - 1);
6220 }
6221
6222 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6223 // converted into select instructions. We require N - 1 selects per phi
6224 // node, where N is the number of incoming values.
6225 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6226 Type *ResultTy = Phi->getType();
6227
6228 // All instructions in an Any-of reduction chain are narrowed to bool.
6229 // Check if that is the case for this phi node.
6230 auto *HeaderUser = cast_if_present<PHINode>(
6231 Val: find_singleton<User>(Range: Phi->users(), P: [this](User *U, bool) -> User * {
6232 auto *Phi = dyn_cast<PHINode>(Val: U);
6233 if (Phi && Phi->getParent() == TheLoop->getHeader())
6234 return Phi;
6235 return nullptr;
6236 }));
6237 if (HeaderUser) {
6238 auto &ReductionVars = Legal->getReductionVars();
6239 auto Iter = ReductionVars.find(Key: HeaderUser);
6240 if (Iter != ReductionVars.end() &&
6241 RecurrenceDescriptor::isAnyOfRecurrenceKind(
6242 Kind: Iter->second.getRecurrenceKind()))
6243 ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6244 }
6245 return (Phi->getNumIncomingValues() - 1) *
6246 TTI.getCmpSelInstrCost(
6247 Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6248 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6249 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6250 }
6251
6252 // When tail folding with EVL, if the phi is part of an out of loop
6253 // reduction then it will be transformed into a wide vp_merge.
6254 if (VF.isVector() && foldTailWithEVL() &&
6255 Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6256 IntrinsicCostAttributes ICA(
6257 Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6258 {toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6259 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6260 }
6261
6262 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6263 }
6264 case Instruction::UDiv:
6265 case Instruction::SDiv:
6266 case Instruction::URem:
6267 case Instruction::SRem:
6268 if (VF.isVector() && isPredicatedInst(I)) {
6269 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6270 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6271 ScalarCost : SafeDivisorCost;
6272 }
6273 // We've proven all lanes safe to speculate, fall through.
6274 [[fallthrough]];
6275 case Instruction::Add:
6276 case Instruction::Sub: {
6277 auto Info = Legal->getHistogramInfo(I);
6278 if (Info && VF.isVector()) {
6279 const HistogramInfo *HGram = Info.value();
6280 // Assume that a non-constant update value (or a constant != 1) requires
6281 // a multiply, and add that into the cost.
6282 InstructionCost MulCost = TTI::TCC_Free;
6283 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
6284 if (!RHS || RHS->getZExtValue() != 1)
6285 MulCost =
6286 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6287
6288 // Find the cost of the histogram operation itself.
6289 Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6290 Type *ScalarTy = I->getType();
6291 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6292 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6293 Type::getVoidTy(C&: I->getContext()),
6294 {PtrTy, ScalarTy, MaskTy});
6295
6296 // Add the costs together with the add/sub operation.
6297 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6298 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6299 }
6300 [[fallthrough]];
6301 }
6302 case Instruction::FAdd:
6303 case Instruction::FSub:
6304 case Instruction::Mul:
6305 case Instruction::FMul:
6306 case Instruction::FDiv:
6307 case Instruction::FRem:
6308 case Instruction::Shl:
6309 case Instruction::LShr:
6310 case Instruction::AShr:
6311 case Instruction::And:
6312 case Instruction::Or:
6313 case Instruction::Xor: {
6314 // If we're speculating on the stride being 1, the multiplication may
6315 // fold away. We can generalize this for all operations using the notion
6316 // of neutral elements. (TODO)
6317 if (I->getOpcode() == Instruction::Mul &&
6318 ((TheLoop->isLoopInvariant(V: I->getOperand(i: 0)) &&
6319 PSE.getSCEV(V: I->getOperand(i: 0))->isOne()) ||
6320 (TheLoop->isLoopInvariant(V: I->getOperand(i: 1)) &&
6321 PSE.getSCEV(V: I->getOperand(i: 1))->isOne())))
6322 return 0;
6323
6324 // Detect reduction patterns
6325 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6326 return *RedCost;
6327
6328 // Certain instructions can be cheaper to vectorize if they have a constant
6329 // second vector operand. One example of this are shifts on x86.
6330 Value *Op2 = I->getOperand(i: 1);
6331 if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6332 PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6333 isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6334 Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6335 }
6336 auto Op2Info = TTI.getOperandInfo(V: Op2);
6337 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6338 shouldConsiderInvariant(Op: Op2))
6339 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6340
6341 SmallVector<const Value *, 4> Operands(I->operand_values());
6342 return TTI.getArithmeticInstrCost(
6343 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6344 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6345 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6346 }
6347 case Instruction::FNeg: {
6348 return TTI.getArithmeticInstrCost(
6349 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6350 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6351 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6352 Args: I->getOperand(i: 0), CxtI: I);
6353 }
6354 case Instruction::Select: {
6355 SelectInst *SI = cast<SelectInst>(Val: I);
6356 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6357 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6358
6359 const Value *Op0, *Op1;
6360 using namespace llvm::PatternMatch;
6361 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6362 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6363 // select x, y, false --> x & y
6364 // select x, true, y --> x | y
6365 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6366 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6367 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6368 Op1->getType()->getScalarSizeInBits() == 1);
6369
6370 return TTI.getArithmeticInstrCost(
6371 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And,
6372 Ty: VectorTy, CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: {Op0, Op1}, CxtI: I);
6373 }
6374
6375 Type *CondTy = SI->getCondition()->getType();
6376 if (!ScalarCond)
6377 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6378
6379 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6380 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6381 Pred = Cmp->getPredicate();
6382 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6383 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6384 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6385 }
6386 case Instruction::ICmp:
6387 case Instruction::FCmp: {
6388 Type *ValTy = I->getOperand(i: 0)->getType();
6389
6390 if (canTruncateToMinimalBitwidth(I, VF)) {
6391 [[maybe_unused]] Instruction *Op0AsInstruction =
6392 dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6393 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6394 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6395 "if both the operand and the compare are marked for "
6396 "truncation, they must have the same bitwidth");
6397 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[I]);
6398 }
6399
6400 VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6401 return TTI.getCmpSelInstrCost(
6402 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6403 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6404 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6405 }
6406 case Instruction::Store:
6407 case Instruction::Load: {
6408 ElementCount Width = VF;
6409 if (Width.isVector()) {
6410 InstWidening Decision = getWideningDecision(I, VF: Width);
6411 assert(Decision != CM_Unknown &&
6412 "CM decision should be taken at this point");
6413 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6414 return InstructionCost::getInvalid();
6415 if (Decision == CM_Scalarize)
6416 Width = ElementCount::getFixed(MinVal: 1);
6417 }
6418 VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6419 return getMemoryInstructionCost(I, VF);
6420 }
6421 case Instruction::BitCast:
6422 if (I->getType()->isPointerTy())
6423 return 0;
6424 [[fallthrough]];
6425 case Instruction::ZExt:
6426 case Instruction::SExt:
6427 case Instruction::FPToUI:
6428 case Instruction::FPToSI:
6429 case Instruction::FPExt:
6430 case Instruction::PtrToInt:
6431 case Instruction::IntToPtr:
6432 case Instruction::SIToFP:
6433 case Instruction::UIToFP:
6434 case Instruction::Trunc:
6435 case Instruction::FPTrunc: {
6436 // Computes the CastContextHint from a Load/Store instruction.
6437 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6438 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6439 "Expected a load or a store!");
6440
6441 if (VF.isScalar() || !TheLoop->contains(Inst: I))
6442 return TTI::CastContextHint::Normal;
6443
6444 switch (getWideningDecision(I, VF)) {
6445 case LoopVectorizationCostModel::CM_GatherScatter:
6446 return TTI::CastContextHint::GatherScatter;
6447 case LoopVectorizationCostModel::CM_Interleave:
6448 return TTI::CastContextHint::Interleave;
6449 case LoopVectorizationCostModel::CM_Scalarize:
6450 case LoopVectorizationCostModel::CM_Widen:
6451 return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6452 : TTI::CastContextHint::Normal;
6453 case LoopVectorizationCostModel::CM_Widen_Reverse:
6454 return TTI::CastContextHint::Reversed;
6455 case LoopVectorizationCostModel::CM_Unknown:
6456 llvm_unreachable("Instr did not go through cost modelling?");
6457 case LoopVectorizationCostModel::CM_VectorCall:
6458 case LoopVectorizationCostModel::CM_IntrinsicCall:
6459 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6460 }
6461
6462 llvm_unreachable("Unhandled case!");
6463 };
6464
6465 unsigned Opcode = I->getOpcode();
6466 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6467 // For Trunc, the context is the only user, which must be a StoreInst.
6468 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6469 if (I->hasOneUse())
6470 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
6471 CCH = ComputeCCH(Store);
6472 }
6473 // For Z/Sext, the context is the operand, which must be a LoadInst.
6474 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6475 Opcode == Instruction::FPExt) {
6476 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
6477 CCH = ComputeCCH(Load);
6478 }
6479
6480 // We optimize the truncation of induction variables having constant
6481 // integer steps. The cost of these truncations is the same as the scalar
6482 // operation.
6483 if (isOptimizableIVTruncate(I, VF)) {
6484 auto *Trunc = cast<TruncInst>(Val: I);
6485 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6486 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6487 }
6488
6489 // Detect reduction patterns
6490 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6491 return *RedCost;
6492
6493 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
6494 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6495 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6496 SrcScalarTy =
6497 IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6498 Type *SrcVecTy =
6499 VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6500
6501 if (canTruncateToMinimalBitwidth(I, VF)) {
6502 // If the result type is <= the source type, there will be no extend
6503 // after truncating the users to the minimal required bitwidth.
6504 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6505 (I->getOpcode() == Instruction::ZExt ||
6506 I->getOpcode() == Instruction::SExt))
6507 return 0;
6508 }
6509
6510 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6511 }
6512 case Instruction::Call:
6513 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6514 case Instruction::ExtractValue:
6515 return TTI.getInstructionCost(U: I, CostKind);
6516 case Instruction::Alloca:
6517 // We cannot easily widen alloca to a scalable alloca, as
6518 // the result would need to be a vector of pointers.
6519 if (VF.isScalable())
6520 return InstructionCost::getInvalid();
6521 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: RetTy, CostKind);
6522 default:
6523 // This opcode is unknown. Assume that it is the same as 'mul'.
6524 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6525 } // end of switch.
6526}
6527
6528void LoopVectorizationCostModel::collectValuesToIgnore() {
6529 // Ignore ephemeral values.
6530 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6531
6532 SmallVector<Value *, 4> DeadInterleavePointerOps;
6533 SmallVector<Value *, 4> DeadOps;
6534
6535 // If a scalar epilogue is required, users outside the loop won't use
6536 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6537 // that is the case.
6538 bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6539 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6540 return RequiresScalarEpilogue &&
6541 !TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6542 };
6543
6544 LoopBlocksDFS DFS(TheLoop);
6545 DFS.perform(LI);
6546 for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6547 for (Instruction &I : reverse(C&: *BB)) {
6548 if (VecValuesToIgnore.contains(Ptr: &I) || ValuesToIgnore.contains(Ptr: &I))
6549 continue;
6550
6551 // Add instructions that would be trivially dead and are only used by
6552 // values already ignored to DeadOps to seed worklist.
6553 if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6554 all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6555 return VecValuesToIgnore.contains(Ptr: U) ||
6556 ValuesToIgnore.contains(Ptr: U) || IsLiveOutDead(U);
6557 }))
6558 DeadOps.push_back(Elt: &I);
6559
6560 // For interleave groups, we only create a pointer for the start of the
6561 // interleave group. Queue up addresses of group members except the insert
6562 // position for further processing.
6563 if (isAccessInterleaved(Instr: &I)) {
6564 auto *Group = getInterleavedAccessGroup(Instr: &I);
6565 if (Group->getInsertPos() == &I)
6566 continue;
6567 Value *PointerOp = getLoadStorePointerOperand(V: &I);
6568 DeadInterleavePointerOps.push_back(Elt: PointerOp);
6569 }
6570
6571 // Queue branches for analysis. They are dead, if their successors only
6572 // contain dead instructions.
6573 if (auto *Br = dyn_cast<BranchInst>(Val: &I)) {
6574 if (Br->isConditional())
6575 DeadOps.push_back(Elt: &I);
6576 }
6577 }
6578
6579 // Mark ops feeding interleave group members as free, if they are only used
6580 // by other dead computations.
6581 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6582 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
6583 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
6584 Instruction *UI = cast<Instruction>(Val: U);
6585 return !VecValuesToIgnore.contains(Ptr: U) &&
6586 (!isAccessInterleaved(Instr: UI) ||
6587 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6588 }))
6589 continue;
6590 VecValuesToIgnore.insert(Ptr: Op);
6591 append_range(C&: DeadInterleavePointerOps, R: Op->operands());
6592 }
6593
6594 // Mark ops that would be trivially dead and are only used by ignored
6595 // instructions as free.
6596 BasicBlock *Header = TheLoop->getHeader();
6597
6598 // Returns true if the block contains only dead instructions. Such blocks will
6599 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6600 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6601 auto IsEmptyBlock = [this](BasicBlock *BB) {
6602 return all_of(Range&: *BB, P: [this](Instruction &I) {
6603 return ValuesToIgnore.contains(Ptr: &I) || VecValuesToIgnore.contains(Ptr: &I) ||
6604 (isa<BranchInst>(Val: &I) && !cast<BranchInst>(Val: &I)->isConditional());
6605 });
6606 };
6607 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6608 auto *Op = dyn_cast<Instruction>(Val: DeadOps[I]);
6609
6610 // Check if the branch should be considered dead.
6611 if (auto *Br = dyn_cast_or_null<BranchInst>(Val: Op)) {
6612 BasicBlock *ThenBB = Br->getSuccessor(i: 0);
6613 BasicBlock *ElseBB = Br->getSuccessor(i: 1);
6614 // Don't considers branches leaving the loop for simplification.
6615 if (!TheLoop->contains(BB: ThenBB) || !TheLoop->contains(BB: ElseBB))
6616 continue;
6617 bool ThenEmpty = IsEmptyBlock(ThenBB);
6618 bool ElseEmpty = IsEmptyBlock(ElseBB);
6619 if ((ThenEmpty && ElseEmpty) ||
6620 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6621 ElseBB->phis().empty()) ||
6622 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6623 ThenBB->phis().empty())) {
6624 VecValuesToIgnore.insert(Ptr: Br);
6625 DeadOps.push_back(Elt: Br->getCondition());
6626 }
6627 continue;
6628 }
6629
6630 // Skip any op that shouldn't be considered dead.
6631 if (!Op || !TheLoop->contains(Inst: Op) ||
6632 (isa<PHINode>(Val: Op) && Op->getParent() == Header) ||
6633 !wouldInstructionBeTriviallyDead(I: Op, TLI) ||
6634 any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6635 return !VecValuesToIgnore.contains(Ptr: U) &&
6636 !ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead(U);
6637 }))
6638 continue;
6639
6640 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6641 // which applies for both scalar and vector versions. Otherwise it is only
6642 // dead in vector versions, so only add it to VecValuesToIgnore.
6643 if (all_of(Range: Op->users(),
6644 P: [this](User *U) { return ValuesToIgnore.contains(Ptr: U); }))
6645 ValuesToIgnore.insert(Ptr: Op);
6646
6647 VecValuesToIgnore.insert(Ptr: Op);
6648 append_range(C&: DeadOps, R: Op->operands());
6649 }
6650
6651 // Ignore type-promoting instructions we identified during reduction
6652 // detection.
6653 for (const auto &Reduction : Legal->getReductionVars()) {
6654 const RecurrenceDescriptor &RedDes = Reduction.second;
6655 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6656 VecValuesToIgnore.insert_range(R: Casts);
6657 }
6658 // Ignore type-casting instructions we identified during induction
6659 // detection.
6660 for (const auto &Induction : Legal->getInductionVars()) {
6661 const InductionDescriptor &IndDes = Induction.second;
6662 VecValuesToIgnore.insert_range(R: IndDes.getCastInsts());
6663 }
6664}
6665
6666void LoopVectorizationCostModel::collectInLoopReductions() {
6667 // Avoid duplicating work finding in-loop reductions.
6668 if (!InLoopReductions.empty())
6669 return;
6670
6671 for (const auto &Reduction : Legal->getReductionVars()) {
6672 PHINode *Phi = Reduction.first;
6673 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6674
6675 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
6676 // separately and should not be considered for in-loop reductions.
6677 if (RdxDesc.hasUsesOutsideReductionChain())
6678 continue;
6679
6680 // We don't collect reductions that are type promoted (yet).
6681 if (RdxDesc.getRecurrenceType() != Phi->getType())
6682 continue;
6683
6684 // In-loop AnyOf and FindIV reductions are not yet supported.
6685 RecurKind Kind = RdxDesc.getRecurrenceKind();
6686 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) ||
6687 RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) ||
6688 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
6689 continue;
6690
6691 // If the target would prefer this reduction to happen "in-loop", then we
6692 // want to record it as such.
6693 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6694 !TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6695 continue;
6696
6697 // Check that we can correctly put the reductions into the loop, by
6698 // finding the chain of operations that leads from the phi to the loop
6699 // exit value.
6700 SmallVector<Instruction *, 4> ReductionOperations =
6701 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6702 bool InLoop = !ReductionOperations.empty();
6703
6704 if (InLoop) {
6705 InLoopReductions.insert(Ptr: Phi);
6706 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6707 Instruction *LastChain = Phi;
6708 for (auto *I : ReductionOperations) {
6709 InLoopReductionImmediateChains[I] = LastChain;
6710 LastChain = I;
6711 }
6712 }
6713 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6714 << " reduction for phi: " << *Phi << "\n");
6715 }
6716}
6717
6718// This function will select a scalable VF if the target supports scalable
6719// vectors and a fixed one otherwise.
6720// TODO: we could return a pair of values that specify the max VF and
6721// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6722// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6723// doesn't have a cost model that can choose which plan to execute if
6724// more than one is generated.
6725static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6726 LoopVectorizationCostModel &CM) {
6727 unsigned WidestType;
6728 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6729
6730 TargetTransformInfo::RegisterKind RegKind =
6731 TTI.enableScalableVectorization()
6732 ? TargetTransformInfo::RGK_ScalableVector
6733 : TargetTransformInfo::RGK_FixedWidthVector;
6734
6735 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6736 unsigned N = RegSize.getKnownMinValue() / WidestType;
6737 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6738}
6739
6740VectorizationFactor
6741LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6742 ElementCount VF = UserVF;
6743 // Outer loop handling: They may require CFG and instruction level
6744 // transformations before even evaluating whether vectorization is profitable.
6745 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6746 // the vectorization pipeline.
6747 if (!OrigLoop->isInnermost()) {
6748 // If the user doesn't provide a vectorization factor, determine a
6749 // reasonable one.
6750 if (UserVF.isZero()) {
6751 VF = determineVPlanVF(TTI, CM);
6752 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6753
6754 // Make sure we have a VF > 1 for stress testing.
6755 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6756 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6757 << "overriding computed VF.\n");
6758 VF = ElementCount::getFixed(MinVal: 4);
6759 }
6760 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6761 !ForceTargetSupportsScalableVectors) {
6762 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6763 << "not supported by the target.\n");
6764 reportVectorizationFailure(
6765 DebugMsg: "Scalable vectorization requested but not supported by the target",
6766 OREMsg: "the scalable user-specified vectorization width for outer-loop "
6767 "vectorization cannot be used because the target does not support "
6768 "scalable vectors.",
6769 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6770 return VectorizationFactor::Disabled();
6771 }
6772 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6773 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6774 "VF needs to be a power of two");
6775 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6776 << "VF " << VF << " to build VPlans.\n");
6777 buildVPlans(MinVF: VF, MaxVF: VF);
6778
6779 if (VPlans.empty())
6780 return VectorizationFactor::Disabled();
6781
6782 // For VPlan build stress testing, we bail out after VPlan construction.
6783 if (VPlanBuildStressTest)
6784 return VectorizationFactor::Disabled();
6785
6786 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6787 }
6788
6789 LLVM_DEBUG(
6790 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6791 "VPlan-native path.\n");
6792 return VectorizationFactor::Disabled();
6793}
6794
6795void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6796 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6797 CM.collectValuesToIgnore();
6798 CM.collectElementTypesForWidening();
6799
6800 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6801 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6802 return;
6803
6804 // Invalidate interleave groups if all blocks of loop will be predicated.
6805 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6806 !useMaskedInterleavedAccesses(TTI)) {
6807 LLVM_DEBUG(
6808 dbgs()
6809 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6810 "which requires masked-interleaved support.\n");
6811 if (CM.InterleaveInfo.invalidateGroups())
6812 // Invalidating interleave groups also requires invalidating all decisions
6813 // based on them, which includes widening decisions and uniform and scalar
6814 // values.
6815 CM.invalidateCostModelingDecisions();
6816 }
6817
6818 if (CM.foldTailByMasking())
6819 Legal->prepareToFoldTailByMasking();
6820
6821 ElementCount MaxUserVF =
6822 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6823 if (UserVF) {
6824 if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6825 reportVectorizationInfo(
6826 Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6827 ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6828 } else {
6829 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6830 "VF needs to be a power of two");
6831 // Collect the instructions (and their associated costs) that will be more
6832 // profitable to scalarize.
6833 CM.collectInLoopReductions();
6834 if (CM.selectUserVectorizationFactor(UserVF)) {
6835 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6836 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6837 LLVM_DEBUG(printPlans(dbgs()));
6838 return;
6839 }
6840 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6841 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6842 }
6843 }
6844
6845 // Collect the Vectorization Factor Candidates.
6846 SmallVector<ElementCount> VFCandidates;
6847 for (auto VF = ElementCount::getFixed(MinVal: 1);
6848 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
6849 VFCandidates.push_back(Elt: VF);
6850 for (auto VF = ElementCount::getScalable(MinVal: 1);
6851 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
6852 VFCandidates.push_back(Elt: VF);
6853
6854 CM.collectInLoopReductions();
6855 for (const auto &VF : VFCandidates) {
6856 // Collect Uniform and Scalar instructions after vectorization with VF.
6857 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6858 }
6859
6860 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
6861 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
6862
6863 LLVM_DEBUG(printPlans(dbgs()));
6864}
6865
6866InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6867 ElementCount VF) const {
6868 InstructionCost Cost = CM.getInstructionCost(I: UI, VF);
6869 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6870 return InstructionCost(ForceTargetInstructionCost);
6871 return Cost;
6872}
6873
6874bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6875 ElementCount VF) const {
6876 return CM.isUniformAfterVectorization(I, VF);
6877}
6878
6879bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6880 return CM.ValuesToIgnore.contains(Ptr: UI) ||
6881 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
6882 SkipCostComputation.contains(Ptr: UI);
6883}
6884
6885unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
6886 return CM.getPredBlockCostDivisor(CostKind, BB);
6887}
6888
6889InstructionCost
6890LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6891 VPCostContext &CostCtx) const {
6892 InstructionCost Cost;
6893 // Cost modeling for inductions is inaccurate in the legacy cost model
6894 // compared to the recipes that are generated. To match here initially during
6895 // VPlan cost model bring up directly use the induction costs from the legacy
6896 // cost model. Note that we do this as pre-processing; the VPlan may not have
6897 // any recipes associated with the original induction increment instruction
6898 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6899 // the cost of induction phis and increments (both that are represented by
6900 // recipes and those that are not), to avoid distinguishing between them here,
6901 // and skip all recipes that represent induction phis and increments (the
6902 // former case) later on, if they exist, to avoid counting them twice.
6903 // Similarly we pre-compute the cost of any optimized truncates.
6904 // TODO: Switch to more accurate costing based on VPlan.
6905 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6906 Instruction *IVInc = cast<Instruction>(
6907 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6908 SmallVector<Instruction *> IVInsts = {IVInc};
6909 for (unsigned I = 0; I != IVInsts.size(); I++) {
6910 for (Value *Op : IVInsts[I]->operands()) {
6911 auto *OpI = dyn_cast<Instruction>(Val: Op);
6912 if (Op == IV || !OpI || !OrigLoop->contains(Inst: OpI) || !Op->hasOneUse())
6913 continue;
6914 IVInsts.push_back(Elt: OpI);
6915 }
6916 }
6917 IVInsts.push_back(Elt: IV);
6918 for (User *U : IV->users()) {
6919 auto *CI = cast<Instruction>(Val: U);
6920 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6921 continue;
6922 IVInsts.push_back(Elt: CI);
6923 }
6924
6925 // If the vector loop gets executed exactly once with the given VF, ignore
6926 // the costs of comparison and induction instructions, as they'll get
6927 // simplified away.
6928 // TODO: Remove this code after stepping away from the legacy cost model and
6929 // adding code to simplify VPlans before calculating their costs.
6930 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6931 if (TC == VF && !CM.foldTailByMasking())
6932 addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6933 InstsToIgnore&: CostCtx.SkipCostComputation);
6934
6935 for (Instruction *IVInst : IVInsts) {
6936 if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6937 continue;
6938 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6939 LLVM_DEBUG({
6940 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6941 << ": induction instruction " << *IVInst << "\n";
6942 });
6943 Cost += InductionCost;
6944 CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6945 }
6946 }
6947
6948 /// Compute the cost of all exiting conditions of the loop using the legacy
6949 /// cost model. This is to match the legacy behavior, which adds the cost of
6950 /// all exit conditions. Note that this over-estimates the cost, as there will
6951 /// be a single condition to control the vector loop.
6952 SmallVector<BasicBlock *> Exiting;
6953 CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6954 SetVector<Instruction *> ExitInstrs;
6955 // Collect all exit conditions.
6956 for (BasicBlock *EB : Exiting) {
6957 auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator());
6958 if (!Term || CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6959 continue;
6960 if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) {
6961 ExitInstrs.insert(X: CondI);
6962 }
6963 }
6964 // Compute the cost of all instructions only feeding the exit conditions.
6965 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6966 Instruction *CondI = ExitInstrs[I];
6967 if (!OrigLoop->contains(Inst: CondI) ||
6968 !CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6969 continue;
6970 InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6971 LLVM_DEBUG({
6972 dbgs() << "Cost of " << CondICost << " for VF " << VF
6973 << ": exit condition instruction " << *CondI << "\n";
6974 });
6975 Cost += CondICost;
6976 for (Value *Op : CondI->operands()) {
6977 auto *OpI = dyn_cast<Instruction>(Val: Op);
6978 if (!OpI || CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) ||
6979 any_of(Range: OpI->users(), P: [&ExitInstrs](User *U) {
6980 return !ExitInstrs.contains(key: cast<Instruction>(Val: U));
6981 }))
6982 continue;
6983 ExitInstrs.insert(X: OpI);
6984 }
6985 }
6986
6987 // Pre-compute the costs for branches except for the backedge, as the number
6988 // of replicate regions in a VPlan may not directly match the number of
6989 // branches, which would lead to different decisions.
6990 // TODO: Compute cost of branches for each replicate region in the VPlan,
6991 // which is more accurate than the legacy cost model.
6992 for (BasicBlock *BB : OrigLoop->blocks()) {
6993 if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6994 continue;
6995 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6996 if (BB == OrigLoop->getLoopLatch())
6997 continue;
6998 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6999 Cost += BranchCost;
7000 }
7001
7002 // Pre-compute costs for instructions that are forced-scalar or profitable to
7003 // scalarize. Their costs will be computed separately in the legacy cost
7004 // model.
7005 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7006 if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
7007 continue;
7008 CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
7009 InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
7010 LLVM_DEBUG({
7011 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7012 << ": forced scalar " << *ForcedScalar << "\n";
7013 });
7014 Cost += ForcedCost;
7015 }
7016 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7017 if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
7018 continue;
7019 CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
7020 LLVM_DEBUG({
7021 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7022 << ": profitable to scalarize " << *Scalarized << "\n";
7023 });
7024 Cost += ScalarCost;
7025 }
7026
7027 return Cost;
7028}
7029
7030InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7031 ElementCount VF) const {
7032 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
7033 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7034
7035 // Now compute and add the VPlan-based cost.
7036 Cost += Plan.cost(VF, Ctx&: CostCtx);
7037#ifndef NDEBUG
7038 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
7039 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7040 << " (Estimated cost per lane: ");
7041 if (Cost.isValid()) {
7042 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
7043 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7044 } else /* No point dividing an invalid cost - it will still be invalid */
7045 LLVM_DEBUG(dbgs() << "Invalid");
7046 LLVM_DEBUG(dbgs() << ")\n");
7047#endif
7048 return Cost;
7049}
7050
7051#ifndef NDEBUG
7052/// Return true if the original loop \ TheLoop contains any instructions that do
7053/// not have corresponding recipes in \p Plan and are not marked to be ignored
7054/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7055/// cost-model did not account for.
7056static bool planContainsAdditionalSimplifications(VPlan &Plan,
7057 VPCostContext &CostCtx,
7058 Loop *TheLoop,
7059 ElementCount VF) {
7060 // First collect all instructions for the recipes in Plan.
7061 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7062 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7063 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7064 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7065 return &WidenMem->getIngredient();
7066 return nullptr;
7067 };
7068
7069 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
7070 // the select doesn't need to be considered for the vector loop cost; go with
7071 // the more accurate VPlan-based cost model.
7072 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
7073 auto *VPI = dyn_cast<VPInstruction>(&R);
7074 if (!VPI || VPI->getOpcode() != Instruction::Select)
7075 continue;
7076
7077 if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7078 switch (WR->getOpcode()) {
7079 case Instruction::UDiv:
7080 case Instruction::SDiv:
7081 case Instruction::URem:
7082 case Instruction::SRem:
7083 return true;
7084 default:
7085 break;
7086 }
7087 }
7088 }
7089
7090 DenseSet<Instruction *> SeenInstrs;
7091 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7092 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7093 for (VPRecipeBase &R : *VPBB) {
7094 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7095 auto *IG = IR->getInterleaveGroup();
7096 unsigned NumMembers = IG->getNumMembers();
7097 for (unsigned I = 0; I != NumMembers; ++I) {
7098 if (Instruction *M = IG->getMember(I))
7099 SeenInstrs.insert(M);
7100 }
7101 continue;
7102 }
7103 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7104 // cost model won't cost it whilst the legacy will.
7105 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7106 using namespace VPlanPatternMatch;
7107 if (none_of(FOR->users(),
7108 match_fn(m_VPInstruction<
7109 VPInstruction::FirstOrderRecurrenceSplice>())))
7110 return true;
7111 }
7112 // The VPlan-based cost model is more accurate for partial reductions and
7113 // comparing against the legacy cost isn't desirable.
7114 if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7115 if (VPR->isPartialReduction())
7116 return true;
7117
7118 // The VPlan-based cost model can analyze if recipes are scalar
7119 // recursively, but the legacy cost model cannot.
7120 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7121 auto *AddrI = dyn_cast<Instruction>(
7122 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7123 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7124 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7125 return true;
7126
7127 if (WidenMemR->isReverse()) {
7128 // If the stored value of a reverse store is invariant, LICM will
7129 // hoist the reverse operation to the preheader. In this case, the
7130 // result of the VPlan-based cost model will diverge from that of
7131 // the legacy model.
7132 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
7133 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7134 return true;
7135
7136 if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
7137 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7138 return true;
7139 }
7140 }
7141
7142 // The legacy cost model costs non-header phis with a scalar VF as a phi,
7143 // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
7144 if (isa<VPBlendRecipe>(&R) &&
7145 vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
7146 return true;
7147
7148 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7149 /// but the original instruction wasn't uniform-after-vectorization in the
7150 /// legacy cost model, the legacy cost overestimates the actual cost.
7151 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7152 if (RepR->isSingleScalar() &&
7153 !CostCtx.isLegacyUniformAfterVectorization(
7154 RepR->getUnderlyingInstr(), VF))
7155 return true;
7156 }
7157 if (Instruction *UI = GetInstructionForCost(&R)) {
7158 // If we adjusted the predicate of the recipe, the cost in the legacy
7159 // cost model may be different.
7160 using namespace VPlanPatternMatch;
7161 CmpPredicate Pred;
7162 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7163 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7164 cast<CmpInst>(UI)->getPredicate())
7165 return true;
7166
7167 // Recipes with underlying instructions being moved out of the loop
7168 // region by LICM may cause discrepancies between the legacy cost model
7169 // and the VPlan-based cost model.
7170 if (!VPBB->getEnclosingLoopRegion())
7171 return true;
7172
7173 SeenInstrs.insert(UI);
7174 }
7175 }
7176 }
7177
7178 // Return true if the loop contains any instructions that are not also part of
7179 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7180 // that the VPlan contains extra simplifications.
7181 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7182 TheLoop](BasicBlock *BB) {
7183 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7184 // Skip induction phis when checking for simplifications, as they may not
7185 // be lowered directly be lowered to a corresponding PHI recipe.
7186 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7187 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7188 return false;
7189 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7190 });
7191 });
7192}
7193#endif
7194
7195VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7196 if (VPlans.empty())
7197 return VectorizationFactor::Disabled();
7198 // If there is a single VPlan with a single VF, return it directly.
7199 VPlan &FirstPlan = *VPlans[0];
7200 if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1)
7201 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7202
7203 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7204 << (CM.CostKind == TTI::TCK_RecipThroughput
7205 ? "Reciprocal Throughput\n"
7206 : CM.CostKind == TTI::TCK_Latency
7207 ? "Instruction Latency\n"
7208 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7209 : CM.CostKind == TTI::TCK_SizeAndLatency
7210 ? "Code Size and Latency\n"
7211 : "Unknown\n"));
7212
7213 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
7214 assert(hasPlanWithVF(ScalarVF) &&
7215 "More than a single plan/VF w/o any plan having scalar VF");
7216
7217 // TODO: Compute scalar cost using VPlan-based cost model.
7218 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7219 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7220 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7221 VectorizationFactor BestFactor = ScalarFactor;
7222
7223 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7224 if (ForceVectorization) {
7225 // Ignore scalar width, because the user explicitly wants vectorization.
7226 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7227 // evaluation.
7228 BestFactor.Cost = InstructionCost::getMax();
7229 }
7230
7231 for (auto &P : VPlans) {
7232 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7233 P->vectorFactors().end());
7234
7235 SmallVector<VPRegisterUsage, 8> RUs;
7236 if (any_of(Range&: VFs, P: [this](ElementCount VF) {
7237 return CM.shouldConsiderRegPressureForVF(VF);
7238 }))
7239 RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7240
7241 for (unsigned I = 0; I < VFs.size(); I++) {
7242 ElementCount VF = VFs[I];
7243 if (VF.isScalar())
7244 continue;
7245 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7246 LLVM_DEBUG(
7247 dbgs()
7248 << "LV: Not considering vector loop of width " << VF
7249 << " because it will not generate any vector instructions.\n");
7250 continue;
7251 }
7252 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7253 LLVM_DEBUG(
7254 dbgs()
7255 << "LV: Not considering vector loop of width " << VF
7256 << " because it would cause replicated blocks to be generated,"
7257 << " which isn't allowed when optimizing for size.\n");
7258 continue;
7259 }
7260
7261 InstructionCost Cost = cost(Plan&: *P, VF);
7262 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7263
7264 if (CM.shouldConsiderRegPressureForVF(VF) &&
7265 RUs[I].exceedsMaxNumRegs(TTI, OverrideMaxNumRegs: ForceTargetNumVectorRegs)) {
7266 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7267 << VF << " because it uses too many registers\n");
7268 continue;
7269 }
7270
7271 if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P->hasScalarTail()))
7272 BestFactor = CurrentFactor;
7273
7274 // If profitable add it to ProfitableVF list.
7275 if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P->hasScalarTail()))
7276 ProfitableVFs.push_back(Elt: CurrentFactor);
7277 }
7278 }
7279
7280#ifndef NDEBUG
7281 // Select the optimal vectorization factor according to the legacy cost-model.
7282 // This is now only used to verify the decisions by the new VPlan-based
7283 // cost-model and will be retired once the VPlan-based cost-model is
7284 // stabilized.
7285 VectorizationFactor LegacyVF = selectVectorizationFactor();
7286 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7287
7288 // Pre-compute the cost and use it to check if BestPlan contains any
7289 // simplifications not accounted for in the legacy cost model. If that's the
7290 // case, don't trigger the assertion, as the extra simplifications may cause a
7291 // different VF to be picked by the VPlan-based cost model.
7292 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, CM.PSE,
7293 OrigLoop);
7294 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7295 // Verify that the VPlan-based and legacy cost models agree, except for
7296 // * VPlans with early exits,
7297 // * VPlans with additional VPlan simplifications,
7298 // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
7299 // vp_scatter/vp_gather).
7300 // The legacy cost model doesn't properly model costs for such loops.
7301 bool UsesEVLGatherScatter =
7302 any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
7303 BestPlan.getVectorLoopRegion()->getEntry())),
7304 [](VPBasicBlock *VPBB) {
7305 return any_of(*VPBB, [](VPRecipeBase &R) {
7306 return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7307 !cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7308 });
7309 });
7310 assert(
7311 (BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7312 !Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
7313 planContainsAdditionalSimplifications(
7314 getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
7315 planContainsAdditionalSimplifications(
7316 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7317 " VPlan cost model and legacy cost model disagreed");
7318 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7319 "when vectorizing, the scalar cost must be computed.");
7320#endif
7321
7322 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7323 return BestFactor;
7324}
7325
7326// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7327// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7328// from the main vector loop.
7329static void fixReductionScalarResumeWhenVectorizingEpilog(
7330 VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
7331 using namespace VPlanPatternMatch;
7332 // Get the VPInstruction computing the reduction result in the middle block.
7333 // The first operand may not be from the middle block if it is not connected
7334 // to the scalar preheader. In that case, there's nothing to fix.
7335 VPValue *Incoming = EpiResumePhiR->getOperand(N: 0);
7336 match(V: Incoming, P: VPlanPatternMatch::m_ZExtOrSExt(
7337 Op0: VPlanPatternMatch::m_VPValue(V&: Incoming)));
7338 auto *EpiRedResult = dyn_cast<VPInstruction>(Val: Incoming);
7339 if (!EpiRedResult)
7340 return;
7341
7342 VPValue *BackedgeVal;
7343 bool IsFindIV = false;
7344 if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult ||
7345 EpiRedResult->getOpcode() == VPInstruction::ComputeReductionResult)
7346 BackedgeVal = EpiRedResult->getOperand(N: EpiRedResult->getNumOperands() - 1);
7347 else if (matchFindIVResult(VPI: EpiRedResult, ReducedIV: m_VPValue(V&: BackedgeVal), Start: m_VPValue()))
7348 IsFindIV = true;
7349 else
7350 return;
7351
7352 auto *EpiRedHeaderPhi = cast_if_present<VPReductionPHIRecipe>(
7353 Val: vputils::findRecipe(Start: BackedgeVal, Pred: IsaPred<VPReductionPHIRecipe>));
7354 if (!EpiRedHeaderPhi) {
7355 match(V: BackedgeVal,
7356 P: VPlanPatternMatch::m_Select(Op0: VPlanPatternMatch::m_VPValue(),
7357 Op1: VPlanPatternMatch::m_VPValue(V&: BackedgeVal),
7358 Op2: VPlanPatternMatch::m_VPValue()));
7359 EpiRedHeaderPhi = cast<VPReductionPHIRecipe>(
7360 Val: vputils::findRecipe(Start: BackedgeVal, Pred: IsaPred<VPReductionPHIRecipe>));
7361 }
7362
7363 Value *MainResumeValue;
7364 if (auto *VPI = dyn_cast<VPInstruction>(Val: EpiRedHeaderPhi->getStartValue())) {
7365 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7366 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7367 "unexpected start recipe");
7368 MainResumeValue = VPI->getOperand(N: 0)->getUnderlyingValue();
7369 } else
7370 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7371 if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
7372 [[maybe_unused]] Value *StartV =
7373 EpiRedResult->getOperand(N: 0)->getLiveInIRValue();
7374 auto *Cmp = cast<ICmpInst>(Val: MainResumeValue);
7375 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7376 "AnyOf expected to start with ICMP_NE");
7377 assert(Cmp->getOperand(1) == StartV &&
7378 "AnyOf expected to start by comparing main resume value to original "
7379 "start value");
7380 MainResumeValue = Cmp->getOperand(i_nocapture: 0);
7381 } else if (IsFindIV) {
7382 MainResumeValue = cast<SelectInst>(Val: MainResumeValue)->getFalseValue();
7383 }
7384 PHINode *MainResumePhi = cast<PHINode>(Val: MainResumeValue);
7385
7386 // When fixing reductions in the epilogue loop we should already have
7387 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7388 // over the incoming values correctly.
7389 EpiResumePhi.setIncomingValueForBlock(
7390 BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
7391}
7392
7393DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7394 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7395 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7396 assert(BestVPlan.hasVF(BestVF) &&
7397 "Trying to execute plan with unsupported VF");
7398 assert(BestVPlan.hasUF(BestUF) &&
7399 "Trying to execute plan with unsupported UF");
7400 if (BestVPlan.hasEarlyExit())
7401 ++LoopsEarlyExitVectorized;
7402 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7403 // cost model is complete for better cost estimates.
7404 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7405 RUN_VPLAN_PASS(VPlanTransforms::materializePacksAndUnpacks, BestVPlan);
7406 RUN_VPLAN_PASS(VPlanTransforms::materializeBroadcasts, BestVPlan);
7407 RUN_VPLAN_PASS(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7408 bool HasBranchWeights =
7409 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
7410 if (HasBranchWeights) {
7411 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7412 RUN_VPLAN_PASS(VPlanTransforms::addBranchWeightToMiddleTerminator,
7413 BestVPlan, BestVF, VScale);
7414 }
7415
7416 // Checks are the same for all VPlans, added to BestVPlan only for
7417 // compactness.
7418 attachRuntimeChecks(Plan&: BestVPlan, RTChecks&: ILV.RTChecks, HasBranchWeights);
7419
7420 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7421 VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
7422
7423 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7424 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7425 VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan);
7426 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7427 BestVPlan.getScalarPreheader()) {
7428 // TODO: The vector loop would be dead, should not even try to vectorize.
7429 ORE->emit(RemarkBuilder: [&]() {
7430 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7431 OrigLoop->getStartLoc(),
7432 OrigLoop->getHeader())
7433 << "Created vector loop never executes due to insufficient trip "
7434 "count.";
7435 });
7436 return DenseMap<const SCEV *, Value *>();
7437 }
7438
7439 VPlanTransforms::narrowInterleaveGroups(
7440 Plan&: BestVPlan, VF: BestVF,
7441 VectorRegWidth: TTI.getRegisterBitWidth(K: BestVF.isScalable()
7442 ? TargetTransformInfo::RGK_ScalableVector
7443 : TargetTransformInfo::RGK_FixedWidthVector));
7444 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7445
7446 VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan);
7447 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
7448 VPlanTransforms::convertEVLExitCond(Plan&: BestVPlan);
7449 // Regions are dissolved after optimizing for VF and UF, which completely
7450 // removes unneeded loop regions first.
7451 VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7452 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
7453 // its successors.
7454 VPlanTransforms::expandBranchOnTwoConds(Plan&: BestVPlan);
7455 // Canonicalize EVL loops after regions are dissolved.
7456 VPlanTransforms::canonicalizeEVLLoops(Plan&: BestVPlan);
7457 VPlanTransforms::materializeBackedgeTakenCount(Plan&: BestVPlan, VectorPH);
7458 VPlanTransforms::materializeVectorTripCount(
7459 Plan&: BestVPlan, VectorPHVPBB: VectorPH, TailByMasking: CM.foldTailByMasking(),
7460 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: BestVF.isVector()));
7461 VPlanTransforms::materializeVFAndVFxUF(Plan&: BestVPlan, VectorPH, VF: BestVF);
7462 VPlanTransforms::cse(Plan&: BestVPlan);
7463 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7464
7465 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7466 // making any changes to the CFG.
7467 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7468 VPlanTransforms::expandSCEVs(Plan&: BestVPlan, SE&: *PSE.getSE());
7469 if (!ILV.getTripCount()) {
7470 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7471 } else {
7472 assert(VectorizingEpilogue && "should only re-use the existing trip "
7473 "count during epilogue vectorization");
7474 }
7475
7476 // Perform the actual loop transformation.
7477 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7478 OrigLoop->getParentLoop(),
7479 Legal->getWidestInductionType());
7480
7481#ifdef EXPENSIVE_CHECKS
7482 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7483#endif
7484
7485 // 1. Set up the skeleton for vectorization, including vector pre-header and
7486 // middle block. The vector loop is created during VPlan execution.
7487 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7488 replaceVPBBWithIRVPBB(VPBB: BestVPlan.getScalarPreheader(),
7489 IRBB: State.CFG.PrevBB->getSingleSuccessor(), Plan: &BestVPlan);
7490 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7491
7492 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7493 "final VPlan is invalid");
7494
7495 // After vectorization, the exit blocks of the original loop will have
7496 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7497 // looked through single-entry phis.
7498 ScalarEvolution &SE = *PSE.getSE();
7499 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7500 if (!Exit->hasPredecessors())
7501 continue;
7502 for (VPRecipeBase &PhiR : Exit->phis())
7503 SE.forgetLcssaPhiWithNewPredecessor(L: OrigLoop,
7504 V: &cast<VPIRPhi>(Val&: PhiR).getIRPhi());
7505 }
7506 // Forget the original loop and block dispositions.
7507 SE.forgetLoop(L: OrigLoop);
7508 SE.forgetBlockAndLoopDispositions();
7509
7510 ILV.printDebugTracesAtStart();
7511
7512 //===------------------------------------------------===//
7513 //
7514 // Notice: any optimization or new instruction that go
7515 // into the code below should also be implemented in
7516 // the cost-model.
7517 //
7518 //===------------------------------------------------===//
7519
7520 // Retrieve loop information before executing the plan, which may remove the
7521 // original loop, if it becomes unreachable.
7522 MDNode *LID = OrigLoop->getLoopID();
7523 unsigned OrigLoopInvocationWeight = 0;
7524 std::optional<unsigned> OrigAverageTripCount =
7525 getLoopEstimatedTripCount(L: OrigLoop, EstimatedLoopInvocationWeight: &OrigLoopInvocationWeight);
7526
7527 BestVPlan.execute(State: &State);
7528
7529 // 2.6. Maintain Loop Hints
7530 // Keep all loop hints from the original loop on the vector loop (we'll
7531 // replace the vectorizer-specific hints below).
7532 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7533 // Add metadata to disable runtime unrolling a scalar loop when there
7534 // are no runtime checks about strides and memory. A scalar loop that is
7535 // rarely used is not worth unrolling.
7536 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7537 updateLoopMetadataAndProfileInfo(
7538 VectorLoop: HeaderVPBB ? LI->getLoopFor(BB: State.CFG.VPBB2IRBB.lookup(Val: HeaderVPBB))
7539 : nullptr,
7540 HeaderVPBB, Plan: BestVPlan, VectorizingEpilogue, OrigLoopID: LID, OrigAverageTripCount,
7541 OrigLoopInvocationWeight,
7542 EstimatedVFxUF: estimateElementCount(VF: BestVF * BestUF, VScale: CM.getVScaleForTuning()),
7543 DisableRuntimeUnroll);
7544
7545 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7546 // predication, updating analyses.
7547 ILV.fixVectorizedLoop(State);
7548
7549 ILV.printDebugTracesAtEnd();
7550
7551 return ExpandedSCEVs;
7552}
7553
7554//===--------------------------------------------------------------------===//
7555// EpilogueVectorizerMainLoop
7556//===--------------------------------------------------------------------===//
7557
7558/// This function is partially responsible for generating the control flow
7559/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7560BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
7561 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
7562 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7563
7564 // Generate the code to check the minimum iteration count of the vector
7565 // epilogue (see below).
7566 EPI.EpilogueIterationCountCheck =
7567 emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: true);
7568 EPI.EpilogueIterationCountCheck->setName("iter.check");
7569
7570 VectorPH = cast<BranchInst>(Val: EPI.EpilogueIterationCountCheck->getTerminator())
7571 ->getSuccessor(i: 1);
7572 // Generate the iteration count check for the main loop, *after* the check
7573 // for the epilogue loop, so that the path-length is shorter for the case
7574 // that goes directly through the vector epilogue. The longer-path length for
7575 // the main loop is compensated for, by the gain from vectorizing the larger
7576 // trip count. Note: the branch will get updated later on when we vectorize
7577 // the epilogue.
7578 EPI.MainLoopIterationCountCheck =
7579 emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: false);
7580
7581 return cast<BranchInst>(Val: EPI.MainLoopIterationCountCheck->getTerminator())
7582 ->getSuccessor(i: 1);
7583}
7584
7585void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7586 LLVM_DEBUG({
7587 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7588 << "Main Loop VF:" << EPI.MainLoopVF
7589 << ", Main Loop UF:" << EPI.MainLoopUF
7590 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7591 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7592 });
7593}
7594
7595void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7596 DEBUG_WITH_TYPE(VerboseDebug, {
7597 dbgs() << "intermediate fn:\n"
7598 << *OrigLoop->getHeader()->getParent() << "\n";
7599 });
7600}
7601
7602BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
7603 BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
7604 assert(Bypass && "Expected valid bypass basic block.");
7605 Value *Count = getTripCount();
7606 MinProfitableTripCount = ElementCount::getFixed(MinVal: 0);
7607 Value *CheckMinIters = createIterationCountCheck(
7608 VectorPH, VF: ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7609 UF: ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7610
7611 BasicBlock *const TCCheckBlock = VectorPH;
7612 if (!ForEpilogue)
7613 TCCheckBlock->setName("vector.main.loop.iter.check");
7614
7615 // Create new preheader for vector loop.
7616 VectorPH = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7617 DT: static_cast<DominatorTree *>(nullptr), LI, MSSAU: nullptr,
7618 BBName: "vector.ph");
7619 if (ForEpilogue) {
7620 // Save the trip count so we don't have to regenerate it in the
7621 // vec.epilog.iter.check. This is safe to do because the trip count
7622 // generated here dominates the vector epilog iter check.
7623 EPI.TripCount = Count;
7624 } else {
7625 VectorPHVPBB = replaceVPBBWithIRVPBB(VPBB: VectorPHVPBB, IRBB: VectorPH);
7626 }
7627
7628 BranchInst &BI = *BranchInst::Create(IfTrue: Bypass, IfFalse: VectorPH, Cond: CheckMinIters);
7629 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7630 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
7631 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7632
7633 // When vectorizing the main loop, its trip-count check is placed in a new
7634 // block, whereas the overall trip-count check is placed in the VPlan entry
7635 // block. When vectorizing the epilogue loop, its trip-count check is placed
7636 // in the VPlan entry block.
7637 if (!ForEpilogue)
7638 introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7639 return TCCheckBlock;
7640}
7641
7642//===--------------------------------------------------------------------===//
7643// EpilogueVectorizerEpilogueLoop
7644//===--------------------------------------------------------------------===//
7645
7646/// This function creates a new scalar preheader, using the previous one as
7647/// entry block to the epilogue VPlan. The minimum iteration check is being
7648/// represented in VPlan.
7649BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
7650 BasicBlock *NewScalarPH = createScalarPreheader(Prefix: "vec.epilog.");
7651 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7652 OriginalScalarPH->setName("vec.epilog.iter.check");
7653 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: OriginalScalarPH);
7654 VPBasicBlock *OldEntry = Plan.getEntry();
7655 for (auto &R : make_early_inc_range(Range&: *OldEntry)) {
7656 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7657 // defining.
7658 if (isa<VPIRInstruction>(Val: &R))
7659 continue;
7660 R.moveBefore(BB&: *NewEntry, I: NewEntry->end());
7661 }
7662
7663 VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7664 Plan.setEntry(NewEntry);
7665 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7666
7667 return OriginalScalarPH;
7668}
7669
7670void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7671 LLVM_DEBUG({
7672 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7673 << "Epilogue Loop VF:" << EPI.EpilogueVF
7674 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7675 });
7676}
7677
7678void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7679 DEBUG_WITH_TYPE(VerboseDebug, {
7680 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7681 });
7682}
7683
7684VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7685 VFRange &Range) {
7686 assert((VPI->getOpcode() == Instruction::Load ||
7687 VPI->getOpcode() == Instruction::Store) &&
7688 "Must be called with either a load or store");
7689 Instruction *I = VPI->getUnderlyingInstr();
7690
7691 auto WillWiden = [&](ElementCount VF) -> bool {
7692 LoopVectorizationCostModel::InstWidening Decision =
7693 CM.getWideningDecision(I, VF);
7694 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7695 "CM decision should be taken at this point.");
7696 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7697 return true;
7698 if (CM.isScalarAfterVectorization(I, VF) ||
7699 CM.isProfitableToScalarize(I, VF))
7700 return false;
7701 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7702 };
7703
7704 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7705 return nullptr;
7706
7707 VPValue *Mask = nullptr;
7708 if (Legal->isMaskRequired(I))
7709 Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7710
7711 // Determine if the pointer operand of the access is either consecutive or
7712 // reverse consecutive.
7713 LoopVectorizationCostModel::InstWidening Decision =
7714 CM.getWideningDecision(I, VF: Range.Start);
7715 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7716 bool Consecutive =
7717 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7718
7719 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(N: 0)
7720 : VPI->getOperand(N: 1);
7721 if (Consecutive) {
7722 auto *GEP = dyn_cast<GetElementPtrInst>(
7723 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7724 VPSingleDefRecipe *VectorPtr;
7725 if (Reverse) {
7726 // When folding the tail, we may compute an address that we don't in the
7727 // original scalar loop: drop the GEP no-wrap flags in this case.
7728 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7729 // emit negative indices.
7730 GEPNoWrapFlags Flags =
7731 CM.foldTailByMasking() || !GEP
7732 ? GEPNoWrapFlags::none()
7733 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7734 VectorPtr = new VPVectorEndPointerRecipe(
7735 Ptr, &Plan.getVF(), getLoadStoreType(I),
7736 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7737 } else {
7738 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7739 GEP ? GEP->getNoWrapFlags()
7740 : GEPNoWrapFlags::none(),
7741 VPI->getDebugLoc());
7742 }
7743 Builder.insert(R: VectorPtr);
7744 Ptr = VectorPtr;
7745 }
7746
7747 if (VPI->getOpcode() == Instruction::Load) {
7748 auto *Load = cast<LoadInst>(Val: I);
7749 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7750 *VPI, Load->getDebugLoc());
7751 if (Reverse) {
7752 Builder.insert(R: LoadR);
7753 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
7754 LoadR->getDebugLoc());
7755 }
7756 return LoadR;
7757 }
7758
7759 StoreInst *Store = cast<StoreInst>(Val: I);
7760 VPValue *StoredVal = VPI->getOperand(N: 0);
7761 if (Reverse)
7762 StoredVal = Builder.createNaryOp(Opcode: VPInstruction::Reverse, Operands: StoredVal,
7763 DL: Store->getDebugLoc());
7764 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
7765 Reverse, *VPI, Store->getDebugLoc());
7766}
7767
7768VPWidenIntOrFpInductionRecipe *
7769VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7770 VFRange &Range) {
7771 auto *I = cast<TruncInst>(Val: VPI->getUnderlyingInstr());
7772 // Optimize the special case where the source is a constant integer
7773 // induction variable. Notice that we can only optimize the 'trunc' case
7774 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7775 // (c) other casts depend on pointer size.
7776
7777 // Determine whether \p K is a truncation based on an induction variable that
7778 // can be optimized.
7779 auto IsOptimizableIVTruncate =
7780 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7781 return [=](ElementCount VF) -> bool {
7782 return CM.isOptimizableIVTruncate(I: K, VF);
7783 };
7784 };
7785
7786 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
7787 Predicate: IsOptimizableIVTruncate(I), Range))
7788 return nullptr;
7789
7790 auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
7791 Val: VPI->getOperand(N: 0)->getDefiningRecipe());
7792 PHINode *Phi = WidenIV->getPHINode();
7793 VPIRValue *Start = WidenIV->getStartValue();
7794 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7795
7796 // It is always safe to copy over the NoWrap and FastMath flags. In
7797 // particular, when folding tail by masking, the masked-off lanes are never
7798 // used, so it is safe.
7799 VPIRFlags Flags = vputils::getFlagsFromIndDesc(ID: IndDesc);
7800 VPValue *Step =
7801 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
7802 return new VPWidenIntOrFpInductionRecipe(
7803 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7804}
7805
7806VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7807 VFRange &Range) {
7808 CallInst *CI = cast<CallInst>(Val: VPI->getUnderlyingInstr());
7809 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7810 Predicate: [this, CI](ElementCount VF) {
7811 return CM.isScalarWithPredication(I: CI, VF);
7812 },
7813 Range);
7814
7815 if (IsPredicated)
7816 return nullptr;
7817
7818 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7819 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7820 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7821 ID == Intrinsic::pseudoprobe ||
7822 ID == Intrinsic::experimental_noalias_scope_decl))
7823 return nullptr;
7824
7825 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7826 VPI->op_begin() + CI->arg_size());
7827
7828 // Is it beneficial to perform intrinsic call compared to lib call?
7829 bool ShouldUseVectorIntrinsic =
7830 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7831 Predicate: [&](ElementCount VF) -> bool {
7832 return CM.getCallWideningDecision(CI, VF).Kind ==
7833 LoopVectorizationCostModel::CM_IntrinsicCall;
7834 },
7835 Range);
7836 if (ShouldUseVectorIntrinsic)
7837 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
7838 VPI->getDebugLoc());
7839
7840 Function *Variant = nullptr;
7841 std::optional<unsigned> MaskPos;
7842 // Is better to call a vectorized version of the function than to to scalarize
7843 // the call?
7844 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7845 Predicate: [&](ElementCount VF) -> bool {
7846 // The following case may be scalarized depending on the VF.
7847 // The flag shows whether we can use a usual Call for vectorized
7848 // version of the instruction.
7849
7850 // If we've found a variant at a previous VF, then stop looking. A
7851 // vectorized variant of a function expects input in a certain shape
7852 // -- basically the number of input registers, the number of lanes
7853 // per register, and whether there's a mask required.
7854 // We store a pointer to the variant in the VPWidenCallRecipe, so
7855 // once we have an appropriate variant it's only valid for that VF.
7856 // This will force a different vplan to be generated for each VF that
7857 // finds a valid variant.
7858 if (Variant)
7859 return false;
7860 LoopVectorizationCostModel::CallWideningDecision Decision =
7861 CM.getCallWideningDecision(CI, VF);
7862 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7863 Variant = Decision.Variant;
7864 MaskPos = Decision.MaskPos;
7865 return true;
7866 }
7867
7868 return false;
7869 },
7870 Range);
7871 if (ShouldUseVectorCall) {
7872 if (MaskPos.has_value()) {
7873 // We have 2 cases that would require a mask:
7874 // 1) The block needs to be predicated, either due to a conditional
7875 // in the scalar loop or use of an active lane mask with
7876 // tail-folding, and we use the appropriate mask for the block.
7877 // 2) No mask is required for the block, but the only available
7878 // vector variant at this VF requires a mask, so we synthesize an
7879 // all-true mask.
7880 VPValue *Mask = Legal->isMaskRequired(I: CI)
7881 ? getBlockInMask(VPBB: Builder.getInsertBlock())
7882 : Plan.getTrue();
7883
7884 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7885 }
7886
7887 Ops.push_back(Elt: VPI->getOperand(N: VPI->getNumOperands() - 1));
7888 return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
7889 VPI->getDebugLoc());
7890 }
7891
7892 return nullptr;
7893}
7894
7895bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7896 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7897 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7898 // Instruction should be widened, unless it is scalar after vectorization,
7899 // scalarization is profitable or it is predicated.
7900 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7901 return CM.isScalarAfterVectorization(I, VF) ||
7902 CM.isProfitableToScalarize(I, VF) ||
7903 CM.isScalarWithPredication(I, VF);
7904 };
7905 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7906 Range);
7907}
7908
7909VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7910 auto *I = VPI->getUnderlyingInstr();
7911 switch (VPI->getOpcode()) {
7912 default:
7913 return nullptr;
7914 case Instruction::SDiv:
7915 case Instruction::UDiv:
7916 case Instruction::SRem:
7917 case Instruction::URem: {
7918 // If not provably safe, use a select to form a safe divisor before widening the
7919 // div/rem operation itself. Otherwise fall through to general handling below.
7920 if (CM.isPredicatedInst(I)) {
7921 SmallVector<VPValue *> Ops(VPI->operands());
7922 VPValue *Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7923 VPValue *One = Plan.getConstantInt(Ty: I->getType(), Val: 1u);
7924 auto *SafeRHS =
7925 Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: VPI->getDebugLoc());
7926 Ops[1] = SafeRHS;
7927 return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
7928 }
7929 [[fallthrough]];
7930 }
7931 case Instruction::Add:
7932 case Instruction::And:
7933 case Instruction::AShr:
7934 case Instruction::FAdd:
7935 case Instruction::FCmp:
7936 case Instruction::FDiv:
7937 case Instruction::FMul:
7938 case Instruction::FNeg:
7939 case Instruction::FRem:
7940 case Instruction::FSub:
7941 case Instruction::ICmp:
7942 case Instruction::LShr:
7943 case Instruction::Mul:
7944 case Instruction::Or:
7945 case Instruction::Select:
7946 case Instruction::Shl:
7947 case Instruction::Sub:
7948 case Instruction::Xor:
7949 case Instruction::Freeze:
7950 return new VPWidenRecipe(*I, VPI->operands(), *VPI, *VPI,
7951 VPI->getDebugLoc());
7952 case Instruction::ExtractValue: {
7953 SmallVector<VPValue *> NewOps(VPI->operands());
7954 auto *EVI = cast<ExtractValueInst>(Val: I);
7955 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7956 unsigned Idx = EVI->getIndices()[0];
7957 NewOps.push_back(Elt: Plan.getConstantInt(BitWidth: 32, Val: Idx));
7958 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7959 }
7960 };
7961}
7962
7963VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7964 VPInstruction *VPI) {
7965 // FIXME: Support other operations.
7966 unsigned Opcode = HI->Update->getOpcode();
7967 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7968 "Histogram update operation must be an Add or Sub");
7969
7970 SmallVector<VPValue *, 3> HGramOps;
7971 // Bucket address.
7972 HGramOps.push_back(Elt: VPI->getOperand(N: 1));
7973 // Increment value.
7974 HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: 1)));
7975
7976 // In case of predicated execution (due to tail-folding, or conditional
7977 // execution, or both), pass the relevant mask.
7978 if (Legal->isMaskRequired(I: HI->Store))
7979 HGramOps.push_back(Elt: getBlockInMask(VPBB: Builder.getInsertBlock()));
7980
7981 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7982}
7983
7984VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
7985 VFRange &Range) {
7986 auto *I = VPI->getUnderlyingInstr();
7987 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7988 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7989 Range);
7990
7991 bool IsPredicated = CM.isPredicatedInst(I);
7992
7993 // Even if the instruction is not marked as uniform, there are certain
7994 // intrinsic calls that can be effectively treated as such, so we check for
7995 // them here. Conservatively, we only do this for scalable vectors, since
7996 // for fixed-width VFs we can always fall back on full scalarization.
7997 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
7998 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
7999 case Intrinsic::assume:
8000 case Intrinsic::lifetime_start:
8001 case Intrinsic::lifetime_end:
8002 // For scalable vectors if one of the operands is variant then we still
8003 // want to mark as uniform, which will generate one instruction for just
8004 // the first lane of the vector. We can't scalarize the call in the same
8005 // way as for fixed-width vectors because we don't know how many lanes
8006 // there are.
8007 //
8008 // The reasons for doing it this way for scalable vectors are:
8009 // 1. For the assume intrinsic generating the instruction for the first
8010 // lane is still be better than not generating any at all. For
8011 // example, the input may be a splat across all lanes.
8012 // 2. For the lifetime start/end intrinsics the pointer operand only
8013 // does anything useful when the input comes from a stack object,
8014 // which suggests it should always be uniform. For non-stack objects
8015 // the effect is to poison the object, which still allows us to
8016 // remove the call.
8017 IsUniform = true;
8018 break;
8019 default:
8020 break;
8021 }
8022 }
8023 VPValue *BlockInMask = nullptr;
8024 if (!IsPredicated) {
8025 // Finalize the recipe for Instr, first if it is not predicated.
8026 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8027 } else {
8028 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8029 // Instructions marked for predication are replicated and a mask operand is
8030 // added initially. Masked replicate recipes will later be placed under an
8031 // if-then construct to prevent side-effects. Generate recipes to compute
8032 // the block mask for this region.
8033 BlockInMask = getBlockInMask(VPBB: Builder.getInsertBlock());
8034 }
8035
8036 // Note that there is some custom logic to mark some intrinsics as uniform
8037 // manually above for scalable vectors, which this assert needs to account for
8038 // as well.
8039 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8040 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8041 "Should not predicate a uniform recipe");
8042 auto *Recipe =
8043 new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI,
8044 *VPI, VPI->getDebugLoc());
8045 return Recipe;
8046}
8047
8048VPRecipeBase *
8049VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
8050 VFRange &Range) {
8051 assert(!R->isPhi() && "phis must be handled earlier");
8052 // First, check for specific widening recipes that deal with optimizing
8053 // truncates, calls and memory operations.
8054
8055 VPRecipeBase *Recipe;
8056 auto *VPI = cast<VPInstruction>(Val: R);
8057 if (VPI->getOpcode() == Instruction::Trunc &&
8058 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
8059 return Recipe;
8060
8061 // All widen recipes below deal only with VF > 1.
8062 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8063 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8064 return nullptr;
8065
8066 if (VPI->getOpcode() == Instruction::Call)
8067 return tryToWidenCall(VPI, Range);
8068
8069 Instruction *Instr = R->getUnderlyingInstr();
8070 if (VPI->getOpcode() == Instruction::Store)
8071 if (auto HistInfo = Legal->getHistogramInfo(I: cast<StoreInst>(Val: Instr)))
8072 return tryToWidenHistogram(HI: *HistInfo, VPI);
8073
8074 if (VPI->getOpcode() == Instruction::Load ||
8075 VPI->getOpcode() == Instruction::Store)
8076 return tryToWidenMemory(VPI, Range);
8077
8078 if (!shouldWiden(I: Instr, Range))
8079 return nullptr;
8080
8081 if (VPI->getOpcode() == Instruction::GetElementPtr)
8082 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Val: Instr), R->operands(),
8083 *VPI, VPI->getDebugLoc());
8084
8085 if (Instruction::isCast(Opcode: VPI->getOpcode())) {
8086 auto *CI = cast<CastInst>(Val: Instr);
8087 auto *CastR = cast<VPInstructionWithType>(Val: VPI);
8088 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(N: 0),
8089 CastR->getResultType(), CI, *VPI, *VPI,
8090 VPI->getDebugLoc());
8091 }
8092
8093 return tryToWiden(VPI);
8094}
8095
8096void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8097 ElementCount MaxVF) {
8098 if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8099 return;
8100
8101 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8102
8103 const LoopAccessInfo *LAI = Legal->getLAI();
8104 LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8105 OrigLoop, LI, DT, PSE.getSE());
8106 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8107 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
8108 // Only use noalias metadata when using memory checks guaranteeing no
8109 // overlap across all iterations.
8110 LVer.prepareNoAliasMetadata();
8111 }
8112
8113 // Create initial base VPlan0, to serve as common starting point for all
8114 // candidates built later for specific VF ranges.
8115 auto VPlan0 = VPlanTransforms::buildVPlan0(
8116 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8117 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE, LVer: &LVer);
8118
8119 // Create recipes for header phis.
8120 VPlanTransforms::createHeaderPhiRecipes(
8121 Plan&: *VPlan0, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
8122 Reductions: Legal->getReductionVars(), FixedOrderRecurrences: Legal->getFixedOrderRecurrences(),
8123 InLoopReductions: CM.getInLoopReductions(), AllowReordering: Hints.allowReordering());
8124
8125 auto MaxVFTimes2 = MaxVF * 2;
8126 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8127 VFRange SubRange = {VF, MaxVFTimes2};
8128 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8129 InitialPlan: std::unique_ptr<VPlan>(VPlan0->duplicate()), Range&: SubRange, LVer: &LVer)) {
8130 // Now optimize the initial VPlan.
8131 VPlanTransforms::hoistPredicatedLoads(Plan&: *Plan, PSE, L: OrigLoop);
8132 VPlanTransforms::sinkPredicatedStores(Plan&: *Plan, PSE, L: OrigLoop);
8133 RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
8134 CM.getMinimalBitwidths());
8135 RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
8136 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
8137 if (CM.foldTailWithEVL()) {
8138 RUN_VPLAN_PASS(VPlanTransforms::addExplicitVectorLength, *Plan,
8139 CM.getMaxSafeElements());
8140 RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
8141 }
8142 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8143 VPlans.push_back(Elt: std::move(Plan));
8144 }
8145 VF = SubRange.End;
8146 }
8147}
8148
8149VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8150 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8151
8152 using namespace llvm::VPlanPatternMatch;
8153 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8154
8155 // ---------------------------------------------------------------------------
8156 // Build initial VPlan: Scan the body of the loop in a topological order to
8157 // visit each basic block after having visited its predecessor basic blocks.
8158 // ---------------------------------------------------------------------------
8159
8160 bool RequiresScalarEpilogueCheck =
8161 LoopVectorizationPlanner::getDecisionAndClampRange(
8162 Predicate: [this](ElementCount VF) {
8163 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8164 },
8165 Range);
8166 VPlanTransforms::handleEarlyExits(Plan&: *Plan, HasUncountableExit: Legal->hasUncountableEarlyExit());
8167 VPlanTransforms::addMiddleCheck(Plan&: *Plan, RequiresScalarEpilogueCheck,
8168 TailFolded: CM.foldTailByMasking());
8169
8170 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8171
8172 // Don't use getDecisionAndClampRange here, because we don't know the UF
8173 // so this function is better to be conservative, rather than to split
8174 // it up into different VPlans.
8175 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8176 bool IVUpdateMayOverflow = false;
8177 for (ElementCount VF : Range)
8178 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8179
8180 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8181 // Use NUW for the induction increment if we proved that it won't overflow in
8182 // the vector loop or when not folding the tail. In the later case, we know
8183 // that the canonical induction increment will not overflow as the vector trip
8184 // count is >= increment and a multiple of the increment.
8185 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8186 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8187 if (!HasNUW) {
8188 auto *IVInc =
8189 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(N: 0);
8190 assert(match(IVInc,
8191 m_VPInstruction<Instruction::Add>(
8192 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8193 "Did not find the canonical IV increment");
8194 cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8195 }
8196
8197 // ---------------------------------------------------------------------------
8198 // Pre-construction: record ingredients whose recipes we'll need to further
8199 // process after constructing the initial VPlan.
8200 // ---------------------------------------------------------------------------
8201
8202 // For each interleave group which is relevant for this (possibly trimmed)
8203 // Range, add it to the set of groups to be later applied to the VPlan and add
8204 // placeholders for its members' Recipes which we'll be replacing with a
8205 // single VPInterleaveRecipe.
8206 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8207 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8208 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8209 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8210 LoopVectorizationCostModel::CM_Interleave);
8211 // For scalable vectors, the interleave factors must be <= 8 since we
8212 // require the (de)interleaveN intrinsics instead of shufflevectors.
8213 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8214 "Unsupported interleave factor for scalable vectors");
8215 return Result;
8216 };
8217 if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8218 continue;
8219 InterleaveGroups.insert(Ptr: IG);
8220 }
8221
8222 // ---------------------------------------------------------------------------
8223 // Predicate and linearize the top-level loop region.
8224 // ---------------------------------------------------------------------------
8225 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8226 Plan&: *Plan, FoldTail: CM.foldTailByMasking());
8227
8228 // ---------------------------------------------------------------------------
8229 // Construct wide recipes and apply predication for original scalar
8230 // VPInstructions in the loop.
8231 // ---------------------------------------------------------------------------
8232 VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder, BlockMaskCache);
8233
8234 // Scan the body of the loop in a topological order to visit each basic block
8235 // after having visited its predecessor basic blocks.
8236 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8237 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8238 HeaderVPBB);
8239
8240 auto *MiddleVPBB = Plan->getMiddleBlock();
8241 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8242 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8243 // temporarily to update created block masks.
8244 DenseMap<VPValue *, VPValue *> Old2New;
8245
8246 // Collect blocks that need predication for in-loop reduction recipes.
8247 DenseSet<BasicBlock *> BlocksNeedingPredication;
8248 for (BasicBlock *BB : OrigLoop->blocks())
8249 if (CM.blockNeedsPredicationForAnyReason(BB))
8250 BlocksNeedingPredication.insert(V: BB);
8251
8252 VPlanTransforms::createInLoopReductionRecipes(
8253 Plan&: *Plan, BlockMaskCache, BlocksNeedingPredication, MinVF: Range.Start);
8254
8255 // Now process all other blocks and instructions.
8256 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8257 // Convert input VPInstructions to widened recipes.
8258 for (VPRecipeBase &R : make_early_inc_range(
8259 Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end()))) {
8260 // Skip recipes that do not need transforming.
8261 if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(Val: &R))
8262 continue;
8263 auto *VPI = cast<VPInstruction>(Val: &R);
8264 if (!VPI->getUnderlyingValue())
8265 continue;
8266
8267 // TODO: Gradually replace uses of underlying instruction by analyses on
8268 // VPlan. Migrate code relying on the underlying instruction from VPlan0
8269 // to construct recipes below to not use the underlying instruction.
8270 Instruction *Instr = cast<Instruction>(Val: VPI->getUnderlyingValue());
8271 Builder.setInsertPoint(VPI);
8272
8273 // The stores with invariant address inside the loop will be deleted, and
8274 // in the exit block, a uniform store recipe will be created for the final
8275 // invariant store of the reduction.
8276 StoreInst *SI;
8277 if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8278 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8279 // Only create recipe for the final invariant store of the reduction.
8280 if (Legal->isInvariantStoreOfReduction(SI)) {
8281 auto *Recipe = new VPReplicateRecipe(
8282 SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/, *VPI,
8283 *VPI, VPI->getDebugLoc());
8284 Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8285 }
8286 R.eraseFromParent();
8287 continue;
8288 }
8289
8290 VPRecipeBase *Recipe =
8291 RecipeBuilder.tryToCreateWidenNonPhiRecipe(R: VPI, Range);
8292 if (!Recipe)
8293 Recipe =
8294 RecipeBuilder.handleReplication(VPI: cast<VPInstruction>(Val: VPI), Range);
8295
8296 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8297 if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8298 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8299 // moved to the phi section in the header.
8300 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8301 } else {
8302 Builder.insert(R: Recipe);
8303 }
8304 if (Recipe->getNumDefinedValues() == 1) {
8305 VPI->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8306 Old2New[VPI] = Recipe->getVPSingleValue();
8307 } else {
8308 assert(Recipe->getNumDefinedValues() == 0 &&
8309 "Unexpected multidef recipe");
8310 R.eraseFromParent();
8311 }
8312 }
8313 }
8314
8315 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8316 // TODO: Include the masks as operands in the predicated VPlan directly
8317 // to remove the need to keep a map of masks beyond the predication
8318 // transform.
8319 RecipeBuilder.updateBlockMaskCache(Old2New);
8320 for (VPValue *Old : Old2New.keys())
8321 Old->getDefiningRecipe()->eraseFromParent();
8322
8323 assert(isa<VPRegionBlock>(LoopRegion) &&
8324 !LoopRegion->getEntryBasicBlock()->empty() &&
8325 "entry block must be set to a VPRegionBlock having a non-empty entry "
8326 "VPBasicBlock");
8327
8328 // TODO: We can't call runPass on these transforms yet, due to verifier
8329 // failures.
8330 VPlanTransforms::addExitUsersForFirstOrderRecurrences(Plan&: *Plan, Range);
8331 DenseMap<VPValue *, VPValue *> IVEndValues;
8332 VPlanTransforms::updateScalarResumePhis(Plan&: *Plan, IVEndValues);
8333
8334 // ---------------------------------------------------------------------------
8335 // Transform initial VPlan: Apply previously taken decisions, in order, to
8336 // bring the VPlan to its final state.
8337 // ---------------------------------------------------------------------------
8338
8339 addReductionResultComputation(Plan, RecipeBuilder, MinVF: Range.Start);
8340
8341 // Apply mandatory transformation to handle reductions with multiple in-loop
8342 // uses if possible, bail out otherwise.
8343 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMultiUseReductions, *Plan))
8344 return nullptr;
8345 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8346 // NaNs if possible, bail out otherwise.
8347 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMaxMinNumReductions, *Plan))
8348 return nullptr;
8349
8350 // Create whole-vector selects for find-last recurrences.
8351 if (!RUN_VPLAN_PASS(VPlanTransforms::handleFindLastReductions, *Plan))
8352 return nullptr;
8353
8354 // Create partial reduction recipes for scaled reductions and transform
8355 // recipes to abstract recipes if it is legal and beneficial and clamp the
8356 // range for better cost estimation.
8357 // TODO: Enable following transform when the EVL-version of extended-reduction
8358 // and mulacc-reduction are implemented.
8359 if (!CM.foldTailWithEVL()) {
8360 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
8361 OrigLoop);
8362 RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
8363 Range);
8364 RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
8365 Range);
8366 }
8367
8368 for (ElementCount VF : Range)
8369 Plan->addVF(VF);
8370 Plan->setName("Initial VPlan");
8371
8372 // Interleave memory: for each Interleave Group we marked earlier as relevant
8373 // for this VPlan, replace the Recipes widening its memory instructions with a
8374 // single VPInterleaveRecipe at its insertion point.
8375 RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
8376 InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
8377
8378 // Replace VPValues for known constant strides.
8379 RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
8380 Legal->getLAI()->getSymbolicStrides());
8381
8382 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8383 return Legal->blockNeedsPredication(BB);
8384 };
8385 RUN_VPLAN_PASS(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
8386 BlockNeedsPredication);
8387
8388 // Sink users of fixed-order recurrence past the recipe defining the previous
8389 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8390 if (!RUN_VPLAN_PASS(VPlanTransforms::adjustFixedOrderRecurrences, *Plan,
8391 Builder))
8392 return nullptr;
8393
8394 if (useActiveLaneMask(Style)) {
8395 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8396 // TailFoldingStyle is visible there.
8397 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8398 bool WithoutRuntimeCheck =
8399 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8400 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8401 DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8402 }
8403 VPlanTransforms::optimizeInductionExitUsers(Plan&: *Plan, EndValues&: IVEndValues, PSE);
8404
8405 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8406 return Plan;
8407}
8408
8409VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8410 // Outer loop handling: They may require CFG and instruction level
8411 // transformations before even evaluating whether vectorization is profitable.
8412 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8413 // the vectorization pipeline.
8414 assert(!OrigLoop->isInnermost());
8415 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8416
8417 auto Plan = VPlanTransforms::buildVPlan0(
8418 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8419 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE);
8420
8421 VPlanTransforms::createHeaderPhiRecipes(
8422 Plan&: *Plan, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
8423 Reductions: MapVector<PHINode *, RecurrenceDescriptor>(),
8424 FixedOrderRecurrences: SmallPtrSet<const PHINode *, 1>(), InLoopReductions: SmallPtrSet<PHINode *, 1>(),
8425 /*AllowReordering=*/false);
8426 VPlanTransforms::handleEarlyExits(Plan&: *Plan,
8427 /*HasUncountableExit*/ false);
8428 VPlanTransforms::addMiddleCheck(Plan&: *Plan, /*RequiresScalarEpilogue*/ RequiresScalarEpilogueCheck: true,
8429 /*TailFolded*/ false);
8430
8431 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8432
8433 for (ElementCount VF : Range)
8434 Plan->addVF(VF);
8435
8436 if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(Plan&: *Plan, TLI: *TLI))
8437 return nullptr;
8438
8439 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8440 // values.
8441 // TODO: We can't call runPass on the transform yet, due to verifier
8442 // failures.
8443 DenseMap<VPValue *, VPValue *> IVEndValues;
8444 VPlanTransforms::updateScalarResumePhis(Plan&: *Plan, IVEndValues);
8445
8446 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8447 return Plan;
8448}
8449
8450void LoopVectorizationPlanner::addReductionResultComputation(
8451 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8452 using namespace VPlanPatternMatch;
8453 VPTypeAnalysis TypeInfo(*Plan);
8454 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8455 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8456 SmallVector<VPRecipeBase *> ToDelete;
8457 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8458 Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
8459 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8460 for (VPRecipeBase &R :
8461 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8462 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8463 if (!PhiR)
8464 continue;
8465
8466 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8467 PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
8468 Type *PhiTy = TypeInfo.inferScalarType(V: PhiR);
8469 // If tail is folded by masking, introduce selects between the phi
8470 // and the users outside the vector region of each reduction, at the
8471 // beginning of the dedicated latch block.
8472 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8473 auto *NewExitingVPV = PhiR->getBackedgeValue();
8474 // Don't output selects for partial reductions because they have an output
8475 // with fewer lanes than the VF. So the operands of the select would have
8476 // different numbers of lanes. Partial reductions mask the input instead.
8477 auto *RR = dyn_cast<VPReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe());
8478 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8479 (!RR || !RR->isPartialReduction())) {
8480 VPValue *Cond = RecipeBuilder.getBlockInMask(VPBB: PhiR->getParent());
8481 std::optional<FastMathFlags> FMFs =
8482 PhiTy->isFloatingPointTy()
8483 ? std::make_optional(t: RdxDesc.getFastMathFlags())
8484 : std::nullopt;
8485 NewExitingVPV =
8486 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
8487 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
8488 using namespace VPlanPatternMatch;
8489 return match(
8490 U: &U, P: m_CombineOr(
8491 L: m_VPInstruction<VPInstruction::ComputeAnyOfResult>(),
8492 R: m_VPInstruction<VPInstruction::ComputeReductionResult>()));
8493 });
8494 if (CM.usePredicatedReductionSelect())
8495 PhiR->setOperand(I: 1, New: NewExitingVPV);
8496 }
8497
8498 // We want code in the middle block to appear to execute on the location of
8499 // the scalar loop's latch terminator because: (a) it is all compiler
8500 // generated, (b) these instructions are always executed after evaluating
8501 // the latch conditional branch, and (c) other passes may add new
8502 // predecessors which terminate on this line. This is the easiest way to
8503 // ensure we don't accidentally cause an extra step back into the loop while
8504 // debugging.
8505 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8506
8507 // TODO: At the moment ComputeReductionResult also drives creation of the
8508 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8509 // even for in-loop reductions, until the reduction resume value handling is
8510 // also modeled in VPlan.
8511 VPInstruction *FinalReductionResult;
8512 VPBuilder::InsertPointGuard Guard(Builder);
8513 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
8514 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8515 // For AnyOf reductions, find the select among PhiR's users. This is used
8516 // both to find NewVal for ComputeAnyOfResult and to adjust the reduction.
8517 VPRecipeBase *AnyOfSelect = nullptr;
8518 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8519 AnyOfSelect = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
8520 return match(U, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()));
8521 }));
8522 }
8523 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind)) {
8524 VPValue *Start = PhiR->getStartValue();
8525 VPValue *Sentinel = Plan->getOrAddLiveIn(V: RdxDesc.getSentinelValue());
8526 RecurKind MinMaxKind;
8527 bool IsSigned =
8528 RecurrenceDescriptor::isSignedRecurrenceKind(Kind: RecurrenceKind);
8529 if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind: RecurrenceKind))
8530 MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
8531 else
8532 MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
8533 VPIRFlags Flags(MinMaxKind, /*IsOrdered=*/false, /*IsInLoop=*/false,
8534 FastMathFlags());
8535 auto *ReducedIV =
8536 Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
8537 Operands: {NewExitingVPV}, Flags, DL: ExitDL);
8538 auto *Cmp =
8539 Builder.createICmp(Pred: CmpInst::ICMP_NE, A: ReducedIV, B: Sentinel, DL: ExitDL);
8540 FinalReductionResult = cast<VPInstruction>(
8541 Val: Builder.createSelect(Cond: Cmp, TrueVal: ReducedIV, FalseVal: Start, DL: ExitDL));
8542 } else if (AnyOfSelect) {
8543 VPValue *Start = PhiR->getStartValue();
8544 // NewVal is the non-phi operand of the select.
8545 VPValue *NewVal = AnyOfSelect->getOperand(N: 1) == PhiR
8546 ? AnyOfSelect->getOperand(N: 2)
8547 : AnyOfSelect->getOperand(N: 1);
8548 FinalReductionResult =
8549 Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
8550 Operands: {Start, NewVal, NewExitingVPV}, DL: ExitDL);
8551 } else {
8552 FastMathFlags FMFs =
8553 RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind: RecurrenceKind)
8554 ? RdxDesc.getFastMathFlags()
8555 : FastMathFlags();
8556 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
8557 FMFs);
8558 FinalReductionResult =
8559 Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
8560 Operands: {NewExitingVPV}, Flags, DL: ExitDL);
8561 }
8562 // If the vector reduction can be performed in a smaller type, we truncate
8563 // then extend the loop exit value to enable InstCombine to evaluate the
8564 // entire expression in the smaller type.
8565 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8566 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8567 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8568 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
8569 "Unexpected truncated min-max recurrence!");
8570 Type *RdxTy = RdxDesc.getRecurrenceType();
8571 VPWidenCastRecipe *Trunc;
8572 Instruction::CastOps ExtendOpc =
8573 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8574 VPWidenCastRecipe *Extnd;
8575 {
8576 VPBuilder::InsertPointGuard Guard(Builder);
8577 Builder.setInsertPoint(
8578 TheBB: NewExitingVPV->getDefiningRecipe()->getParent(),
8579 IP: std::next(x: NewExitingVPV->getDefiningRecipe()->getIterator()));
8580 Trunc =
8581 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: NewExitingVPV, ResultTy: RdxTy);
8582 Extnd = Builder.createWidenCast(Opcode: ExtendOpc, Op: Trunc, ResultTy: PhiTy);
8583 }
8584 if (PhiR->getOperand(N: 1) == NewExitingVPV)
8585 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
8586
8587 // Update ComputeReductionResult with the truncated exiting value and
8588 // extend its result. Operand 0 provides the values to be reduced.
8589 FinalReductionResult->setOperand(I: 0, New: Trunc);
8590 FinalReductionResult =
8591 Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
8592 }
8593
8594 // Update all users outside the vector region. Also replace redundant
8595 // extracts.
8596 for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
8597 auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
8598 if (FinalReductionResult == U || Parent->getParent())
8599 continue;
8600 // Skip FindIV reduction chain recipes (ComputeReductionResult, icmp).
8601 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind) &&
8602 match(U, P: m_CombineOr(
8603 L: m_VPInstruction<VPInstruction::ComputeReductionResult>(),
8604 R: m_VPInstruction<Instruction::ICmp>())))
8605 continue;
8606 U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
8607
8608 // Look through ExtractLastPart.
8609 if (match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
8610 U = cast<VPInstruction>(Val: U)->getSingleUser();
8611
8612 if (match(U, P: m_CombineOr(L: m_ExtractLane(Op0: m_VPValue(), Op1: m_VPValue()),
8613 R: m_ExtractLastLane(Op0: m_VPValue()))))
8614 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
8615 }
8616
8617 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8618 // with a boolean reduction phi node to check if the condition is true in
8619 // any iteration. The final value is selected by the final
8620 // ComputeReductionResult.
8621 if (AnyOfSelect) {
8622 VPValue *Cmp = AnyOfSelect->getOperand(N: 0);
8623 // If the compare is checking the reduction PHI node, adjust it to check
8624 // the start value.
8625 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8626 CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
8627 Builder.setInsertPoint(AnyOfSelect);
8628
8629 // If the true value of the select is the reduction phi, the new value is
8630 // selected if the negated condition is true in any iteration.
8631 if (AnyOfSelect->getOperand(N: 1) == PhiR)
8632 Cmp = Builder.createNot(Operand: Cmp);
8633 VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
8634 AnyOfSelect->getVPSingleValue()->replaceAllUsesWith(New: Or);
8635 // Delete AnyOfSelect now that it has invalid types.
8636 ToDelete.push_back(Elt: AnyOfSelect);
8637
8638 // Convert the reduction phi to operate on bools.
8639 PhiR->setOperand(I: 0, New: Plan->getFalse());
8640 continue;
8641 }
8642
8643 if (RecurrenceDescriptor::isFindIVRecurrenceKind(
8644 Kind: RdxDesc.getRecurrenceKind())) {
8645 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
8646 // the sentinel value after generating the ResumePhi recipe, which uses
8647 // the original start value.
8648 PhiR->setOperand(I: 0, New: Plan->getOrAddLiveIn(V: RdxDesc.getSentinelValue()));
8649 }
8650 RecurKind RK = RdxDesc.getRecurrenceKind();
8651 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
8652 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
8653 !RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) &&
8654 !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RK))) {
8655 VPBuilder PHBuilder(Plan->getVectorPreheader());
8656 VPValue *Iden = Plan->getOrAddLiveIn(
8657 V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: RdxDesc.getFastMathFlags()));
8658 auto *ScaleFactorVPV = Plan->getConstantInt(BitWidth: 32, Val: 1);
8659 VPValue *StartV = PHBuilder.createNaryOp(
8660 Opcode: VPInstruction::ReductionStartVector,
8661 Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV},
8662 Flags: PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
8663 : FastMathFlags());
8664 PhiR->setOperand(I: 0, New: StartV);
8665 }
8666 }
8667 for (VPRecipeBase *R : ToDelete)
8668 R->eraseFromParent();
8669
8670 RUN_VPLAN_PASS(VPlanTransforms::clearReductionWrapFlags, *Plan);
8671}
8672
8673void LoopVectorizationPlanner::attachRuntimeChecks(
8674 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8675 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8676 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(N: 0)) {
8677 assert((!CM.OptForSize ||
8678 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8679 "Cannot SCEV check stride or overflow when optimizing for size");
8680 VPlanTransforms::attachCheckBlock(Plan, Cond: SCEVCheckCond, CheckBlock: SCEVCheckBlock,
8681 AddBranchWeights: HasBranchWeights);
8682 }
8683 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8684 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(N: 0)) {
8685 // VPlan-native path does not do any analysis for runtime checks
8686 // currently.
8687 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
8688 "Runtime checks are not supported for outer loops yet");
8689
8690 if (CM.OptForSize) {
8691 assert(
8692 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8693 "Cannot emit memory checks when optimizing for size, unless forced "
8694 "to vectorize.");
8695 ORE->emit(RemarkBuilder: [&]() {
8696 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
8697 OrigLoop->getStartLoc(),
8698 OrigLoop->getHeader())
8699 << "Code-size may be reduced by not forcing "
8700 "vectorization, or by source-code modifications "
8701 "eliminating the need for runtime checks "
8702 "(e.g., adding 'restrict').";
8703 });
8704 }
8705 VPlanTransforms::attachCheckBlock(Plan, Cond: MemCheckCond, CheckBlock: MemCheckBlock,
8706 AddBranchWeights: HasBranchWeights);
8707 }
8708}
8709
8710void LoopVectorizationPlanner::addMinimumIterationCheck(
8711 VPlan &Plan, ElementCount VF, unsigned UF,
8712 ElementCount MinProfitableTripCount) const {
8713 // vscale is not necessarily a power-of-2, which means we cannot guarantee
8714 // an overflow to zero when updating induction variables and so an
8715 // additional overflow check is required before entering the vector loop.
8716 bool IsIndvarOverflowCheckNeededForVF =
8717 VF.isScalable() && !TTI.isVScaleKnownToBeAPowerOfTwo() &&
8718 !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF, UF) &&
8719 CM.getTailFoldingStyle() !=
8720 TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8721 const uint32_t *BranchWeigths =
8722 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())
8723 ? &MinItersBypassWeights[0]
8724 : nullptr;
8725 VPlanTransforms::addMinimumIterationCheck(
8726 Plan, VF, UF, MinProfitableTripCount,
8727 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()), TailFolded: CM.foldTailByMasking(),
8728 CheckNeededWithTailFolding: IsIndvarOverflowCheckNeededForVF, OrigLoop, MinItersBypassWeights: BranchWeigths,
8729 DL: OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
8730}
8731
8732// Determine how to lower the scalar epilogue, which depends on 1) optimising
8733// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8734// predication, and 4) a TTI hook that analyses whether the loop is suitable
8735// for predication.
8736static ScalarEpilogueLowering getScalarEpilogueLowering(
8737 Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
8738 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8739 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
8740 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8741 // don't look at hints or options, and don't request a scalar epilogue.
8742 if (F->hasOptSize() ||
8743 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8744 return CM_ScalarEpilogueNotAllowedOptSize;
8745
8746 // 2) If set, obey the directives
8747 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8748 switch (PreferPredicateOverEpilogue) {
8749 case PreferPredicateTy::ScalarEpilogue:
8750 return CM_ScalarEpilogueAllowed;
8751 case PreferPredicateTy::PredicateElseScalarEpilogue:
8752 return CM_ScalarEpilogueNotNeededUsePredicate;
8753 case PreferPredicateTy::PredicateOrDontVectorize:
8754 return CM_ScalarEpilogueNotAllowedUsePredicate;
8755 };
8756 }
8757
8758 // 3) If set, obey the hints
8759 switch (Hints.getPredicate()) {
8760 case LoopVectorizeHints::FK_Enabled:
8761 return CM_ScalarEpilogueNotNeededUsePredicate;
8762 case LoopVectorizeHints::FK_Disabled:
8763 return CM_ScalarEpilogueAllowed;
8764 };
8765
8766 // 4) if the TTI hook indicates this is profitable, request predication.
8767 TailFoldingInfo TFI(TLI, &LVL, IAI);
8768 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
8769 return CM_ScalarEpilogueNotNeededUsePredicate;
8770
8771 return CM_ScalarEpilogueAllowed;
8772}
8773
8774// Process the loop in the VPlan-native vectorization path. This path builds
8775// VPlan upfront in the vectorization pipeline, which allows to apply
8776// VPlan-to-VPlan transformations from the very beginning without modifying the
8777// input LLVM IR.
8778static bool processLoopInVPlanNativePath(
8779 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8780 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8781 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8782 OptimizationRemarkEmitter *ORE,
8783 std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
8784 LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
8785
8786 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
8787 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8788 return false;
8789 }
8790 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8791 Function *F = L->getHeader()->getParent();
8792 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8793
8794 ScalarEpilogueLowering SEL =
8795 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL&: *LVL, IAI: &IAI);
8796
8797 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
8798 GetBFI, F, &Hints, IAI, OptForSize);
8799 // Use the planner for outer loop vectorization.
8800 // TODO: CM is not used at this point inside the planner. Turn CM into an
8801 // optional argument if we don't need it in the future.
8802 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
8803 ORE);
8804
8805 // Get user vectorization factor.
8806 ElementCount UserVF = Hints.getWidth();
8807
8808 CM.collectElementTypesForWidening();
8809
8810 // Plan how to best vectorize, return the best VF and its cost.
8811 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8812
8813 // If we are stress testing VPlan builds, do not attempt to generate vector
8814 // code. Masked vector code generation support will follow soon.
8815 // Also, do not attempt to vectorize if no vector code will be produced.
8816 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
8817 return false;
8818
8819 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
8820
8821 {
8822 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
8823 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
8824 Checks, BestPlan);
8825 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8826 << L->getHeader()->getParent()->getName() << "\"\n");
8827 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, /*UF=*/1,
8828 MinProfitableTripCount: VF.MinProfitableTripCount);
8829
8830 LVP.executePlan(BestVF: VF.Width, /*UF=*/BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
8831 }
8832
8833 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
8834
8835 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8836 return true;
8837}
8838
8839// Emit a remark if there are stores to floats that required a floating point
8840// extension. If the vectorized loop was generated with floating point there
8841// will be a performance penalty from the conversion overhead and the change in
8842// the vector width.
8843static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
8844 SmallVector<Instruction *, 4> Worklist;
8845 for (BasicBlock *BB : L->getBlocks()) {
8846 for (Instruction &Inst : *BB) {
8847 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
8848 if (S->getValueOperand()->getType()->isFloatTy())
8849 Worklist.push_back(Elt: S);
8850 }
8851 }
8852 }
8853
8854 // Traverse the floating point stores upwards searching, for floating point
8855 // conversions.
8856 SmallPtrSet<const Instruction *, 4> Visited;
8857 SmallPtrSet<const Instruction *, 4> EmittedRemark;
8858 while (!Worklist.empty()) {
8859 auto *I = Worklist.pop_back_val();
8860 if (!L->contains(Inst: I))
8861 continue;
8862 if (!Visited.insert(Ptr: I).second)
8863 continue;
8864
8865 // Emit a remark if the floating point store required a floating
8866 // point conversion.
8867 // TODO: More work could be done to identify the root cause such as a
8868 // constant or a function return type and point the user to it.
8869 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
8870 ORE->emit(RemarkBuilder: [&]() {
8871 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
8872 I->getDebugLoc(), L->getHeader())
8873 << "floating point conversion changes vector width. "
8874 << "Mixed floating point precision requires an up/down "
8875 << "cast that will negatively impact performance.";
8876 });
8877
8878 for (Use &Op : I->operands())
8879 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
8880 Worklist.push_back(Elt: OpI);
8881 }
8882}
8883
8884/// For loops with uncountable early exits, find the cost of doing work when
8885/// exiting the loop early, such as calculating the final exit values of
8886/// variables used outside the loop.
8887/// TODO: This is currently overly pessimistic because the loop may not take
8888/// the early exit, but better to keep this conservative for now. In future,
8889/// it might be possible to relax this by using branch probabilities.
8890static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
8891 VPlan &Plan, ElementCount VF) {
8892 InstructionCost Cost = 0;
8893 for (auto *ExitVPBB : Plan.getExitBlocks()) {
8894 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
8895 // If the predecessor is not the middle.block, then it must be the
8896 // vector.early.exit block, which may contain work to calculate the exit
8897 // values of variables used outside the loop.
8898 if (PredVPBB != Plan.getMiddleBlock()) {
8899 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
8900 << PredVPBB->getName() << ":\n");
8901 Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
8902 }
8903 }
8904 }
8905 return Cost;
8906}
8907
8908/// This function determines whether or not it's still profitable to vectorize
8909/// the loop given the extra work we have to do outside of the loop:
8910/// 1. Perform the runtime checks before entering the loop to ensure it's safe
8911/// to vectorize.
8912/// 2. In the case of loops with uncountable early exits, we may have to do
8913/// extra work when exiting the loop early, such as calculating the final
8914/// exit values of variables used outside the loop.
8915/// 3. The middle block.
8916static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
8917 VectorizationFactor &VF, Loop *L,
8918 PredicatedScalarEvolution &PSE,
8919 VPCostContext &CostCtx, VPlan &Plan,
8920 ScalarEpilogueLowering SEL,
8921 std::optional<unsigned> VScale) {
8922 InstructionCost RtC = Checks.getCost();
8923 if (!RtC.isValid())
8924 return false;
8925
8926 // When interleaving only scalar and vector cost will be equal, which in turn
8927 // would lead to a divide by 0. Fall back to hard threshold.
8928 if (VF.Width.isScalar()) {
8929 // TODO: Should we rename VectorizeMemoryCheckThreshold?
8930 if (RtC > VectorizeMemoryCheckThreshold) {
8931 LLVM_DEBUG(
8932 dbgs()
8933 << "LV: Interleaving only is not profitable due to runtime checks\n");
8934 return false;
8935 }
8936 return true;
8937 }
8938
8939 // The scalar cost should only be 0 when vectorizing with a user specified
8940 // VF/IC. In those cases, runtime checks should always be generated.
8941 uint64_t ScalarC = VF.ScalarCost.getValue();
8942 if (ScalarC == 0)
8943 return true;
8944
8945 InstructionCost TotalCost = RtC;
8946 // Add on the cost of any work required in the vector early exit block, if
8947 // one exists.
8948 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
8949 TotalCost += Plan.getMiddleBlock()->cost(VF: VF.Width, Ctx&: CostCtx);
8950
8951 // First, compute the minimum iteration count required so that the vector
8952 // loop outperforms the scalar loop.
8953 // The total cost of the scalar loop is
8954 // ScalarC * TC
8955 // where
8956 // * TC is the actual trip count of the loop.
8957 // * ScalarC is the cost of a single scalar iteration.
8958 //
8959 // The total cost of the vector loop is
8960 // TotalCost + VecC * (TC / VF) + EpiC
8961 // where
8962 // * TotalCost is the sum of the costs cost of
8963 // - the generated runtime checks, i.e. RtC
8964 // - performing any additional work in the vector.early.exit block for
8965 // loops with uncountable early exits.
8966 // - the middle block, if ExpectedTC <= VF.Width.
8967 // * VecC is the cost of a single vector iteration.
8968 // * TC is the actual trip count of the loop
8969 // * VF is the vectorization factor
8970 // * EpiCost is the cost of the generated epilogue, including the cost
8971 // of the remaining scalar operations.
8972 //
8973 // Vectorization is profitable once the total vector cost is less than the
8974 // total scalar cost:
8975 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
8976 //
8977 // Now we can compute the minimum required trip count TC as
8978 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
8979 //
8980 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
8981 // the computations are performed on doubles, not integers and the result
8982 // is rounded up, hence we get an upper estimate of the TC.
8983 unsigned IntVF = estimateElementCount(VF: VF.Width, VScale);
8984 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
8985 uint64_t MinTC1 =
8986 Div == 0 ? 0 : divideCeil(Numerator: TotalCost.getValue() * IntVF, Denominator: Div);
8987
8988 // Second, compute a minimum iteration count so that the cost of the
8989 // runtime checks is only a fraction of the total scalar loop cost. This
8990 // adds a loop-dependent bound on the overhead incurred if the runtime
8991 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
8992 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
8993 // cost, compute
8994 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
8995 uint64_t MinTC2 = divideCeil(Numerator: RtC.getValue() * 10, Denominator: ScalarC);
8996
8997 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
8998 // epilogue is allowed, choose the next closest multiple of VF. This should
8999 // partly compensate for ignoring the epilogue cost.
9000 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9001 if (SEL == CM_ScalarEpilogueAllowed)
9002 MinTC = alignTo(Value: MinTC, Align: IntVF);
9003 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9004
9005 LLVM_DEBUG(
9006 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9007 << VF.MinProfitableTripCount << "\n");
9008
9009 // Skip vectorization if the expected trip count is less than the minimum
9010 // required trip count.
9011 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9012 if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
9013 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9014 "trip count < minimum profitable VF ("
9015 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9016 << ")\n");
9017
9018 return false;
9019 }
9020 }
9021 return true;
9022}
9023
9024LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9025 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9026 !EnableLoopInterleaving),
9027 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9028 !EnableLoopVectorization) {}
9029
9030/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9031/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9032/// don't have a corresponding wide induction in \p EpiPlan.
9033static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9034 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9035 // will need their resume-values computed in the main vector loop. Others
9036 // can be removed from the main VPlan.
9037 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9038 for (VPRecipeBase &R :
9039 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9040 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9041 continue;
9042 EpiWidenedPhis.insert(
9043 Ptr: cast<PHINode>(Val: R.getVPSingleValue()->getUnderlyingValue()));
9044 }
9045 for (VPRecipeBase &R :
9046 make_early_inc_range(Range: MainPlan.getScalarHeader()->phis())) {
9047 auto *VPIRInst = cast<VPIRPhi>(Val: &R);
9048 if (EpiWidenedPhis.contains(Ptr: &VPIRInst->getIRPhi()))
9049 continue;
9050 // There is no corresponding wide induction in the epilogue plan that would
9051 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9052 // together with the corresponding ResumePhi. The resume values for the
9053 // scalar loop will be created during execution of EpiPlan.
9054 VPRecipeBase *ResumePhi = VPIRInst->getOperand(N: 0)->getDefiningRecipe();
9055 VPIRInst->eraseFromParent();
9056 ResumePhi->eraseFromParent();
9057 }
9058 RUN_VPLAN_PASS(VPlanTransforms::removeDeadRecipes, MainPlan);
9059
9060 using namespace VPlanPatternMatch;
9061 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9062 // introduce multiple uses of undef/poison. If the reduction start value may
9063 // be undef or poison it needs to be frozen and the frozen start has to be
9064 // used when computing the reduction result. We also need to use the frozen
9065 // value in the resume phi generated by the main vector loop, as this is also
9066 // used to compute the reduction result after the epilogue vector loop.
9067 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9068 bool UpdateResumePhis) {
9069 VPBuilder Builder(Plan.getEntry());
9070 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9071 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9072 if (!VPI)
9073 continue;
9074 VPValue *OrigStart;
9075 if (!matchFindIVResult(VPI, ReducedIV: m_VPValue(), Start: m_VPValue(V&: OrigStart)))
9076 continue;
9077 if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
9078 continue;
9079 VPInstruction *Freeze =
9080 Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, DL: {}, Name: "fr");
9081 VPI->setOperand(I: 2, New: Freeze);
9082 if (UpdateResumePhis)
9083 OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
9084 return Freeze != &U && isa<VPPhi>(Val: &U);
9085 });
9086 }
9087 };
9088 AddFreezeForFindLastIVReductions(MainPlan, true);
9089 AddFreezeForFindLastIVReductions(EpiPlan, false);
9090
9091 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9092 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9093 // If there is a suitable resume value for the canonical induction in the
9094 // scalar (which will become vector) epilogue loop, use it and move it to the
9095 // beginning of the scalar preheader. Otherwise create it below.
9096 auto ResumePhiIter =
9097 find_if(Range: MainScalarPH->phis(), P: [VectorTC](VPRecipeBase &R) {
9098 return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Ops: m_Specific(VPV: VectorTC),
9099 Ops: m_ZeroInt()));
9100 });
9101 VPPhi *ResumePhi = nullptr;
9102 if (ResumePhiIter == MainScalarPH->phis().end()) {
9103 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9104 ResumePhi = ScalarPHBuilder.createScalarPhi(
9105 IncomingValues: {VectorTC,
9106 MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue()},
9107 DL: {}, Name: "vec.epilog.resume.val");
9108 } else {
9109 ResumePhi = cast<VPPhi>(Val: &*ResumePhiIter);
9110 if (MainScalarPH->begin() == MainScalarPH->end())
9111 ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->end());
9112 else if (&*MainScalarPH->begin() != ResumePhi)
9113 ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->begin());
9114 }
9115 // Add a user to to make sure the resume phi won't get removed.
9116 VPBuilder(MainScalarPH)
9117 .createNaryOp(Opcode: VPInstruction::ResumeForEpilogue, Operands: ResumePhi);
9118}
9119
9120/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9121/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
9122/// reductions require creating new instructions to compute the resume values.
9123/// They are collected in a vector and returned. They must be moved to the
9124/// preheader of the vector epilogue loop, after created by the execution of \p
9125/// Plan.
9126static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
9127 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
9128 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
9129 ScalarEvolution &SE) {
9130 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9131 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9132 Header->setName("vec.epilog.vector.body");
9133
9134 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
9135 // When vectorizing the epilogue loop, the canonical induction needs to be
9136 // adjusted by the value after the main vector loop. Find the resume value
9137 // created during execution of the main VPlan. It must be the first phi in the
9138 // loop preheader. Use the value to increment the canonical IV, and update all
9139 // users in the loop region to use the adjusted value.
9140 // FIXME: Improve modeling for canonical IV start values in the epilogue
9141 // loop.
9142 using namespace llvm::PatternMatch;
9143 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9144 for (Value *Inc : EPResumeVal->incoming_values()) {
9145 if (match(V: Inc, P: m_SpecificInt(V: 0)))
9146 continue;
9147 assert(!EPI.VectorTripCount &&
9148 "Must only have a single non-zero incoming value");
9149 EPI.VectorTripCount = Inc;
9150 }
9151 // If we didn't find a non-zero vector trip count, all incoming values
9152 // must be zero, which also means the vector trip count is zero. Pick the
9153 // first zero as vector trip count.
9154 // TODO: We should not choose VF * UF so the main vector loop is known to
9155 // be dead.
9156 if (!EPI.VectorTripCount) {
9157 assert(EPResumeVal->getNumIncomingValues() > 0 &&
9158 all_of(EPResumeVal->incoming_values(),
9159 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9160 "all incoming values must be 0");
9161 EPI.VectorTripCount = EPResumeVal->getOperand(i_nocapture: 0);
9162 }
9163 VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9164 assert(all_of(IV->users(),
9165 [](const VPUser *U) {
9166 return isa<VPScalarIVStepsRecipe>(U) ||
9167 isa<VPDerivedIVRecipe>(U) ||
9168 cast<VPRecipeBase>(U)->isScalarCast() ||
9169 cast<VPInstruction>(U)->getOpcode() ==
9170 Instruction::Add;
9171 }) &&
9172 "the canonical IV should only be used by its increment or "
9173 "ScalarIVSteps when resetting the start value");
9174 VPBuilder Builder(Header, Header->getFirstNonPhi());
9175 VPInstruction *Add = Builder.createAdd(LHS: IV, RHS: VPV);
9176 IV->replaceAllUsesWith(New: Add);
9177 Add->setOperand(I: 0, New: IV);
9178
9179 DenseMap<Value *, Value *> ToFrozen;
9180 SmallVector<Instruction *> InstsToMove;
9181 // Ensure that the start values for all header phi recipes are updated before
9182 // vectorizing the epilogue loop. Skip the canonical IV, which has been
9183 // handled above.
9184 for (VPRecipeBase &R : drop_begin(RangeOrContainer: Header->phis())) {
9185 Value *ResumeV = nullptr;
9186 // TODO: Move setting of resume values to prepareToExecute.
9187 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9188 // Find the reduction result by searching users of the phi or its backedge
9189 // value.
9190 auto IsReductionResult = [](VPRecipeBase *R) {
9191 auto *VPI = dyn_cast<VPInstruction>(Val: R);
9192 if (!VPI)
9193 return false;
9194 return VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9195 VPI->getOpcode() == VPInstruction::ComputeReductionResult;
9196 };
9197 auto *RdxResult = cast<VPInstruction>(
9198 Val: vputils::findRecipe(Start: ReductionPhi->getBackedgeValue(), Pred: IsReductionResult));
9199 assert(RdxResult && "expected to find reduction result");
9200
9201 ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9202 ->getIncomingValueForBlock(BB: L->getLoopPreheader());
9203
9204 // Check for FindIV pattern by looking for icmp user of RdxResult.
9205 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
9206 using namespace VPlanPatternMatch;
9207 VPValue *SentinelVPV = nullptr;
9208 bool IsFindIV = any_of(Range: RdxResult->users(), P: [&](VPUser *U) {
9209 return match(U, P: VPlanPatternMatch::m_SpecificICmp(
9210 MatchPred: ICmpInst::ICMP_NE, Op0: m_Specific(VPV: RdxResult),
9211 Op1: m_VPValue(V&: SentinelVPV)));
9212 });
9213
9214 if (RdxResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
9215 Value *StartV = RdxResult->getOperand(N: 0)->getLiveInIRValue();
9216 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9217 // start value; compare the final value from the main vector loop
9218 // to the start value.
9219 BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9220 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9221 ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9222 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9223 InstsToMove.push_back(Elt: I);
9224 } else if (IsFindIV) {
9225 assert(SentinelVPV && "expected to find icmp using RdxResult");
9226
9227 // Get the frozen start value from the main loop.
9228 Value *FrozenStartV = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9229 BB: EPI.MainLoopIterationCountCheck);
9230 if (auto *FreezeI = dyn_cast<FreezeInst>(Val: FrozenStartV))
9231 ToFrozen[FreezeI->getOperand(i_nocapture: 0)] = FrozenStartV;
9232
9233 // Adjust resume: select(icmp eq ResumeV, FrozenStartV), Sentinel,
9234 // ResumeV
9235 BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9236 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9237 Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: FrozenStartV);
9238 if (auto *I = dyn_cast<Instruction>(Val: Cmp))
9239 InstsToMove.push_back(Elt: I);
9240 ResumeV =
9241 Builder.CreateSelect(C: Cmp, True: SentinelVPV->getLiveInIRValue(), False: ResumeV);
9242 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9243 InstsToMove.push_back(Elt: I);
9244 } else {
9245 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9246 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9247 if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9248 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9249 "unexpected start value");
9250 VPI->setOperand(I: 0, New: StartVal);
9251 continue;
9252 }
9253 }
9254 } else {
9255 // Retrieve the induction resume values for wide inductions from
9256 // their original phi nodes in the scalar loop.
9257 PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9258 // Hook up to the PHINode generated by a ResumePhi recipe of main
9259 // loop VPlan, which feeds the scalar loop.
9260 ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9261 }
9262 assert(ResumeV && "Must have a resume value");
9263 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9264 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9265 }
9266
9267 // For some VPValues in the epilogue plan we must re-use the generated IR
9268 // values from the main plan. Replace them with live-in VPValues.
9269 // TODO: This is a workaround needed for epilogue vectorization and it
9270 // should be removed once induction resume value creation is done
9271 // directly in VPlan.
9272 for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9273 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9274 // epilogue plan. This ensures all users use the same frozen value.
9275 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9276 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9277 VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9278 V: ToFrozen.lookup(Val: VPI->getOperand(N: 0)->getLiveInIRValue())));
9279 continue;
9280 }
9281
9282 // Re-use the trip count and steps expanded for the main loop, as
9283 // skeleton creation needs it as a value that dominates both the scalar
9284 // and vector epilogue loops
9285 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9286 if (!ExpandR)
9287 continue;
9288 VPValue *ExpandedVal =
9289 Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9290 ExpandR->replaceAllUsesWith(New: ExpandedVal);
9291 if (Plan.getTripCount() == ExpandR)
9292 Plan.resetTripCount(NewTripCount: ExpandedVal);
9293 ExpandR->eraseFromParent();
9294 }
9295
9296 auto VScale = CM.getVScaleForTuning();
9297 unsigned MainLoopStep =
9298 estimateElementCount(VF: EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9299 unsigned EpilogueLoopStep =
9300 estimateElementCount(VF: EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9301 VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
9302 Plan, TripCount: EPI.TripCount, VectorTripCount: EPI.VectorTripCount,
9303 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()), EpilogueVF: EPI.EpilogueVF,
9304 EpilogueUF: EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9305
9306 return InstsToMove;
9307}
9308
9309// Generate bypass values from the additional bypass block. Note that when the
9310// vectorized epilogue is skipped due to iteration count check, then the
9311// resume value for the induction variable comes from the trip count of the
9312// main vector loop, passed as the second argument.
9313static Value *createInductionAdditionalBypassValues(
9314 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9315 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9316 Instruction *OldInduction) {
9317 Value *Step = getExpandedStep(ID: II, ExpandedSCEVs);
9318 // For the primary induction the additional bypass end value is known.
9319 // Otherwise it is computed.
9320 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9321 if (OrigPhi != OldInduction) {
9322 auto *BinOp = II.getInductionBinOp();
9323 // Fast-math-flags propagate from the original induction instruction.
9324 if (isa_and_nonnull<FPMathOperator>(Val: BinOp))
9325 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9326
9327 // Compute the end value for the additional bypass.
9328 EndValueFromAdditionalBypass =
9329 emitTransformedIndex(B&: BypassBuilder, Index: MainVectorTripCount,
9330 StartValue: II.getStartValue(), Step, InductionKind: II.getKind(), InductionBinOp: BinOp);
9331 EndValueFromAdditionalBypass->setName("ind.end");
9332 }
9333 return EndValueFromAdditionalBypass;
9334}
9335
9336static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
9337 VPlan &BestEpiPlan,
9338 LoopVectorizationLegality &LVL,
9339 const SCEV2ValueTy &ExpandedSCEVs,
9340 Value *MainVectorTripCount) {
9341 // Fix reduction resume values from the additional bypass block.
9342 BasicBlock *PH = L->getLoopPreheader();
9343 for (auto *Pred : predecessors(BB: PH)) {
9344 for (PHINode &Phi : PH->phis()) {
9345 if (Phi.getBasicBlockIndex(BB: Pred) != -1)
9346 continue;
9347 Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
9348 }
9349 }
9350 auto *ScalarPH = cast<VPIRBasicBlock>(Val: BestEpiPlan.getScalarPreheader());
9351 if (ScalarPH->hasPredecessors()) {
9352 // If ScalarPH has predecessors, we may need to update its reduction
9353 // resume values.
9354 for (const auto &[R, IRPhi] :
9355 zip(t: ScalarPH->phis(), u: ScalarPH->getIRBasicBlock()->phis())) {
9356 fixReductionScalarResumeWhenVectorizingEpilog(EpiResumePhiR: cast<VPPhi>(Val: &R), EpiResumePhi&: IRPhi,
9357 BypassBlock);
9358 }
9359 }
9360
9361 // Fix induction resume values from the additional bypass block.
9362 IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
9363 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
9364 auto *Inc = cast<PHINode>(Val: IVPhi->getIncomingValueForBlock(BB: PH));
9365 Value *V = createInductionAdditionalBypassValues(
9366 OrigPhi: IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
9367 OldInduction: LVL.getPrimaryInduction());
9368 // TODO: Directly add as extra operand to the VPResumePHI recipe.
9369 Inc->setIncomingValueForBlock(BB: BypassBlock, V);
9370 }
9371}
9372
9373/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9374// loop, after both plans have executed, updating branches from the iteration
9375// and runtime checks of the main loop, as well as updating various phis. \p
9376// InstsToMove contains instructions that need to be moved to the preheader of
9377// the epilogue vector loop.
9378static void connectEpilogueVectorLoop(
9379 VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
9380 DominatorTree *DT, LoopVectorizationLegality &LVL,
9381 DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
9382 ArrayRef<Instruction *> InstsToMove) {
9383 BasicBlock *VecEpilogueIterationCountCheck =
9384 cast<VPIRBasicBlock>(Val: EpiPlan.getEntry())->getIRBasicBlock();
9385
9386 BasicBlock *VecEpiloguePreHeader =
9387 cast<BranchInst>(Val: VecEpilogueIterationCountCheck->getTerminator())
9388 ->getSuccessor(i: 1);
9389 // Adjust the control flow taking the state info from the main loop
9390 // vectorization into account.
9391 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
9392 "expected this to be saved from the previous pass.");
9393 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9394 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
9395 From: VecEpilogueIterationCountCheck, To: VecEpiloguePreHeader);
9396
9397 DTU.applyUpdates(Updates: {{DominatorTree::Delete, EPI.MainLoopIterationCountCheck,
9398 VecEpilogueIterationCountCheck},
9399 {DominatorTree::Insert, EPI.MainLoopIterationCountCheck,
9400 VecEpiloguePreHeader}});
9401
9402 BasicBlock *ScalarPH =
9403 cast<VPIRBasicBlock>(Val: EpiPlan.getScalarPreheader())->getIRBasicBlock();
9404 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
9405 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9406 DTU.applyUpdates(
9407 Updates: {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck,
9408 VecEpilogueIterationCountCheck},
9409 {DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}});
9410
9411 // Adjust the terminators of runtime check blocks and phis using them.
9412 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9413 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9414 if (SCEVCheckBlock) {
9415 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9416 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9417 DTU.applyUpdates(Updates: {{DominatorTree::Delete, SCEVCheckBlock,
9418 VecEpilogueIterationCountCheck},
9419 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9420 }
9421 if (MemCheckBlock) {
9422 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9423 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9424 DTU.applyUpdates(
9425 Updates: {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9426 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9427 }
9428
9429 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9430 // or reductions which merge control-flow from the latch block and the
9431 // middle block. Update the incoming values here and move the Phi into the
9432 // preheader.
9433 SmallVector<PHINode *, 4> PhisInBlock(
9434 llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
9435
9436 for (PHINode *Phi : PhisInBlock) {
9437 Phi->moveBefore(InsertPos: VecEpiloguePreHeader->getFirstNonPHIIt());
9438 Phi->replaceIncomingBlockWith(
9439 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
9440 New: VecEpilogueIterationCountCheck);
9441
9442 // If the phi doesn't have an incoming value from the
9443 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9444 // incoming value and also those from other check blocks. This is needed
9445 // for reduction phis only.
9446 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
9447 return EPI.EpilogueIterationCountCheck == IncB;
9448 }))
9449 continue;
9450 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
9451 if (SCEVCheckBlock)
9452 Phi->removeIncomingValue(BB: SCEVCheckBlock);
9453 if (MemCheckBlock)
9454 Phi->removeIncomingValue(BB: MemCheckBlock);
9455 }
9456
9457 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9458 for (auto *I : InstsToMove)
9459 I->moveBefore(InsertPos: IP);
9460
9461 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9462 // after executing the main loop. We need to update the resume values of
9463 // inductions and reductions during epilogue vectorization.
9464 fixScalarResumeValuesFromBypass(BypassBlock: VecEpilogueIterationCountCheck, L, BestEpiPlan&: EpiPlan,
9465 LVL, ExpandedSCEVs, MainVectorTripCount: EPI.VectorTripCount);
9466}
9467
9468bool LoopVectorizePass::processLoop(Loop *L) {
9469 assert((EnableVPlanNativePath || L->isInnermost()) &&
9470 "VPlan-native path is not enabled. Only process inner loops.");
9471
9472 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9473 << L->getHeader()->getParent()->getName() << "' from "
9474 << L->getLocStr() << "\n");
9475
9476 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9477
9478 LLVM_DEBUG(
9479 dbgs() << "LV: Loop hints:"
9480 << " force="
9481 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9482 ? "disabled"
9483 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9484 ? "enabled"
9485 : "?"))
9486 << " width=" << Hints.getWidth()
9487 << " interleave=" << Hints.getInterleave() << "\n");
9488
9489 // Function containing loop
9490 Function *F = L->getHeader()->getParent();
9491
9492 // Looking at the diagnostic output is the only way to determine if a loop
9493 // was vectorized (other than looking at the IR or machine code), so it
9494 // is important to generate an optimization remark for each loop. Most of
9495 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9496 // generated as OptimizationRemark and OptimizationRemarkMissed are
9497 // less verbose reporting vectorized loops and unvectorized loops that may
9498 // benefit from vectorization, respectively.
9499
9500 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9501 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9502 return false;
9503 }
9504
9505 PredicatedScalarEvolution PSE(*SE, *L);
9506
9507 // Query this against the original loop and save it here because the profile
9508 // of the original loop header may change as the transformation happens.
9509 bool OptForSize = llvm::shouldOptimizeForSize(
9510 BB: L->getHeader(), PSI,
9511 BFI: PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
9512 QueryType: PGSOQueryType::IRPass);
9513
9514 // Check if it is legal to vectorize the loop.
9515 LoopVectorizationRequirements Requirements;
9516 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9517 &Requirements, &Hints, DB, AC,
9518 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
9519 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9520 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9521 Hints.emitRemarkWithHints();
9522 return false;
9523 }
9524
9525 if (LVL.hasUncountableEarlyExit()) {
9526 if (!EnableEarlyExitVectorization) {
9527 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
9528 "early exit is not enabled",
9529 ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
9530 return false;
9531 }
9532 SmallVector<BasicBlock *, 8> ExitingBlocks;
9533 L->getExitingBlocks(ExitingBlocks);
9534 // TODO: Support multiple uncountable early exits.
9535 if (ExitingBlocks.size() - LVL.getCountableExitingBlocks().size() > 1) {
9536 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with multiple "
9537 "uncountable early exits is not yet supported",
9538 ORETag: "MultipleUncountableEarlyExits", ORE, TheLoop: L);
9539 return false;
9540 }
9541 }
9542
9543 if (!LVL.getPotentiallyFaultingLoads().empty()) {
9544 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with potentially "
9545 "faulting load is not supported",
9546 ORETag: "PotentiallyFaultingLoadsNotSupported", ORE, TheLoop: L);
9547 return false;
9548 }
9549
9550 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9551 // here. They may require CFG and instruction level transformations before
9552 // even evaluating whether vectorization is profitable. Since we cannot modify
9553 // the incoming IR, we need to build VPlan upfront in the vectorization
9554 // pipeline.
9555 if (!L->isInnermost())
9556 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9557 ORE, GetBFI, OptForSize, Hints,
9558 Requirements);
9559
9560 assert(L->isInnermost() && "Inner loop expected.");
9561
9562 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9563 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9564
9565 // If an override option has been passed in for interleaved accesses, use it.
9566 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9567 UseInterleaved = EnableInterleavedMemAccesses;
9568
9569 // Analyze interleaved memory accesses.
9570 if (UseInterleaved)
9571 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9572
9573 if (LVL.hasUncountableEarlyExit()) {
9574 BasicBlock *LoopLatch = L->getLoopLatch();
9575 if (IAI.requiresScalarEpilogue() ||
9576 any_of(Range: LVL.getCountableExitingBlocks(),
9577 P: [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9578 reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
9579 "requiring a scalar epilogue is unsupported",
9580 ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
9581 return false;
9582 }
9583 }
9584
9585 // Check the function attributes and profiles to find out if this function
9586 // should be optimized for size.
9587 ScalarEpilogueLowering SEL =
9588 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, IAI: &IAI);
9589
9590 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9591 // count by optimizing for size, to minimize overheads.
9592 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9593 if (ExpectedTC && ExpectedTC->isFixed() &&
9594 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9595 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9596 << "This loop is worth vectorizing only if no scalar "
9597 << "iteration overheads are incurred.");
9598 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9599 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9600 else {
9601 LLVM_DEBUG(dbgs() << "\n");
9602 // Predicate tail-folded loops are efficient even when the loop
9603 // iteration count is low. However, setting the epilogue policy to
9604 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9605 // with runtime checks. It's more effective to let
9606 // `isOutsideLoopWorkProfitable` determine if vectorization is
9607 // beneficial for the loop.
9608 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9609 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9610 }
9611 }
9612
9613 // Check the function attributes to see if implicit floats or vectors are
9614 // allowed.
9615 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
9616 reportVectorizationFailure(
9617 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9618 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9619 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9620 Hints.emitRemarkWithHints();
9621 return false;
9622 }
9623
9624 // Check if the target supports potentially unsafe FP vectorization.
9625 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9626 // for the target we're vectorizing for, to make sure none of the
9627 // additional fp-math flags can help.
9628 if (Hints.isPotentiallyUnsafe() &&
9629 TTI->isFPVectorizationPotentiallyUnsafe()) {
9630 reportVectorizationFailure(
9631 DebugMsg: "Potentially unsafe FP op prevents vectorization",
9632 OREMsg: "loop not vectorized due to unsafe FP support.",
9633 ORETag: "UnsafeFP", ORE, TheLoop: L);
9634 Hints.emitRemarkWithHints();
9635 return false;
9636 }
9637
9638 bool AllowOrderedReductions;
9639 // If the flag is set, use that instead and override the TTI behaviour.
9640 if (ForceOrderedReductions.getNumOccurrences() > 0)
9641 AllowOrderedReductions = ForceOrderedReductions;
9642 else
9643 AllowOrderedReductions = TTI->enableOrderedReductions();
9644 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9645 ORE->emit(RemarkBuilder: [&]() {
9646 auto *ExactFPMathInst = Requirements.getExactFPInst();
9647 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9648 ExactFPMathInst->getDebugLoc(),
9649 ExactFPMathInst->getParent())
9650 << "loop not vectorized: cannot prove it is safe to reorder "
9651 "floating-point operations";
9652 });
9653 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9654 "reorder floating-point operations\n");
9655 Hints.emitRemarkWithHints();
9656 return false;
9657 }
9658
9659 // Use the cost model.
9660 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9661 GetBFI, F, &Hints, IAI, OptForSize);
9662 // Use the planner for vectorization.
9663 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9664 ORE);
9665
9666 // Get user vectorization factor and interleave count.
9667 ElementCount UserVF = Hints.getWidth();
9668 unsigned UserIC = Hints.getInterleave();
9669 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9670 UserIC = 1;
9671
9672 // Plan how to best vectorize.
9673 LVP.plan(UserVF, UserIC);
9674 VectorizationFactor VF = LVP.computeBestVF();
9675 unsigned IC = 1;
9676
9677 if (ORE->allowExtraAnalysis(LV_NAME))
9678 LVP.emitInvalidCostRemarks(ORE);
9679
9680 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
9681 if (LVP.hasPlanWithVF(VF: VF.Width)) {
9682 // Select the interleave count.
9683 IC = LVP.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
9684
9685 unsigned SelectedIC = std::max(a: IC, b: UserIC);
9686 // Optimistically generate runtime checks if they are needed. Drop them if
9687 // they turn out to not be profitable.
9688 if (VF.Width.isVector() || SelectedIC > 1) {
9689 Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC,
9690 ORE&: *ORE);
9691
9692 // Bail out early if either the SCEV or memory runtime checks are known to
9693 // fail. In that case, the vector loop would never execute.
9694 using namespace llvm::PatternMatch;
9695 if (Checks.getSCEVChecks().first &&
9696 match(V: Checks.getSCEVChecks().first, P: m_One()))
9697 return false;
9698 if (Checks.getMemRuntimeChecks().first &&
9699 match(V: Checks.getMemRuntimeChecks().first, P: m_One()))
9700 return false;
9701 }
9702
9703 // Check if it is profitable to vectorize with runtime checks.
9704 bool ForceVectorization =
9705 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9706 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF: VF.Width), CM,
9707 CM.CostKind, CM.PSE, L);
9708 if (!ForceVectorization &&
9709 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9710 Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
9711 VScale: CM.getVScaleForTuning())) {
9712 ORE->emit(RemarkBuilder: [&]() {
9713 return OptimizationRemarkAnalysisAliasing(
9714 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9715 L->getHeader())
9716 << "loop not vectorized: cannot prove it is safe to reorder "
9717 "memory operations";
9718 });
9719 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9720 Hints.emitRemarkWithHints();
9721 return false;
9722 }
9723 }
9724
9725 // Identify the diagnostic messages that should be produced.
9726 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9727 bool VectorizeLoop = true, InterleaveLoop = true;
9728 if (VF.Width.isScalar()) {
9729 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9730 VecDiagMsg = {
9731 "VectorizationNotBeneficial",
9732 "the cost-model indicates that vectorization is not beneficial"};
9733 VectorizeLoop = false;
9734 }
9735
9736 if (UserIC == 1 && Hints.getInterleave() > 1) {
9737 assert(!LVL.isSafeForAnyVectorWidth() &&
9738 "UserIC should only be ignored due to unsafe dependencies");
9739 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9740 IntDiagMsg = {"InterleavingUnsafe",
9741 "Ignoring user-specified interleave count due to possibly "
9742 "unsafe dependencies in the loop."};
9743 InterleaveLoop = false;
9744 } else if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > 1) {
9745 // Tell the user interleaving was avoided up-front, despite being explicitly
9746 // requested.
9747 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9748 "interleaving should be avoided up front\n");
9749 IntDiagMsg = {"InterleavingAvoided",
9750 "Ignoring UserIC, because interleaving was avoided up front"};
9751 InterleaveLoop = false;
9752 } else if (IC == 1 && UserIC <= 1) {
9753 // Tell the user interleaving is not beneficial.
9754 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9755 IntDiagMsg = {
9756 "InterleavingNotBeneficial",
9757 "the cost-model indicates that interleaving is not beneficial"};
9758 InterleaveLoop = false;
9759 if (UserIC == 1) {
9760 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9761 IntDiagMsg.second +=
9762 " and is explicitly disabled or interleave count is set to 1";
9763 }
9764 } else if (IC > 1 && UserIC == 1) {
9765 // Tell the user interleaving is beneficial, but it explicitly disabled.
9766 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
9767 "disabled.\n");
9768 IntDiagMsg = {"InterleavingBeneficialButDisabled",
9769 "the cost-model indicates that interleaving is beneficial "
9770 "but is explicitly disabled or interleave count is set to 1"};
9771 InterleaveLoop = false;
9772 }
9773
9774 // If there is a histogram in the loop, do not just interleave without
9775 // vectorizing. The order of operations will be incorrect without the
9776 // histogram intrinsics, which are only used for recipes with VF > 1.
9777 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
9778 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9779 << "to histogram operations.\n");
9780 IntDiagMsg = {
9781 "HistogramPreventsScalarInterleaving",
9782 "Unable to interleave without vectorization due to constraints on "
9783 "the order of histogram operations"};
9784 InterleaveLoop = false;
9785 }
9786
9787 // Override IC if user provided an interleave count.
9788 IC = UserIC > 0 ? UserIC : IC;
9789
9790 // FIXME: Enable interleaving for FindLast reductions.
9791 if (InterleaveLoop && hasFindLastReductionPhi(Plan&: LVP.getPlanFor(VF: VF.Width))) {
9792 LLVM_DEBUG(dbgs() << "LV: Not interleaving due to FindLast reduction.\n");
9793 IntDiagMsg = {"FindLastPreventsScalarInterleaving",
9794 "Unable to interleave due to FindLast reduction."};
9795 InterleaveLoop = false;
9796 IC = 1;
9797 }
9798
9799 // Emit diagnostic messages, if any.
9800 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9801 if (!VectorizeLoop && !InterleaveLoop) {
9802 // Do not vectorize or interleaving the loop.
9803 ORE->emit(RemarkBuilder: [&]() {
9804 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9805 L->getStartLoc(), L->getHeader())
9806 << VecDiagMsg.second;
9807 });
9808 ORE->emit(RemarkBuilder: [&]() {
9809 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9810 L->getStartLoc(), L->getHeader())
9811 << IntDiagMsg.second;
9812 });
9813 return false;
9814 }
9815
9816 if (!VectorizeLoop && InterleaveLoop) {
9817 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9818 ORE->emit(RemarkBuilder: [&]() {
9819 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9820 L->getStartLoc(), L->getHeader())
9821 << VecDiagMsg.second;
9822 });
9823 } else if (VectorizeLoop && !InterleaveLoop) {
9824 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9825 << ") in " << L->getLocStr() << '\n');
9826 ORE->emit(RemarkBuilder: [&]() {
9827 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9828 L->getStartLoc(), L->getHeader())
9829 << IntDiagMsg.second;
9830 });
9831 } else if (VectorizeLoop && InterleaveLoop) {
9832 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9833 << ") in " << L->getLocStr() << '\n');
9834 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9835 }
9836
9837 // Report the vectorization decision.
9838 if (VF.Width.isScalar()) {
9839 using namespace ore;
9840 assert(IC > 1);
9841 ORE->emit(RemarkBuilder: [&]() {
9842 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9843 L->getHeader())
9844 << "interleaved loop (interleaved count: "
9845 << NV("InterleaveCount", IC) << ")";
9846 });
9847 } else {
9848 // Report the vectorization decision.
9849 reportVectorization(ORE, TheLoop: L, VF, IC);
9850 }
9851 if (ORE->allowExtraAnalysis(LV_NAME))
9852 checkMixedPrecision(L, ORE);
9853
9854 // If we decided that it is *legal* to interleave or vectorize the loop, then
9855 // do it.
9856
9857 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9858 // Consider vectorizing the epilogue too if it's profitable.
9859 VectorizationFactor EpilogueVF =
9860 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
9861 if (EpilogueVF.Width.isVector()) {
9862 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
9863
9864 // The first pass vectorizes the main loop and creates a scalar epilogue
9865 // to be vectorized by executing the plan (potentially with a different
9866 // factor) again shortly afterwards.
9867 VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
9868 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
9869 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
9870 preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
9871 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
9872 BestEpiPlan);
9873 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9874 Checks, *BestMainPlan);
9875 auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
9876 BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false);
9877 ++LoopsVectorized;
9878
9879 // Second pass vectorizes the epilogue and adjusts the control flow
9880 // edges from the first pass.
9881 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9882 Checks, BestEpiPlan);
9883 SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
9884 Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI, CM, SE&: *PSE.getSE());
9885 LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, DT,
9886 VectorizingEpilogue: true);
9887 connectEpilogueVectorLoop(EpiPlan&: BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
9888 Checks, InstsToMove);
9889 ++LoopsEpilogueVectorized;
9890 } else {
9891 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
9892 BestPlan);
9893 // TODO: Move to general VPlan pipeline once epilogue loops are also
9894 // supported.
9895 RUN_VPLAN_PASS(VPlanTransforms::materializeConstantVectorTripCount,
9896 BestPlan, VF.Width, IC, PSE);
9897 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, UF: IC,
9898 MinProfitableTripCount: VF.MinProfitableTripCount);
9899
9900 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9901 ++LoopsVectorized;
9902 }
9903
9904 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
9905 "DT not preserved correctly");
9906 assert(!verifyFunction(*F, &dbgs()));
9907
9908 return true;
9909}
9910
9911LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
9912
9913 // Don't attempt if
9914 // 1. the target claims to have no vector registers, and
9915 // 2. interleaving won't help ILP.
9916 //
9917 // The second condition is necessary because, even if the target has no
9918 // vector registers, loop vectorization may still enable scalar
9919 // interleaving.
9920 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
9921 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
9922 return LoopVectorizeResult(false, false);
9923
9924 bool Changed = false, CFGChanged = false;
9925
9926 // The vectorizer requires loops to be in simplified form.
9927 // Since simplification may add new inner loops, it has to run before the
9928 // legality and profitability checks. This means running the loop vectorizer
9929 // will simplify all loops, regardless of whether anything end up being
9930 // vectorized.
9931 for (const auto &L : *LI)
9932 Changed |= CFGChanged |=
9933 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
9934
9935 // Build up a worklist of inner-loops to vectorize. This is necessary as
9936 // the act of vectorizing or partially unrolling a loop creates new loops
9937 // and can invalidate iterators across the loops.
9938 SmallVector<Loop *, 8> Worklist;
9939
9940 for (Loop *L : *LI)
9941 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
9942
9943 LoopsAnalyzed += Worklist.size();
9944
9945 // Now walk the identified inner loops.
9946 while (!Worklist.empty()) {
9947 Loop *L = Worklist.pop_back_val();
9948
9949 // For the inner loops we actually process, form LCSSA to simplify the
9950 // transform.
9951 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
9952
9953 Changed |= CFGChanged |= processLoop(L);
9954
9955 if (Changed) {
9956 LAIs->clear();
9957
9958#ifndef NDEBUG
9959 if (VerifySCEV)
9960 SE->verify();
9961#endif
9962 }
9963 }
9964
9965 // Process each loop nest in the function.
9966 return LoopVectorizeResult(Changed, CFGChanged);
9967}
9968
9969PreservedAnalyses LoopVectorizePass::run(Function &F,
9970 FunctionAnalysisManager &AM) {
9971 LI = &AM.getResult<LoopAnalysis>(IR&: F);
9972 // There are no loops in the function. Return before computing other
9973 // expensive analyses.
9974 if (LI->empty())
9975 return PreservedAnalyses::all();
9976 SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
9977 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
9978 DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
9979 TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
9980 AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
9981 DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
9982 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
9983 LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
9984 AA = &AM.getResult<AAManager>(IR&: F);
9985
9986 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
9987 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
9988 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
9989 return AM.getResult<BlockFrequencyAnalysis>(IR&: F);
9990 };
9991 LoopVectorizeResult Result = runImpl(F);
9992 if (!Result.MadeAnyChange)
9993 return PreservedAnalyses::all();
9994 PreservedAnalyses PA;
9995
9996 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
9997 for (auto &BB : F)
9998 RemoveRedundantDbgInstrs(BB: &BB);
9999 }
10000
10001 PA.preserve<LoopAnalysis>();
10002 PA.preserve<DominatorTreeAnalysis>();
10003 PA.preserve<ScalarEvolutionAnalysis>();
10004 PA.preserve<LoopAccessAnalysis>();
10005
10006 if (Result.MadeCFGChange) {
10007 // Making CFG changes likely means a loop got vectorized. Indicate that
10008 // extra simplification passes should be run.
10009 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10010 // be run if runtime checks have been added.
10011 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10012 PA.preserve<ShouldRunExtraVectorPasses>();
10013 } else {
10014 PA.preserveSet<CFGAnalyses>();
10015 }
10016 return PA;
10017}
10018
10019void LoopVectorizePass::printPipeline(
10020 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10021 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10022 OS, MapClassName2PassName);
10023
10024 OS << '<';
10025 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10026 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10027 OS << '>';
10028}
10029