1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/DenseMapInfo.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
74#include "llvm/ADT/SmallPtrSet.h"
75#include "llvm/ADT/SmallVector.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
80#include "llvm/ADT/iterator_range.h"
81#include "llvm/Analysis/AssumptionCache.h"
82#include "llvm/Analysis/BasicAliasAnalysis.h"
83#include "llvm/Analysis/BlockFrequencyInfo.h"
84#include "llvm/Analysis/CFG.h"
85#include "llvm/Analysis/CodeMetrics.h"
86#include "llvm/Analysis/DemandedBits.h"
87#include "llvm/Analysis/GlobalsModRef.h"
88#include "llvm/Analysis/LoopAccessAnalysis.h"
89#include "llvm/Analysis/LoopAnalysisManager.h"
90#include "llvm/Analysis/LoopInfo.h"
91#include "llvm/Analysis/LoopIterator.h"
92#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93#include "llvm/Analysis/ProfileSummaryInfo.h"
94#include "llvm/Analysis/ScalarEvolution.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
97#include "llvm/Analysis/TargetLibraryInfo.h"
98#include "llvm/Analysis/TargetTransformInfo.h"
99#include "llvm/Analysis/ValueTracking.h"
100#include "llvm/Analysis/VectorUtils.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/DiagnosticInfo.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
117#include "llvm/IR/IntrinsicInst.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/ProfDataUtils.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
131#include "llvm/Support/CommandLine.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/NativeFormatting.h"
137#include "llvm/Support/raw_ostream.h"
138#include "llvm/Transforms/Utils/BasicBlockUtils.h"
139#include "llvm/Transforms/Utils/InjectTLIMappings.h"
140#include "llvm/Transforms/Utils/Local.h"
141#include "llvm/Transforms/Utils/LoopSimplify.h"
142#include "llvm/Transforms/Utils/LoopUtils.h"
143#include "llvm/Transforms/Utils/LoopVersioning.h"
144#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
145#include "llvm/Transforms/Utils/SizeOpts.h"
146#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
174static cl::opt<bool> EnableEpilogueVectorization(
175 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
178static cl::opt<unsigned> EpilogueVectorizationForceVF(
179 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
184static cl::opt<unsigned> EpilogueVectorizationMinVF(
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
191static cl::opt<unsigned> TinyTripCountVectorThreshold(
192 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
197static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
198 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201/// Note: This currently only applies to `llvm.masked.load` and
202/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
203static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
204 "force-target-supports-masked-memory-ops", cl::init(Val: false), cl::Hidden,
205 cl::desc("Assume the target supports masked memory operations (used for "
206 "testing)."));
207
208// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
209// that predication is preferred, and this lists all options. I.e., the
210// vectorizer will try to fold the tail-loop (epilogue) into the vector body
211// and predicate the instructions accordingly. If tail-folding fails, there are
212// different fallback strategies depending on these values:
213namespace PreferPredicateTy {
214 enum Option {
215 ScalarEpilogue = 0,
216 PredicateElseScalarEpilogue,
217 PredicateOrDontVectorize
218 };
219} // namespace PreferPredicateTy
220
221static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
222 "prefer-predicate-over-epilogue",
223 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
224 cl::Hidden,
225 cl::desc("Tail-folding and predication preferences over creating a scalar "
226 "epilogue loop."),
227 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
228 "scalar-epilogue",
229 "Don't tail-predicate loops, create scalar epilogue"),
230 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
231 "predicate-else-scalar-epilogue",
232 "prefer tail-folding, create scalar epilogue if tail "
233 "folding fails."),
234 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
235 "predicate-dont-vectorize",
236 "prefers tail-folding, don't attempt vectorization if "
237 "tail-folding fails.")));
238
239static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
240 "force-tail-folding-style", cl::desc("Force the tail folding style"),
241 cl::init(Val: TailFoldingStyle::None),
242 cl::values(
243 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 clEnumValN(
245 TailFoldingStyle::Data, "data",
246 "Create lane mask for data only, using active.lane.mask intrinsic"),
247 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
248 "data-without-lane-mask",
249 "Create lane mask with compare/stepvector"),
250 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
251 "Create lane mask using active.lane.mask intrinsic, and use "
252 "it for both data and control flow"),
253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254 "Use predicated EVL instructions for tail folding. If EVL "
255 "is unsupported, fallback to data-without-lane-mask.")));
256
257cl::opt<bool> llvm::EnableWideActiveLaneMask(
258 "enable-wide-lane-mask", cl::init(Val: false), cl::Hidden,
259 cl::desc("Enable use of wide lane masks when used for control flow in "
260 "tail-folded loops"));
261
262static cl::opt<bool> MaximizeBandwidth(
263 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
264 cl::desc("Maximize bandwidth when selecting vectorization factor which "
265 "will be determined by the smallest type in loop."));
266
267static cl::opt<bool> EnableInterleavedMemAccesses(
268 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
269 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
270
271/// An interleave-group may need masking if it resides in a block that needs
272/// predication, or in order to mask away gaps.
273static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
274 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
275 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
276
277static cl::opt<unsigned> ForceTargetNumScalarRegs(
278 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of scalar registers."));
280
281static cl::opt<unsigned> ForceTargetNumVectorRegs(
282 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
283 cl::desc("A flag that overrides the target's number of vector registers."));
284
285static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
286 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "scalar loops."));
289
290static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
291 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
292 cl::desc("A flag that overrides the target's max interleave factor for "
293 "vectorized loops."));
294
295cl::opt<unsigned> llvm::ForceTargetInstructionCost(
296 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
297 cl::desc("A flag that overrides the target's expected cost for "
298 "an instruction to a single constant value. Mostly "
299 "useful for getting consistent testing."));
300
301static cl::opt<bool> ForceTargetSupportsScalableVectors(
302 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
303 cl::desc(
304 "Pretend that scalable vectors are supported, even if the target does "
305 "not support them. This flag should only be used for testing."));
306
307static cl::opt<unsigned> SmallLoopCost(
308 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
309 cl::desc(
310 "The cost of a loop that is considered 'small' by the interleaver."));
311
312static cl::opt<bool> LoopVectorizeWithBlockFrequency(
313 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
314 cl::desc("Enable the use of the block frequency analysis to access PGO "
315 "heuristics minimizing code growth in cold regions and being more "
316 "aggressive in hot regions."));
317
318// Runtime interleave loops for load/store throughput.
319static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
320 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
321 cl::desc(
322 "Enable runtime interleaving until load/store ports are saturated"));
323
324/// The number of stores in a loop that are allowed to need predication.
325cl::opt<unsigned> NumberOfStoresToPredicate(
326 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
327 cl::desc("Max number of stores to be predicated behind an if."));
328
329static cl::opt<bool> EnableIndVarRegisterHeur(
330 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
331 cl::desc("Count the induction variable only once when interleaving"));
332
333static cl::opt<bool> EnableCondStoresVectorization(
334 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
335 cl::desc("Enable if predication of stores during vectorization."));
336
337static cl::opt<unsigned> MaxNestedScalarReductionIC(
338 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
339 cl::desc("The maximum interleave count to use when interleaving a scalar "
340 "reduction in a nested loop."));
341
342static cl::opt<bool>
343 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
344 cl::Hidden,
345 cl::desc("Prefer in-loop vector reductions, "
346 "overriding the targets preference."));
347
348static cl::opt<bool> ForceOrderedReductions(
349 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
350 cl::desc("Enable the vectorisation of loops with in-order (strict) "
351 "FP reductions"));
352
353static cl::opt<bool> PreferPredicatedReductionSelect(
354 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
355 cl::desc(
356 "Prefer predicating a reduction operation over an after loop select."));
357
358cl::opt<bool> llvm::EnableVPlanNativePath(
359 "enable-vplan-native-path", cl::Hidden,
360 cl::desc("Enable VPlan-native vectorization path with "
361 "support for outer loop vectorization."));
362
363cl::opt<bool>
364 llvm::VerifyEachVPlan("vplan-verify-each",
365#ifdef EXPENSIVE_CHECKS
366 cl::init(true),
367#else
368 cl::init(Val: false),
369#endif
370 cl::Hidden,
371 cl::desc("Verify VPlans after VPlan transforms."));
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374cl::opt<bool> llvm::VPlanPrintAfterAll(
375 "vplan-print-after-all", cl::init(false), cl::Hidden,
376 cl::desc("Print VPlans after all VPlan transformations."));
377
378cl::list<std::string> llvm::VPlanPrintAfterPasses(
379 "vplan-print-after", cl::Hidden,
380 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
381
382cl::opt<bool> llvm::VPlanPrintVectorRegionScope(
383 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
384 cl::desc("Limit VPlan printing to vector loop region in "
385 "`-vplan-print-after*` if the plan has one."));
386#endif
387
388// This flag enables the stress testing of the VPlan H-CFG construction in the
389// VPlan-native vectorization path. It must be used in conjuction with
390// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
391// verification of the H-CFGs built.
392static cl::opt<bool> VPlanBuildStressTest(
393 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
394 cl::desc(
395 "Build VPlan for every supported loop nest in the function and bail "
396 "out right after the build (stress test the VPlan H-CFG construction "
397 "in the VPlan-native vectorization path)."));
398
399cl::opt<bool> llvm::EnableLoopInterleaving(
400 "interleave-loops", cl::init(Val: true), cl::Hidden,
401 cl::desc("Enable loop interleaving in Loop vectorization passes"));
402cl::opt<bool> llvm::EnableLoopVectorization(
403 "vectorize-loops", cl::init(Val: true), cl::Hidden,
404 cl::desc("Run the Loop vectorization passes"));
405
406static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
407 "force-widen-divrem-via-safe-divisor", cl::Hidden,
408 cl::desc(
409 "Override cost based safe divisor widening for div/rem instructions"));
410
411static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
412 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
413 cl::Hidden,
414 cl::desc("Try wider VFs if they enable the use of vector variants"));
415
416static cl::opt<bool> EnableEarlyExitVectorization(
417 "enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
418 cl::desc(
419 "Enable vectorization of early exit loops with uncountable exits."));
420
421static cl::opt<bool> ConsiderRegPressure(
422 "vectorizer-consider-reg-pressure", cl::init(Val: false), cl::Hidden,
423 cl::desc("Discard VFs if their register pressure is too high."));
424
425// Likelyhood of bypassing the vectorized loop because there are zero trips left
426// after prolog. See `emitIterationCountCheck`.
427static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
428
429/// A helper function that returns true if the given type is irregular. The
430/// type is irregular if its allocated size doesn't equal the store size of an
431/// element of the corresponding vector type.
432static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
433 // Determine if an array of N elements of type Ty is "bitcast compatible"
434 // with a <N x Ty> vector.
435 // This is only true if there is no padding between the array elements.
436 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
437}
438
439/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
440/// ElementCount to include loops whose trip count is a function of vscale.
441static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
442 const Loop *L) {
443 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
444 return ElementCount::getFixed(MinVal: ExpectedTC);
445
446 const SCEV *BTC = SE->getBackedgeTakenCount(L);
447 if (isa<SCEVCouldNotCompute>(Val: BTC))
448 return ElementCount::getFixed(MinVal: 0);
449
450 const SCEV *ExitCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
451 if (isa<SCEVVScale>(Val: ExitCount))
452 return ElementCount::getScalable(MinVal: 1);
453
454 const APInt *Scale;
455 if (match(S: ExitCount, P: m_scev_Mul(Op0: m_scev_APInt(C&: Scale), Op1: m_SCEVVScale())))
456 if (cast<SCEVMulExpr>(Val: ExitCount)->hasNoUnsignedWrap())
457 if (Scale->getActiveBits() <= 32)
458 return ElementCount::getScalable(MinVal: Scale->getZExtValue());
459
460 return ElementCount::getFixed(MinVal: 0);
461}
462
463/// Returns "best known" trip count, which is either a valid positive trip count
464/// or std::nullopt when an estimate cannot be made (including when the trip
465/// count would overflow), for the specified loop \p L as defined by the
466/// following procedure:
467/// 1) Returns exact trip count if it is known.
468/// 2) Returns expected trip count according to profile data if any.
469/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
470/// 4) Returns std::nullopt if all of the above failed.
471static std::optional<ElementCount>
472getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
473 bool CanUseConstantMax = true) {
474 // Check if exact trip count is known.
475 if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
476 return ExpectedTC;
477
478 // Check if there is an expected trip count available from profile data.
479 if (LoopVectorizeWithBlockFrequency)
480 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
481 return ElementCount::getFixed(MinVal: *EstimatedTC);
482
483 if (!CanUseConstantMax)
484 return std::nullopt;
485
486 // Check if upper bound estimate is known.
487 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
488 return ElementCount::getFixed(MinVal: ExpectedTC);
489
490 return std::nullopt;
491}
492
493namespace {
494// Forward declare GeneratedRTChecks.
495class GeneratedRTChecks;
496
497using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
498} // namespace
499
500namespace llvm {
501
502AnalysisKey ShouldRunExtraVectorPasses::Key;
503
504/// InnerLoopVectorizer vectorizes loops which contain only one basic
505/// block to a specified vectorization factor (VF).
506/// This class performs the widening of scalars into vectors, or multiple
507/// scalars. This class also implements the following features:
508/// * It inserts an epilogue loop for handling loops that don't have iteration
509/// counts that are known to be a multiple of the vectorization factor.
510/// * It handles the code generation for reduction variables.
511/// * Scalarization (implementation using scalars) of un-vectorizable
512/// instructions.
513/// InnerLoopVectorizer does not perform any vectorization-legality
514/// checks, and relies on the caller to check for the different legality
515/// aspects. The InnerLoopVectorizer relies on the
516/// LoopVectorizationLegality class to provide information about the induction
517/// and reduction variables that were found to a given vectorization factor.
518class InnerLoopVectorizer {
519public:
520 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
521 LoopInfo *LI, DominatorTree *DT,
522 const TargetTransformInfo *TTI, AssumptionCache *AC,
523 ElementCount VecWidth, unsigned UnrollFactor,
524 LoopVectorizationCostModel *CM,
525 GeneratedRTChecks &RTChecks, VPlan &Plan)
526 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
527 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
528 Cost(CM), RTChecks(RTChecks), Plan(Plan),
529 VectorPHVPBB(cast<VPBasicBlock>(
530 Val: Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
531
532 virtual ~InnerLoopVectorizer() = default;
533
534 /// Creates a basic block for the scalar preheader. Both
535 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
536 /// the method to create additional blocks and checks needed for epilogue
537 /// vectorization.
538 virtual BasicBlock *createVectorizedLoopSkeleton();
539
540 /// Fix the vectorized code, taking care of header phi's, and more.
541 void fixVectorizedLoop(VPTransformState &State);
542
543 /// Fix the non-induction PHIs in \p Plan.
544 void fixNonInductionPHIs(VPTransformState &State);
545
546 /// Returns the original loop trip count.
547 Value *getTripCount() const { return TripCount; }
548
549 /// Used to set the trip count after ILV's construction and after the
550 /// preheader block has been executed. Note that this always holds the trip
551 /// count of the original loop for both main loop and epilogue vectorization.
552 void setTripCount(Value *TC) { TripCount = TC; }
553
554protected:
555 friend class LoopVectorizationPlanner;
556
557 /// Create and return a new IR basic block for the scalar preheader whose name
558 /// is prefixed with \p Prefix.
559 BasicBlock *createScalarPreheader(StringRef Prefix);
560
561 /// Allow subclasses to override and print debug traces before/after vplan
562 /// execution, when trace information is requested.
563 virtual void printDebugTracesAtStart() {}
564 virtual void printDebugTracesAtEnd() {}
565
566 /// The original loop.
567 Loop *OrigLoop;
568
569 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
570 /// dynamic knowledge to simplify SCEV expressions and converts them to a
571 /// more usable form.
572 PredicatedScalarEvolution &PSE;
573
574 /// Loop Info.
575 LoopInfo *LI;
576
577 /// Dominator Tree.
578 DominatorTree *DT;
579
580 /// Target Transform Info.
581 const TargetTransformInfo *TTI;
582
583 /// Assumption Cache.
584 AssumptionCache *AC;
585
586 /// The vectorization SIMD factor to use. Each vector will have this many
587 /// vector elements.
588 ElementCount VF;
589
590 /// The vectorization unroll factor to use. Each scalar is vectorized to this
591 /// many different vector instructions.
592 unsigned UF;
593
594 /// The builder that we use
595 IRBuilder<> Builder;
596
597 // --- Vectorization state ---
598
599 /// Trip count of the original loop.
600 Value *TripCount = nullptr;
601
602 /// The profitablity analysis.
603 LoopVectorizationCostModel *Cost;
604
605 /// Structure to hold information about generated runtime checks, responsible
606 /// for cleaning the checks, if vectorization turns out unprofitable.
607 GeneratedRTChecks &RTChecks;
608
609 VPlan &Plan;
610
611 /// The vector preheader block of \p Plan, used as target for check blocks
612 /// introduced during skeleton creation.
613 VPBasicBlock *VectorPHVPBB;
614};
615
616/// Encapsulate information regarding vectorization of a loop and its epilogue.
617/// This information is meant to be updated and used across two stages of
618/// epilogue vectorization.
619struct EpilogueLoopVectorizationInfo {
620 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
621 unsigned MainLoopUF = 0;
622 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
623 unsigned EpilogueUF = 0;
624 BasicBlock *MainLoopIterationCountCheck = nullptr;
625 BasicBlock *EpilogueIterationCountCheck = nullptr;
626 Value *TripCount = nullptr;
627 Value *VectorTripCount = nullptr;
628 VPlan &EpiloguePlan;
629
630 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
631 ElementCount EVF, unsigned EUF,
632 VPlan &EpiloguePlan)
633 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
634 EpiloguePlan(EpiloguePlan) {
635 assert(EUF == 1 &&
636 "A high UF for the epilogue loop is likely not beneficial.");
637 }
638};
639
640/// An extension of the inner loop vectorizer that creates a skeleton for a
641/// vectorized loop that has its epilogue (residual) also vectorized.
642/// The idea is to run the vplan on a given loop twice, firstly to setup the
643/// skeleton and vectorize the main loop, and secondly to complete the skeleton
644/// from the first step and vectorize the epilogue. This is achieved by
645/// deriving two concrete strategy classes from this base class and invoking
646/// them in succession from the loop vectorizer planner.
647class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
648public:
649 InnerLoopAndEpilogueVectorizer(
650 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
651 DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
652 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
653 GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
654 ElementCount MinProfitableTripCount, unsigned UnrollFactor)
655 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
656 UnrollFactor, CM, Checks, Plan),
657 EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
658
659 /// Holds and updates state information required to vectorize the main loop
660 /// and its epilogue in two separate passes. This setup helps us avoid
661 /// regenerating and recomputing runtime safety checks. It also helps us to
662 /// shorten the iteration-count-check path length for the cases where the
663 /// iteration count of the loop is so small that the main vector loop is
664 /// completely skipped.
665 EpilogueLoopVectorizationInfo &EPI;
666
667protected:
668 ElementCount MinProfitableTripCount;
669};
670
671/// A specialized derived class of inner loop vectorizer that performs
672/// vectorization of *main* loops in the process of vectorizing loops and their
673/// epilogues.
674class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
675public:
676 EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
677 LoopInfo *LI, DominatorTree *DT,
678 const TargetTransformInfo *TTI,
679 AssumptionCache *AC,
680 EpilogueLoopVectorizationInfo &EPI,
681 LoopVectorizationCostModel *CM,
682 GeneratedRTChecks &Check, VPlan &Plan)
683 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
684 Check, Plan, EPI.MainLoopVF,
685 EPI.MainLoopVF, EPI.MainLoopUF) {}
686 /// Implements the interface for creating a vectorized skeleton using the
687 /// *main loop* strategy (i.e., the first pass of VPlan execution).
688 BasicBlock *createVectorizedLoopSkeleton() final;
689
690protected:
691 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
692 /// vector preheader and its predecessor, also connecting the new block to the
693 /// scalar preheader.
694 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
695
696 // Create a check to see if the main vector loop should be executed
697 Value *createIterationCountCheck(BasicBlock *VectorPH, ElementCount VF,
698 unsigned UF) const;
699
700 /// Emits an iteration count bypass check once for the main loop (when \p
701 /// ForEpilogue is false) and once for the epilogue loop (when \p
702 /// ForEpilogue is true).
703 BasicBlock *emitIterationCountCheck(BasicBlock *VectorPH, BasicBlock *Bypass,
704 bool ForEpilogue);
705 void printDebugTracesAtStart() override;
706 void printDebugTracesAtEnd() override;
707};
708
709// A specialized derived class of inner loop vectorizer that performs
710// vectorization of *epilogue* loops in the process of vectorizing loops and
711// their epilogues.
712class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
713public:
714 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
715 LoopInfo *LI, DominatorTree *DT,
716 const TargetTransformInfo *TTI,
717 AssumptionCache *AC,
718 EpilogueLoopVectorizationInfo &EPI,
719 LoopVectorizationCostModel *CM,
720 GeneratedRTChecks &Checks, VPlan &Plan)
721 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
722 Checks, Plan, EPI.EpilogueVF,
723 EPI.EpilogueVF, EPI.EpilogueUF) {}
724 /// Implements the interface for creating a vectorized skeleton using the
725 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
726 BasicBlock *createVectorizedLoopSkeleton() final;
727
728protected:
729 void printDebugTracesAtStart() override;
730 void printDebugTracesAtEnd() override;
731};
732} // end namespace llvm
733
734/// Look for a meaningful debug location on the instruction or its operands.
735static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
736 if (!I)
737 return DebugLoc::getUnknown();
738
739 DebugLoc Empty;
740 if (I->getDebugLoc() != Empty)
741 return I->getDebugLoc();
742
743 for (Use &Op : I->operands()) {
744 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
745 if (OpInst->getDebugLoc() != Empty)
746 return OpInst->getDebugLoc();
747 }
748
749 return I->getDebugLoc();
750}
751
752/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
753/// is passed, the message relates to that particular instruction.
754#ifndef NDEBUG
755static void debugVectorizationMessage(const StringRef Prefix,
756 const StringRef DebugMsg,
757 Instruction *I) {
758 dbgs() << "LV: " << Prefix << DebugMsg;
759 if (I != nullptr)
760 dbgs() << " " << *I;
761 else
762 dbgs() << '.';
763 dbgs() << '\n';
764}
765#endif
766
767/// Create an analysis remark that explains why vectorization failed
768///
769/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
770/// RemarkName is the identifier for the remark. If \p I is passed it is an
771/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
772/// the location of the remark. If \p DL is passed, use it as debug location for
773/// the remark. \return the remark object that can be streamed to.
774static OptimizationRemarkAnalysis
775createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
776 Instruction *I, DebugLoc DL = {}) {
777 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
778 // If debug location is attached to the instruction, use it. Otherwise if DL
779 // was not provided, use the loop's.
780 if (I && I->getDebugLoc())
781 DL = I->getDebugLoc();
782 else if (!DL)
783 DL = TheLoop->getStartLoc();
784
785 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
786}
787
788namespace llvm {
789
790/// Return a value for Step multiplied by VF.
791Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
792 int64_t Step) {
793 assert(Ty->isIntegerTy() && "Expected an integer step");
794 ElementCount VFxStep = VF.multiplyCoefficientBy(RHS: Step);
795 assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
796 if (VF.isScalable() && isPowerOf2_64(Value: Step)) {
797 return B.CreateShl(
798 LHS: B.CreateVScale(Ty),
799 RHS: ConstantInt::get(Ty, V: Log2_64(Value: VFxStep.getKnownMinValue())), Name: "", HasNUW: true);
800 }
801 return B.CreateElementCount(Ty, EC: VFxStep);
802}
803
804/// Return the runtime value for VF.
805Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
806 return B.CreateElementCount(Ty, EC: VF);
807}
808
809void reportVectorizationFailure(const StringRef DebugMsg,
810 const StringRef OREMsg, const StringRef ORETag,
811 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
812 Instruction *I) {
813 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
814 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
815 ORE->emit(
816 OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
817 << "loop not vectorized: " << OREMsg);
818}
819
820/// Reports an informative message: print \p Msg for debugging purposes as well
821/// as an optimization remark. Uses either \p I as location of the remark, or
822/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
823/// remark. If \p DL is passed, use it as debug location for the remark.
824static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
825 OptimizationRemarkEmitter *ORE,
826 Loop *TheLoop, Instruction *I = nullptr,
827 DebugLoc DL = {}) {
828 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
829 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
830 ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
831 I, DL)
832 << Msg);
833}
834
835/// Report successful vectorization of the loop. In case an outer loop is
836/// vectorized, prepend "outer" to the vectorization remark.
837static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
838 VectorizationFactor VF, unsigned IC) {
839 LLVM_DEBUG(debugVectorizationMessage(
840 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
841 nullptr));
842 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
843 ORE->emit(RemarkBuilder: [&]() {
844 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
845 TheLoop->getHeader())
846 << "vectorized " << LoopType << "loop (vectorization width: "
847 << ore::NV("VectorizationFactor", VF.Width)
848 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
849 });
850}
851
852} // end namespace llvm
853
854namespace llvm {
855
856// Loop vectorization cost-model hints how the scalar epilogue loop should be
857// lowered.
858enum ScalarEpilogueLowering {
859
860 // The default: allowing scalar epilogues.
861 CM_ScalarEpilogueAllowed,
862
863 // Vectorization with OptForSize: don't allow epilogues.
864 CM_ScalarEpilogueNotAllowedOptSize,
865
866 // A special case of vectorisation with OptForSize: loops with a very small
867 // trip count are considered for vectorization under OptForSize, thereby
868 // making sure the cost of their loop body is dominant, free of runtime
869 // guards and scalar iteration overheads.
870 CM_ScalarEpilogueNotAllowedLowTripLoop,
871
872 // Loop hint predicate indicating an epilogue is undesired.
873 CM_ScalarEpilogueNotNeededUsePredicate,
874
875 // Directive indicating we must either tail fold or not vectorize
876 CM_ScalarEpilogueNotAllowedUsePredicate
877};
878
879/// LoopVectorizationCostModel - estimates the expected speedups due to
880/// vectorization.
881/// In many cases vectorization is not profitable. This can happen because of
882/// a number of reasons. In this class we mainly attempt to predict the
883/// expected speedup/slowdowns due to the supported instruction set. We use the
884/// TargetTransformInfo to query the different backends for the cost of
885/// different operations.
886class LoopVectorizationCostModel {
887 friend class LoopVectorizationPlanner;
888
889public:
890 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
891 PredicatedScalarEvolution &PSE, LoopInfo *LI,
892 LoopVectorizationLegality *Legal,
893 const TargetTransformInfo &TTI,
894 const TargetLibraryInfo *TLI, DemandedBits *DB,
895 AssumptionCache *AC,
896 OptimizationRemarkEmitter *ORE,
897 std::function<BlockFrequencyInfo &()> GetBFI,
898 const Function *F, const LoopVectorizeHints *Hints,
899 InterleavedAccessInfo &IAI, bool OptForSize)
900 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
901 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
902 TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
903 OptForSize(OptForSize) {
904 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
905 initializeVScaleForTuning();
906 CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
907 }
908
909 /// \return An upper bound for the vectorization factors (both fixed and
910 /// scalable). If the factors are 0, vectorization and interleaving should be
911 /// avoided up front.
912 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
913
914 /// \return True if runtime checks are required for vectorization, and false
915 /// otherwise.
916 bool runtimeChecksRequired();
917
918 /// Setup cost-based decisions for user vectorization factor.
919 /// \return true if the UserVF is a feasible VF to be chosen.
920 bool selectUserVectorizationFactor(ElementCount UserVF) {
921 collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
922 return expectedCost(VF: UserVF).isValid();
923 }
924
925 /// \return True if maximizing vector bandwidth is enabled by the target or
926 /// user options, for the given register kind.
927 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
928
929 /// \return True if register pressure should be considered for the given VF.
930 bool shouldConsiderRegPressureForVF(ElementCount VF);
931
932 /// \return The size (in bits) of the smallest and widest types in the code
933 /// that needs to be vectorized. We ignore values that remain scalar such as
934 /// 64 bit loop indices.
935 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
936
937 /// Memory access instruction may be vectorized in more than one way.
938 /// Form of instruction after vectorization depends on cost.
939 /// This function takes cost-based decisions for Load/Store instructions
940 /// and collects them in a map. This decisions map is used for building
941 /// the lists of loop-uniform and loop-scalar instructions.
942 /// The calculated cost is saved with widening decision in order to
943 /// avoid redundant calculations.
944 void setCostBasedWideningDecision(ElementCount VF);
945
946 /// A call may be vectorized in different ways depending on whether we have
947 /// vectorized variants available and whether the target supports masking.
948 /// This function analyzes all calls in the function at the supplied VF,
949 /// makes a decision based on the costs of available options, and stores that
950 /// decision in a map for use in planning and plan execution.
951 void setVectorizedCallDecision(ElementCount VF);
952
953 /// Collect values we want to ignore in the cost model.
954 void collectValuesToIgnore();
955
956 /// Collect all element types in the loop for which widening is needed.
957 void collectElementTypesForWidening();
958
959 /// Split reductions into those that happen in the loop, and those that happen
960 /// outside. In loop reductions are collected into InLoopReductions.
961 void collectInLoopReductions();
962
963 /// Returns true if we should use strict in-order reductions for the given
964 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
965 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
966 /// of FP operations.
967 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
968 return !Hints->allowReordering() && RdxDesc.isOrdered();
969 }
970
971 /// \returns The smallest bitwidth each instruction can be represented with.
972 /// The vector equivalents of these instructions should be truncated to this
973 /// type.
974 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
975 return MinBWs;
976 }
977
978 /// \returns True if it is more profitable to scalarize instruction \p I for
979 /// vectorization factor \p VF.
980 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
981 assert(VF.isVector() &&
982 "Profitable to scalarize relevant only for VF > 1.");
983 assert(
984 TheLoop->isInnermost() &&
985 "cost-model should not be used for outer loops (in VPlan-native path)");
986
987 auto Scalars = InstsToScalarize.find(Key: VF);
988 assert(Scalars != InstsToScalarize.end() &&
989 "VF not yet analyzed for scalarization profitability");
990 return Scalars->second.contains(Key: I);
991 }
992
993 /// Returns true if \p I is known to be uniform after vectorization.
994 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
995 assert(
996 TheLoop->isInnermost() &&
997 "cost-model should not be used for outer loops (in VPlan-native path)");
998 // Pseudo probe needs to be duplicated for each unrolled iteration and
999 // vector lane so that profiled loop trip count can be accurately
1000 // accumulated instead of being under counted.
1001 if (isa<PseudoProbeInst>(Val: I))
1002 return false;
1003
1004 if (VF.isScalar())
1005 return true;
1006
1007 auto UniformsPerVF = Uniforms.find(Val: VF);
1008 assert(UniformsPerVF != Uniforms.end() &&
1009 "VF not yet analyzed for uniformity");
1010 return UniformsPerVF->second.count(Ptr: I);
1011 }
1012
1013 /// Returns true if \p I is known to be scalar after vectorization.
1014 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1015 assert(
1016 TheLoop->isInnermost() &&
1017 "cost-model should not be used for outer loops (in VPlan-native path)");
1018 if (VF.isScalar())
1019 return true;
1020
1021 auto ScalarsPerVF = Scalars.find(Val: VF);
1022 assert(ScalarsPerVF != Scalars.end() &&
1023 "Scalar values are not calculated for VF");
1024 return ScalarsPerVF->second.count(Ptr: I);
1025 }
1026
1027 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1028 /// for vectorization factor \p VF.
1029 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1030 // Truncs must truncate at most to their destination type.
1031 if (isa_and_nonnull<TruncInst>(Val: I) && MinBWs.contains(Key: I) &&
1032 I->getType()->getScalarSizeInBits() < MinBWs.lookup(Key: I))
1033 return false;
1034 return VF.isVector() && MinBWs.contains(Key: I) &&
1035 !isProfitableToScalarize(I, VF) &&
1036 !isScalarAfterVectorization(I, VF);
1037 }
1038
1039 /// Decision that was taken during cost calculation for memory instruction.
1040 enum InstWidening {
1041 CM_Unknown,
1042 CM_Widen, // For consecutive accesses with stride +1.
1043 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1044 CM_Interleave,
1045 CM_GatherScatter,
1046 CM_Scalarize,
1047 CM_VectorCall,
1048 CM_IntrinsicCall
1049 };
1050
1051 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1052 /// instruction \p I and vector width \p VF.
1053 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1054 InstructionCost Cost) {
1055 assert(VF.isVector() && "Expected VF >=2");
1056 WideningDecisions[{I, VF}] = {W, Cost};
1057 }
1058
1059 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1060 /// interleaving group \p Grp and vector width \p VF.
1061 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1062 ElementCount VF, InstWidening W,
1063 InstructionCost Cost) {
1064 assert(VF.isVector() && "Expected VF >=2");
1065 /// Broadcast this decicion to all instructions inside the group.
1066 /// When interleaving, the cost will only be assigned one instruction, the
1067 /// insert position. For other cases, add the appropriate fraction of the
1068 /// total cost to each instruction. This ensures accurate costs are used,
1069 /// even if the insert position instruction is not used.
1070 InstructionCost InsertPosCost = Cost;
1071 InstructionCost OtherMemberCost = 0;
1072 if (W != CM_Interleave)
1073 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1074 ;
1075 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1076 if (auto *I = Grp->getMember(Index: Idx)) {
1077 if (Grp->getInsertPos() == I)
1078 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1079 else
1080 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1081 }
1082 }
1083 }
1084
1085 /// Return the cost model decision for the given instruction \p I and vector
1086 /// width \p VF. Return CM_Unknown if this instruction did not pass
1087 /// through the cost modeling.
1088 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1089 assert(VF.isVector() && "Expected VF to be a vector VF");
1090 assert(
1091 TheLoop->isInnermost() &&
1092 "cost-model should not be used for outer loops (in VPlan-native path)");
1093
1094 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1095 auto Itr = WideningDecisions.find(Val: InstOnVF);
1096 if (Itr == WideningDecisions.end())
1097 return CM_Unknown;
1098 return Itr->second.first;
1099 }
1100
1101 /// Return the vectorization cost for the given instruction \p I and vector
1102 /// width \p VF.
1103 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1104 assert(VF.isVector() && "Expected VF >=2");
1105 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1106 assert(WideningDecisions.contains(InstOnVF) &&
1107 "The cost is not calculated");
1108 return WideningDecisions[InstOnVF].second;
1109 }
1110
1111 struct CallWideningDecision {
1112 InstWidening Kind;
1113 Function *Variant;
1114 Intrinsic::ID IID;
1115 std::optional<unsigned> MaskPos;
1116 InstructionCost Cost;
1117 };
1118
1119 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1120 Function *Variant, Intrinsic::ID IID,
1121 std::optional<unsigned> MaskPos,
1122 InstructionCost Cost) {
1123 assert(!VF.isScalar() && "Expected vector VF");
1124 CallWideningDecisions[{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1125 }
1126
1127 CallWideningDecision getCallWideningDecision(CallInst *CI,
1128 ElementCount VF) const {
1129 assert(!VF.isScalar() && "Expected vector VF");
1130 auto I = CallWideningDecisions.find(Val: {CI, VF});
1131 if (I == CallWideningDecisions.end())
1132 return {.Kind: CM_Unknown, .Variant: nullptr, .IID: Intrinsic::not_intrinsic, .MaskPos: std::nullopt, .Cost: 0};
1133 return I->second;
1134 }
1135
1136 /// Return True if instruction \p I is an optimizable truncate whose operand
1137 /// is an induction variable. Such a truncate will be removed by adding a new
1138 /// induction variable with the destination type.
1139 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1140 // If the instruction is not a truncate, return false.
1141 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1142 if (!Trunc)
1143 return false;
1144
1145 // Get the source and destination types of the truncate.
1146 Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1147 Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1148
1149 // If the truncate is free for the given types, return false. Replacing a
1150 // free truncate with an induction variable would add an induction variable
1151 // update instruction to each iteration of the loop. We exclude from this
1152 // check the primary induction variable since it will need an update
1153 // instruction regardless.
1154 Value *Op = Trunc->getOperand(i_nocapture: 0);
1155 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1156 return false;
1157
1158 // If the truncated value is not an induction variable, return false.
1159 return Legal->isInductionPhi(V: Op);
1160 }
1161
1162 /// Collects the instructions to scalarize for each predicated instruction in
1163 /// the loop.
1164 void collectInstsToScalarize(ElementCount VF);
1165
1166 /// Collect values that will not be widened, including Uniforms, Scalars, and
1167 /// Instructions to Scalarize for the given \p VF.
1168 /// The sets depend on CM decision for Load/Store instructions
1169 /// that may be vectorized as interleave, gather-scatter or scalarized.
1170 /// Also make a decision on what to do about call instructions in the loop
1171 /// at that VF -- scalarize, call a known vector routine, or call a
1172 /// vector intrinsic.
1173 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1174 // Do the analysis once.
1175 if (VF.isScalar() || Uniforms.contains(Val: VF))
1176 return;
1177 setCostBasedWideningDecision(VF);
1178 collectLoopUniforms(VF);
1179 setVectorizedCallDecision(VF);
1180 collectLoopScalars(VF);
1181 collectInstsToScalarize(VF);
1182 }
1183
1184 /// Returns true if the target machine supports masked store operation
1185 /// for the given \p DataType and kind of access to \p Ptr.
1186 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1187 unsigned AddressSpace) const {
1188 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1189 (ForceTargetSupportsMaskedMemoryOps ||
1190 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace));
1191 }
1192
1193 /// Returns true if the target machine supports masked load operation
1194 /// for the given \p DataType and kind of access to \p Ptr.
1195 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1196 unsigned AddressSpace) const {
1197 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1198 (ForceTargetSupportsMaskedMemoryOps ||
1199 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace));
1200 }
1201
1202 /// Returns true if the target machine can represent \p V as a masked gather
1203 /// or scatter operation.
1204 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1205 bool LI = isa<LoadInst>(Val: V);
1206 bool SI = isa<StoreInst>(Val: V);
1207 if (!LI && !SI)
1208 return false;
1209 auto *Ty = getLoadStoreType(I: V);
1210 Align Align = getLoadStoreAlignment(I: V);
1211 if (VF.isVector())
1212 Ty = VectorType::get(ElementType: Ty, EC: VF);
1213 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1214 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1215 }
1216
1217 /// Returns true if the target machine supports all of the reduction
1218 /// variables found for the given VF.
1219 bool canVectorizeReductions(ElementCount VF) const {
1220 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1221 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1222 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1223 }));
1224 }
1225
1226 /// Given costs for both strategies, return true if the scalar predication
1227 /// lowering should be used for div/rem. This incorporates an override
1228 /// option so it is not simply a cost comparison.
1229 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1230 InstructionCost SafeDivisorCost) const {
1231 switch (ForceSafeDivisor) {
1232 case cl::BOU_UNSET:
1233 return ScalarCost < SafeDivisorCost;
1234 case cl::BOU_TRUE:
1235 return false;
1236 case cl::BOU_FALSE:
1237 return true;
1238 }
1239 llvm_unreachable("impossible case value");
1240 }
1241
1242 /// Returns true if \p I is an instruction which requires predication and
1243 /// for which our chosen predication strategy is scalarization (i.e. we
1244 /// don't have an alternate strategy such as masking available).
1245 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1246 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1247
1248 /// Returns true if \p I is an instruction that needs to be predicated
1249 /// at runtime. The result is independent of the predication mechanism.
1250 /// Superset of instructions that return true for isScalarWithPredication.
1251 bool isPredicatedInst(Instruction *I) const;
1252
1253 /// A helper function that returns how much we should divide the cost of a
1254 /// predicated block by. Typically this is the reciprocal of the block
1255 /// probability, i.e. if we return X we are assuming the predicated block will
1256 /// execute once for every X iterations of the loop header so the block should
1257 /// only contribute 1/X of its cost to the total cost calculation, but when
1258 /// optimizing for code size it will just be 1 as code size costs don't depend
1259 /// on execution probabilities.
1260 ///
1261 /// Note that if a block wasn't originally predicated but was predicated due
1262 /// to tail folding, the divisor will still be 1 because it will execute for
1263 /// every iteration of the loop header.
1264 inline uint64_t
1265 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1266 const BasicBlock *BB);
1267
1268 /// Returns true if an artificially high cost for emulated masked memrefs
1269 /// should be used.
1270 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1271
1272 /// Return the costs for our two available strategies for lowering a
1273 /// div/rem operation which requires speculating at least one lane.
1274 /// First result is for scalarization (will be invalid for scalable
1275 /// vectors); second is for the safe-divisor strategy.
1276 std::pair<InstructionCost, InstructionCost>
1277 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1278
1279 /// Returns true if \p I is a memory instruction with consecutive memory
1280 /// access that can be widened.
1281 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1282
1283 /// Returns true if \p I is a memory instruction in an interleaved-group
1284 /// of memory accesses that can be vectorized with wide vector loads/stores
1285 /// and shuffles.
1286 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1287
1288 /// Check if \p Instr belongs to any interleaved access group.
1289 bool isAccessInterleaved(Instruction *Instr) const {
1290 return InterleaveInfo.isInterleaved(Instr);
1291 }
1292
1293 /// Get the interleaved access group that \p Instr belongs to.
1294 const InterleaveGroup<Instruction> *
1295 getInterleavedAccessGroup(Instruction *Instr) const {
1296 return InterleaveInfo.getInterleaveGroup(Instr);
1297 }
1298
1299 /// Returns true if we're required to use a scalar epilogue for at least
1300 /// the final iteration of the original loop.
1301 bool requiresScalarEpilogue(bool IsVectorizing) const {
1302 if (!isScalarEpilogueAllowed()) {
1303 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1304 return false;
1305 }
1306 // If we might exit from anywhere but the latch and early exit vectorization
1307 // is disabled, we must run the exiting iteration in scalar form.
1308 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1309 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1310 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1311 "from latch block\n");
1312 return true;
1313 }
1314 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1315 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1316 "interleaved group requires scalar epilogue\n");
1317 return true;
1318 }
1319 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1320 return false;
1321 }
1322
1323 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1324 /// loop hint annotation.
1325 bool isScalarEpilogueAllowed() const {
1326 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1327 }
1328
1329 /// Returns true if tail-folding is preferred over a scalar epilogue.
1330 bool preferPredicatedLoop() const {
1331 return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1332 ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1333 }
1334
1335 /// Returns the TailFoldingStyle that is best for the current loop.
1336 TailFoldingStyle getTailFoldingStyle() const {
1337 return ChosenTailFoldingStyle;
1338 }
1339
1340 /// Selects and saves TailFoldingStyle.
1341 /// \param IsScalableVF true if scalable vector factors enabled.
1342 /// \param UserIC User specific interleave count.
1343 void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC) {
1344 assert(ChosenTailFoldingStyle == TailFoldingStyle::None &&
1345 "Tail folding must not be selected yet.");
1346 if (!Legal->canFoldTailByMasking()) {
1347 ChosenTailFoldingStyle = TailFoldingStyle::None;
1348 return;
1349 }
1350
1351 // Default to TTI preference, but allow command line override.
1352 ChosenTailFoldingStyle = TTI.getPreferredTailFoldingStyle();
1353 if (ForceTailFoldingStyle.getNumOccurrences())
1354 ChosenTailFoldingStyle = ForceTailFoldingStyle.getValue();
1355
1356 if (ChosenTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1357 return;
1358 // Override EVL styles if needed.
1359 // FIXME: Investigate opportunity for fixed vector factor.
1360 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1361 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1362 if (EVLIsLegal)
1363 return;
1364 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1365 // if it's allowed, or DataWithoutLaneMask otherwise.
1366 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1367 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1368 ChosenTailFoldingStyle = TailFoldingStyle::None;
1369 else
1370 ChosenTailFoldingStyle = TailFoldingStyle::DataWithoutLaneMask;
1371
1372 LLVM_DEBUG(
1373 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1374 "not try to generate VP Intrinsics "
1375 << (UserIC > 1
1376 ? "since interleave count specified is greater than 1.\n"
1377 : "due to non-interleaving reasons.\n"));
1378 }
1379
1380 /// Returns true if all loop blocks should be masked to fold tail loop.
1381 bool foldTailByMasking() const {
1382 return getTailFoldingStyle() != TailFoldingStyle::None;
1383 }
1384
1385 /// Returns true if the use of wide lane masks is requested and the loop is
1386 /// using tail-folding with a lane mask for control flow.
1387 bool useWideActiveLaneMask() const {
1388 if (!EnableWideActiveLaneMask)
1389 return false;
1390
1391 return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow;
1392 }
1393
1394 /// Return maximum safe number of elements to be processed per vector
1395 /// iteration, which do not prevent store-load forwarding and are safe with
1396 /// regard to the memory dependencies. Required for EVL-based VPlans to
1397 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1398 /// MaxSafeElements).
1399 /// TODO: need to consider adjusting cost model to use this value as a
1400 /// vectorization factor for EVL-based vectorization.
1401 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1402
1403 /// Returns true if the instructions in this block requires predication
1404 /// for any reason, e.g. because tail folding now requires a predicate
1405 /// or because the block in the original loop was predicated.
1406 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1407 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1408 }
1409
1410 /// Returns true if VP intrinsics with explicit vector length support should
1411 /// be generated in the tail folded loop.
1412 bool foldTailWithEVL() const {
1413 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1414 }
1415
1416 /// Returns true if the Phi is part of an inloop reduction.
1417 bool isInLoopReduction(PHINode *Phi) const {
1418 return InLoopReductions.contains(Ptr: Phi);
1419 }
1420
1421 /// Returns the set of in-loop reduction PHIs.
1422 const SmallPtrSetImpl<PHINode *> &getInLoopReductions() const {
1423 return InLoopReductions;
1424 }
1425
1426 /// Returns true if the predicated reduction select should be used to set the
1427 /// incoming value for the reduction phi.
1428 bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const {
1429 // Force to use predicated reduction select since the EVL of the
1430 // second-to-last iteration might not be VF*UF.
1431 if (foldTailWithEVL())
1432 return true;
1433
1434 // Note: For FindLast recurrences we prefer a predicated select to simplify
1435 // matching in handleFindLastReductions(), rather than handle multiple
1436 // cases.
1437 if (RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RecurrenceKind))
1438 return true;
1439
1440 return PreferPredicatedReductionSelect ||
1441 TTI.preferPredicatedReductionSelect();
1442 }
1443
1444 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1445 /// with factor VF. Return the cost of the instruction, including
1446 /// scalarization overhead if it's needed.
1447 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1448
1449 /// Estimate cost of a call instruction CI if it were vectorized with factor
1450 /// VF. Return the cost of the instruction, including scalarization overhead
1451 /// if it's needed.
1452 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1453
1454 /// Invalidates decisions already taken by the cost model.
1455 void invalidateCostModelingDecisions() {
1456 WideningDecisions.clear();
1457 CallWideningDecisions.clear();
1458 Uniforms.clear();
1459 Scalars.clear();
1460 }
1461
1462 /// Returns the expected execution cost. The unit of the cost does
1463 /// not matter because we use the 'cost' units to compare different
1464 /// vector widths. The cost that is returned is *not* normalized by
1465 /// the factor width.
1466 InstructionCost expectedCost(ElementCount VF);
1467
1468 bool hasPredStores() const { return NumPredStores > 0; }
1469
1470 /// Returns true if epilogue vectorization is considered profitable, and
1471 /// false otherwise.
1472 /// \p VF is the vectorization factor chosen for the original loop.
1473 /// \p Multiplier is an aditional scaling factor applied to VF before
1474 /// comparing to EpilogueVectorizationMinVF.
1475 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1476 const unsigned IC) const;
1477
1478 /// Returns the execution time cost of an instruction for a given vector
1479 /// width. Vector width of one means scalar.
1480 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1481
1482 /// Return the cost of instructions in an inloop reduction pattern, if I is
1483 /// part of that pattern.
1484 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1485 ElementCount VF,
1486 Type *VectorTy) const;
1487
1488 /// Returns true if \p Op should be considered invariant and if it is
1489 /// trivially hoistable.
1490 bool shouldConsiderInvariant(Value *Op);
1491
1492 /// Return the value of vscale used for tuning the cost model.
1493 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1494
1495private:
1496 unsigned NumPredStores = 0;
1497
1498 /// Used to store the value of vscale used for tuning the cost model. It is
1499 /// initialized during object construction.
1500 std::optional<unsigned> VScaleForTuning;
1501
1502 /// Initializes the value of vscale used for tuning the cost model. If
1503 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1504 /// return the value returned by the corresponding TTI method.
1505 void initializeVScaleForTuning() {
1506 const Function *Fn = TheLoop->getHeader()->getParent();
1507 if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1508 auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1509 auto Min = Attr.getVScaleRangeMin();
1510 auto Max = Attr.getVScaleRangeMax();
1511 if (Max && Min == Max) {
1512 VScaleForTuning = Max;
1513 return;
1514 }
1515 }
1516
1517 VScaleForTuning = TTI.getVScaleForTuning();
1518 }
1519
1520 /// \return An upper bound for the vectorization factors for both
1521 /// fixed and scalable vectorization, where the minimum-known number of
1522 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1523 /// disabled or unsupported, then the scalable part will be equal to
1524 /// ElementCount::getScalable(0).
1525 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1526 ElementCount UserVF, unsigned UserIC,
1527 bool FoldTailByMasking);
1528
1529 /// If \p VF * \p UserIC > MaxTripcount, clamps VF to the next lower VF that
1530 /// results in VF * UserIC <= MaxTripCount.
1531 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1532 unsigned UserIC,
1533 bool FoldTailByMasking) const;
1534
1535 /// \return the maximized element count based on the targets vector
1536 /// registers and the loop trip-count, but limited to a maximum safe VF.
1537 /// This is a helper function of computeFeasibleMaxVF.
1538 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1539 unsigned SmallestType,
1540 unsigned WidestType,
1541 ElementCount MaxSafeVF, unsigned UserIC,
1542 bool FoldTailByMasking);
1543
1544 /// Checks if scalable vectorization is supported and enabled. Caches the
1545 /// result to avoid repeated debug dumps for repeated queries.
1546 bool isScalableVectorizationAllowed();
1547
1548 /// \return the maximum legal scalable VF, based on the safe max number
1549 /// of elements.
1550 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1551
1552 /// Calculate vectorization cost of memory instruction \p I.
1553 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1554
1555 /// The cost computation for scalarized memory instruction.
1556 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1557
1558 /// The cost computation for interleaving group of memory instructions.
1559 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1560
1561 /// The cost computation for Gather/Scatter instruction.
1562 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1563
1564 /// The cost computation for widening instruction \p I with consecutive
1565 /// memory access.
1566 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1567
1568 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1569 /// Load: scalar load + broadcast.
1570 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1571 /// element)
1572 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1573
1574 /// Estimate the overhead of scalarizing an instruction. This is a
1575 /// convenience wrapper for the type-based getScalarizationOverhead API.
1576 InstructionCost getScalarizationOverhead(Instruction *I,
1577 ElementCount VF) const;
1578
1579 /// Map of scalar integer values to the smallest bitwidth they can be legally
1580 /// represented as. The vector equivalents of these values should be truncated
1581 /// to this type.
1582 MapVector<Instruction *, uint64_t> MinBWs;
1583
1584 /// A type representing the costs for instructions if they were to be
1585 /// scalarized rather than vectorized. The entries are Instruction-Cost
1586 /// pairs.
1587 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1588
1589 /// A set containing all BasicBlocks that are known to present after
1590 /// vectorization as a predicated block.
1591 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1592 PredicatedBBsAfterVectorization;
1593
1594 /// Records whether it is allowed to have the original scalar loop execute at
1595 /// least once. This may be needed as a fallback loop in case runtime
1596 /// aliasing/dependence checks fail, or to handle the tail/remainder
1597 /// iterations when the trip count is unknown or doesn't divide by the VF,
1598 /// or as a peel-loop to handle gaps in interleave-groups.
1599 /// Under optsize and when the trip count is very small we don't allow any
1600 /// iterations to execute in the scalar loop.
1601 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1602
1603 /// Control finally chosen tail folding style.
1604 TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;
1605
1606 /// true if scalable vectorization is supported and enabled.
1607 std::optional<bool> IsScalableVectorizationAllowed;
1608
1609 /// Maximum safe number of elements to be processed per vector iteration,
1610 /// which do not prevent store-load forwarding and are safe with regard to the
1611 /// memory dependencies. Required for EVL-based veectorization, where this
1612 /// value is used as the upper bound of the safe AVL.
1613 std::optional<unsigned> MaxSafeElements;
1614
1615 /// A map holding scalar costs for different vectorization factors. The
1616 /// presence of a cost for an instruction in the mapping indicates that the
1617 /// instruction will be scalarized when vectorizing with the associated
1618 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1619 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1620
1621 /// Holds the instructions known to be uniform after vectorization.
1622 /// The data is collected per VF.
1623 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1624
1625 /// Holds the instructions known to be scalar after vectorization.
1626 /// The data is collected per VF.
1627 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1628
1629 /// Holds the instructions (address computations) that are forced to be
1630 /// scalarized.
1631 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1632
1633 /// PHINodes of the reductions that should be expanded in-loop.
1634 SmallPtrSet<PHINode *, 4> InLoopReductions;
1635
1636 /// A Map of inloop reduction operations and their immediate chain operand.
1637 /// FIXME: This can be removed once reductions can be costed correctly in
1638 /// VPlan. This was added to allow quick lookup of the inloop operations.
1639 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1640
1641 /// Returns the expected difference in cost from scalarizing the expression
1642 /// feeding a predicated instruction \p PredInst. The instructions to
1643 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1644 /// non-negative return value implies the expression will be scalarized.
1645 /// Currently, only single-use chains are considered for scalarization.
1646 InstructionCost computePredInstDiscount(Instruction *PredInst,
1647 ScalarCostsTy &ScalarCosts,
1648 ElementCount VF);
1649
1650 /// Collect the instructions that are uniform after vectorization. An
1651 /// instruction is uniform if we represent it with a single scalar value in
1652 /// the vectorized loop corresponding to each vector iteration. Examples of
1653 /// uniform instructions include pointer operands of consecutive or
1654 /// interleaved memory accesses. Note that although uniformity implies an
1655 /// instruction will be scalar, the reverse is not true. In general, a
1656 /// scalarized instruction will be represented by VF scalar values in the
1657 /// vectorized loop, each corresponding to an iteration of the original
1658 /// scalar loop.
1659 void collectLoopUniforms(ElementCount VF);
1660
1661 /// Collect the instructions that are scalar after vectorization. An
1662 /// instruction is scalar if it is known to be uniform or will be scalarized
1663 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1664 /// to the list if they are used by a load/store instruction that is marked as
1665 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1666 /// VF values in the vectorized loop, each corresponding to an iteration of
1667 /// the original scalar loop.
1668 void collectLoopScalars(ElementCount VF);
1669
1670 /// Keeps cost model vectorization decision and cost for instructions.
1671 /// Right now it is used for memory instructions only.
1672 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1673 std::pair<InstWidening, InstructionCost>>;
1674
1675 DecisionList WideningDecisions;
1676
1677 using CallDecisionList =
1678 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1679
1680 CallDecisionList CallWideningDecisions;
1681
1682 /// Returns true if \p V is expected to be vectorized and it needs to be
1683 /// extracted.
1684 bool needsExtract(Value *V, ElementCount VF) const {
1685 Instruction *I = dyn_cast<Instruction>(Val: V);
1686 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1687 TheLoop->isLoopInvariant(V: I) ||
1688 getWideningDecision(I, VF) == CM_Scalarize ||
1689 (isa<CallInst>(Val: I) &&
1690 getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize))
1691 return false;
1692
1693 // Assume we can vectorize V (and hence we need extraction) if the
1694 // scalars are not computed yet. This can happen, because it is called
1695 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1696 // the scalars are collected. That should be a safe assumption in most
1697 // cases, because we check if the operands have vectorizable types
1698 // beforehand in LoopVectorizationLegality.
1699 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1700 };
1701
1702 /// Returns a range containing only operands needing to be extracted.
1703 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1704 ElementCount VF) const {
1705
1706 SmallPtrSet<const Value *, 4> UniqueOperands;
1707 SmallVector<Value *, 4> Res;
1708 for (Value *Op : Ops) {
1709 if (isa<Constant>(Val: Op) || !UniqueOperands.insert(Ptr: Op).second ||
1710 !needsExtract(V: Op, VF))
1711 continue;
1712 Res.push_back(Elt: Op);
1713 }
1714 return Res;
1715 }
1716
1717public:
1718 /// The loop that we evaluate.
1719 Loop *TheLoop;
1720
1721 /// Predicated scalar evolution analysis.
1722 PredicatedScalarEvolution &PSE;
1723
1724 /// Loop Info analysis.
1725 LoopInfo *LI;
1726
1727 /// Vectorization legality.
1728 LoopVectorizationLegality *Legal;
1729
1730 /// Vector target information.
1731 const TargetTransformInfo &TTI;
1732
1733 /// Target Library Info.
1734 const TargetLibraryInfo *TLI;
1735
1736 /// Demanded bits analysis.
1737 DemandedBits *DB;
1738
1739 /// Assumption cache.
1740 AssumptionCache *AC;
1741
1742 /// Interface to emit optimization remarks.
1743 OptimizationRemarkEmitter *ORE;
1744
1745 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1746 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1747 /// there is no predication.
1748 std::function<BlockFrequencyInfo &()> GetBFI;
1749 /// The BlockFrequencyInfo returned from GetBFI.
1750 BlockFrequencyInfo *BFI = nullptr;
1751 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1752 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1753 BlockFrequencyInfo &getBFI() {
1754 if (!BFI)
1755 BFI = &GetBFI();
1756 return *BFI;
1757 }
1758
1759 const Function *TheFunction;
1760
1761 /// Loop Vectorize Hint.
1762 const LoopVectorizeHints *Hints;
1763
1764 /// The interleave access information contains groups of interleaved accesses
1765 /// with the same stride and close to each other.
1766 InterleavedAccessInfo &InterleaveInfo;
1767
1768 /// Values to ignore in the cost model.
1769 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1770
1771 /// Values to ignore in the cost model when VF > 1.
1772 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1773
1774 /// All element types found in the loop.
1775 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1776
1777 /// The kind of cost that we are calculating
1778 TTI::TargetCostKind CostKind;
1779
1780 /// Whether this loop should be optimized for size based on function attribute
1781 /// or profile information.
1782 bool OptForSize;
1783
1784 /// The highest VF possible for this loop, without using MaxBandwidth.
1785 FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
1786};
1787} // end namespace llvm
1788
1789namespace {
1790/// Helper struct to manage generating runtime checks for vectorization.
1791///
1792/// The runtime checks are created up-front in temporary blocks to allow better
1793/// estimating the cost and un-linked from the existing IR. After deciding to
1794/// vectorize, the checks are moved back. If deciding not to vectorize, the
1795/// temporary blocks are completely removed.
1796class GeneratedRTChecks {
1797 /// Basic block which contains the generated SCEV checks, if any.
1798 BasicBlock *SCEVCheckBlock = nullptr;
1799
1800 /// The value representing the result of the generated SCEV checks. If it is
1801 /// nullptr no SCEV checks have been generated.
1802 Value *SCEVCheckCond = nullptr;
1803
1804 /// Basic block which contains the generated memory runtime checks, if any.
1805 BasicBlock *MemCheckBlock = nullptr;
1806
1807 /// The value representing the result of the generated memory runtime checks.
1808 /// If it is nullptr no memory runtime checks have been generated.
1809 Value *MemRuntimeCheckCond = nullptr;
1810
1811 DominatorTree *DT;
1812 LoopInfo *LI;
1813 TargetTransformInfo *TTI;
1814
1815 SCEVExpander SCEVExp;
1816 SCEVExpander MemCheckExp;
1817
1818 bool CostTooHigh = false;
1819
1820 Loop *OuterLoop = nullptr;
1821
1822 PredicatedScalarEvolution &PSE;
1823
1824 /// The kind of cost that we are calculating
1825 TTI::TargetCostKind CostKind;
1826
1827public:
1828 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1829 LoopInfo *LI, TargetTransformInfo *TTI,
1830 TTI::TargetCostKind CostKind)
1831 : DT(DT), LI(LI), TTI(TTI),
1832 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1833 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1834 PSE(PSE), CostKind(CostKind) {}
1835
1836 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1837 /// accurately estimate the cost of the runtime checks. The blocks are
1838 /// un-linked from the IR and are added back during vector code generation. If
1839 /// there is no vector code generation, the check blocks are removed
1840 /// completely.
1841 void create(Loop *L, const LoopAccessInfo &LAI,
1842 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1843 OptimizationRemarkEmitter &ORE) {
1844
1845 // Hard cutoff to limit compile-time increase in case a very large number of
1846 // runtime checks needs to be generated.
1847 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1848 // profile info.
1849 CostTooHigh =
1850 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1851 if (CostTooHigh) {
1852 // Mark runtime checks as never succeeding when they exceed the threshold.
1853 MemRuntimeCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1854 SCEVCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1855 ORE.emit(RemarkBuilder: [&]() {
1856 return OptimizationRemarkAnalysisAliasing(
1857 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1858 L->getHeader())
1859 << "loop not vectorized: too many memory checks needed";
1860 });
1861 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1862 return;
1863 }
1864
1865 BasicBlock *LoopHeader = L->getHeader();
1866 BasicBlock *Preheader = L->getLoopPreheader();
1867
1868 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1869 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1870 // may be used by SCEVExpander. The blocks will be un-linked from their
1871 // predecessors and removed from LI & DT at the end of the function.
1872 if (!UnionPred.isAlwaysTrue()) {
1873 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1874 MSSAU: nullptr, BBName: "vector.scevcheck");
1875
1876 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1877 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1878 if (isa<Constant>(Val: SCEVCheckCond)) {
1879 // Clean up directly after expanding the predicate to a constant, to
1880 // avoid further expansions re-using anything left over from SCEVExp.
1881 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1882 SCEVCleaner.cleanup();
1883 }
1884 }
1885
1886 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1887 if (RtPtrChecking.Need) {
1888 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1889 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1890 BBName: "vector.memcheck");
1891
1892 auto DiffChecks = RtPtrChecking.getDiffChecks();
1893 if (DiffChecks) {
1894 Value *RuntimeVF = nullptr;
1895 MemRuntimeCheckCond = addDiffRuntimeChecks(
1896 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1897 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1898 if (!RuntimeVF)
1899 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1900 return RuntimeVF;
1901 },
1902 IC);
1903 } else {
1904 MemRuntimeCheckCond = addRuntimeChecks(
1905 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1906 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1907 }
1908 assert(MemRuntimeCheckCond &&
1909 "no RT checks generated although RtPtrChecking "
1910 "claimed checks are required");
1911 }
1912
1913 SCEVExp.eraseDeadInstructions(Root: SCEVCheckCond);
1914
1915 if (!MemCheckBlock && !SCEVCheckBlock)
1916 return;
1917
1918 // Unhook the temporary block with the checks, update various places
1919 // accordingly.
1920 if (SCEVCheckBlock)
1921 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1922 if (MemCheckBlock)
1923 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1924
1925 if (SCEVCheckBlock) {
1926 SCEVCheckBlock->getTerminator()->moveBefore(
1927 InsertPos: Preheader->getTerminator()->getIterator());
1928 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1929 UI->setDebugLoc(DebugLoc::getTemporary());
1930 Preheader->getTerminator()->eraseFromParent();
1931 }
1932 if (MemCheckBlock) {
1933 MemCheckBlock->getTerminator()->moveBefore(
1934 InsertPos: Preheader->getTerminator()->getIterator());
1935 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1936 UI->setDebugLoc(DebugLoc::getTemporary());
1937 Preheader->getTerminator()->eraseFromParent();
1938 }
1939
1940 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1941 if (MemCheckBlock) {
1942 DT->eraseNode(BB: MemCheckBlock);
1943 LI->removeBlock(BB: MemCheckBlock);
1944 }
1945 if (SCEVCheckBlock) {
1946 DT->eraseNode(BB: SCEVCheckBlock);
1947 LI->removeBlock(BB: SCEVCheckBlock);
1948 }
1949
1950 // Outer loop is used as part of the later cost calculations.
1951 OuterLoop = L->getParentLoop();
1952 }
1953
1954 InstructionCost getCost() {
1955 if (SCEVCheckBlock || MemCheckBlock)
1956 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1957
1958 if (CostTooHigh) {
1959 InstructionCost Cost;
1960 Cost.setInvalid();
1961 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1962 return Cost;
1963 }
1964
1965 InstructionCost RTCheckCost = 0;
1966 if (SCEVCheckBlock)
1967 for (Instruction &I : *SCEVCheckBlock) {
1968 if (SCEVCheckBlock->getTerminator() == &I)
1969 continue;
1970 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1971 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1972 RTCheckCost += C;
1973 }
1974 if (MemCheckBlock) {
1975 InstructionCost MemCheckCost = 0;
1976 for (Instruction &I : *MemCheckBlock) {
1977 if (MemCheckBlock->getTerminator() == &I)
1978 continue;
1979 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1980 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1981 MemCheckCost += C;
1982 }
1983
1984 // If the runtime memory checks are being created inside an outer loop
1985 // we should find out if these checks are outer loop invariant. If so,
1986 // the checks will likely be hoisted out and so the effective cost will
1987 // reduce according to the outer loop trip count.
1988 if (OuterLoop) {
1989 ScalarEvolution *SE = MemCheckExp.getSE();
1990 // TODO: If profitable, we could refine this further by analysing every
1991 // individual memory check, since there could be a mixture of loop
1992 // variant and invariant checks that mean the final condition is
1993 // variant.
1994 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1995 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1996 // It seems reasonable to assume that we can reduce the effective
1997 // cost of the checks even when we know nothing about the trip
1998 // count. Assume that the outer loop executes at least twice.
1999 unsigned BestTripCount = 2;
2000
2001 // Get the best known TC estimate.
2002 if (auto EstimatedTC = getSmallBestKnownTC(
2003 PSE, L: OuterLoop, /* CanUseConstantMax = */ false))
2004 if (EstimatedTC->isFixed())
2005 BestTripCount = EstimatedTC->getFixedValue();
2006
2007 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2008
2009 // Let's ensure the cost is always at least 1.
2010 NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
2011 b: (InstructionCost::CostType)1);
2012
2013 if (BestTripCount > 1)
2014 LLVM_DEBUG(dbgs()
2015 << "We expect runtime memory checks to be hoisted "
2016 << "out of the outer loop. Cost reduced from "
2017 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2018
2019 MemCheckCost = NewMemCheckCost;
2020 }
2021 }
2022
2023 RTCheckCost += MemCheckCost;
2024 }
2025
2026 if (SCEVCheckBlock || MemCheckBlock)
2027 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2028 << "\n");
2029
2030 return RTCheckCost;
2031 }
2032
2033 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2034 /// unused.
2035 ~GeneratedRTChecks() {
2036 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2037 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2038 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(BB: SCEVCheckBlock);
2039 bool MemChecksUsed = !MemCheckBlock || !pred_empty(BB: MemCheckBlock);
2040 if (SCEVChecksUsed)
2041 SCEVCleaner.markResultUsed();
2042
2043 if (MemChecksUsed) {
2044 MemCheckCleaner.markResultUsed();
2045 } else {
2046 auto &SE = *MemCheckExp.getSE();
2047 // Memory runtime check generation creates compares that use expanded
2048 // values. Remove them before running the SCEVExpanderCleaners.
2049 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2050 if (MemCheckExp.isInsertedInstruction(I: &I))
2051 continue;
2052 SE.forgetValue(V: &I);
2053 I.eraseFromParent();
2054 }
2055 }
2056 MemCheckCleaner.cleanup();
2057 SCEVCleaner.cleanup();
2058
2059 if (!SCEVChecksUsed)
2060 SCEVCheckBlock->eraseFromParent();
2061 if (!MemChecksUsed)
2062 MemCheckBlock->eraseFromParent();
2063 }
2064
2065 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2066 /// outside VPlan.
2067 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2068 using namespace llvm::PatternMatch;
2069 if (!SCEVCheckCond || match(V: SCEVCheckCond, P: m_ZeroInt()))
2070 return {nullptr, nullptr};
2071
2072 return {SCEVCheckCond, SCEVCheckBlock};
2073 }
2074
2075 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2076 /// outside VPlan.
2077 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2078 using namespace llvm::PatternMatch;
2079 if (MemRuntimeCheckCond && match(V: MemRuntimeCheckCond, P: m_ZeroInt()))
2080 return {nullptr, nullptr};
2081 return {MemRuntimeCheckCond, MemCheckBlock};
2082 }
2083
2084 /// Return true if any runtime checks have been added
2085 bool hasChecks() const {
2086 return getSCEVChecks().first || getMemRuntimeChecks().first;
2087 }
2088};
2089} // namespace
2090
2091static bool useActiveLaneMask(TailFoldingStyle Style) {
2092 return Style == TailFoldingStyle::Data ||
2093 Style == TailFoldingStyle::DataAndControlFlow;
2094}
2095
2096static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2097 return Style == TailFoldingStyle::DataAndControlFlow;
2098}
2099
2100// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2101// vectorization. The loop needs to be annotated with #pragma omp simd
2102// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2103// vector length information is not provided, vectorization is not considered
2104// explicit. Interleave hints are not allowed either. These limitations will be
2105// relaxed in the future.
2106// Please, note that we are currently forced to abuse the pragma 'clang
2107// vectorize' semantics. This pragma provides *auto-vectorization hints*
2108// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2109// provides *explicit vectorization hints* (LV can bypass legal checks and
2110// assume that vectorization is legal). However, both hints are implemented
2111// using the same metadata (llvm.loop.vectorize, processed by
2112// LoopVectorizeHints). This will be fixed in the future when the native IR
2113// representation for pragma 'omp simd' is introduced.
2114static bool isExplicitVecOuterLoop(Loop *OuterLp,
2115 OptimizationRemarkEmitter *ORE) {
2116 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2117 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2118
2119 // Only outer loops with an explicit vectorization hint are supported.
2120 // Unannotated outer loops are ignored.
2121 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2122 return false;
2123
2124 Function *Fn = OuterLp->getHeader()->getParent();
2125 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2126 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2127 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2128 return false;
2129 }
2130
2131 if (Hints.getInterleave() > 1) {
2132 // TODO: Interleave support is future work.
2133 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2134 "outer loops.\n");
2135 Hints.emitRemarkWithHints();
2136 return false;
2137 }
2138
2139 return true;
2140}
2141
2142static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2143 OptimizationRemarkEmitter *ORE,
2144 SmallVectorImpl<Loop *> &V) {
2145 // Collect inner loops and outer loops without irreducible control flow. For
2146 // now, only collect outer loops that have explicit vectorization hints. If we
2147 // are stress testing the VPlan H-CFG construction, we collect the outermost
2148 // loop of every loop nest.
2149 if (L.isInnermost() || VPlanBuildStressTest ||
2150 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2151 LoopBlocksRPO RPOT(&L);
2152 RPOT.perform(LI);
2153 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2154 V.push_back(Elt: &L);
2155 // TODO: Collect inner loops inside marked outer loops in case
2156 // vectorization fails for the outer loop. Do not invoke
2157 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2158 // already known to be reducible. We can use an inherited attribute for
2159 // that.
2160 return;
2161 }
2162 }
2163 for (Loop *InnerL : L)
2164 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2165}
2166
2167//===----------------------------------------------------------------------===//
2168// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2169// LoopVectorizationCostModel and LoopVectorizationPlanner.
2170//===----------------------------------------------------------------------===//
2171
2172/// FIXME: The newly created binary instructions should contain nsw/nuw
2173/// flags, which can be found from the original scalar operations.
2174Value *
2175llvm::emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2176 Value *Step,
2177 InductionDescriptor::InductionKind InductionKind,
2178 const BinaryOperator *InductionBinOp) {
2179 using namespace llvm::PatternMatch;
2180 Type *StepTy = Step->getType();
2181 Value *CastedIndex = StepTy->isIntegerTy()
2182 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2183 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2184 if (CastedIndex != Index) {
2185 CastedIndex->setName(CastedIndex->getName() + ".cast");
2186 Index = CastedIndex;
2187 }
2188
2189 // Note: the IR at this point is broken. We cannot use SE to create any new
2190 // SCEV and then expand it, hoping that SCEV's simplification will give us
2191 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2192 // lead to various SCEV crashes. So all we can do is to use builder and rely
2193 // on InstCombine for future simplifications. Here we handle some trivial
2194 // cases only.
2195 auto CreateAdd = [&B](Value *X, Value *Y) {
2196 assert(X->getType() == Y->getType() && "Types don't match!");
2197 if (match(V: X, P: m_ZeroInt()))
2198 return Y;
2199 if (match(V: Y, P: m_ZeroInt()))
2200 return X;
2201 return B.CreateAdd(LHS: X, RHS: Y);
2202 };
2203
2204 // We allow X to be a vector type, in which case Y will potentially be
2205 // splatted into a vector with the same element count.
2206 auto CreateMul = [&B](Value *X, Value *Y) {
2207 assert(X->getType()->getScalarType() == Y->getType() &&
2208 "Types don't match!");
2209 if (match(V: X, P: m_One()))
2210 return Y;
2211 if (match(V: Y, P: m_One()))
2212 return X;
2213 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2214 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2215 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2216 return B.CreateMul(LHS: X, RHS: Y);
2217 };
2218
2219 switch (InductionKind) {
2220 case InductionDescriptor::IK_IntInduction: {
2221 assert(!isa<VectorType>(Index->getType()) &&
2222 "Vector indices not supported for integer inductions yet");
2223 assert(Index->getType() == StartValue->getType() &&
2224 "Index type does not match StartValue type");
2225 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2226 return B.CreateSub(LHS: StartValue, RHS: Index);
2227 auto *Offset = CreateMul(Index, Step);
2228 return CreateAdd(StartValue, Offset);
2229 }
2230 case InductionDescriptor::IK_PtrInduction:
2231 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2232 case InductionDescriptor::IK_FpInduction: {
2233 assert(!isa<VectorType>(Index->getType()) &&
2234 "Vector indices not supported for FP inductions yet");
2235 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2236 assert(InductionBinOp &&
2237 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2238 InductionBinOp->getOpcode() == Instruction::FSub) &&
2239 "Original bin op should be defined for FP induction");
2240
2241 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2242 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2243 Name: "induction");
2244 }
2245 case InductionDescriptor::IK_NoInduction:
2246 return nullptr;
2247 }
2248 llvm_unreachable("invalid enum");
2249}
2250
2251static std::optional<unsigned> getMaxVScale(const Function &F,
2252 const TargetTransformInfo &TTI) {
2253 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2254 return MaxVScale;
2255
2256 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2257 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2258
2259 return std::nullopt;
2260}
2261
2262/// For the given VF and UF and maximum trip count computed for the loop, return
2263/// whether the induction variable might overflow in the vectorized loop. If not,
2264/// then we know a runtime overflow check always evaluates to false and can be
2265/// removed.
2266static bool isIndvarOverflowCheckKnownFalse(
2267 const LoopVectorizationCostModel *Cost,
2268 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2269 // Always be conservative if we don't know the exact unroll factor.
2270 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2271
2272 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2273 APInt MaxUIntTripCount = IdxTy->getMask();
2274
2275 // We know the runtime overflow check is known false iff the (max) trip-count
2276 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2277 // the vector loop induction variable.
2278 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2279 uint64_t MaxVF = VF.getKnownMinValue();
2280 if (VF.isScalable()) {
2281 std::optional<unsigned> MaxVScale =
2282 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2283 if (!MaxVScale)
2284 return false;
2285 MaxVF *= *MaxVScale;
2286 }
2287
2288 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2289 }
2290
2291 return false;
2292}
2293
2294// Return whether we allow using masked interleave-groups (for dealing with
2295// strided loads/stores that reside in predicated blocks, or for dealing
2296// with gaps).
2297static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2298 // If an override option has been passed in for interleaved accesses, use it.
2299 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2300 return EnableMaskedInterleavedMemAccesses;
2301
2302 return TTI.enableMaskedInterleavedAccessVectorization();
2303}
2304
2305void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan(
2306 BasicBlock *CheckIRBB) {
2307 // Note: The block with the minimum trip-count check is already connected
2308 // during earlier VPlan construction.
2309 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2310 VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2311 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2312 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2313 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2314 VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPBB, BlockPtr: CheckVPIRBB);
2315 PreVectorPH = CheckVPIRBB;
2316 VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2317 PreVectorPH->swapSuccessors();
2318
2319 // We just connected a new block to the scalar preheader. Update all
2320 // VPPhis by adding an incoming value for it, replicating the last value.
2321 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2322 for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2323 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2324 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2325 "must have incoming values for all operands");
2326 R.addOperand(Operand: R.getOperand(N: NumPredecessors - 2));
2327 }
2328}
2329
2330Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
2331 BasicBlock *VectorPH, ElementCount VF, unsigned UF) const {
2332 // Generate code to check if the loop's trip count is less than VF * UF, or
2333 // equal to it in case a scalar epilogue is required; this implies that the
2334 // vector trip count is zero. This check also covers the case where adding one
2335 // to the backedge-taken count overflowed leading to an incorrect trip count
2336 // of zero. In this case we will also jump to the scalar loop.
2337 auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2338 : ICmpInst::ICMP_ULT;
2339
2340 // Reuse existing vector loop preheader for TC checks.
2341 // Note that new preheader block is generated for vector loop.
2342 BasicBlock *const TCCheckBlock = VectorPH;
2343 IRBuilder<InstSimplifyFolder> Builder(
2344 TCCheckBlock->getContext(),
2345 InstSimplifyFolder(TCCheckBlock->getDataLayout()));
2346 Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2347
2348 // If tail is to be folded, vector loop takes care of all iterations.
2349 Value *Count = getTripCount();
2350 Type *CountTy = Count->getType();
2351 Value *CheckMinIters = Builder.getFalse();
2352 auto CreateStep = [&]() -> Value * {
2353 // Create step with max(MinProTripCount, UF * VF).
2354 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2355 return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2356
2357 Value *MinProfTC =
2358 Builder.CreateElementCount(Ty: CountTy, EC: MinProfitableTripCount);
2359 if (!VF.isScalable())
2360 return MinProfTC;
2361 return Builder.CreateBinaryIntrinsic(
2362 ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2363 };
2364
2365 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2366 if (Style == TailFoldingStyle::None) {
2367 Value *Step = CreateStep();
2368 ScalarEvolution &SE = *PSE.getSE();
2369 // TODO: Emit unconditional branch to vector preheader instead of
2370 // conditional branch with known condition.
2371 const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2372 // Check if the trip count is < the step.
2373 if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2374 // TODO: Ensure step is at most the trip count when determining max VF and
2375 // UF, w/o tail folding.
2376 CheckMinIters = Builder.getTrue();
2377 } else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2378 LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2379 // Generate the minimum iteration check only if we cannot prove the
2380 // check is known to be true, or known to be false.
2381 CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2382 } // else step known to be < trip count, use CheckMinIters preset to false.
2383 }
2384
2385 return CheckMinIters;
2386}
2387
2388/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2389/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2390/// predecessors and successors of VPBB, if any, are rewired to the new
2391/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2392static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
2393 BasicBlock *IRBB,
2394 VPlan *Plan = nullptr) {
2395 if (!Plan)
2396 Plan = VPBB->getPlan();
2397 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2398 auto IP = IRVPBB->begin();
2399 for (auto &R : make_early_inc_range(Range: VPBB->phis()))
2400 R.moveBefore(BB&: *IRVPBB, I: IP);
2401
2402 for (auto &R :
2403 make_early_inc_range(Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end())))
2404 R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2405
2406 VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2407 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2408 return IRVPBB;
2409}
2410
2411BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
2412 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2413 assert(VectorPH && "Invalid loop structure");
2414 assert((OrigLoop->getUniqueLatchExitBlock() ||
2415 Cost->requiresScalarEpilogue(VF.isVector())) &&
2416 "loops not exiting via the latch without required epilogue?");
2417
2418 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2419 // wrapping the newly created scalar preheader here at the moment, because the
2420 // Plan's scalar preheader may be unreachable at this point. Instead it is
2421 // replaced in executePlan.
2422 return SplitBlock(Old: VectorPH, SplitPt: VectorPH->getTerminator(), DT, LI, MSSAU: nullptr,
2423 BBName: Twine(Prefix) + "scalar.ph");
2424}
2425
2426/// Knowing that loop \p L executes a single vector iteration, add instructions
2427/// that will get simplified and thus should not have any cost to \p
2428/// InstsToIgnore.
2429static void addFullyUnrolledInstructionsToIgnore(
2430 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2431 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2432 auto *Cmp = L->getLatchCmpInst();
2433 if (Cmp)
2434 InstsToIgnore.insert(Ptr: Cmp);
2435 for (const auto &KV : IL) {
2436 // Extract the key by hand so that it can be used in the lambda below. Note
2437 // that captured structured bindings are a C++20 extension.
2438 const PHINode *IV = KV.first;
2439
2440 // Get next iteration value of the induction variable.
2441 Instruction *IVInst =
2442 cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2443 if (all_of(Range: IVInst->users(),
2444 P: [&](const User *U) { return U == IV || U == Cmp; }))
2445 InstsToIgnore.insert(Ptr: IVInst);
2446 }
2447}
2448
2449BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2450 // Create a new IR basic block for the scalar preheader.
2451 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
2452 return ScalarPH->getSinglePredecessor();
2453}
2454
2455namespace {
2456
2457struct CSEDenseMapInfo {
2458 static bool canHandle(const Instruction *I) {
2459 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2460 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2461 }
2462
2463 static inline Instruction *getEmptyKey() {
2464 return DenseMapInfo<Instruction *>::getEmptyKey();
2465 }
2466
2467 static inline Instruction *getTombstoneKey() {
2468 return DenseMapInfo<Instruction *>::getTombstoneKey();
2469 }
2470
2471 static unsigned getHashValue(const Instruction *I) {
2472 assert(canHandle(I) && "Unknown instruction!");
2473 return hash_combine(args: I->getOpcode(),
2474 args: hash_combine_range(R: I->operand_values()));
2475 }
2476
2477 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2478 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2479 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2480 return LHS == RHS;
2481 return LHS->isIdenticalTo(I: RHS);
2482 }
2483};
2484
2485} // end anonymous namespace
2486
2487/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2488/// removal, in favor of the VPlan-based one.
2489static void legacyCSE(BasicBlock *BB) {
2490 // Perform simple cse.
2491 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2492 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2493 if (!CSEDenseMapInfo::canHandle(I: &In))
2494 continue;
2495
2496 // Check if we can replace this instruction with any of the
2497 // visited instructions.
2498 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2499 In.replaceAllUsesWith(V);
2500 In.eraseFromParent();
2501 continue;
2502 }
2503
2504 CSEMap[&In] = &In;
2505 }
2506}
2507
2508/// This function attempts to return a value that represents the ElementCount
2509/// at runtime. For fixed-width VFs we know this precisely at compile
2510/// time, but for scalable VFs we calculate it based on an estimate of the
2511/// vscale value.
2512static unsigned estimateElementCount(ElementCount VF,
2513 std::optional<unsigned> VScale) {
2514 unsigned EstimatedVF = VF.getKnownMinValue();
2515 if (VF.isScalable())
2516 if (VScale)
2517 EstimatedVF *= *VScale;
2518 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2519 return EstimatedVF;
2520}
2521
2522InstructionCost
2523LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2524 ElementCount VF) const {
2525 // We only need to calculate a cost if the VF is scalar; for actual vectors
2526 // we should already have a pre-calculated cost at each VF.
2527 if (!VF.isScalar())
2528 return getCallWideningDecision(CI, VF).Cost;
2529
2530 Type *RetTy = CI->getType();
2531 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2532 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2533 return *RedCost;
2534
2535 SmallVector<Type *, 4> Tys;
2536 for (auto &ArgOp : CI->args())
2537 Tys.push_back(Elt: ArgOp->getType());
2538
2539 InstructionCost ScalarCallCost =
2540 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2541
2542 // If this is an intrinsic we may have a lower cost for it.
2543 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2544 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2545 return std::min(a: ScalarCallCost, b: IntrinsicCost);
2546 }
2547 return ScalarCallCost;
2548}
2549
2550static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2551 if (VF.isScalar() || !canVectorizeTy(Ty))
2552 return Ty;
2553 return toVectorizedTy(Ty, EC: VF);
2554}
2555
2556InstructionCost
2557LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2558 ElementCount VF) const {
2559 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2560 assert(ID && "Expected intrinsic call!");
2561 Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2562 FastMathFlags FMF;
2563 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2564 FMF = FPMO->getFastMathFlags();
2565
2566 SmallVector<const Value *> Arguments(CI->args());
2567 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2568 SmallVector<Type *> ParamTys;
2569 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2570 result: std::back_inserter(x&: ParamTys),
2571 unary_op: [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2572
2573 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2574 dyn_cast<IntrinsicInst>(Val: CI),
2575 InstructionCost::getInvalid());
2576 return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2577}
2578
2579void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2580 // Fix widened non-induction PHIs by setting up the PHI operands.
2581 fixNonInductionPHIs(State);
2582
2583 // Don't apply optimizations below when no (vector) loop remains, as they all
2584 // require one at the moment.
2585 VPBasicBlock *HeaderVPBB =
2586 vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2587 if (!HeaderVPBB)
2588 return;
2589
2590 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2591
2592 // Remove redundant induction instructions.
2593 legacyCSE(BB: HeaderBB);
2594}
2595
2596void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2597 auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2598 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2599 for (VPRecipeBase &P : VPBB->phis()) {
2600 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2601 if (!VPPhi)
2602 continue;
2603 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2604 // Make sure the builder has a valid insert point.
2605 Builder.SetInsertPoint(NewPhi);
2606 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2607 NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB[VPBB]);
2608 }
2609 }
2610}
2611
2612void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2613 // We should not collect Scalars more than once per VF. Right now, this
2614 // function is called from collectUniformsAndScalars(), which already does
2615 // this check. Collecting Scalars for VF=1 does not make any sense.
2616 assert(VF.isVector() && !Scalars.contains(VF) &&
2617 "This function should not be visited twice for the same VF");
2618
2619 // This avoids any chances of creating a REPLICATE recipe during planning
2620 // since that would result in generation of scalarized code during execution,
2621 // which is not supported for scalable vectors.
2622 if (VF.isScalable()) {
2623 Scalars[VF].insert_range(R&: Uniforms[VF]);
2624 return;
2625 }
2626
2627 SmallSetVector<Instruction *, 8> Worklist;
2628
2629 // These sets are used to seed the analysis with pointers used by memory
2630 // accesses that will remain scalar.
2631 SmallSetVector<Instruction *, 8> ScalarPtrs;
2632 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2633 auto *Latch = TheLoop->getLoopLatch();
2634
2635 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2636 // The pointer operands of loads and stores will be scalar as long as the
2637 // memory access is not a gather or scatter operation. The value operand of a
2638 // store will remain scalar if the store is scalarized.
2639 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2640 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2641 assert(WideningDecision != CM_Unknown &&
2642 "Widening decision should be ready at this moment");
2643 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2644 if (Ptr == Store->getValueOperand())
2645 return WideningDecision == CM_Scalarize;
2646 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2647 "Ptr is neither a value or pointer operand");
2648 return WideningDecision != CM_GatherScatter;
2649 };
2650
2651 // A helper that returns true if the given value is a getelementptr
2652 // instruction contained in the loop.
2653 auto IsLoopVaryingGEP = [&](Value *V) {
2654 return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2655 };
2656
2657 // A helper that evaluates a memory access's use of a pointer. If the use will
2658 // be a scalar use and the pointer is only used by memory accesses, we place
2659 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2660 // PossibleNonScalarPtrs.
2661 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2662 // We only care about bitcast and getelementptr instructions contained in
2663 // the loop.
2664 if (!IsLoopVaryingGEP(Ptr))
2665 return;
2666
2667 // If the pointer has already been identified as scalar (e.g., if it was
2668 // also identified as uniform), there's nothing to do.
2669 auto *I = cast<Instruction>(Val: Ptr);
2670 if (Worklist.count(key: I))
2671 return;
2672
2673 // If the use of the pointer will be a scalar use, and all users of the
2674 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2675 // place the pointer in PossibleNonScalarPtrs.
2676 if (IsScalarUse(MemAccess, Ptr) &&
2677 all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2678 ScalarPtrs.insert(X: I);
2679 else
2680 PossibleNonScalarPtrs.insert(Ptr: I);
2681 };
2682
2683 // We seed the scalars analysis with three classes of instructions: (1)
2684 // instructions marked uniform-after-vectorization and (2) bitcast,
2685 // getelementptr and (pointer) phi instructions used by memory accesses
2686 // requiring a scalar use.
2687 //
2688 // (1) Add to the worklist all instructions that have been identified as
2689 // uniform-after-vectorization.
2690 Worklist.insert_range(R&: Uniforms[VF]);
2691
2692 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2693 // memory accesses requiring a scalar use. The pointer operands of loads and
2694 // stores will be scalar unless the operation is a gather or scatter.
2695 // The value operand of a store will remain scalar if the store is scalarized.
2696 for (auto *BB : TheLoop->blocks())
2697 for (auto &I : *BB) {
2698 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2699 EvaluatePtrUse(Load, Load->getPointerOperand());
2700 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2701 EvaluatePtrUse(Store, Store->getPointerOperand());
2702 EvaluatePtrUse(Store, Store->getValueOperand());
2703 }
2704 }
2705 for (auto *I : ScalarPtrs)
2706 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2707 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2708 Worklist.insert(X: I);
2709 }
2710
2711 // Insert the forced scalars.
2712 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2713 // induction variable when the PHI user is scalarized.
2714 auto ForcedScalar = ForcedScalars.find(Val: VF);
2715 if (ForcedScalar != ForcedScalars.end())
2716 for (auto *I : ForcedScalar->second) {
2717 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2718 Worklist.insert(X: I);
2719 }
2720
2721 // Expand the worklist by looking through any bitcasts and getelementptr
2722 // instructions we've already identified as scalar. This is similar to the
2723 // expansion step in collectLoopUniforms(); however, here we're only
2724 // expanding to include additional bitcasts and getelementptr instructions.
2725 unsigned Idx = 0;
2726 while (Idx != Worklist.size()) {
2727 Instruction *Dst = Worklist[Idx++];
2728 if (!IsLoopVaryingGEP(Dst->getOperand(i: 0)))
2729 continue;
2730 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
2731 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
2732 auto *J = cast<Instruction>(Val: U);
2733 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
2734 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
2735 IsScalarUse(J, Src));
2736 })) {
2737 Worklist.insert(X: Src);
2738 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2739 }
2740 }
2741
2742 // An induction variable will remain scalar if all users of the induction
2743 // variable and induction variable update remain scalar.
2744 for (const auto &Induction : Legal->getInductionVars()) {
2745 auto *Ind = Induction.first;
2746 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2747
2748 // If tail-folding is applied, the primary induction variable will be used
2749 // to feed a vector compare.
2750 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2751 continue;
2752
2753 // Returns true if \p Indvar is a pointer induction that is used directly by
2754 // load/store instruction \p I.
2755 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2756 Instruction *I) {
2757 return Induction.second.getKind() ==
2758 InductionDescriptor::IK_PtrInduction &&
2759 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
2760 Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse(I, Indvar);
2761 };
2762
2763 // Determine if all users of the induction variable are scalar after
2764 // vectorization.
2765 bool ScalarInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2766 auto *I = cast<Instruction>(Val: U);
2767 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2768 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2769 });
2770 if (!ScalarInd)
2771 continue;
2772
2773 // If the induction variable update is a fixed-order recurrence, neither the
2774 // induction variable or its update should be marked scalar after
2775 // vectorization.
2776 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2777 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2778 continue;
2779
2780 // Determine if all users of the induction variable update instruction are
2781 // scalar after vectorization.
2782 bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
2783 auto *I = cast<Instruction>(Val: U);
2784 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2785 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2786 });
2787 if (!ScalarIndUpdate)
2788 continue;
2789
2790 // The induction variable and its update instruction will remain scalar.
2791 Worklist.insert(X: Ind);
2792 Worklist.insert(X: IndUpdate);
2793 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2794 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2795 << "\n");
2796 }
2797
2798 Scalars[VF].insert_range(R&: Worklist);
2799}
2800
2801bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
2802 ElementCount VF) {
2803 if (!isPredicatedInst(I))
2804 return false;
2805
2806 // Do we have a non-scalar lowering for this predicated
2807 // instruction? No - it is scalar with predication.
2808 switch(I->getOpcode()) {
2809 default:
2810 return true;
2811 case Instruction::Call:
2812 if (VF.isScalar())
2813 return true;
2814 return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
2815 case Instruction::Load:
2816 case Instruction::Store: {
2817 auto *Ptr = getLoadStorePointerOperand(V: I);
2818 auto *Ty = getLoadStoreType(I);
2819 unsigned AS = getLoadStoreAddressSpace(I);
2820 Type *VTy = Ty;
2821 if (VF.isVector())
2822 VTy = VectorType::get(ElementType: Ty, EC: VF);
2823 const Align Alignment = getLoadStoreAlignment(I);
2824 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2825 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
2826 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2827 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
2828 }
2829 case Instruction::UDiv:
2830 case Instruction::SDiv:
2831 case Instruction::SRem:
2832 case Instruction::URem: {
2833 // We have the option to use the safe-divisor idiom to avoid predication.
2834 // The cost based decision here will always select safe-divisor for
2835 // scalable vectors as scalarization isn't legal.
2836 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2837 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2838 }
2839 }
2840}
2841
2842// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2843bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2844 // TODO: We can use the loop-preheader as context point here and get
2845 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2846 if (isSafeToSpeculativelyExecute(I) ||
2847 (isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) ||
2848 isa<UncondBrInst, CondBrInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2849 return false;
2850
2851 // If the instruction was executed conditionally in the original scalar loop,
2852 // predication is needed with a mask whose lanes are all possibly inactive.
2853 if (Legal->blockNeedsPredication(BB: I->getParent()))
2854 return true;
2855
2856 // If we're not folding the tail by masking, predication is unnecessary.
2857 if (!foldTailByMasking())
2858 return false;
2859
2860 // All that remain are instructions with side-effects originally executed in
2861 // the loop unconditionally, but now execute under a tail-fold mask (only)
2862 // having at least one active lane (the first). If the side-effects of the
2863 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2864 // - it will cause the same side-effects as when masked.
2865 switch(I->getOpcode()) {
2866 default:
2867 llvm_unreachable(
2868 "instruction should have been considered by earlier checks");
2869 case Instruction::Call:
2870 // Side-effects of a Call are assumed to be non-invariant, needing a
2871 // (fold-tail) mask.
2872 assert(Legal->isMaskRequired(I) &&
2873 "should have returned earlier for calls not needing a mask");
2874 return true;
2875 case Instruction::Load:
2876 // If the address is loop invariant no predication is needed.
2877 return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2878 case Instruction::Store: {
2879 // For stores, we need to prove both speculation safety (which follows from
2880 // the same argument as loads), but also must prove the value being stored
2881 // is correct. The easiest form of the later is to require that all values
2882 // stored are the same.
2883 return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2884 TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2885 }
2886 case Instruction::UDiv:
2887 case Instruction::URem:
2888 // If the divisor is loop-invariant no predication is needed.
2889 return !Legal->isInvariant(V: I->getOperand(i: 1));
2890 case Instruction::SDiv:
2891 case Instruction::SRem:
2892 // Conservative for now, since masked-off lanes may be poison and could
2893 // trigger signed overflow.
2894 return true;
2895 }
2896}
2897
2898uint64_t LoopVectorizationCostModel::getPredBlockCostDivisor(
2899 TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
2900 if (CostKind == TTI::TCK_CodeSize)
2901 return 1;
2902 // If the block wasn't originally predicated then return early to avoid
2903 // computing BlockFrequencyInfo unnecessarily.
2904 if (!Legal->blockNeedsPredication(BB))
2905 return 1;
2906
2907 uint64_t HeaderFreq =
2908 getBFI().getBlockFreq(BB: TheLoop->getHeader()).getFrequency();
2909 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2910 assert(HeaderFreq >= BBFreq &&
2911 "Header has smaller block freq than dominated BB?");
2912 return std::round(x: (double)HeaderFreq / BBFreq);
2913}
2914
2915std::pair<InstructionCost, InstructionCost>
2916LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
2917 ElementCount VF) {
2918 assert(I->getOpcode() == Instruction::UDiv ||
2919 I->getOpcode() == Instruction::SDiv ||
2920 I->getOpcode() == Instruction::SRem ||
2921 I->getOpcode() == Instruction::URem);
2922 assert(!isSafeToSpeculativelyExecute(I));
2923
2924 // Scalarization isn't legal for scalable vector types
2925 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2926 if (!VF.isScalable()) {
2927 // Get the scalarization cost and scale this amount by the probability of
2928 // executing the predicated block. If the instruction is not predicated,
2929 // we fall through to the next case.
2930 ScalarizationCost = 0;
2931
2932 // These instructions have a non-void type, so account for the phi nodes
2933 // that we will create. This cost is likely to be zero. The phi node
2934 // cost, if any, should be scaled by the block probability because it
2935 // models a copy at the end of each predicated block.
2936 ScalarizationCost +=
2937 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
2938
2939 // The cost of the non-predicated instruction.
2940 ScalarizationCost +=
2941 VF.getFixedValue() *
2942 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
2943
2944 // The cost of insertelement and extractelement instructions needed for
2945 // scalarization.
2946 ScalarizationCost += getScalarizationOverhead(I, VF);
2947
2948 // Scale the cost by the probability of executing the predicated blocks.
2949 // This assumes the predicated block for each vector lane is equally
2950 // likely.
2951 ScalarizationCost =
2952 ScalarizationCost / getPredBlockCostDivisor(CostKind, BB: I->getParent());
2953 }
2954
2955 InstructionCost SafeDivisorCost = 0;
2956 auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
2957 // The cost of the select guard to ensure all lanes are well defined
2958 // after we speculate above any internal control flow.
2959 SafeDivisorCost +=
2960 TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
2961 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
2962 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
2963
2964 SmallVector<const Value *, 4> Operands(I->operand_values());
2965 SafeDivisorCost += TTI.getArithmeticInstrCost(
2966 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
2967 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2968 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2969 Args: Operands, CxtI: I);
2970 return {ScalarizationCost, SafeDivisorCost};
2971}
2972
2973bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
2974 Instruction *I, ElementCount VF) const {
2975 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2976 assert(getWideningDecision(I, VF) == CM_Unknown &&
2977 "Decision should not be set yet.");
2978 auto *Group = getInterleavedAccessGroup(Instr: I);
2979 assert(Group && "Must have a group.");
2980 unsigned InterleaveFactor = Group->getFactor();
2981
2982 // If the instruction's allocated size doesn't equal its type size, it
2983 // requires padding and will be scalarized.
2984 auto &DL = I->getDataLayout();
2985 auto *ScalarTy = getLoadStoreType(I);
2986 if (hasIrregularType(Ty: ScalarTy, DL))
2987 return false;
2988
2989 // For scalable vectors, the interleave factors must be <= 8 since we require
2990 // the (de)interleaveN intrinsics instead of shufflevectors.
2991 if (VF.isScalable() && InterleaveFactor > 8)
2992 return false;
2993
2994 // If the group involves a non-integral pointer, we may not be able to
2995 // losslessly cast all values to a common type.
2996 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
2997 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
2998 Instruction *Member = Group->getMember(Index: Idx);
2999 if (!Member)
3000 continue;
3001 auto *MemberTy = getLoadStoreType(I: Member);
3002 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3003 // Don't coerce non-integral pointers to integers or vice versa.
3004 if (MemberNI != ScalarNI)
3005 // TODO: Consider adding special nullptr value case here
3006 return false;
3007 if (MemberNI && ScalarNI &&
3008 ScalarTy->getPointerAddressSpace() !=
3009 MemberTy->getPointerAddressSpace())
3010 return false;
3011 }
3012
3013 // Check if masking is required.
3014 // A Group may need masking for one of two reasons: it resides in a block that
3015 // needs predication, or it was decided to use masking to deal with gaps
3016 // (either a gap at the end of a load-access that may result in a speculative
3017 // load, or any gaps in a store-access).
3018 bool PredicatedAccessRequiresMasking =
3019 blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3020 Legal->isMaskRequired(I);
3021 bool LoadAccessWithGapsRequiresEpilogMasking =
3022 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3023 !isScalarEpilogueAllowed();
3024 bool StoreAccessWithGapsRequiresMasking =
3025 isa<StoreInst>(Val: I) && !Group->isFull();
3026 if (!PredicatedAccessRequiresMasking &&
3027 !LoadAccessWithGapsRequiresEpilogMasking &&
3028 !StoreAccessWithGapsRequiresMasking)
3029 return true;
3030
3031 // If masked interleaving is required, we expect that the user/target had
3032 // enabled it, because otherwise it either wouldn't have been created or
3033 // it should have been invalidated by the CostModel.
3034 assert(useMaskedInterleavedAccesses(TTI) &&
3035 "Masked interleave-groups for predicated accesses are not enabled.");
3036
3037 if (Group->isReverse())
3038 return false;
3039
3040 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
3041 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
3042 StoreAccessWithGapsRequiresMasking;
3043 if (VF.isScalable() && NeedsMaskForGaps)
3044 return false;
3045
3046 auto *Ty = getLoadStoreType(I);
3047 const Align Alignment = getLoadStoreAlignment(I);
3048 unsigned AS = getLoadStoreAddressSpace(I);
3049 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3050 : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3051}
3052
3053bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3054 Instruction *I, ElementCount VF) {
3055 // Get and ensure we have a valid memory instruction.
3056 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3057
3058 auto *Ptr = getLoadStorePointerOperand(V: I);
3059 auto *ScalarTy = getLoadStoreType(I);
3060
3061 // In order to be widened, the pointer should be consecutive, first of all.
3062 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3063 return false;
3064
3065 // If the instruction is a store located in a predicated block, it will be
3066 // scalarized.
3067 if (isScalarWithPredication(I, VF))
3068 return false;
3069
3070 // If the instruction's allocated size doesn't equal it's type size, it
3071 // requires padding and will be scalarized.
3072 auto &DL = I->getDataLayout();
3073 if (hasIrregularType(Ty: ScalarTy, DL))
3074 return false;
3075
3076 return true;
3077}
3078
3079void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3080 // We should not collect Uniforms more than once per VF. Right now,
3081 // this function is called from collectUniformsAndScalars(), which
3082 // already does this check. Collecting Uniforms for VF=1 does not make any
3083 // sense.
3084
3085 assert(VF.isVector() && !Uniforms.contains(VF) &&
3086 "This function should not be visited twice for the same VF");
3087
3088 // Visit the list of Uniforms. If we find no uniform value, we won't
3089 // analyze again. Uniforms.count(VF) will return 1.
3090 Uniforms[VF].clear();
3091
3092 // Now we know that the loop is vectorizable!
3093 // Collect instructions inside the loop that will remain uniform after
3094 // vectorization.
3095
3096 // Global values, params and instructions outside of current loop are out of
3097 // scope.
3098 auto IsOutOfScope = [&](Value *V) -> bool {
3099 Instruction *I = dyn_cast<Instruction>(Val: V);
3100 return (!I || !TheLoop->contains(Inst: I));
3101 };
3102
3103 // Worklist containing uniform instructions demanding lane 0.
3104 SetVector<Instruction *> Worklist;
3105
3106 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3107 // that require predication must not be considered uniform after
3108 // vectorization, because that would create an erroneous replicating region
3109 // where only a single instance out of VF should be formed.
3110 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3111 if (IsOutOfScope(I)) {
3112 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3113 << *I << "\n");
3114 return;
3115 }
3116 if (isPredicatedInst(I)) {
3117 LLVM_DEBUG(
3118 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3119 << "\n");
3120 return;
3121 }
3122 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3123 Worklist.insert(X: I);
3124 };
3125
3126 // Start with the conditional branches exiting the loop. If the branch
3127 // condition is an instruction contained in the loop that is only used by the
3128 // branch, it is uniform. Note conditions from uncountable early exits are not
3129 // uniform.
3130 SmallVector<BasicBlock *> Exiting;
3131 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3132 for (BasicBlock *E : Exiting) {
3133 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3134 continue;
3135 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
3136 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3137 AddToWorklistIfAllowed(Cmp);
3138 }
3139
3140 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
3141 // Return true if all lanes perform the same memory operation, and we can
3142 // thus choose to execute only one.
3143 auto IsUniformMemOpUse = [&](Instruction *I) {
3144 // If the value was already known to not be uniform for the previous
3145 // (smaller VF), it cannot be uniform for the larger VF.
3146 if (PrevVF.isVector()) {
3147 auto Iter = Uniforms.find(Val: PrevVF);
3148 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
3149 return false;
3150 }
3151 if (!Legal->isUniformMemOp(I&: *I, VF))
3152 return false;
3153 if (isa<LoadInst>(Val: I))
3154 // Loading the same address always produces the same result - at least
3155 // assuming aliasing and ordering which have already been checked.
3156 return true;
3157 // Storing the same value on every iteration.
3158 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3159 };
3160
3161 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3162 InstWidening WideningDecision = getWideningDecision(I, VF);
3163 assert(WideningDecision != CM_Unknown &&
3164 "Widening decision should be ready at this moment");
3165
3166 if (IsUniformMemOpUse(I))
3167 return true;
3168
3169 return (WideningDecision == CM_Widen ||
3170 WideningDecision == CM_Widen_Reverse ||
3171 WideningDecision == CM_Interleave);
3172 };
3173
3174 // Returns true if Ptr is the pointer operand of a memory access instruction
3175 // I, I is known to not require scalarization, and the pointer is not also
3176 // stored.
3177 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3178 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
3179 return false;
3180 return getLoadStorePointerOperand(V: I) == Ptr &&
3181 (IsUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
3182 };
3183
3184 // Holds a list of values which are known to have at least one uniform use.
3185 // Note that there may be other uses which aren't uniform. A "uniform use"
3186 // here is something which only demands lane 0 of the unrolled iterations;
3187 // it does not imply that all lanes produce the same value (e.g. this is not
3188 // the usual meaning of uniform)
3189 SetVector<Value *> HasUniformUse;
3190
3191 // Scan the loop for instructions which are either a) known to have only
3192 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3193 for (auto *BB : TheLoop->blocks())
3194 for (auto &I : *BB) {
3195 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3196 switch (II->getIntrinsicID()) {
3197 case Intrinsic::sideeffect:
3198 case Intrinsic::experimental_noalias_scope_decl:
3199 case Intrinsic::assume:
3200 case Intrinsic::lifetime_start:
3201 case Intrinsic::lifetime_end:
3202 if (TheLoop->hasLoopInvariantOperands(I: &I))
3203 AddToWorklistIfAllowed(&I);
3204 break;
3205 default:
3206 break;
3207 }
3208 }
3209
3210 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3211 if (IsOutOfScope(EVI->getAggregateOperand())) {
3212 AddToWorklistIfAllowed(EVI);
3213 continue;
3214 }
3215 // Only ExtractValue instructions where the aggregate value comes from a
3216 // call are allowed to be non-uniform.
3217 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3218 "Expected aggregate value to be call return value");
3219 }
3220
3221 // If there's no pointer operand, there's nothing to do.
3222 auto *Ptr = getLoadStorePointerOperand(V: &I);
3223 if (!Ptr)
3224 continue;
3225
3226 // If the pointer can be proven to be uniform, always add it to the
3227 // worklist.
3228 if (isa<Instruction>(Val: Ptr) && Legal->isUniform(V: Ptr, VF))
3229 AddToWorklistIfAllowed(cast<Instruction>(Val: Ptr));
3230
3231 if (IsUniformMemOpUse(&I))
3232 AddToWorklistIfAllowed(&I);
3233
3234 if (IsVectorizedMemAccessUse(&I, Ptr))
3235 HasUniformUse.insert(X: Ptr);
3236 }
3237
3238 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3239 // demanding) users. Since loops are assumed to be in LCSSA form, this
3240 // disallows uses outside the loop as well.
3241 for (auto *V : HasUniformUse) {
3242 if (IsOutOfScope(V))
3243 continue;
3244 auto *I = cast<Instruction>(Val: V);
3245 bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User *U) -> bool {
3246 auto *UI = cast<Instruction>(Val: U);
3247 return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse(UI, V);
3248 });
3249 if (UsersAreMemAccesses)
3250 AddToWorklistIfAllowed(I);
3251 }
3252
3253 // Expand Worklist in topological order: whenever a new instruction
3254 // is added , its users should be already inside Worklist. It ensures
3255 // a uniform instruction will only be used by uniform instructions.
3256 unsigned Idx = 0;
3257 while (Idx != Worklist.size()) {
3258 Instruction *I = Worklist[Idx++];
3259
3260 for (auto *OV : I->operand_values()) {
3261 // isOutOfScope operands cannot be uniform instructions.
3262 if (IsOutOfScope(OV))
3263 continue;
3264 // First order recurrence Phi's should typically be considered
3265 // non-uniform.
3266 auto *OP = dyn_cast<PHINode>(Val: OV);
3267 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3268 continue;
3269 // If all the users of the operand are uniform, then add the
3270 // operand into the uniform worklist.
3271 auto *OI = cast<Instruction>(Val: OV);
3272 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
3273 auto *J = cast<Instruction>(Val: U);
3274 return Worklist.count(key: J) || IsVectorizedMemAccessUse(J, OI);
3275 }))
3276 AddToWorklistIfAllowed(OI);
3277 }
3278 }
3279
3280 // For an instruction to be added into Worklist above, all its users inside
3281 // the loop should also be in Worklist. However, this condition cannot be
3282 // true for phi nodes that form a cyclic dependence. We must process phi
3283 // nodes separately. An induction variable will remain uniform if all users
3284 // of the induction variable and induction variable update remain uniform.
3285 // The code below handles both pointer and non-pointer induction variables.
3286 BasicBlock *Latch = TheLoop->getLoopLatch();
3287 for (const auto &Induction : Legal->getInductionVars()) {
3288 auto *Ind = Induction.first;
3289 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3290
3291 // Determine if all users of the induction variable are uniform after
3292 // vectorization.
3293 bool UniformInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3294 auto *I = cast<Instruction>(Val: U);
3295 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3296 IsVectorizedMemAccessUse(I, Ind);
3297 });
3298 if (!UniformInd)
3299 continue;
3300
3301 // Determine if all users of the induction variable update instruction are
3302 // uniform after vectorization.
3303 bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3304 auto *I = cast<Instruction>(Val: U);
3305 return I == Ind || Worklist.count(key: I) ||
3306 IsVectorizedMemAccessUse(I, IndUpdate);
3307 });
3308 if (!UniformIndUpdate)
3309 continue;
3310
3311 // The induction variable and its update instruction will remain uniform.
3312 AddToWorklistIfAllowed(Ind);
3313 AddToWorklistIfAllowed(IndUpdate);
3314 }
3315
3316 Uniforms[VF].insert_range(R&: Worklist);
3317}
3318
3319bool LoopVectorizationCostModel::runtimeChecksRequired() {
3320 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3321
3322 if (Legal->getRuntimePointerChecking()->Need) {
3323 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3324 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3325 "loop with '#pragma clang loop vectorize(enable)' when "
3326 "compiling with -Os/-Oz",
3327 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3328 return true;
3329 }
3330
3331 if (!PSE.getPredicate().isAlwaysTrue()) {
3332 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3333 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3334 "loop with '#pragma clang loop vectorize(enable)' when "
3335 "compiling with -Os/-Oz",
3336 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3337 return true;
3338 }
3339
3340 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3341 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3342 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3343 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3344 "this loop without such check by compiling with -Os/-Oz",
3345 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3346 return true;
3347 }
3348
3349 return false;
3350}
3351
3352bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3353 if (IsScalableVectorizationAllowed)
3354 return *IsScalableVectorizationAllowed;
3355
3356 IsScalableVectorizationAllowed = false;
3357 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3358 return false;
3359
3360 if (Hints->isScalableVectorizationDisabled()) {
3361 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3362 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3363 return false;
3364 }
3365
3366 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3367
3368 auto MaxScalableVF = ElementCount::getScalable(
3369 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3370
3371 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3372 // FIXME: While for scalable vectors this is currently sufficient, this should
3373 // be replaced by a more detailed mechanism that filters out specific VFs,
3374 // instead of invalidating vectorization for a whole set of VFs based on the
3375 // MaxVF.
3376
3377 // Disable scalable vectorization if the loop contains unsupported reductions.
3378 if (!canVectorizeReductions(VF: MaxScalableVF)) {
3379 reportVectorizationInfo(
3380 Msg: "Scalable vectorization not supported for the reduction "
3381 "operations found in this loop.",
3382 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3383 return false;
3384 }
3385
3386 // Disable scalable vectorization if the loop contains any instructions
3387 // with element types not supported for scalable vectors.
3388 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3389 return !Ty->isVoidTy() &&
3390 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3391 })) {
3392 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3393 "for all element types found in this loop.",
3394 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3395 return false;
3396 }
3397
3398 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3399 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3400 "for safe distance analysis.",
3401 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3402 return false;
3403 }
3404
3405 IsScalableVectorizationAllowed = true;
3406 return true;
3407}
3408
3409ElementCount
3410LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3411 if (!isScalableVectorizationAllowed())
3412 return ElementCount::getScalable(MinVal: 0);
3413
3414 auto MaxScalableVF = ElementCount::getScalable(
3415 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3416 if (Legal->isSafeForAnyVectorWidth())
3417 return MaxScalableVF;
3418
3419 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3420 // Limit MaxScalableVF by the maximum safe dependence distance.
3421 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3422
3423 if (!MaxScalableVF)
3424 reportVectorizationInfo(
3425 Msg: "Max legal vector width too small, scalable vectorization "
3426 "unfeasible.",
3427 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3428
3429 return MaxScalableVF;
3430}
3431
3432FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3433 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
3434 bool FoldTailByMasking) {
3435 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3436 unsigned SmallestType, WidestType;
3437 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3438
3439 // Get the maximum safe dependence distance in bits computed by LAA.
3440 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3441 // the memory accesses that is most restrictive (involved in the smallest
3442 // dependence distance).
3443 unsigned MaxSafeElementsPowerOf2 =
3444 bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3445 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3446 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3447 MaxSafeElementsPowerOf2 =
3448 std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3449 }
3450 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3451 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3452
3453 if (!Legal->isSafeForAnyVectorWidth())
3454 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3455
3456 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3457 << ".\n");
3458 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3459 << ".\n");
3460
3461 // First analyze the UserVF, fall back if the UserVF should be ignored.
3462 if (UserVF) {
3463 auto MaxSafeUserVF =
3464 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3465
3466 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3467 // If `VF=vscale x N` is safe, then so is `VF=N`
3468 if (UserVF.isScalable())
3469 return FixedScalableVFPair(
3470 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3471
3472 return UserVF;
3473 }
3474
3475 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3476
3477 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3478 // is better to ignore the hint and let the compiler choose a suitable VF.
3479 if (!UserVF.isScalable()) {
3480 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3481 << " is unsafe, clamping to max safe VF="
3482 << MaxSafeFixedVF << ".\n");
3483 ORE->emit(RemarkBuilder: [&]() {
3484 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3485 TheLoop->getStartLoc(),
3486 TheLoop->getHeader())
3487 << "User-specified vectorization factor "
3488 << ore::NV("UserVectorizationFactor", UserVF)
3489 << " is unsafe, clamping to maximum safe vectorization factor "
3490 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3491 });
3492 return MaxSafeFixedVF;
3493 }
3494
3495 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3496 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3497 << " is ignored because scalable vectors are not "
3498 "available.\n");
3499 ORE->emit(RemarkBuilder: [&]() {
3500 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3501 TheLoop->getStartLoc(),
3502 TheLoop->getHeader())
3503 << "User-specified vectorization factor "
3504 << ore::NV("UserVectorizationFactor", UserVF)
3505 << " is ignored because the target does not support scalable "
3506 "vectors. The compiler will pick a more suitable value.";
3507 });
3508 } else {
3509 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3510 << " is unsafe. Ignoring scalable UserVF.\n");
3511 ORE->emit(RemarkBuilder: [&]() {
3512 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3513 TheLoop->getStartLoc(),
3514 TheLoop->getHeader())
3515 << "User-specified vectorization factor "
3516 << ore::NV("UserVectorizationFactor", UserVF)
3517 << " is unsafe. Ignoring the hint to let the compiler pick a "
3518 "more suitable value.";
3519 });
3520 }
3521 }
3522
3523 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3524 << " / " << WidestType << " bits.\n");
3525
3526 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
3527 ElementCount::getScalable(MinVal: 0));
3528 if (auto MaxVF =
3529 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3530 MaxSafeVF: MaxSafeFixedVF, UserIC, FoldTailByMasking))
3531 Result.FixedVF = MaxVF;
3532
3533 if (auto MaxVF =
3534 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3535 MaxSafeVF: MaxSafeScalableVF, UserIC, FoldTailByMasking))
3536 if (MaxVF.isScalable()) {
3537 Result.ScalableVF = MaxVF;
3538 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3539 << "\n");
3540 }
3541
3542 return Result;
3543}
3544
3545FixedScalableVFPair
3546LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3547 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3548 // TODO: It may be useful to do since it's still likely to be dynamically
3549 // uniform if the target can skip.
3550 reportVectorizationFailure(
3551 DebugMsg: "Not inserting runtime ptr check for divergent target",
3552 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3553 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3554 return FixedScalableVFPair::getNone();
3555 }
3556
3557 ScalarEvolution *SE = PSE.getSE();
3558 ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3559 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3560 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3561 if (TC != ElementCount::getFixed(MinVal: MaxTC))
3562 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3563 if (TC.isScalar()) {
3564 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3565 OREMsg: "loop trip count is one, irrelevant for vectorization",
3566 ORETag: "SingleIterationLoop", ORE, TheLoop);
3567 return FixedScalableVFPair::getNone();
3568 }
3569
3570 // If BTC matches the widest induction type and is -1 then the trip count
3571 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3572 // to vectorize.
3573 const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3574 if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3575 BTC->getType()->getScalarSizeInBits() >=
3576 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3577 SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3578 RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3579 reportVectorizationFailure(
3580 DebugMsg: "Trip count computation wrapped",
3581 OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3582 ORETag: "TripCountWrapped", ORE, TheLoop);
3583 return FixedScalableVFPair::getNone();
3584 }
3585
3586 switch (ScalarEpilogueStatus) {
3587 case CM_ScalarEpilogueAllowed:
3588 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: false);
3589 case CM_ScalarEpilogueNotAllowedUsePredicate:
3590 [[fallthrough]];
3591 case CM_ScalarEpilogueNotNeededUsePredicate:
3592 LLVM_DEBUG(
3593 dbgs() << "LV: vector predicate hint/switch found.\n"
3594 << "LV: Not allowing scalar epilogue, creating predicated "
3595 << "vector loop.\n");
3596 break;
3597 case CM_ScalarEpilogueNotAllowedLowTripLoop:
3598 // fallthrough as a special case of OptForSize
3599 case CM_ScalarEpilogueNotAllowedOptSize:
3600 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3601 LLVM_DEBUG(
3602 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3603 else
3604 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3605 << "count.\n");
3606
3607 // Bail if runtime checks are required, which are not good when optimising
3608 // for size.
3609 if (runtimeChecksRequired())
3610 return FixedScalableVFPair::getNone();
3611
3612 break;
3613 }
3614
3615 // Now try the tail folding
3616
3617 // Invalidate interleave groups that require an epilogue if we can't mask
3618 // the interleave-group.
3619 if (!useMaskedInterleavedAccesses(TTI)) {
3620 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3621 "No decisions should have been taken at this point");
3622 // Note: There is no need to invalidate any cost modeling decisions here, as
3623 // none were taken so far.
3624 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3625 }
3626
3627 FixedScalableVFPair MaxFactors =
3628 computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: true);
3629
3630 // Avoid tail folding if the trip count is known to be a multiple of any VF
3631 // we choose.
3632 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3633 MaxFactors.FixedVF.getFixedValue();
3634 if (MaxFactors.ScalableVF) {
3635 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3636 if (MaxVScale) {
3637 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3638 a: *MaxPowerOf2RuntimeVF,
3639 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3640 } else
3641 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3642 }
3643
3644 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3645 // Return false if the loop is neither a single-latch-exit loop nor an
3646 // early-exit loop as tail-folding is not supported in that case.
3647 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3648 !Legal->hasUncountableEarlyExit())
3649 return false;
3650 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3651 ScalarEvolution *SE = PSE.getSE();
3652 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3653 // with uncountable exits. For countable loops, the symbolic maximum must
3654 // remain identical to the known back-edge taken count.
3655 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3656 assert((Legal->hasUncountableEarlyExit() ||
3657 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3658 "Invalid loop count");
3659 const SCEV *ExitCount = SE->getAddExpr(
3660 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3661 const SCEV *Rem = SE->getURemExpr(
3662 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3663 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3664 return Rem->isZero();
3665 };
3666
3667 if (MaxPowerOf2RuntimeVF > 0u) {
3668 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3669 "MaxFixedVF must be a power of 2");
3670 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3671 // Accept MaxFixedVF if we do not have a tail.
3672 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3673 return MaxFactors;
3674 }
3675 }
3676
3677 auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3678 if (ExpectedTC && ExpectedTC->isFixed() &&
3679 ExpectedTC->getFixedValue() <=
3680 TTI.getMinTripCountTailFoldingThreshold()) {
3681 if (MaxPowerOf2RuntimeVF > 0u) {
3682 // If we have a low-trip-count, and the fixed-width VF is known to divide
3683 // the trip count but the scalable factor does not, use the fixed-width
3684 // factor in preference to allow the generation of a non-predicated loop.
3685 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3686 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3687 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3688 "remain for any chosen VF.\n");
3689 MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: 0);
3690 return MaxFactors;
3691 }
3692 }
3693
3694 reportVectorizationFailure(
3695 DebugMsg: "The trip count is below the minial threshold value.",
3696 OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3697 ORE, TheLoop);
3698 return FixedScalableVFPair::getNone();
3699 }
3700
3701 // If we don't know the precise trip count, or if the trip count that we
3702 // found modulo the vectorization factor is not zero, try to fold the tail
3703 // by masking.
3704 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3705 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3706 setTailFoldingStyle(IsScalableVF: ContainsScalableVF, UserIC);
3707 if (foldTailByMasking()) {
3708 if (foldTailWithEVL()) {
3709 LLVM_DEBUG(
3710 dbgs()
3711 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3712 "try to generate VP Intrinsics with scalable vector "
3713 "factors only.\n");
3714 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3715 // for now.
3716 // TODO: extend it for fixed vectors, if required.
3717 assert(ContainsScalableVF && "Expected scalable vector factor.");
3718
3719 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
3720 }
3721 return MaxFactors;
3722 }
3723
3724 // If there was a tail-folding hint/switch, but we can't fold the tail by
3725 // masking, fallback to a vectorization with a scalar epilogue.
3726 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3727 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3728 "scalar epilogue instead.\n");
3729 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3730 return MaxFactors;
3731 }
3732
3733 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3734 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3735 return FixedScalableVFPair::getNone();
3736 }
3737
3738 if (TC.isZero()) {
3739 reportVectorizationFailure(
3740 DebugMsg: "unable to calculate the loop count due to complex control flow",
3741 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3742 return FixedScalableVFPair::getNone();
3743 }
3744
3745 reportVectorizationFailure(
3746 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3747 OREMsg: "cannot optimize for size and vectorize at the same time. "
3748 "Enable vectorization of this loop with '#pragma clang loop "
3749 "vectorize(enable)' when compiling with -Os/-Oz",
3750 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3751 return FixedScalableVFPair::getNone();
3752}
3753
3754bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
3755 ElementCount VF) {
3756 if (ConsiderRegPressure.getNumOccurrences())
3757 return ConsiderRegPressure;
3758
3759 // TODO: We should eventually consider register pressure for all targets. The
3760 // TTI hook is temporary whilst target-specific issues are being fixed.
3761 if (TTI.shouldConsiderVectorizationRegPressure())
3762 return true;
3763
3764 if (!useMaxBandwidth(RegKind: VF.isScalable()
3765 ? TargetTransformInfo::RGK_ScalableVector
3766 : TargetTransformInfo::RGK_FixedWidthVector))
3767 return false;
3768 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3769 return ElementCount::isKnownGT(
3770 LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3771 : MaxPermissibleVFWithoutMaxBW.FixedVF);
3772}
3773
3774bool LoopVectorizationCostModel::useMaxBandwidth(
3775 TargetTransformInfo::RegisterKind RegKind) {
3776 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3777 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
3778 (UseWiderVFIfCallVariantsPresent &&
3779 Legal->hasVectorCallVariants())));
3780}
3781
3782ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3783 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
3784 bool FoldTailByMasking) const {
3785 unsigned EstimatedVF = VF.getKnownMinValue();
3786 if (VF.isScalable() && TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3787 auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3788 auto Min = Attr.getVScaleRangeMin();
3789 EstimatedVF *= Min;
3790 }
3791
3792 // When a scalar epilogue is required, at least one iteration of the scalar
3793 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3794 // max VF that results in a dead vector loop.
3795 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
3796 MaxTripCount -= 1;
3797
3798 // When the user specifies an interleave count, we need to ensure that
3799 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
3800 unsigned IC = UserIC > 0 ? UserIC : 1;
3801 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
3802
3803 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
3804 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
3805 // If upper bound loop trip count (TC) is known at compile time there is no
3806 // point in choosing VF greater than TC / IC (as done in the loop below).
3807 // Select maximum power of two which doesn't exceed TC / IC. If VF is
3808 // scalable, we only fall back on a fixed VF when the TC is less than or
3809 // equal to the known number of lanes.
3810 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC);
3811 if (ClampedUpperTripCount == 0)
3812 ClampedUpperTripCount = 1;
3813 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3814 "exceeding the constant trip count"
3815 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
3816 << ClampedUpperTripCount << "\n");
3817 return ElementCount::get(MinVal: ClampedUpperTripCount,
3818 Scalable: FoldTailByMasking ? VF.isScalable() : false);
3819 }
3820 return VF;
3821}
3822
3823ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3824 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3825 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
3826 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3827 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3828 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3829 : TargetTransformInfo::RGK_FixedWidthVector);
3830
3831 // Convenience function to return the minimum of two ElementCounts.
3832 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3833 assert((LHS.isScalable() == RHS.isScalable()) &&
3834 "Scalable flags must match");
3835 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3836 };
3837
3838 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3839 // Note that both WidestRegister and WidestType may not be a powers of 2.
3840 auto MaxVectorElementCount = ElementCount::get(
3841 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3842 Scalable: ComputeScalableMaxVF);
3843 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3844 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3845 << (MaxVectorElementCount * WidestType) << " bits.\n");
3846
3847 if (!MaxVectorElementCount) {
3848 LLVM_DEBUG(dbgs() << "LV: The target has no "
3849 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3850 << " vector registers.\n");
3851 return ElementCount::getFixed(MinVal: 1);
3852 }
3853
3854 ElementCount MaxVF = clampVFByMaxTripCount(
3855 VF: MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking);
3856 // If the MaxVF was already clamped, there's no point in trying to pick a
3857 // larger one.
3858 if (MaxVF != MaxVectorElementCount)
3859 return MaxVF;
3860
3861 TargetTransformInfo::RegisterKind RegKind =
3862 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3863 : TargetTransformInfo::RGK_FixedWidthVector;
3864
3865 if (MaxVF.isScalable())
3866 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3867 else
3868 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3869
3870 if (useMaxBandwidth(RegKind)) {
3871 auto MaxVectorElementCountMaxBW = ElementCount::get(
3872 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
3873 Scalable: ComputeScalableMaxVF);
3874 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3875
3876 if (ElementCount MinVF =
3877 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
3878 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
3879 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3880 << ") with target's minimum: " << MinVF << '\n');
3881 MaxVF = MinVF;
3882 }
3883 }
3884
3885 MaxVF =
3886 clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC, FoldTailByMasking);
3887
3888 if (MaxVectorElementCount != MaxVF) {
3889 // Invalidate any widening decisions we might have made, in case the loop
3890 // requires prediction (decided later), but we have already made some
3891 // load/store widening decisions.
3892 invalidateCostModelingDecisions();
3893 }
3894 }
3895 return MaxVF;
3896}
3897
3898bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3899 const VectorizationFactor &B,
3900 const unsigned MaxTripCount,
3901 bool HasTail,
3902 bool IsEpilogue) const {
3903 InstructionCost CostA = A.Cost;
3904 InstructionCost CostB = B.Cost;
3905
3906 // Improve estimate for the vector width if it is scalable.
3907 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3908 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3909 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3910 if (A.Width.isScalable())
3911 EstimatedWidthA *= *VScale;
3912 if (B.Width.isScalable())
3913 EstimatedWidthB *= *VScale;
3914 }
3915
3916 // When optimizing for size choose whichever is smallest, which will be the
3917 // one with the smallest cost for the whole loop. On a tie pick the larger
3918 // vector width, on the assumption that throughput will be greater.
3919 if (CM.CostKind == TTI::TCK_CodeSize)
3920 return CostA < CostB ||
3921 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3922
3923 // Assume vscale may be larger than 1 (or the value being tuned for),
3924 // so that scalable vectorization is slightly favorable over fixed-width
3925 // vectorization.
3926 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3927 A.Width.isScalable() && !B.Width.isScalable();
3928
3929 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3930 const InstructionCost &RHS) {
3931 return PreferScalable ? LHS <= RHS : LHS < RHS;
3932 };
3933
3934 // To avoid the need for FP division:
3935 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3936 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3937 if (!MaxTripCount)
3938 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3939
3940 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3941 InstructionCost VectorCost,
3942 InstructionCost ScalarCost) {
3943 // If the trip count is a known (possibly small) constant, the trip count
3944 // will be rounded up to an integer number of iterations under
3945 // FoldTailByMasking. The total cost in that case will be
3946 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3947 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3948 // some extra overheads, but for the purpose of comparing the costs of
3949 // different VFs we can use this to compare the total loop-body cost
3950 // expected after vectorization.
3951 if (HasTail)
3952 return VectorCost * (MaxTripCount / VF) +
3953 ScalarCost * (MaxTripCount % VF);
3954 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
3955 };
3956
3957 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3958 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3959 return CmpFn(RTCostA, RTCostB);
3960}
3961
3962bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3963 const VectorizationFactor &B,
3964 bool HasTail,
3965 bool IsEpilogue) const {
3966 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3967 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3968 IsEpilogue);
3969}
3970
3971void LoopVectorizationPlanner::emitInvalidCostRemarks(
3972 OptimizationRemarkEmitter *ORE) {
3973 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3974 SmallVector<RecipeVFPair> InvalidCosts;
3975 for (const auto &Plan : VPlans) {
3976 for (ElementCount VF : Plan->vectorFactors()) {
3977 // The VPlan-based cost model is designed for computing vector cost.
3978 // Querying VPlan-based cost model with a scarlar VF will cause some
3979 // errors because we expect the VF is vector for most of the widen
3980 // recipes.
3981 if (VF.isScalar())
3982 continue;
3983
3984 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
3985 OrigLoop);
3986 precomputeCosts(Plan&: *Plan, VF, CostCtx);
3987 auto Iter = vp_depth_first_deep(G: Plan->getVectorLoopRegion()->getEntry());
3988 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3989 for (auto &R : *VPBB) {
3990 if (!R.cost(VF, Ctx&: CostCtx).isValid())
3991 InvalidCosts.emplace_back(Args: &R, Args&: VF);
3992 }
3993 }
3994 }
3995 }
3996 if (InvalidCosts.empty())
3997 return;
3998
3999 // Emit a report of VFs with invalid costs in the loop.
4000
4001 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4002 DenseMap<VPRecipeBase *, unsigned> Numbering;
4003 unsigned I = 0;
4004 for (auto &Pair : InvalidCosts)
4005 if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4006 ++I;
4007
4008 // Sort the list, first on recipe(number) then on VF.
4009 sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4010 unsigned NA = Numbering[A.first];
4011 unsigned NB = Numbering[B.first];
4012 if (NA != NB)
4013 return NA < NB;
4014 return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4015 });
4016
4017 // For a list of ordered recipe-VF pairs:
4018 // [(load, VF1), (load, VF2), (store, VF1)]
4019 // group the recipes together to emit separate remarks for:
4020 // load (VF1, VF2)
4021 // store (VF1)
4022 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4023 auto Subset = ArrayRef<RecipeVFPair>();
4024 do {
4025 if (Subset.empty())
4026 Subset = Tail.take_front(N: 1);
4027
4028 VPRecipeBase *R = Subset.front().first;
4029
4030 unsigned Opcode =
4031 TypeSwitch<const VPRecipeBase *, unsigned>(R)
4032 .Case(caseFn: [](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
4033 .Case(
4034 caseFn: [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
4035 .Case(caseFn: [](const VPWidenLoadRecipe *R) { return Instruction::Load; })
4036 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4037 caseFn: [](const auto *R) { return Instruction::Call; })
4038 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4039 VPWidenCastRecipe>(
4040 caseFn: [](const auto *R) { return R->getOpcode(); })
4041 .Case(caseFn: [](const VPInterleaveRecipe *R) {
4042 return R->getStoredValues().empty() ? Instruction::Load
4043 : Instruction::Store;
4044 })
4045 .Case(caseFn: [](const VPReductionRecipe *R) {
4046 return RecurrenceDescriptor::getOpcode(Kind: R->getRecurrenceKind());
4047 });
4048
4049 // If the next recipe is different, or if there are no other pairs,
4050 // emit a remark for the collated subset. e.g.
4051 // [(load, VF1), (load, VF2))]
4052 // to emit:
4053 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4054 if (Subset == Tail || Tail[Subset.size()].first != R) {
4055 std::string OutString;
4056 raw_string_ostream OS(OutString);
4057 assert(!Subset.empty() && "Unexpected empty range");
4058 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4059 for (const auto &Pair : Subset)
4060 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4061 OS << "):";
4062 if (Opcode == Instruction::Call) {
4063 StringRef Name = "";
4064 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4065 Name = Int->getIntrinsicName();
4066 } else {
4067 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4068 Function *CalledFn =
4069 WidenCall ? WidenCall->getCalledScalarFunction()
4070 : cast<Function>(Val: R->getOperand(N: R->getNumOperands() - 1)
4071 ->getLiveInIRValue());
4072 Name = CalledFn->getName();
4073 }
4074 OS << " call to " << Name;
4075 } else
4076 OS << " " << Instruction::getOpcodeName(Opcode);
4077 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4078 DL: R->getDebugLoc());
4079 Tail = Tail.drop_front(N: Subset.size());
4080 Subset = {};
4081 } else
4082 // Grow the subset by one element
4083 Subset = Tail.take_front(N: Subset.size() + 1);
4084 } while (!Tail.empty());
4085}
4086
4087/// Check if any recipe of \p Plan will generate a vector value, which will be
4088/// assigned a vector register.
4089static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4090 const TargetTransformInfo &TTI) {
4091 assert(VF.isVector() && "Checking a scalar VF?");
4092 VPTypeAnalysis TypeInfo(Plan);
4093 DenseSet<VPRecipeBase *> EphemeralRecipes;
4094 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4095 // Set of already visited types.
4096 DenseSet<Type *> Visited;
4097 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4098 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4099 for (VPRecipeBase &R : *VPBB) {
4100 if (EphemeralRecipes.contains(V: &R))
4101 continue;
4102 // Continue early if the recipe is considered to not produce a vector
4103 // result. Note that this includes VPInstruction where some opcodes may
4104 // produce a vector, to preserve existing behavior as VPInstructions model
4105 // aspects not directly mapped to existing IR instructions.
4106 switch (R.getVPRecipeID()) {
4107 case VPRecipeBase::VPDerivedIVSC:
4108 case VPRecipeBase::VPScalarIVStepsSC:
4109 case VPRecipeBase::VPReplicateSC:
4110 case VPRecipeBase::VPInstructionSC:
4111 case VPRecipeBase::VPCanonicalIVPHISC:
4112 case VPRecipeBase::VPCurrentIterationPHISC:
4113 case VPRecipeBase::VPVectorPointerSC:
4114 case VPRecipeBase::VPVectorEndPointerSC:
4115 case VPRecipeBase::VPExpandSCEVSC:
4116 case VPRecipeBase::VPPredInstPHISC:
4117 case VPRecipeBase::VPBranchOnMaskSC:
4118 continue;
4119 case VPRecipeBase::VPReductionSC:
4120 case VPRecipeBase::VPActiveLaneMaskPHISC:
4121 case VPRecipeBase::VPWidenCallSC:
4122 case VPRecipeBase::VPWidenCanonicalIVSC:
4123 case VPRecipeBase::VPWidenCastSC:
4124 case VPRecipeBase::VPWidenGEPSC:
4125 case VPRecipeBase::VPWidenIntrinsicSC:
4126 case VPRecipeBase::VPWidenSC:
4127 case VPRecipeBase::VPBlendSC:
4128 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
4129 case VPRecipeBase::VPHistogramSC:
4130 case VPRecipeBase::VPWidenPHISC:
4131 case VPRecipeBase::VPWidenIntOrFpInductionSC:
4132 case VPRecipeBase::VPWidenPointerInductionSC:
4133 case VPRecipeBase::VPReductionPHISC:
4134 case VPRecipeBase::VPInterleaveEVLSC:
4135 case VPRecipeBase::VPInterleaveSC:
4136 case VPRecipeBase::VPWidenLoadEVLSC:
4137 case VPRecipeBase::VPWidenLoadSC:
4138 case VPRecipeBase::VPWidenStoreEVLSC:
4139 case VPRecipeBase::VPWidenStoreSC:
4140 break;
4141 default:
4142 llvm_unreachable("unhandled recipe");
4143 }
4144
4145 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4146 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4147 if (!NumLegalParts)
4148 return false;
4149 if (VF.isScalable()) {
4150 // <vscale x 1 x iN> is assumed to be profitable over iN because
4151 // scalable registers are a distinct register class from scalar
4152 // ones. If we ever find a target which wants to lower scalable
4153 // vectors back to scalars, we'll need to update this code to
4154 // explicitly ask TTI about the register class uses for each part.
4155 return NumLegalParts <= VF.getKnownMinValue();
4156 }
4157 // Two or more elements that share a register - are vectorized.
4158 return NumLegalParts < VF.getFixedValue();
4159 };
4160
4161 // If no def nor is a store, e.g., branches, continue - no value to check.
4162 if (R.getNumDefinedValues() == 0 &&
4163 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(Val: &R))
4164 continue;
4165 // For multi-def recipes, currently only interleaved loads, suffice to
4166 // check first def only.
4167 // For stores check their stored value; for interleaved stores suffice
4168 // the check first stored value only. In all cases this is the second
4169 // operand.
4170 VPValue *ToCheck =
4171 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
4172 Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4173 if (!Visited.insert(V: {ScalarTy}).second)
4174 continue;
4175 Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4176 if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4177 return true;
4178 }
4179 }
4180
4181 return false;
4182}
4183
4184static bool hasReplicatorRegion(VPlan &Plan) {
4185 return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4186 G: Plan.getVectorLoopRegion()->getEntry())),
4187 P: [](auto *VPRB) { return VPRB->isReplicator(); });
4188}
4189
4190#ifndef NDEBUG
4191VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4192 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4193 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4194 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4195 assert(
4196 any_of(VPlans,
4197 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4198 "Expected Scalar VF to be a candidate");
4199
4200 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4201 ExpectedCost);
4202 VectorizationFactor ChosenFactor = ScalarCost;
4203
4204 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4205 if (ForceVectorization &&
4206 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4207 // Ignore scalar width, because the user explicitly wants vectorization.
4208 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4209 // evaluation.
4210 ChosenFactor.Cost = InstructionCost::getMax();
4211 }
4212
4213 for (auto &P : VPlans) {
4214 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4215 P->vectorFactors().end());
4216
4217 SmallVector<VPRegisterUsage, 8> RUs;
4218 if (any_of(VFs, [this](ElementCount VF) {
4219 return CM.shouldConsiderRegPressureForVF(VF);
4220 }))
4221 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4222
4223 for (unsigned I = 0; I < VFs.size(); I++) {
4224 ElementCount VF = VFs[I];
4225 // The cost for scalar VF=1 is already calculated, so ignore it.
4226 if (VF.isScalar())
4227 continue;
4228
4229 /// If the register pressure needs to be considered for VF,
4230 /// don't consider the VF as valid if it exceeds the number
4231 /// of registers for the target.
4232 if (CM.shouldConsiderRegPressureForVF(VF) &&
4233 RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4234 continue;
4235
4236 InstructionCost C = CM.expectedCost(VF);
4237
4238 // Add on other costs that are modelled in VPlan, but not in the legacy
4239 // cost model.
4240 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, CM.PSE,
4241 OrigLoop);
4242 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4243 assert(VectorRegion && "Expected to have a vector region!");
4244 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4245 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4246 for (VPRecipeBase &R : *VPBB) {
4247 auto *VPI = dyn_cast<VPInstruction>(&R);
4248 if (!VPI)
4249 continue;
4250 switch (VPI->getOpcode()) {
4251 // Selects are only modelled in the legacy cost model for safe
4252 // divisors.
4253 case Instruction::Select: {
4254 if (auto *WR =
4255 dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4256 switch (WR->getOpcode()) {
4257 case Instruction::UDiv:
4258 case Instruction::SDiv:
4259 case Instruction::URem:
4260 case Instruction::SRem:
4261 continue;
4262 default:
4263 break;
4264 }
4265 }
4266 C += VPI->cost(VF, CostCtx);
4267 break;
4268 }
4269 case VPInstruction::ActiveLaneMask: {
4270 unsigned Multiplier =
4271 cast<VPConstantInt>(VPI->getOperand(2))->getZExtValue();
4272 C += VPI->cost(VF * Multiplier, CostCtx);
4273 break;
4274 }
4275 case VPInstruction::ExplicitVectorLength:
4276 case VPInstruction::AnyOf:
4277 C += VPI->cost(VF, CostCtx);
4278 break;
4279 default:
4280 break;
4281 }
4282 }
4283 }
4284
4285 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4286 unsigned Width =
4287 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4288 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4289 << " costs: " << (Candidate.Cost / Width));
4290 if (VF.isScalable())
4291 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4292 << CM.getVScaleForTuning().value_or(1) << ")");
4293 LLVM_DEBUG(dbgs() << ".\n");
4294
4295 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4296 LLVM_DEBUG(
4297 dbgs()
4298 << "LV: Not considering vector loop of width " << VF
4299 << " because it will not generate any vector instructions.\n");
4300 continue;
4301 }
4302
4303 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4304 LLVM_DEBUG(
4305 dbgs()
4306 << "LV: Not considering vector loop of width " << VF
4307 << " because it would cause replicated blocks to be generated,"
4308 << " which isn't allowed when optimizing for size.\n");
4309 continue;
4310 }
4311
4312 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4313 ChosenFactor = Candidate;
4314 }
4315 }
4316
4317 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4318 reportVectorizationFailure(
4319 "There are conditional stores.",
4320 "store that is conditionally executed prevents vectorization",
4321 "ConditionalStore", ORE, OrigLoop);
4322 ChosenFactor = ScalarCost;
4323 }
4324
4325 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4326 !isMoreProfitable(ChosenFactor, ScalarCost,
4327 !CM.foldTailByMasking())) dbgs()
4328 << "LV: Vectorization seems to be not beneficial, "
4329 << "but was forced by a user.\n");
4330 return ChosenFactor;
4331}
4332#endif
4333
4334/// Returns true if the VPlan contains a VPReductionPHIRecipe with
4335/// FindLast recurrence kind.
4336static bool hasFindLastReductionPhi(VPlan &Plan) {
4337 return any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4338 P: [](VPRecipeBase &R) {
4339 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4340 return RedPhi &&
4341 RecurrenceDescriptor::isFindLastRecurrenceKind(
4342 Kind: RedPhi->getRecurrenceKind());
4343 });
4344}
4345
4346/// Returns true if the VPlan contains header phi recipes that are not currently
4347/// supported for epilogue vectorization.
4348static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan) {
4349 return any_of(
4350 Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4351 P: [](VPRecipeBase &R) {
4352 if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R))
4353 return !WidenInd->getPHINode();
4354 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4355 return RedPhi && (RecurrenceDescriptor::isFindLastRecurrenceKind(
4356 Kind: RedPhi->getRecurrenceKind()) ||
4357 !RedPhi->getUnderlyingValue());
4358 });
4359}
4360
4361bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4362 ElementCount VF) const {
4363 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4364 // reductions need special handling and are currently unsupported.
4365 if (any_of(Range: OrigLoop->getHeader()->phis(), P: [&](PHINode &Phi) {
4366 if (!Legal->isReductionVariable(PN: &Phi))
4367 return Legal->isFixedOrderRecurrence(Phi: &Phi);
4368 RecurKind Kind =
4369 Legal->getRecurrenceDescriptor(PN: &Phi).getRecurrenceKind();
4370 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
4371 }))
4372 return false;
4373
4374 // FindLast reductions and inductions without underlying PHI require special
4375 // handling and are currently not supported for epilogue vectorization.
4376 if (hasUnsupportedHeaderPhiRecipe(Plan&: getPlanFor(VF)))
4377 return false;
4378
4379 // Phis with uses outside of the loop require special handling and are
4380 // currently unsupported.
4381 for (const auto &Entry : Legal->getInductionVars()) {
4382 // Look for uses of the value of the induction at the last iteration.
4383 Value *PostInc =
4384 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4385 for (User *U : PostInc->users())
4386 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4387 return false;
4388 // Look for uses of penultimate value of the induction.
4389 for (User *U : Entry.first->users())
4390 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4391 return false;
4392 }
4393
4394 // Epilogue vectorization code has not been auditted to ensure it handles
4395 // non-latch exits properly. It may be fine, but it needs auditted and
4396 // tested.
4397 // TODO: Add support for loops with an early exit.
4398 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4399 return false;
4400
4401 return true;
4402}
4403
4404bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4405 const ElementCount VF, const unsigned IC) const {
4406 // FIXME: We need a much better cost-model to take different parameters such
4407 // as register pressure, code size increase and cost of extra branches into
4408 // account. For now we apply a very crude heuristic and only consider loops
4409 // with vectorization factors larger than a certain value.
4410
4411 // Allow the target to opt out.
4412 if (!TTI.preferEpilogueVectorization(Iters: VF * IC))
4413 return false;
4414
4415 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4416 ? EpilogueVectorizationMinVF
4417 : TTI.getEpilogueVectorizationMinVF();
4418 return estimateElementCount(VF: VF * IC, VScale: VScaleForTuning) >= MinVFThreshold;
4419}
4420
4421VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4422 const ElementCount MainLoopVF, unsigned IC) {
4423 VectorizationFactor Result = VectorizationFactor::Disabled();
4424 if (!EnableEpilogueVectorization) {
4425 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4426 return Result;
4427 }
4428
4429 if (!CM.isScalarEpilogueAllowed()) {
4430 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4431 "epilogue is allowed.\n");
4432 return Result;
4433 }
4434
4435 // Not really a cost consideration, but check for unsupported cases here to
4436 // simplify the logic.
4437 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4438 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4439 "is not a supported candidate.\n");
4440 return Result;
4441 }
4442
4443 if (EpilogueVectorizationForceVF > 1) {
4444 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4445 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4446 if (hasPlanWithVF(VF: ForcedEC))
4447 return {ForcedEC, 0, 0};
4448
4449 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4450 "viable.\n");
4451 return Result;
4452 }
4453
4454 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4455 LLVM_DEBUG(
4456 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4457 return Result;
4458 }
4459
4460 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4461 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4462 "this loop\n");
4463 return Result;
4464 }
4465
4466 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4467 // the main loop handles 8 lanes per iteration. We could still benefit from
4468 // vectorizing the epilogue loop with VF=4.
4469 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4470 MinVal: estimateElementCount(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4471
4472 Type *TCType = Legal->getWidestInductionType();
4473 const SCEV *RemainingIterations = nullptr;
4474 unsigned MaxTripCount = 0;
4475 const SCEV *TC = vputils::getSCEVExprForVPValue(
4476 V: getPlanFor(VF: MainLoopVF).getTripCount(), PSE);
4477 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4478 const SCEV *KnownMinTC;
4479 bool ScalableTC = match(S: TC, P: m_scev_c_Mul(Op0: m_SCEV(V&: KnownMinTC), Op1: m_SCEVVScale()));
4480 bool ScalableRemIter = false;
4481 ScalarEvolution &SE = *PSE.getSE();
4482 // Use versions of TC and VF in which both are either scalable or fixed.
4483 if (ScalableTC == MainLoopVF.isScalable()) {
4484 ScalableRemIter = ScalableTC;
4485 RemainingIterations =
4486 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4487 } else if (ScalableTC) {
4488 const SCEV *EstimatedTC = SE.getMulExpr(
4489 LHS: KnownMinTC,
4490 RHS: SE.getConstant(Ty: TCType, V: CM.getVScaleForTuning().value_or(u: 1)));
4491 RemainingIterations = SE.getURemExpr(
4492 LHS: EstimatedTC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4493 } else
4494 RemainingIterations =
4495 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: EstimatedRuntimeVF * IC));
4496
4497 // No iterations left to process in the epilogue.
4498 if (RemainingIterations->isZero())
4499 return Result;
4500
4501 if (MainLoopVF.isFixed()) {
4502 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4503 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4504 RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4505 MaxTripCount = SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4506 }
4507 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4508 << MaxTripCount << "\n");
4509 }
4510
4511 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
4512 return SE.isKnownPredicate(Pred: CmpInst::ICMP_UGT, LHS: VF, RHS: RemIter);
4513 };
4514 for (auto &NextVF : ProfitableVFs) {
4515 // Skip candidate VFs without a corresponding VPlan.
4516 if (!hasPlanWithVF(VF: NextVF.Width))
4517 continue;
4518
4519 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4520 // vectors) or > the VF of the main loop (fixed vectors).
4521 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4522 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) ||
4523 (NextVF.Width.isScalable() &&
4524 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) ||
4525 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4526 ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4527 continue;
4528
4529 // If NextVF is greater than the number of remaining iterations, the
4530 // epilogue loop would be dead. Skip such factors.
4531 // TODO: We should also consider comparing against a scalable
4532 // RemainingIterations when SCEV be able to evaluate non-canonical
4533 // vscale-based expressions.
4534 if (!ScalableRemIter) {
4535 // Handle the case where NextVF and RemainingIterations are in different
4536 // numerical spaces.
4537 ElementCount EC = NextVF.Width;
4538 if (NextVF.Width.isScalable())
4539 EC = ElementCount::getFixed(
4540 MinVal: estimateElementCount(VF: NextVF.Width, VScale: CM.getVScaleForTuning()));
4541 if (SkipVF(SE.getElementCount(Ty: TCType, EC), RemainingIterations))
4542 continue;
4543 }
4544
4545 if (Result.Width.isScalar() ||
4546 isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking(),
4547 /*IsEpilogue*/ true))
4548 Result = NextVF;
4549 }
4550
4551 if (Result != VectorizationFactor::Disabled())
4552 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4553 << Result.Width << "\n");
4554 return Result;
4555}
4556
4557std::pair<unsigned, unsigned>
4558LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4559 unsigned MinWidth = -1U;
4560 unsigned MaxWidth = 8;
4561 const DataLayout &DL = TheFunction->getDataLayout();
4562 // For in-loop reductions, no element types are added to ElementTypesInLoop
4563 // if there are no loads/stores in the loop. In this case, check through the
4564 // reduction variables to determine the maximum width.
4565 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4566 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4567 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4568 // When finding the min width used by the recurrence we need to account
4569 // for casts on the input operands of the recurrence.
4570 MinWidth = std::min(
4571 a: MinWidth,
4572 b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4573 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4574 MaxWidth = std::max(a: MaxWidth,
4575 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4576 }
4577 } else {
4578 for (Type *T : ElementTypesInLoop) {
4579 MinWidth = std::min<unsigned>(
4580 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4581 MaxWidth = std::max<unsigned>(
4582 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4583 }
4584 }
4585 return {MinWidth, MaxWidth};
4586}
4587
4588void LoopVectorizationCostModel::collectElementTypesForWidening() {
4589 ElementTypesInLoop.clear();
4590 // For each block.
4591 for (BasicBlock *BB : TheLoop->blocks()) {
4592 // For each instruction in the loop.
4593 for (Instruction &I : BB->instructionsWithoutDebug()) {
4594 Type *T = I.getType();
4595
4596 // Skip ignored values.
4597 if (ValuesToIgnore.count(Ptr: &I))
4598 continue;
4599
4600 // Only examine Loads, Stores and PHINodes.
4601 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4602 continue;
4603
4604 // Examine PHI nodes that are reduction variables. Update the type to
4605 // account for the recurrence type.
4606 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4607 if (!Legal->isReductionVariable(PN))
4608 continue;
4609 const RecurrenceDescriptor &RdxDesc =
4610 Legal->getRecurrenceDescriptor(PN);
4611 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4612 TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4613 Ty: RdxDesc.getRecurrenceType()))
4614 continue;
4615 T = RdxDesc.getRecurrenceType();
4616 }
4617
4618 // Examine the stored values.
4619 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4620 T = ST->getValueOperand()->getType();
4621
4622 assert(T->isSized() &&
4623 "Expected the load/store/recurrence type to be sized");
4624
4625 ElementTypesInLoop.insert(Ptr: T);
4626 }
4627 }
4628}
4629
4630unsigned
4631LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4632 InstructionCost LoopCost) {
4633 // -- The interleave heuristics --
4634 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4635 // There are many micro-architectural considerations that we can't predict
4636 // at this level. For example, frontend pressure (on decode or fetch) due to
4637 // code size, or the number and capabilities of the execution ports.
4638 //
4639 // We use the following heuristics to select the interleave count:
4640 // 1. If the code has reductions, then we interleave to break the cross
4641 // iteration dependency.
4642 // 2. If the loop is really small, then we interleave to reduce the loop
4643 // overhead.
4644 // 3. We don't interleave if we think that we will spill registers to memory
4645 // due to the increased register pressure.
4646
4647 // Only interleave tail-folded loops if wide lane masks are requested, as the
4648 // overhead of multiple instructions to calculate the predicate is likely
4649 // not beneficial. If a scalar epilogue is not allowed for any other reason,
4650 // do not interleave.
4651 if (!CM.isScalarEpilogueAllowed() &&
4652 !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4653 return 1;
4654
4655 if (any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4656 P: IsaPred<VPCurrentIterationPHIRecipe>)) {
4657 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
4658 "Unroll factor forced to be 1.\n");
4659 return 1;
4660 }
4661
4662 // We used the distance for the interleave count.
4663 if (!Legal->isSafeForAnyVectorWidth())
4664 return 1;
4665
4666 // We don't attempt to perform interleaving for loops with uncountable early
4667 // exits because the VPInstruction::AnyOf code cannot currently handle
4668 // multiple parts.
4669 if (Plan.hasEarlyExit())
4670 return 1;
4671
4672 const bool HasReductions =
4673 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4674 P: IsaPred<VPReductionPHIRecipe>);
4675
4676 // FIXME: implement interleaving for FindLast transform correctly.
4677 if (hasFindLastReductionPhi(Plan))
4678 return 1;
4679
4680 // If we did not calculate the cost for VF (because the user selected the VF)
4681 // then we calculate the cost of VF here.
4682 if (LoopCost == 0) {
4683 if (VF.isScalar())
4684 LoopCost = CM.expectedCost(VF);
4685 else
4686 LoopCost = cost(Plan, VF);
4687 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4688
4689 // Loop body is free and there is no need for interleaving.
4690 if (LoopCost == 0)
4691 return 1;
4692 }
4693
4694 VPRegisterUsage R =
4695 calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore: CM.ValuesToIgnore)[0];
4696 // We divide by these constants so assume that we have at least one
4697 // instruction that uses at least one register.
4698 for (auto &Pair : R.MaxLocalUsers) {
4699 Pair.second = std::max(a: Pair.second, b: 1U);
4700 }
4701
4702 // We calculate the interleave count using the following formula.
4703 // Subtract the number of loop invariants from the number of available
4704 // registers. These registers are used by all of the interleaved instances.
4705 // Next, divide the remaining registers by the number of registers that is
4706 // required by the loop, in order to estimate how many parallel instances
4707 // fit without causing spills. All of this is rounded down if necessary to be
4708 // a power of two. We want power of two interleave count to simplify any
4709 // addressing operations or alignment considerations.
4710 // We also want power of two interleave counts to ensure that the induction
4711 // variable of the vector loop wraps to zero, when tail is folded by masking;
4712 // this currently happens when OptForSize, in which case IC is set to 1 above.
4713 unsigned IC = UINT_MAX;
4714
4715 for (const auto &Pair : R.MaxLocalUsers) {
4716 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4717 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4718 << " registers of "
4719 << TTI.getRegisterClassName(Pair.first)
4720 << " register class\n");
4721 if (VF.isScalar()) {
4722 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4723 TargetNumRegisters = ForceTargetNumScalarRegs;
4724 } else {
4725 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4726 TargetNumRegisters = ForceTargetNumVectorRegs;
4727 }
4728 unsigned MaxLocalUsers = Pair.second;
4729 unsigned LoopInvariantRegs = 0;
4730 if (R.LoopInvariantRegs.contains(Key: Pair.first))
4731 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4732
4733 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4734 MaxLocalUsers);
4735 // Don't count the induction variable as interleaved.
4736 if (EnableIndVarRegisterHeur) {
4737 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
4738 std::max(a: 1U, b: (MaxLocalUsers - 1)));
4739 }
4740
4741 IC = std::min(a: IC, b: TmpIC);
4742 }
4743
4744 // Clamp the interleave ranges to reasonable counts.
4745 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4746 LLVM_DEBUG(dbgs() << "LV: MaxInterleaveFactor for the target is "
4747 << MaxInterleaveCount << "\n");
4748
4749 // Check if the user has overridden the max.
4750 if (VF.isScalar()) {
4751 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4752 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4753 } else {
4754 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4755 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4756 }
4757
4758 // Try to get the exact trip count, or an estimate based on profiling data or
4759 // ConstantMax from PSE, failing that.
4760 auto BestKnownTC = getSmallBestKnownTC(PSE, L: OrigLoop);
4761
4762 // For fixed length VFs treat a scalable trip count as unknown.
4763 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4764 // Re-evaluate trip counts and VFs to be in the same numerical space.
4765 unsigned AvailableTC =
4766 estimateElementCount(VF: *BestKnownTC, VScale: CM.getVScaleForTuning());
4767 unsigned EstimatedVF = estimateElementCount(VF, VScale: CM.getVScaleForTuning());
4768
4769 // At least one iteration must be scalar when this constraint holds. So the
4770 // maximum available iterations for interleaving is one less.
4771 if (CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()))
4772 --AvailableTC;
4773
4774 unsigned InterleaveCountLB = bit_floor(Value: std::max(
4775 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4776
4777 if (getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop).isNonZero()) {
4778 // If the best known trip count is exact, we select between two
4779 // prospective ICs, where
4780 //
4781 // 1) the aggressive IC is capped by the trip count divided by VF
4782 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4783 //
4784 // The final IC is selected in a way that the epilogue loop trip count is
4785 // minimized while maximizing the IC itself, so that we either run the
4786 // vector loop at least once if it generates a small epilogue loop, or
4787 // else we run the vector loop at least twice.
4788
4789 unsigned InterleaveCountUB = bit_floor(Value: std::max(
4790 a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4791 MaxInterleaveCount = InterleaveCountLB;
4792
4793 if (InterleaveCountUB != InterleaveCountLB) {
4794 unsigned TailTripCountUB =
4795 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4796 unsigned TailTripCountLB =
4797 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4798 // If both produce same scalar tail, maximize the IC to do the same work
4799 // in fewer vector loop iterations
4800 if (TailTripCountUB == TailTripCountLB)
4801 MaxInterleaveCount = InterleaveCountUB;
4802 }
4803 } else {
4804 // If trip count is an estimated compile time constant, limit the
4805 // IC to be capped by the trip count divided by VF * 2, such that the
4806 // vector loop runs at least twice to make interleaving seem profitable
4807 // when there is an epilogue loop present. Since exact Trip count is not
4808 // known we choose to be conservative in our IC estimate.
4809 MaxInterleaveCount = InterleaveCountLB;
4810 }
4811 }
4812
4813 assert(MaxInterleaveCount > 0 &&
4814 "Maximum interleave count must be greater than 0");
4815
4816 // Clamp the calculated IC to be between the 1 and the max interleave count
4817 // that the target and trip count allows.
4818 if (IC > MaxInterleaveCount)
4819 IC = MaxInterleaveCount;
4820 else
4821 // Make sure IC is greater than 0.
4822 IC = std::max(a: 1u, b: IC);
4823
4824 assert(IC > 0 && "Interleave count must be greater than 0.");
4825
4826 // Interleave if we vectorized this loop and there is a reduction that could
4827 // benefit from interleaving.
4828 if (VF.isVector() && HasReductions) {
4829 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4830 return IC;
4831 }
4832
4833 // For any scalar loop that either requires runtime checks or predication we
4834 // are better off leaving this to the unroller. Note that if we've already
4835 // vectorized the loop we will have done the runtime check and so interleaving
4836 // won't require further checks.
4837 bool ScalarInterleavingRequiresPredication =
4838 (VF.isScalar() && any_of(Range: OrigLoop->blocks(), P: [this](BasicBlock *BB) {
4839 return Legal->blockNeedsPredication(BB);
4840 }));
4841 bool ScalarInterleavingRequiresRuntimePointerCheck =
4842 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4843
4844 // We want to interleave small loops in order to reduce the loop overhead and
4845 // potentially expose ILP opportunities.
4846 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4847 << "LV: IC is " << IC << '\n'
4848 << "LV: VF is " << VF << '\n');
4849 const bool AggressivelyInterleave =
4850 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4851 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4852 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4853 // We assume that the cost overhead is 1 and we use the cost model
4854 // to estimate the cost of the loop and interleave until the cost of the
4855 // loop overhead is about 5% of the cost of the loop.
4856 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4857 Value: SmallLoopCost / LoopCost.getValue()));
4858
4859 // Interleave until store/load ports (estimated by max interleave count) are
4860 // saturated.
4861 unsigned NumStores = 0;
4862 unsigned NumLoads = 0;
4863 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4864 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
4865 for (VPRecipeBase &R : *VPBB) {
4866 if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: &R)) {
4867 NumLoads++;
4868 continue;
4869 }
4870 if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(Val: &R)) {
4871 NumStores++;
4872 continue;
4873 }
4874
4875 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R)) {
4876 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4877 NumStores += StoreOps;
4878 else
4879 NumLoads += InterleaveR->getNumDefinedValues();
4880 continue;
4881 }
4882 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4883 NumLoads += isa<LoadInst>(Val: RepR->getUnderlyingInstr());
4884 NumStores += isa<StoreInst>(Val: RepR->getUnderlyingInstr());
4885 continue;
4886 }
4887 if (isa<VPHistogramRecipe>(Val: &R)) {
4888 NumLoads++;
4889 NumStores++;
4890 continue;
4891 }
4892 }
4893 }
4894 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4895 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4896
4897 // There is little point in interleaving for reductions containing selects
4898 // and compares when VF=1 since it may just create more overhead than it's
4899 // worth for loops with small trip counts. This is because we still have to
4900 // do the final reduction after the loop.
4901 bool HasSelectCmpReductions =
4902 HasReductions &&
4903 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4904 P: [](VPRecipeBase &R) {
4905 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4906 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4907 Kind: RedR->getRecurrenceKind()) ||
4908 RecurrenceDescriptor::isFindIVRecurrenceKind(
4909 Kind: RedR->getRecurrenceKind()));
4910 });
4911 if (HasSelectCmpReductions) {
4912 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4913 return 1;
4914 }
4915
4916 // If we have a scalar reduction (vector reductions are already dealt with
4917 // by this point), we can increase the critical path length if the loop
4918 // we're interleaving is inside another loop. For tree-wise reductions
4919 // set the limit to 2, and for ordered reductions it's best to disable
4920 // interleaving entirely.
4921 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4922 bool HasOrderedReductions =
4923 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4924 P: [](VPRecipeBase &R) {
4925 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4926
4927 return RedR && RedR->isOrdered();
4928 });
4929 if (HasOrderedReductions) {
4930 LLVM_DEBUG(
4931 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4932 return 1;
4933 }
4934
4935 unsigned F = MaxNestedScalarReductionIC;
4936 SmallIC = std::min(a: SmallIC, b: F);
4937 StoresIC = std::min(a: StoresIC, b: F);
4938 LoadsIC = std::min(a: LoadsIC, b: F);
4939 }
4940
4941 if (EnableLoadStoreRuntimeInterleave &&
4942 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4943 LLVM_DEBUG(
4944 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4945 return std::max(a: StoresIC, b: LoadsIC);
4946 }
4947
4948 // If there are scalar reductions and TTI has enabled aggressive
4949 // interleaving for reductions, we will interleave to expose ILP.
4950 if (VF.isScalar() && AggressivelyInterleave) {
4951 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4952 // Interleave no less than SmallIC but not as aggressive as the normal IC
4953 // to satisfy the rare situation when resources are too limited.
4954 return std::max(a: IC / 2, b: SmallIC);
4955 }
4956
4957 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4958 return SmallIC;
4959 }
4960
4961 // Interleave if this is a large loop (small loops are already dealt with by
4962 // this point) that could benefit from interleaving.
4963 if (AggressivelyInterleave) {
4964 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4965 return IC;
4966 }
4967
4968 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4969 return 1;
4970}
4971
4972bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4973 ElementCount VF) {
4974 // TODO: Cost model for emulated masked load/store is completely
4975 // broken. This hack guides the cost model to use an artificially
4976 // high enough value to practically disable vectorization with such
4977 // operations, except where previously deployed legality hack allowed
4978 // using very low cost values. This is to avoid regressions coming simply
4979 // from moving "masked load/store" check from legality to cost model.
4980 // Masked Load/Gather emulation was previously never allowed.
4981 // Limited number of Masked Store/Scatter emulation was allowed.
4982 assert((isPredicatedInst(I)) &&
4983 "Expecting a scalar emulated instruction");
4984 return isa<LoadInst>(Val: I) ||
4985 (isa<StoreInst>(Val: I) &&
4986 NumPredStores > NumberOfStoresToPredicate);
4987}
4988
4989void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4990 assert(VF.isVector() && "Expected VF >= 2");
4991
4992 // If we've already collected the instructions to scalarize or the predicated
4993 // BBs after vectorization, there's nothing to do. Collection may already have
4994 // occurred if we have a user-selected VF and are now computing the expected
4995 // cost for interleaving.
4996 if (InstsToScalarize.contains(Key: VF) ||
4997 PredicatedBBsAfterVectorization.contains(Val: VF))
4998 return;
4999
5000 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5001 // not profitable to scalarize any instructions, the presence of VF in the
5002 // map will indicate that we've analyzed it already.
5003 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5004
5005 // Find all the instructions that are scalar with predication in the loop and
5006 // determine if it would be better to not if-convert the blocks they are in.
5007 // If so, we also record the instructions to scalarize.
5008 for (BasicBlock *BB : TheLoop->blocks()) {
5009 if (!blockNeedsPredicationForAnyReason(BB))
5010 continue;
5011 for (Instruction &I : *BB)
5012 if (isScalarWithPredication(I: &I, VF)) {
5013 ScalarCostsTy ScalarCosts;
5014 // Do not apply discount logic for:
5015 // 1. Scalars after vectorization, as there will only be a single copy
5016 // of the instruction.
5017 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5018 // 3. Emulated masked memrefs, if a hacked cost is needed.
5019 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5020 !useEmulatedMaskMemRefHack(I: &I, VF) &&
5021 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) {
5022 for (const auto &[I, IC] : ScalarCosts)
5023 ScalarCostsVF.insert(KV: {I, IC});
5024 // Check if we decided to scalarize a call. If so, update the widening
5025 // decision of the call to CM_Scalarize with the computed scalar cost.
5026 for (const auto &[I, Cost] : ScalarCosts) {
5027 auto *CI = dyn_cast<CallInst>(Val: I);
5028 if (!CI || !CallWideningDecisions.contains(Val: {CI, VF}))
5029 continue;
5030 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
5031 CallWideningDecisions[{CI, VF}].Cost = Cost;
5032 }
5033 }
5034 // Remember that BB will remain after vectorization.
5035 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
5036 for (auto *Pred : predecessors(BB)) {
5037 if (Pred->getSingleSuccessor() == BB)
5038 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
5039 }
5040 }
5041 }
5042}
5043
5044InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5045 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5046 assert(!isUniformAfterVectorization(PredInst, VF) &&
5047 "Instruction marked uniform-after-vectorization will be predicated");
5048
5049 // Initialize the discount to zero, meaning that the scalar version and the
5050 // vector version cost the same.
5051 InstructionCost Discount = 0;
5052
5053 // Holds instructions to analyze. The instructions we visit are mapped in
5054 // ScalarCosts. Those instructions are the ones that would be scalarized if
5055 // we find that the scalar version costs less.
5056 SmallVector<Instruction *, 8> Worklist;
5057
5058 // Returns true if the given instruction can be scalarized.
5059 auto CanBeScalarized = [&](Instruction *I) -> bool {
5060 // We only attempt to scalarize instructions forming a single-use chain
5061 // from the original predicated block that would otherwise be vectorized.
5062 // Although not strictly necessary, we give up on instructions we know will
5063 // already be scalar to avoid traversing chains that are unlikely to be
5064 // beneficial.
5065 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5066 isScalarAfterVectorization(I, VF))
5067 return false;
5068
5069 // If the instruction is scalar with predication, it will be analyzed
5070 // separately. We ignore it within the context of PredInst.
5071 if (isScalarWithPredication(I, VF))
5072 return false;
5073
5074 // If any of the instruction's operands are uniform after vectorization,
5075 // the instruction cannot be scalarized. This prevents, for example, a
5076 // masked load from being scalarized.
5077 //
5078 // We assume we will only emit a value for lane zero of an instruction
5079 // marked uniform after vectorization, rather than VF identical values.
5080 // Thus, if we scalarize an instruction that uses a uniform, we would
5081 // create uses of values corresponding to the lanes we aren't emitting code
5082 // for. This behavior can be changed by allowing getScalarValue to clone
5083 // the lane zero values for uniforms rather than asserting.
5084 for (Use &U : I->operands())
5085 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5086 if (isUniformAfterVectorization(I: J, VF))
5087 return false;
5088
5089 // Otherwise, we can scalarize the instruction.
5090 return true;
5091 };
5092
5093 // Compute the expected cost discount from scalarizing the entire expression
5094 // feeding the predicated instruction. We currently only consider expressions
5095 // that are single-use instruction chains.
5096 Worklist.push_back(Elt: PredInst);
5097 while (!Worklist.empty()) {
5098 Instruction *I = Worklist.pop_back_val();
5099
5100 // If we've already analyzed the instruction, there's nothing to do.
5101 if (ScalarCosts.contains(Key: I))
5102 continue;
5103
5104 // Cannot scalarize fixed-order recurrence phis at the moment.
5105 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5106 continue;
5107
5108 // Compute the cost of the vector instruction. Note that this cost already
5109 // includes the scalarization overhead of the predicated instruction.
5110 InstructionCost VectorCost = getInstructionCost(I, VF);
5111
5112 // Compute the cost of the scalarized instruction. This cost is the cost of
5113 // the instruction as if it wasn't if-converted and instead remained in the
5114 // predicated block. We will scale this cost by block probability after
5115 // computing the scalarization overhead.
5116 InstructionCost ScalarCost =
5117 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
5118
5119 // Compute the scalarization overhead of needed insertelement instructions
5120 // and phi nodes.
5121 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5122 Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5123 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5124 ScalarCost += TTI.getScalarizationOverhead(
5125 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5126 /*Insert=*/true,
5127 /*Extract=*/false, CostKind);
5128 }
5129 ScalarCost +=
5130 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5131 }
5132
5133 // Compute the scalarization overhead of needed extractelement
5134 // instructions. For each of the instruction's operands, if the operand can
5135 // be scalarized, add it to the worklist; otherwise, account for the
5136 // overhead.
5137 for (Use &U : I->operands())
5138 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5139 assert(canVectorizeTy(J->getType()) &&
5140 "Instruction has non-scalar type");
5141 if (CanBeScalarized(J))
5142 Worklist.push_back(Elt: J);
5143 else if (needsExtract(V: J, VF)) {
5144 Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5145 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5146 ScalarCost += TTI.getScalarizationOverhead(
5147 Ty: cast<VectorType>(Val: VectorTy),
5148 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5149 /*Extract*/ true, CostKind);
5150 }
5151 }
5152 }
5153
5154 // Scale the total scalar cost by block probability.
5155 ScalarCost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5156
5157 // Compute the discount. A non-negative discount means the vector version
5158 // of the instruction costs more, and scalarizing would be beneficial.
5159 Discount += VectorCost - ScalarCost;
5160 ScalarCosts[I] = ScalarCost;
5161 }
5162
5163 return Discount;
5164}
5165
5166InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5167 InstructionCost Cost;
5168
5169 // If the vector loop gets executed exactly once with the given VF, ignore the
5170 // costs of comparison and induction instructions, as they'll get simplified
5171 // away.
5172 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5173 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5174 if (TC == VF && !foldTailByMasking())
5175 addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5176 InstsToIgnore&: ValuesToIgnoreForVF);
5177
5178 // For each block.
5179 for (BasicBlock *BB : TheLoop->blocks()) {
5180 InstructionCost BlockCost;
5181
5182 // For each instruction in the old loop.
5183 for (Instruction &I : BB->instructionsWithoutDebug()) {
5184 // Skip ignored values.
5185 if (ValuesToIgnore.count(Ptr: &I) || ValuesToIgnoreForVF.count(Ptr: &I) ||
5186 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5187 continue;
5188
5189 InstructionCost C = getInstructionCost(I: &I, VF);
5190
5191 // Check if we should override the cost.
5192 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5193 // For interleave groups, use ForceTargetInstructionCost once for the
5194 // whole group.
5195 if (VF.isVector() && getWideningDecision(I: &I, VF) == CM_Interleave) {
5196 if (getInterleavedAccessGroup(Instr: &I)->getInsertPos() == &I)
5197 C = InstructionCost(ForceTargetInstructionCost);
5198 else
5199 C = InstructionCost(0);
5200 } else {
5201 C = InstructionCost(ForceTargetInstructionCost);
5202 }
5203 }
5204
5205 BlockCost += C;
5206 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5207 << VF << " For instruction: " << I << '\n');
5208 }
5209
5210 // If we are vectorizing a predicated block, it will have been
5211 // if-converted. This means that the block's instructions (aside from
5212 // stores and instructions that may divide by zero) will now be
5213 // unconditionally executed. For the scalar case, we may not always execute
5214 // the predicated block, if it is an if-else block. Thus, scale the block's
5215 // cost by the probability of executing it.
5216 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5217 // by the header mask when folding the tail.
5218 if (VF.isScalar())
5219 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5220
5221 Cost += BlockCost;
5222 }
5223
5224 return Cost;
5225}
5226
5227/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
5228/// according to isAddressSCEVForCost.
5229///
5230/// This SCEV can be sent to the Target in order to estimate the address
5231/// calculation cost.
5232static const SCEV *getAddressAccessSCEV(
5233 Value *Ptr,
5234 PredicatedScalarEvolution &PSE,
5235 const Loop *TheLoop) {
5236 const SCEV *Addr = PSE.getSCEV(V: Ptr);
5237 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L: TheLoop) ? Addr
5238 : nullptr;
5239}
5240
5241InstructionCost
5242LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5243 ElementCount VF) {
5244 assert(VF.isVector() &&
5245 "Scalarization cost of instruction implies vectorization.");
5246 if (VF.isScalable())
5247 return InstructionCost::getInvalid();
5248
5249 Type *ValTy = getLoadStoreType(I);
5250 auto *SE = PSE.getSE();
5251
5252 unsigned AS = getLoadStoreAddressSpace(I);
5253 Value *Ptr = getLoadStorePointerOperand(V: I);
5254 Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5255 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5256 // that it is being called from this specific place.
5257
5258 // Figure out whether the access is strided and get the stride value
5259 // if it's known in compile time
5260 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
5261
5262 // Get the cost of the scalar memory instruction and address computation.
5263 InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
5264 PtrTy, SE, Ptr: PtrSCEV, CostKind);
5265
5266 // Don't pass *I here, since it is scalar but will actually be part of a
5267 // vectorized loop where the user of it is a vectorized instruction.
5268 const Align Alignment = getLoadStoreAlignment(I);
5269 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5270 Cost += VF.getFixedValue() *
5271 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy->getScalarType(), Alignment,
5272 AddressSpace: AS, CostKind, OpdInfo: OpInfo);
5273
5274 // Get the overhead of the extractelement and insertelement instructions
5275 // we might create due to scalarization.
5276 Cost += getScalarizationOverhead(I, VF);
5277
5278 // If we have a predicated load/store, it will need extra i1 extracts and
5279 // conditional branches, but may not be executed for each vector lane. Scale
5280 // the cost by the probability of executing the predicated block.
5281 if (isPredicatedInst(I)) {
5282 Cost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5283
5284 // Add the cost of an i1 extract and a branch
5285 auto *VecI1Ty =
5286 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5287 Cost += TTI.getScalarizationOverhead(
5288 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5289 /*Insert=*/false, /*Extract=*/true, CostKind);
5290 Cost += TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind);
5291
5292 if (useEmulatedMaskMemRefHack(I, VF))
5293 // Artificially setting to a high enough value to practically disable
5294 // vectorization with such operations.
5295 Cost = 3000000;
5296 }
5297
5298 return Cost;
5299}
5300
5301InstructionCost
5302LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5303 ElementCount VF) {
5304 Type *ValTy = getLoadStoreType(I);
5305 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5306 Value *Ptr = getLoadStorePointerOperand(V: I);
5307 unsigned AS = getLoadStoreAddressSpace(I);
5308 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5309
5310 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5311 "Stride should be 1 or -1 for consecutive memory access");
5312 const Align Alignment = getLoadStoreAlignment(I);
5313 InstructionCost Cost = 0;
5314 if (Legal->isMaskRequired(I)) {
5315 unsigned IID = I->getOpcode() == Instruction::Load
5316 ? Intrinsic::masked_load
5317 : Intrinsic::masked_store;
5318 Cost += TTI.getMemIntrinsicInstrCost(
5319 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS), CostKind);
5320 } else {
5321 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5322 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5323 CostKind, OpdInfo: OpInfo, I);
5324 }
5325
5326 bool Reverse = ConsecutiveStride < 0;
5327 if (Reverse)
5328 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5329 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5330 return Cost;
5331}
5332
5333InstructionCost
5334LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5335 ElementCount VF) {
5336 assert(Legal->isUniformMemOp(*I, VF));
5337
5338 Type *ValTy = getLoadStoreType(I);
5339 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5340 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5341 const Align Alignment = getLoadStoreAlignment(I);
5342 unsigned AS = getLoadStoreAddressSpace(I);
5343 if (isa<LoadInst>(Val: I)) {
5344 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5345 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5346 CostKind) +
5347 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5348 SrcTy: VectorTy, Mask: {}, CostKind);
5349 }
5350 StoreInst *SI = cast<StoreInst>(Val: I);
5351
5352 bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5353 // TODO: We have existing tests that request the cost of extracting element
5354 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5355 // the actual generated code, which involves extracting the last element of
5356 // a scalable vector where the lane to extract is unknown at compile time.
5357 InstructionCost Cost =
5358 TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5359 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS, CostKind);
5360 if (!IsLoopInvariantStoreValue)
5361 Cost += TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
5362 Val: VectorTy, CostKind, Index: 0);
5363 return Cost;
5364}
5365
5366InstructionCost
5367LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5368 ElementCount VF) {
5369 Type *ValTy = getLoadStoreType(I);
5370 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5371 const Align Alignment = getLoadStoreAlignment(I);
5372 Value *Ptr = getLoadStorePointerOperand(V: I);
5373 Type *PtrTy = Ptr->getType();
5374
5375 if (!Legal->isUniform(V: Ptr, VF))
5376 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
5377
5378 unsigned IID = I->getOpcode() == Instruction::Load
5379 ? Intrinsic::masked_gather
5380 : Intrinsic::masked_scatter;
5381 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5382 TTI.getMemIntrinsicInstrCost(
5383 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Ptr,
5384 Legal->isMaskRequired(I), Alignment, I),
5385 CostKind);
5386}
5387
5388InstructionCost
5389LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5390 ElementCount VF) {
5391 const auto *Group = getInterleavedAccessGroup(Instr: I);
5392 assert(Group && "Fail to get an interleaved access group.");
5393
5394 Instruction *InsertPos = Group->getInsertPos();
5395 Type *ValTy = getLoadStoreType(I: InsertPos);
5396 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5397 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5398
5399 unsigned InterleaveFactor = Group->getFactor();
5400 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
5401
5402 // Holds the indices of existing members in the interleaved group.
5403 SmallVector<unsigned, 4> Indices;
5404 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5405 if (Group->getMember(Index: IF))
5406 Indices.push_back(Elt: IF);
5407
5408 // Calculate the cost of the whole interleaved group.
5409 bool UseMaskForGaps =
5410 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5411 (isa<StoreInst>(Val: I) && !Group->isFull());
5412 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5413 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5414 Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5415 UseMaskForGaps);
5416
5417 if (Group->isReverse()) {
5418 // TODO: Add support for reversed masked interleaved access.
5419 assert(!Legal->isMaskRequired(I) &&
5420 "Reverse masked interleaved access not supported.");
5421 Cost += Group->getNumMembers() *
5422 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5423 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5424 }
5425 return Cost;
5426}
5427
5428std::optional<InstructionCost>
5429LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5430 ElementCount VF,
5431 Type *Ty) const {
5432 using namespace llvm::PatternMatch;
5433 // Early exit for no inloop reductions
5434 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
5435 return std::nullopt;
5436 auto *VectorTy = cast<VectorType>(Val: Ty);
5437
5438 // We are looking for a pattern of, and finding the minimal acceptable cost:
5439 // reduce(mul(ext(A), ext(B))) or
5440 // reduce(mul(A, B)) or
5441 // reduce(ext(A)) or
5442 // reduce(A).
5443 // The basic idea is that we walk down the tree to do that, finding the root
5444 // reduction instruction in InLoopReductionImmediateChains. From there we find
5445 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5446 // of the components. If the reduction cost is lower then we return it for the
5447 // reduction instruction and 0 for the other instructions in the pattern. If
5448 // it is not we return an invalid cost specifying the orignal cost method
5449 // should be used.
5450 Instruction *RetI = I;
5451 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5452 if (!RetI->hasOneUser())
5453 return std::nullopt;
5454 RetI = RetI->user_back();
5455 }
5456
5457 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5458 RetI->user_back()->getOpcode() == Instruction::Add) {
5459 RetI = RetI->user_back();
5460 }
5461
5462 // Test if the found instruction is a reduction, and if not return an invalid
5463 // cost specifying the parent to use the original cost modelling.
5464 Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5465 if (!LastChain)
5466 return std::nullopt;
5467
5468 // Find the reduction this chain is a part of and calculate the basic cost of
5469 // the reduction on its own.
5470 Instruction *ReductionPhi = LastChain;
5471 while (!isa<PHINode>(Val: ReductionPhi))
5472 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5473
5474 const RecurrenceDescriptor &RdxDesc =
5475 Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
5476
5477 InstructionCost BaseCost;
5478 RecurKind RK = RdxDesc.getRecurrenceKind();
5479 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5480 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5481 BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5482 FMF: RdxDesc.getFastMathFlags(), CostKind);
5483 } else {
5484 BaseCost = TTI.getArithmeticReductionCost(
5485 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5486 }
5487
5488 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5489 // normal fmul instruction to the cost of the fadd reduction.
5490 if (RK == RecurKind::FMulAdd)
5491 BaseCost +=
5492 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5493
5494 // If we're using ordered reductions then we can just return the base cost
5495 // here, since getArithmeticReductionCost calculates the full ordered
5496 // reduction cost when FP reassociation is not allowed.
5497 if (useOrderedReductions(RdxDesc))
5498 return BaseCost;
5499
5500 // Get the operand that was not the reduction chain and match it to one of the
5501 // patterns, returning the better cost if it is found.
5502 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
5503 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
5504 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
5505
5506 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
5507
5508 Instruction *Op0, *Op1;
5509 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5510 match(V: RedOp,
5511 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5512 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5513 Op0->getOpcode() == Op1->getOpcode() &&
5514 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
5515 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5516 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5517
5518 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5519 // Note that the extend opcodes need to all match, or if A==B they will have
5520 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5521 // which is equally fine.
5522 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5523 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
5524 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5525
5526 InstructionCost ExtCost =
5527 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5528 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5529 InstructionCost MulCost =
5530 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5531 InstructionCost Ext2Cost =
5532 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5533 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5534
5535 InstructionCost RedCost = TTI.getMulAccReductionCost(
5536 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5537 CostKind);
5538
5539 if (RedCost.isValid() &&
5540 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5541 return I == RetI ? RedCost : 0;
5542 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5543 !TheLoop->isLoopInvariant(V: RedOp)) {
5544 // Matched reduce(ext(A))
5545 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5546 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
5547 InstructionCost RedCost = TTI.getExtendedReductionCost(
5548 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5549 FMF: RdxDesc.getFastMathFlags(), CostKind);
5550
5551 InstructionCost ExtCost =
5552 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5553 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5554 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5555 return I == RetI ? RedCost : 0;
5556 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5557 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5558 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5559 Op0->getOpcode() == Op1->getOpcode() &&
5560 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5561 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5562 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
5563 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
5564 Type *LargestOpTy =
5565 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5566 : Op0Ty;
5567 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5568
5569 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5570 // different sizes. We take the largest type as the ext to reduce, and add
5571 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5572 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5573 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5574 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5575 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5576 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5577 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5578 InstructionCost MulCost =
5579 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5580
5581 InstructionCost RedCost = TTI.getMulAccReductionCost(
5582 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5583 CostKind);
5584 InstructionCost ExtraExtCost = 0;
5585 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5586 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5587 ExtraExtCost = TTI.getCastInstrCost(
5588 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5589 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
5590 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5591 }
5592
5593 if (RedCost.isValid() &&
5594 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5595 return I == RetI ? RedCost : 0;
5596 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5597 // Matched reduce.add(mul())
5598 InstructionCost MulCost =
5599 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5600
5601 InstructionCost RedCost = TTI.getMulAccReductionCost(
5602 IsUnsigned: true, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy,
5603 CostKind);
5604
5605 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5606 return I == RetI ? RedCost : 0;
5607 }
5608 }
5609
5610 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5611}
5612
5613InstructionCost
5614LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5615 ElementCount VF) {
5616 // Calculate scalar cost only. Vectorization cost should be ready at this
5617 // moment.
5618 if (VF.isScalar()) {
5619 Type *ValTy = getLoadStoreType(I);
5620 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5621 const Align Alignment = getLoadStoreAlignment(I);
5622 unsigned AS = getLoadStoreAddressSpace(I);
5623
5624 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5625 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5626 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5627 OpdInfo: OpInfo, I);
5628 }
5629 return getWideningCost(I, VF);
5630}
5631
5632InstructionCost
5633LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5634 ElementCount VF) const {
5635
5636 // There is no mechanism yet to create a scalable scalarization loop,
5637 // so this is currently Invalid.
5638 if (VF.isScalable())
5639 return InstructionCost::getInvalid();
5640
5641 if (VF.isScalar())
5642 return 0;
5643
5644 InstructionCost Cost = 0;
5645 Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5646 if (!RetTy->isVoidTy() &&
5647 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5648
5649 TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None;
5650 if (isa<LoadInst>(Val: I))
5651 VIC = TTI::VectorInstrContext::Load;
5652 else if (isa<StoreInst>(Val: I))
5653 VIC = TTI::VectorInstrContext::Store;
5654
5655 for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5656 Cost += TTI.getScalarizationOverhead(
5657 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5658 /*Insert=*/true, /*Extract=*/false, CostKind,
5659 /*ForPoisonSrc=*/true, VL: {}, VIC);
5660 }
5661 }
5662
5663 // Some targets keep addresses scalar.
5664 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5665 return Cost;
5666
5667 // Some targets support efficient element stores.
5668 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5669 return Cost;
5670
5671 // Collect operands to consider.
5672 CallInst *CI = dyn_cast<CallInst>(Val: I);
5673 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5674
5675 // Skip operands that do not require extraction/scalarization and do not incur
5676 // any overhead.
5677 SmallVector<Type *> Tys;
5678 for (auto *V : filterExtractingOperands(Ops, VF))
5679 Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5680
5681 TTI::VectorInstrContext OperandVIC = isa<StoreInst>(Val: I)
5682 ? TTI::VectorInstrContext::Store
5683 : TTI::VectorInstrContext::None;
5684 return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC: OperandVIC);
5685}
5686
5687void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5688 if (VF.isScalar())
5689 return;
5690 NumPredStores = 0;
5691 for (BasicBlock *BB : TheLoop->blocks()) {
5692 // For each instruction in the old loop.
5693 for (Instruction &I : *BB) {
5694 Value *Ptr = getLoadStorePointerOperand(V: &I);
5695 if (!Ptr)
5696 continue;
5697
5698 // TODO: We should generate better code and update the cost model for
5699 // predicated uniform stores. Today they are treated as any other
5700 // predicated store (see added test cases in
5701 // invariant-store-vectorization.ll).
5702 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5703 NumPredStores++;
5704
5705 if (Legal->isUniformMemOp(I, VF)) {
5706 auto IsLegalToScalarize = [&]() {
5707 if (!VF.isScalable())
5708 // Scalarization of fixed length vectors "just works".
5709 return true;
5710
5711 // We have dedicated lowering for unpredicated uniform loads and
5712 // stores. Note that even with tail folding we know that at least
5713 // one lane is active (i.e. generalized predication is not possible
5714 // here), and the logic below depends on this fact.
5715 if (!foldTailByMasking())
5716 return true;
5717
5718 // For scalable vectors, a uniform memop load is always
5719 // uniform-by-parts and we know how to scalarize that.
5720 if (isa<LoadInst>(Val: I))
5721 return true;
5722
5723 // A uniform store isn't neccessarily uniform-by-part
5724 // and we can't assume scalarization.
5725 auto &SI = cast<StoreInst>(Val&: I);
5726 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5727 };
5728
5729 const InstructionCost GatherScatterCost =
5730 isLegalGatherOrScatter(V: &I, VF) ?
5731 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5732
5733 // Load: Scalar load + broadcast
5734 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5735 // FIXME: This cost is a significant under-estimate for tail folded
5736 // memory ops.
5737 const InstructionCost ScalarizationCost =
5738 IsLegalToScalarize() ? getUniformMemOpCost(I: &I, VF)
5739 : InstructionCost::getInvalid();
5740
5741 // Choose better solution for the current VF, Note that Invalid
5742 // costs compare as maximumal large. If both are invalid, we get
5743 // scalable invalid which signals a failure and a vectorization abort.
5744 if (GatherScatterCost < ScalarizationCost)
5745 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5746 else
5747 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5748 continue;
5749 }
5750
5751 // We assume that widening is the best solution when possible.
5752 if (memoryInstructionCanBeWidened(I: &I, VF)) {
5753 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5754 int ConsecutiveStride = Legal->isConsecutivePtr(
5755 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5756 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5757 "Expected consecutive stride.");
5758 InstWidening Decision =
5759 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5760 setWideningDecision(I: &I, VF, W: Decision, Cost);
5761 continue;
5762 }
5763
5764 // Choose between Interleaving, Gather/Scatter or Scalarization.
5765 InstructionCost InterleaveCost = InstructionCost::getInvalid();
5766 unsigned NumAccesses = 1;
5767 if (isAccessInterleaved(Instr: &I)) {
5768 const auto *Group = getInterleavedAccessGroup(Instr: &I);
5769 assert(Group && "Fail to get an interleaved access group.");
5770
5771 // Make one decision for the whole group.
5772 if (getWideningDecision(I: &I, VF) != CM_Unknown)
5773 continue;
5774
5775 NumAccesses = Group->getNumMembers();
5776 if (interleavedAccessCanBeWidened(I: &I, VF))
5777 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5778 }
5779
5780 InstructionCost GatherScatterCost =
5781 isLegalGatherOrScatter(V: &I, VF)
5782 ? getGatherScatterCost(I: &I, VF) * NumAccesses
5783 : InstructionCost::getInvalid();
5784
5785 InstructionCost ScalarizationCost =
5786 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5787
5788 // Choose better solution for the current VF,
5789 // write down this decision and use it during vectorization.
5790 InstructionCost Cost;
5791 InstWidening Decision;
5792 if (InterleaveCost <= GatherScatterCost &&
5793 InterleaveCost < ScalarizationCost) {
5794 Decision = CM_Interleave;
5795 Cost = InterleaveCost;
5796 } else if (GatherScatterCost < ScalarizationCost) {
5797 Decision = CM_GatherScatter;
5798 Cost = GatherScatterCost;
5799 } else {
5800 Decision = CM_Scalarize;
5801 Cost = ScalarizationCost;
5802 }
5803 // If the instructions belongs to an interleave group, the whole group
5804 // receives the same decision. The whole group receives the cost, but
5805 // the cost will actually be assigned to one instruction.
5806 if (const auto *Group = getInterleavedAccessGroup(Instr: &I)) {
5807 if (Decision == CM_Scalarize) {
5808 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5809 if (auto *I = Group->getMember(Index: Idx)) {
5810 setWideningDecision(I, VF, W: Decision,
5811 Cost: getMemInstScalarizationCost(I, VF));
5812 }
5813 }
5814 } else {
5815 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5816 }
5817 } else
5818 setWideningDecision(I: &I, VF, W: Decision, Cost);
5819 }
5820 }
5821
5822 // Make sure that any load of address and any other address computation
5823 // remains scalar unless there is gather/scatter support. This avoids
5824 // inevitable extracts into address registers, and also has the benefit of
5825 // activating LSR more, since that pass can't optimize vectorized
5826 // addresses.
5827 if (TTI.prefersVectorizedAddressing())
5828 return;
5829
5830 // Start with all scalar pointer uses.
5831 SmallPtrSet<Instruction *, 8> AddrDefs;
5832 for (BasicBlock *BB : TheLoop->blocks())
5833 for (Instruction &I : *BB) {
5834 Instruction *PtrDef =
5835 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5836 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5837 getWideningDecision(I: &I, VF) != CM_GatherScatter)
5838 AddrDefs.insert(Ptr: PtrDef);
5839 }
5840
5841 // Add all instructions used to generate the addresses.
5842 SmallVector<Instruction *, 4> Worklist;
5843 append_range(C&: Worklist, R&: AddrDefs);
5844 while (!Worklist.empty()) {
5845 Instruction *I = Worklist.pop_back_val();
5846 for (auto &Op : I->operands())
5847 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5848 if (TheLoop->contains(Inst: InstOp) && !isa<PHINode>(Val: InstOp) &&
5849 AddrDefs.insert(Ptr: InstOp).second)
5850 Worklist.push_back(Elt: InstOp);
5851 }
5852
5853 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5854 // If there are direct memory op users of the newly scalarized load,
5855 // their cost may have changed because there's no scalarization
5856 // overhead for the operand. Update it.
5857 for (User *U : LI->users()) {
5858 if (!isa<LoadInst, StoreInst>(Val: U))
5859 continue;
5860 if (getWideningDecision(I: cast<Instruction>(Val: U), VF) != CM_Scalarize)
5861 continue;
5862 setWideningDecision(
5863 I: cast<Instruction>(Val: U), VF, W: CM_Scalarize,
5864 Cost: getMemInstScalarizationCost(I: cast<Instruction>(Val: U), VF));
5865 }
5866 };
5867 for (auto *I : AddrDefs) {
5868 if (isa<LoadInst>(Val: I)) {
5869 // Setting the desired widening decision should ideally be handled in
5870 // by cost functions, but since this involves the task of finding out
5871 // if the loaded register is involved in an address computation, it is
5872 // instead changed here when we know this is the case.
5873 InstWidening Decision = getWideningDecision(I, VF);
5874 if (!isPredicatedInst(I) &&
5875 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5876 (!Legal->isUniformMemOp(I&: *I, VF) && Decision == CM_Scalarize))) {
5877 // Scalarize a widened load of address or update the cost of a scalar
5878 // load of an address.
5879 setWideningDecision(
5880 I, VF, W: CM_Scalarize,
5881 Cost: (VF.getKnownMinValue() *
5882 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
5883 UpdateMemOpUserCost(cast<LoadInst>(Val: I));
5884 } else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5885 // Scalarize all members of this interleaved group when any member
5886 // is used as an address. The address-used load skips scalarization
5887 // overhead, other members include it.
5888 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5889 if (Instruction *Member = Group->getMember(Index: Idx)) {
5890 InstructionCost Cost =
5891 AddrDefs.contains(Ptr: Member)
5892 ? (VF.getKnownMinValue() *
5893 getMemoryInstructionCost(I: Member,
5894 VF: ElementCount::getFixed(MinVal: 1)))
5895 : getMemInstScalarizationCost(I: Member, VF);
5896 setWideningDecision(I: Member, VF, W: CM_Scalarize, Cost);
5897 UpdateMemOpUserCost(cast<LoadInst>(Val: Member));
5898 }
5899 }
5900 }
5901 } else {
5902 // Cannot scalarize fixed-order recurrence phis at the moment.
5903 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5904 continue;
5905
5906 // Make sure I gets scalarized and a cost estimate without
5907 // scalarization overhead.
5908 ForcedScalars[VF].insert(Ptr: I);
5909 }
5910 }
5911}
5912
5913void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5914 assert(!VF.isScalar() &&
5915 "Trying to set a vectorization decision for a scalar VF");
5916
5917 auto ForcedScalar = ForcedScalars.find(Val: VF);
5918 for (BasicBlock *BB : TheLoop->blocks()) {
5919 // For each instruction in the old loop.
5920 for (Instruction &I : *BB) {
5921 CallInst *CI = dyn_cast<CallInst>(Val: &I);
5922
5923 if (!CI)
5924 continue;
5925
5926 InstructionCost ScalarCost = InstructionCost::getInvalid();
5927 InstructionCost VectorCost = InstructionCost::getInvalid();
5928 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5929 Function *ScalarFunc = CI->getCalledFunction();
5930 Type *ScalarRetTy = CI->getType();
5931 SmallVector<Type *, 4> Tys, ScalarTys;
5932 for (auto &ArgOp : CI->args())
5933 ScalarTys.push_back(Elt: ArgOp->getType());
5934
5935 // Estimate cost of scalarized vector call. The source operands are
5936 // assumed to be vectors, so we need to extract individual elements from
5937 // there, execute VF scalar calls, and then gather the result into the
5938 // vector return value.
5939 if (VF.isFixed()) {
5940 InstructionCost ScalarCallCost =
5941 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5942
5943 // Compute costs of unpacking argument values for the scalar calls and
5944 // packing the return values to a vector.
5945 InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5946 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5947 } else {
5948 // There is no point attempting to calculate the scalar cost for a
5949 // scalable VF as we know it will be Invalid.
5950 assert(!getScalarizationOverhead(CI, VF).isValid() &&
5951 "Unexpected valid cost for scalarizing scalable vectors");
5952 ScalarCost = InstructionCost::getInvalid();
5953 }
5954
5955 // Honor ForcedScalars and UniformAfterVectorization decisions.
5956 // TODO: For calls, it might still be more profitable to widen. Use
5957 // VPlan-based cost model to compare different options.
5958 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5959 ForcedScalar->second.contains(Ptr: CI)) ||
5960 isUniformAfterVectorization(I: CI, VF))) {
5961 setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5962 IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5963 Cost: ScalarCost);
5964 continue;
5965 }
5966
5967 bool MaskRequired = Legal->isMaskRequired(I: CI);
5968 // Compute corresponding vector type for return value and arguments.
5969 Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5970 for (Type *ScalarTy : ScalarTys)
5971 Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5972
5973 // An in-loop reduction using an fmuladd intrinsic is a special case;
5974 // we don't want the normal cost for that intrinsic.
5975 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5976 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5977 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5978 IID: getVectorIntrinsicIDForCall(CI, TLI),
5979 MaskPos: std::nullopt, Cost: *RedCost);
5980 continue;
5981 }
5982
5983 // Find the cost of vectorizing the call, if we can find a suitable
5984 // vector variant of the function.
5985 VFInfo FuncInfo;
5986 Function *VecFunc = nullptr;
5987 // Search through any available variants for one we can use at this VF.
5988 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5989 // Must match requested VF.
5990 if (Info.Shape.VF != VF)
5991 continue;
5992
5993 // Must take a mask argument if one is required
5994 if (MaskRequired && !Info.isMasked())
5995 continue;
5996
5997 // Check that all parameter kinds are supported
5998 bool ParamsOk = true;
5999 for (VFParameter Param : Info.Shape.Parameters) {
6000 switch (Param.ParamKind) {
6001 case VFParamKind::Vector:
6002 break;
6003 case VFParamKind::OMP_Uniform: {
6004 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6005 // Make sure the scalar parameter in the loop is invariant.
6006 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
6007 L: TheLoop))
6008 ParamsOk = false;
6009 break;
6010 }
6011 case VFParamKind::OMP_Linear: {
6012 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6013 // Find the stride for the scalar parameter in this loop and see if
6014 // it matches the stride for the variant.
6015 // TODO: do we need to figure out the cost of an extract to get the
6016 // first lane? Or do we hope that it will be folded away?
6017 ScalarEvolution *SE = PSE.getSE();
6018 if (!match(S: SE->getSCEV(V: ScalarParam),
6019 P: m_scev_AffineAddRec(
6020 Op0: m_SCEV(), Op1: m_scev_SpecificSInt(V: Param.LinearStepOrPos),
6021 L: m_SpecificLoop(L: TheLoop))))
6022 ParamsOk = false;
6023 break;
6024 }
6025 case VFParamKind::GlobalPredicate:
6026 break;
6027 default:
6028 ParamsOk = false;
6029 break;
6030 }
6031 }
6032
6033 if (!ParamsOk)
6034 continue;
6035
6036 // Found a suitable candidate, stop here.
6037 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
6038 FuncInfo = Info;
6039 break;
6040 }
6041
6042 if (TLI && VecFunc && !CI->isNoBuiltin())
6043 VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
6044
6045 // Find the cost of an intrinsic; some targets may have instructions that
6046 // perform the operation without needing an actual call.
6047 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6048 if (IID != Intrinsic::not_intrinsic)
6049 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6050
6051 InstructionCost Cost = ScalarCost;
6052 InstWidening Decision = CM_Scalarize;
6053
6054 if (VectorCost.isValid() && VectorCost <= Cost) {
6055 Cost = VectorCost;
6056 Decision = CM_VectorCall;
6057 }
6058
6059 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
6060 Cost = IntrinsicCost;
6061 Decision = CM_IntrinsicCall;
6062 }
6063
6064 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6065 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6066 }
6067 }
6068}
6069
6070bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6071 if (!Legal->isInvariant(V: Op))
6072 return false;
6073 // Consider Op invariant, if it or its operands aren't predicated
6074 // instruction in the loop. In that case, it is not trivially hoistable.
6075 auto *OpI = dyn_cast<Instruction>(Val: Op);
6076 return !OpI || !TheLoop->contains(Inst: OpI) ||
6077 (!isPredicatedInst(I: OpI) &&
6078 (!isa<PHINode>(Val: OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6079 all_of(Range: OpI->operands(),
6080 P: [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6081}
6082
6083InstructionCost
6084LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6085 ElementCount VF) {
6086 // If we know that this instruction will remain uniform, check the cost of
6087 // the scalar version.
6088 if (isUniformAfterVectorization(I, VF))
6089 VF = ElementCount::getFixed(MinVal: 1);
6090
6091 if (VF.isVector() && isProfitableToScalarize(I, VF))
6092 return InstsToScalarize[VF][I];
6093
6094 // Forced scalars do not have any scalarization overhead.
6095 auto ForcedScalar = ForcedScalars.find(Val: VF);
6096 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6097 auto InstSet = ForcedScalar->second;
6098 if (InstSet.count(Ptr: I))
6099 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
6100 VF.getKnownMinValue();
6101 }
6102
6103 Type *RetTy = I->getType();
6104 if (canTruncateToMinimalBitwidth(I, VF))
6105 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
6106 auto *SE = PSE.getSE();
6107
6108 Type *VectorTy;
6109 if (isScalarAfterVectorization(I, VF)) {
6110 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6111 [this](Instruction *I, ElementCount VF) -> bool {
6112 if (VF.isScalar())
6113 return true;
6114
6115 auto Scalarized = InstsToScalarize.find(Key: VF);
6116 assert(Scalarized != InstsToScalarize.end() &&
6117 "VF not yet analyzed for scalarization profitability");
6118 return !Scalarized->second.count(Key: I) &&
6119 llvm::all_of(Range: I->users(), P: [&](User *U) {
6120 auto *UI = cast<Instruction>(Val: U);
6121 return !Scalarized->second.count(Key: UI);
6122 });
6123 };
6124
6125 // With the exception of GEPs and PHIs, after scalarization there should
6126 // only be one copy of the instruction generated in the loop. This is
6127 // because the VF is either 1, or any instructions that need scalarizing
6128 // have already been dealt with by the time we get here. As a result,
6129 // it means we don't have to multiply the instruction cost by VF.
6130 assert(I->getOpcode() == Instruction::GetElementPtr ||
6131 I->getOpcode() == Instruction::PHI ||
6132 (I->getOpcode() == Instruction::BitCast &&
6133 I->getType()->isPointerTy()) ||
6134 HasSingleCopyAfterVectorization(I, VF));
6135 VectorTy = RetTy;
6136 } else
6137 VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
6138
6139 if (VF.isVector() && VectorTy->isVectorTy() &&
6140 !TTI.getNumberOfParts(Tp: VectorTy))
6141 return InstructionCost::getInvalid();
6142
6143 // TODO: We need to estimate the cost of intrinsic calls.
6144 switch (I->getOpcode()) {
6145 case Instruction::GetElementPtr:
6146 // We mark this instruction as zero-cost because the cost of GEPs in
6147 // vectorized code depends on whether the corresponding memory instruction
6148 // is scalarized or not. Therefore, we handle GEPs with the memory
6149 // instruction cost.
6150 return 0;
6151 case Instruction::UncondBr:
6152 case Instruction::CondBr: {
6153 // In cases of scalarized and predicated instructions, there will be VF
6154 // predicated blocks in the vectorized loop. Each branch around these
6155 // blocks requires also an extract of its vector compare i1 element.
6156 // Note that the conditional branch from the loop latch will be replaced by
6157 // a single branch controlling the loop, so there is no extra overhead from
6158 // scalarization.
6159 bool ScalarPredicatedBB = false;
6160 CondBrInst *BI = dyn_cast<CondBrInst>(Val: I);
6161 if (VF.isVector() && BI &&
6162 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6163 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
6164 BI->getParent() != TheLoop->getLoopLatch())
6165 ScalarPredicatedBB = true;
6166
6167 if (ScalarPredicatedBB) {
6168 // Not possible to scalarize scalable vector with predicated instructions.
6169 if (VF.isScalable())
6170 return InstructionCost::getInvalid();
6171 // Return cost for branches around scalarized and predicated blocks.
6172 auto *VecI1Ty =
6173 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6174 return (TTI.getScalarizationOverhead(
6175 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6176 /*Insert*/ false, /*Extract*/ true, CostKind) +
6177 (TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind) *
6178 VF.getFixedValue()));
6179 }
6180
6181 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6182 // The back-edge branch will remain, as will all scalar branches.
6183 return TTI.getCFInstrCost(Opcode: Instruction::UncondBr, CostKind);
6184
6185 // This branch will be eliminated by if-conversion.
6186 return 0;
6187 // Note: We currently assume zero cost for an unconditional branch inside
6188 // a predicated block since it will become a fall-through, although we
6189 // may decide in the future to call TTI for all branches.
6190 }
6191 case Instruction::Switch: {
6192 if (VF.isScalar())
6193 return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6194 auto *Switch = cast<SwitchInst>(Val: I);
6195 return Switch->getNumCases() *
6196 TTI.getCmpSelInstrCost(
6197 Opcode: Instruction::ICmp,
6198 ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6199 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6200 VecPred: CmpInst::ICMP_EQ, CostKind);
6201 }
6202 case Instruction::PHI: {
6203 auto *Phi = cast<PHINode>(Val: I);
6204
6205 // First-order recurrences are replaced by vector shuffles inside the loop.
6206 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6207 SmallVector<int> Mask(VF.getKnownMinValue());
6208 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6209 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6210 DstTy: cast<VectorType>(Val: VectorTy),
6211 SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6212 Index: VF.getKnownMinValue() - 1);
6213 }
6214
6215 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6216 // converted into select instructions. We require N - 1 selects per phi
6217 // node, where N is the number of incoming values.
6218 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6219 Type *ResultTy = Phi->getType();
6220
6221 // All instructions in an Any-of reduction chain are narrowed to bool.
6222 // Check if that is the case for this phi node.
6223 auto *HeaderUser = cast_if_present<PHINode>(
6224 Val: find_singleton<User>(Range: Phi->users(), P: [this](User *U, bool) -> User * {
6225 auto *Phi = dyn_cast<PHINode>(Val: U);
6226 if (Phi && Phi->getParent() == TheLoop->getHeader())
6227 return Phi;
6228 return nullptr;
6229 }));
6230 if (HeaderUser) {
6231 auto &ReductionVars = Legal->getReductionVars();
6232 auto Iter = ReductionVars.find(Key: HeaderUser);
6233 if (Iter != ReductionVars.end() &&
6234 RecurrenceDescriptor::isAnyOfRecurrenceKind(
6235 Kind: Iter->second.getRecurrenceKind()))
6236 ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6237 }
6238 return (Phi->getNumIncomingValues() - 1) *
6239 TTI.getCmpSelInstrCost(
6240 Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6241 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6242 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6243 }
6244
6245 // When tail folding with EVL, if the phi is part of an out of loop
6246 // reduction then it will be transformed into a wide vp_merge.
6247 if (VF.isVector() && foldTailWithEVL() &&
6248 Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6249 IntrinsicCostAttributes ICA(
6250 Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6251 {toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6252 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6253 }
6254
6255 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6256 }
6257 case Instruction::UDiv:
6258 case Instruction::SDiv:
6259 case Instruction::URem:
6260 case Instruction::SRem:
6261 if (VF.isVector() && isPredicatedInst(I)) {
6262 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6263 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6264 ScalarCost : SafeDivisorCost;
6265 }
6266 // We've proven all lanes safe to speculate, fall through.
6267 [[fallthrough]];
6268 case Instruction::Add:
6269 case Instruction::Sub: {
6270 auto Info = Legal->getHistogramInfo(I);
6271 if (Info && VF.isVector()) {
6272 const HistogramInfo *HGram = Info.value();
6273 // Assume that a non-constant update value (or a constant != 1) requires
6274 // a multiply, and add that into the cost.
6275 InstructionCost MulCost = TTI::TCC_Free;
6276 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
6277 if (!RHS || RHS->getZExtValue() != 1)
6278 MulCost =
6279 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6280
6281 // Find the cost of the histogram operation itself.
6282 Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6283 Type *ScalarTy = I->getType();
6284 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6285 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6286 Type::getVoidTy(C&: I->getContext()),
6287 {PtrTy, ScalarTy, MaskTy});
6288
6289 // Add the costs together with the add/sub operation.
6290 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6291 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6292 }
6293 [[fallthrough]];
6294 }
6295 case Instruction::FAdd:
6296 case Instruction::FSub:
6297 case Instruction::Mul:
6298 case Instruction::FMul:
6299 case Instruction::FDiv:
6300 case Instruction::FRem:
6301 case Instruction::Shl:
6302 case Instruction::LShr:
6303 case Instruction::AShr:
6304 case Instruction::And:
6305 case Instruction::Or:
6306 case Instruction::Xor: {
6307 // If we're speculating on the stride being 1, the multiplication may
6308 // fold away. We can generalize this for all operations using the notion
6309 // of neutral elements. (TODO)
6310 if (I->getOpcode() == Instruction::Mul &&
6311 ((TheLoop->isLoopInvariant(V: I->getOperand(i: 0)) &&
6312 PSE.getSCEV(V: I->getOperand(i: 0))->isOne()) ||
6313 (TheLoop->isLoopInvariant(V: I->getOperand(i: 1)) &&
6314 PSE.getSCEV(V: I->getOperand(i: 1))->isOne())))
6315 return 0;
6316
6317 // Detect reduction patterns
6318 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6319 return *RedCost;
6320
6321 // Certain instructions can be cheaper to vectorize if they have a constant
6322 // second vector operand. One example of this are shifts on x86.
6323 Value *Op2 = I->getOperand(i: 1);
6324 if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6325 PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6326 isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6327 Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6328 }
6329 auto Op2Info = TTI.getOperandInfo(V: Op2);
6330 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6331 shouldConsiderInvariant(Op: Op2))
6332 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6333
6334 SmallVector<const Value *, 4> Operands(I->operand_values());
6335 return TTI.getArithmeticInstrCost(
6336 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6337 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6338 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6339 }
6340 case Instruction::FNeg: {
6341 return TTI.getArithmeticInstrCost(
6342 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6343 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6344 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6345 Args: I->getOperand(i: 0), CxtI: I);
6346 }
6347 case Instruction::Select: {
6348 SelectInst *SI = cast<SelectInst>(Val: I);
6349 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6350 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6351
6352 const Value *Op0, *Op1;
6353 using namespace llvm::PatternMatch;
6354 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6355 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6356 // select x, y, false --> x & y
6357 // select x, true, y --> x | y
6358 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6359 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6360 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6361 Op1->getType()->getScalarSizeInBits() == 1);
6362
6363 return TTI.getArithmeticInstrCost(
6364 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And,
6365 Ty: VectorTy, CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: {Op0, Op1}, CxtI: I);
6366 }
6367
6368 Type *CondTy = SI->getCondition()->getType();
6369 if (!ScalarCond)
6370 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6371
6372 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6373 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6374 Pred = Cmp->getPredicate();
6375 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6376 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6377 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6378 }
6379 case Instruction::ICmp:
6380 case Instruction::FCmp: {
6381 Type *ValTy = I->getOperand(i: 0)->getType();
6382
6383 if (canTruncateToMinimalBitwidth(I, VF)) {
6384 [[maybe_unused]] Instruction *Op0AsInstruction =
6385 dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6386 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6387 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6388 "if both the operand and the compare are marked for "
6389 "truncation, they must have the same bitwidth");
6390 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[I]);
6391 }
6392
6393 VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6394 return TTI.getCmpSelInstrCost(
6395 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6396 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6397 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6398 }
6399 case Instruction::Store:
6400 case Instruction::Load: {
6401 ElementCount Width = VF;
6402 if (Width.isVector()) {
6403 InstWidening Decision = getWideningDecision(I, VF: Width);
6404 assert(Decision != CM_Unknown &&
6405 "CM decision should be taken at this point");
6406 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6407 return InstructionCost::getInvalid();
6408 if (Decision == CM_Scalarize)
6409 Width = ElementCount::getFixed(MinVal: 1);
6410 }
6411 VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6412 return getMemoryInstructionCost(I, VF);
6413 }
6414 case Instruction::BitCast:
6415 if (I->getType()->isPointerTy())
6416 return 0;
6417 [[fallthrough]];
6418 case Instruction::ZExt:
6419 case Instruction::SExt:
6420 case Instruction::FPToUI:
6421 case Instruction::FPToSI:
6422 case Instruction::FPExt:
6423 case Instruction::PtrToInt:
6424 case Instruction::IntToPtr:
6425 case Instruction::SIToFP:
6426 case Instruction::UIToFP:
6427 case Instruction::Trunc:
6428 case Instruction::FPTrunc: {
6429 // Computes the CastContextHint from a Load/Store instruction.
6430 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6431 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6432 "Expected a load or a store!");
6433
6434 if (VF.isScalar() || !TheLoop->contains(Inst: I))
6435 return TTI::CastContextHint::Normal;
6436
6437 switch (getWideningDecision(I, VF)) {
6438 case LoopVectorizationCostModel::CM_GatherScatter:
6439 return TTI::CastContextHint::GatherScatter;
6440 case LoopVectorizationCostModel::CM_Interleave:
6441 return TTI::CastContextHint::Interleave;
6442 case LoopVectorizationCostModel::CM_Scalarize:
6443 case LoopVectorizationCostModel::CM_Widen:
6444 return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6445 : TTI::CastContextHint::Normal;
6446 case LoopVectorizationCostModel::CM_Widen_Reverse:
6447 return TTI::CastContextHint::Reversed;
6448 case LoopVectorizationCostModel::CM_Unknown:
6449 llvm_unreachable("Instr did not go through cost modelling?");
6450 case LoopVectorizationCostModel::CM_VectorCall:
6451 case LoopVectorizationCostModel::CM_IntrinsicCall:
6452 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6453 }
6454
6455 llvm_unreachable("Unhandled case!");
6456 };
6457
6458 unsigned Opcode = I->getOpcode();
6459 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6460 // For Trunc, the context is the only user, which must be a StoreInst.
6461 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6462 if (I->hasOneUse())
6463 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
6464 CCH = ComputeCCH(Store);
6465 }
6466 // For Z/Sext, the context is the operand, which must be a LoadInst.
6467 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6468 Opcode == Instruction::FPExt) {
6469 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
6470 CCH = ComputeCCH(Load);
6471 }
6472
6473 // We optimize the truncation of induction variables having constant
6474 // integer steps. The cost of these truncations is the same as the scalar
6475 // operation.
6476 if (isOptimizableIVTruncate(I, VF)) {
6477 auto *Trunc = cast<TruncInst>(Val: I);
6478 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6479 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6480 }
6481
6482 // Detect reduction patterns
6483 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6484 return *RedCost;
6485
6486 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
6487 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6488 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6489 SrcScalarTy =
6490 IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6491 Type *SrcVecTy =
6492 VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6493
6494 if (canTruncateToMinimalBitwidth(I, VF)) {
6495 // If the result type is <= the source type, there will be no extend
6496 // after truncating the users to the minimal required bitwidth.
6497 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6498 (I->getOpcode() == Instruction::ZExt ||
6499 I->getOpcode() == Instruction::SExt))
6500 return 0;
6501 }
6502
6503 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6504 }
6505 case Instruction::Call:
6506 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6507 case Instruction::ExtractValue:
6508 return TTI.getInstructionCost(U: I, CostKind);
6509 case Instruction::Alloca:
6510 // We cannot easily widen alloca to a scalable alloca, as
6511 // the result would need to be a vector of pointers.
6512 if (VF.isScalable())
6513 return InstructionCost::getInvalid();
6514 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: RetTy, CostKind);
6515 default:
6516 // This opcode is unknown. Assume that it is the same as 'mul'.
6517 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6518 } // end of switch.
6519}
6520
6521void LoopVectorizationCostModel::collectValuesToIgnore() {
6522 // Ignore ephemeral values.
6523 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6524
6525 SmallVector<Value *, 4> DeadInterleavePointerOps;
6526 SmallVector<Value *, 4> DeadOps;
6527
6528 // If a scalar epilogue is required, users outside the loop won't use
6529 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6530 // that is the case.
6531 bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6532 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6533 return RequiresScalarEpilogue &&
6534 !TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6535 };
6536
6537 LoopBlocksDFS DFS(TheLoop);
6538 DFS.perform(LI);
6539 for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6540 for (Instruction &I : reverse(C&: *BB)) {
6541 if (VecValuesToIgnore.contains(Ptr: &I) || ValuesToIgnore.contains(Ptr: &I))
6542 continue;
6543
6544 // Add instructions that would be trivially dead and are only used by
6545 // values already ignored to DeadOps to seed worklist.
6546 if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6547 all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6548 return VecValuesToIgnore.contains(Ptr: U) ||
6549 ValuesToIgnore.contains(Ptr: U) || IsLiveOutDead(U);
6550 }))
6551 DeadOps.push_back(Elt: &I);
6552
6553 // For interleave groups, we only create a pointer for the start of the
6554 // interleave group. Queue up addresses of group members except the insert
6555 // position for further processing.
6556 if (isAccessInterleaved(Instr: &I)) {
6557 auto *Group = getInterleavedAccessGroup(Instr: &I);
6558 if (Group->getInsertPos() == &I)
6559 continue;
6560 Value *PointerOp = getLoadStorePointerOperand(V: &I);
6561 DeadInterleavePointerOps.push_back(Elt: PointerOp);
6562 }
6563
6564 // Queue branches for analysis. They are dead, if their successors only
6565 // contain dead instructions.
6566 if (isa<CondBrInst>(Val: &I))
6567 DeadOps.push_back(Elt: &I);
6568 }
6569
6570 // Mark ops feeding interleave group members as free, if they are only used
6571 // by other dead computations.
6572 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6573 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
6574 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
6575 Instruction *UI = cast<Instruction>(Val: U);
6576 return !VecValuesToIgnore.contains(Ptr: U) &&
6577 (!isAccessInterleaved(Instr: UI) ||
6578 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6579 }))
6580 continue;
6581 VecValuesToIgnore.insert(Ptr: Op);
6582 append_range(C&: DeadInterleavePointerOps, R: Op->operands());
6583 }
6584
6585 // Mark ops that would be trivially dead and are only used by ignored
6586 // instructions as free.
6587 BasicBlock *Header = TheLoop->getHeader();
6588
6589 // Returns true if the block contains only dead instructions. Such blocks will
6590 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6591 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6592 auto IsEmptyBlock = [this](BasicBlock *BB) {
6593 return all_of(Range&: *BB, P: [this](Instruction &I) {
6594 return ValuesToIgnore.contains(Ptr: &I) || VecValuesToIgnore.contains(Ptr: &I) ||
6595 isa<UncondBrInst>(Val: &I);
6596 });
6597 };
6598 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6599 auto *Op = dyn_cast<Instruction>(Val: DeadOps[I]);
6600
6601 // Check if the branch should be considered dead.
6602 if (auto *Br = dyn_cast_or_null<CondBrInst>(Val: Op)) {
6603 BasicBlock *ThenBB = Br->getSuccessor(i: 0);
6604 BasicBlock *ElseBB = Br->getSuccessor(i: 1);
6605 // Don't considers branches leaving the loop for simplification.
6606 if (!TheLoop->contains(BB: ThenBB) || !TheLoop->contains(BB: ElseBB))
6607 continue;
6608 bool ThenEmpty = IsEmptyBlock(ThenBB);
6609 bool ElseEmpty = IsEmptyBlock(ElseBB);
6610 if ((ThenEmpty && ElseEmpty) ||
6611 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6612 ElseBB->phis().empty()) ||
6613 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6614 ThenBB->phis().empty())) {
6615 VecValuesToIgnore.insert(Ptr: Br);
6616 DeadOps.push_back(Elt: Br->getCondition());
6617 }
6618 continue;
6619 }
6620
6621 // Skip any op that shouldn't be considered dead.
6622 if (!Op || !TheLoop->contains(Inst: Op) ||
6623 (isa<PHINode>(Val: Op) && Op->getParent() == Header) ||
6624 !wouldInstructionBeTriviallyDead(I: Op, TLI) ||
6625 any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6626 return !VecValuesToIgnore.contains(Ptr: U) &&
6627 !ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead(U);
6628 }))
6629 continue;
6630
6631 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6632 // which applies for both scalar and vector versions. Otherwise it is only
6633 // dead in vector versions, so only add it to VecValuesToIgnore.
6634 if (all_of(Range: Op->users(),
6635 P: [this](User *U) { return ValuesToIgnore.contains(Ptr: U); }))
6636 ValuesToIgnore.insert(Ptr: Op);
6637
6638 VecValuesToIgnore.insert(Ptr: Op);
6639 append_range(C&: DeadOps, R: Op->operands());
6640 }
6641
6642 // Ignore type-promoting instructions we identified during reduction
6643 // detection.
6644 for (const auto &Reduction : Legal->getReductionVars()) {
6645 const RecurrenceDescriptor &RedDes = Reduction.second;
6646 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6647 VecValuesToIgnore.insert_range(R: Casts);
6648 }
6649 // Ignore type-casting instructions we identified during induction
6650 // detection.
6651 for (const auto &Induction : Legal->getInductionVars()) {
6652 const InductionDescriptor &IndDes = Induction.second;
6653 VecValuesToIgnore.insert_range(R: IndDes.getCastInsts());
6654 }
6655}
6656
6657void LoopVectorizationCostModel::collectInLoopReductions() {
6658 // Avoid duplicating work finding in-loop reductions.
6659 if (!InLoopReductions.empty())
6660 return;
6661
6662 for (const auto &Reduction : Legal->getReductionVars()) {
6663 PHINode *Phi = Reduction.first;
6664 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6665
6666 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
6667 // separately and should not be considered for in-loop reductions.
6668 if (RdxDesc.hasUsesOutsideReductionChain())
6669 continue;
6670
6671 // We don't collect reductions that are type promoted (yet).
6672 if (RdxDesc.getRecurrenceType() != Phi->getType())
6673 continue;
6674
6675 // In-loop AnyOf and FindIV reductions are not yet supported.
6676 RecurKind Kind = RdxDesc.getRecurrenceKind();
6677 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) ||
6678 RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) ||
6679 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
6680 continue;
6681
6682 // If the target would prefer this reduction to happen "in-loop", then we
6683 // want to record it as such.
6684 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6685 !TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6686 continue;
6687
6688 // Check that we can correctly put the reductions into the loop, by
6689 // finding the chain of operations that leads from the phi to the loop
6690 // exit value.
6691 SmallVector<Instruction *, 4> ReductionOperations =
6692 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6693 bool InLoop = !ReductionOperations.empty();
6694
6695 if (InLoop) {
6696 InLoopReductions.insert(Ptr: Phi);
6697 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6698 Instruction *LastChain = Phi;
6699 for (auto *I : ReductionOperations) {
6700 InLoopReductionImmediateChains[I] = LastChain;
6701 LastChain = I;
6702 }
6703 }
6704 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6705 << " reduction for phi: " << *Phi << "\n");
6706 }
6707}
6708
6709// This function will select a scalable VF if the target supports scalable
6710// vectors and a fixed one otherwise.
6711// TODO: we could return a pair of values that specify the max VF and
6712// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6713// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6714// doesn't have a cost model that can choose which plan to execute if
6715// more than one is generated.
6716static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6717 LoopVectorizationCostModel &CM) {
6718 unsigned WidestType;
6719 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6720
6721 TargetTransformInfo::RegisterKind RegKind =
6722 TTI.enableScalableVectorization()
6723 ? TargetTransformInfo::RGK_ScalableVector
6724 : TargetTransformInfo::RGK_FixedWidthVector;
6725
6726 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6727 unsigned N = RegSize.getKnownMinValue() / WidestType;
6728 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6729}
6730
6731VectorizationFactor
6732LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6733 ElementCount VF = UserVF;
6734 // Outer loop handling: They may require CFG and instruction level
6735 // transformations before even evaluating whether vectorization is profitable.
6736 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6737 // the vectorization pipeline.
6738 if (!OrigLoop->isInnermost()) {
6739 // If the user doesn't provide a vectorization factor, determine a
6740 // reasonable one.
6741 if (UserVF.isZero()) {
6742 VF = determineVPlanVF(TTI, CM);
6743 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6744
6745 // Make sure we have a VF > 1 for stress testing.
6746 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6747 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6748 << "overriding computed VF.\n");
6749 VF = ElementCount::getFixed(MinVal: 4);
6750 }
6751 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6752 !ForceTargetSupportsScalableVectors) {
6753 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6754 << "not supported by the target.\n");
6755 reportVectorizationFailure(
6756 DebugMsg: "Scalable vectorization requested but not supported by the target",
6757 OREMsg: "the scalable user-specified vectorization width for outer-loop "
6758 "vectorization cannot be used because the target does not support "
6759 "scalable vectors.",
6760 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6761 return VectorizationFactor::Disabled();
6762 }
6763 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6764 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6765 "VF needs to be a power of two");
6766 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6767 << "VF " << VF << " to build VPlans.\n");
6768 buildVPlans(MinVF: VF, MaxVF: VF);
6769
6770 if (VPlans.empty())
6771 return VectorizationFactor::Disabled();
6772
6773 // For VPlan build stress testing, we bail out after VPlan construction.
6774 if (VPlanBuildStressTest)
6775 return VectorizationFactor::Disabled();
6776
6777 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6778 }
6779
6780 LLVM_DEBUG(
6781 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6782 "VPlan-native path.\n");
6783 return VectorizationFactor::Disabled();
6784}
6785
6786void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6787 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6788 CM.collectValuesToIgnore();
6789 CM.collectElementTypesForWidening();
6790
6791 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6792 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6793 return;
6794
6795 // Invalidate interleave groups if all blocks of loop will be predicated.
6796 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6797 !useMaskedInterleavedAccesses(TTI)) {
6798 LLVM_DEBUG(
6799 dbgs()
6800 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6801 "which requires masked-interleaved support.\n");
6802 if (CM.InterleaveInfo.invalidateGroups())
6803 // Invalidating interleave groups also requires invalidating all decisions
6804 // based on them, which includes widening decisions and uniform and scalar
6805 // values.
6806 CM.invalidateCostModelingDecisions();
6807 }
6808
6809 if (CM.foldTailByMasking())
6810 Legal->prepareToFoldTailByMasking();
6811
6812 ElementCount MaxUserVF =
6813 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6814 if (UserVF) {
6815 if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6816 reportVectorizationInfo(
6817 Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6818 ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6819 } else {
6820 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6821 "VF needs to be a power of two");
6822 // Collect the instructions (and their associated costs) that will be more
6823 // profitable to scalarize.
6824 CM.collectInLoopReductions();
6825 if (CM.selectUserVectorizationFactor(UserVF)) {
6826 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6827 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6828 LLVM_DEBUG(printPlans(dbgs()));
6829 return;
6830 }
6831 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6832 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6833 }
6834 }
6835
6836 // Collect the Vectorization Factor Candidates.
6837 SmallVector<ElementCount> VFCandidates;
6838 for (auto VF = ElementCount::getFixed(MinVal: 1);
6839 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
6840 VFCandidates.push_back(Elt: VF);
6841 for (auto VF = ElementCount::getScalable(MinVal: 1);
6842 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
6843 VFCandidates.push_back(Elt: VF);
6844
6845 CM.collectInLoopReductions();
6846 for (const auto &VF : VFCandidates) {
6847 // Collect Uniform and Scalar instructions after vectorization with VF.
6848 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6849 }
6850
6851 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
6852 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
6853
6854 LLVM_DEBUG(printPlans(dbgs()));
6855}
6856
6857InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6858 ElementCount VF) const {
6859 InstructionCost Cost = CM.getInstructionCost(I: UI, VF);
6860 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6861 return InstructionCost(ForceTargetInstructionCost);
6862 return Cost;
6863}
6864
6865bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6866 ElementCount VF) const {
6867 return CM.isUniformAfterVectorization(I, VF);
6868}
6869
6870bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6871 return CM.ValuesToIgnore.contains(Ptr: UI) ||
6872 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
6873 SkipCostComputation.contains(Ptr: UI);
6874}
6875
6876unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
6877 return CM.getPredBlockCostDivisor(CostKind, BB);
6878}
6879
6880InstructionCost
6881LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6882 VPCostContext &CostCtx) const {
6883 InstructionCost Cost;
6884 // Cost modeling for inductions is inaccurate in the legacy cost model
6885 // compared to the recipes that are generated. To match here initially during
6886 // VPlan cost model bring up directly use the induction costs from the legacy
6887 // cost model. Note that we do this as pre-processing; the VPlan may not have
6888 // any recipes associated with the original induction increment instruction
6889 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6890 // the cost of induction phis and increments (both that are represented by
6891 // recipes and those that are not), to avoid distinguishing between them here,
6892 // and skip all recipes that represent induction phis and increments (the
6893 // former case) later on, if they exist, to avoid counting them twice.
6894 // Similarly we pre-compute the cost of any optimized truncates.
6895 // TODO: Switch to more accurate costing based on VPlan.
6896 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6897 Instruction *IVInc = cast<Instruction>(
6898 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6899 SmallVector<Instruction *> IVInsts = {IVInc};
6900 for (unsigned I = 0; I != IVInsts.size(); I++) {
6901 for (Value *Op : IVInsts[I]->operands()) {
6902 auto *OpI = dyn_cast<Instruction>(Val: Op);
6903 if (Op == IV || !OpI || !OrigLoop->contains(Inst: OpI) || !Op->hasOneUse())
6904 continue;
6905 IVInsts.push_back(Elt: OpI);
6906 }
6907 }
6908 IVInsts.push_back(Elt: IV);
6909 for (User *U : IV->users()) {
6910 auto *CI = cast<Instruction>(Val: U);
6911 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6912 continue;
6913 IVInsts.push_back(Elt: CI);
6914 }
6915
6916 // If the vector loop gets executed exactly once with the given VF, ignore
6917 // the costs of comparison and induction instructions, as they'll get
6918 // simplified away.
6919 // TODO: Remove this code after stepping away from the legacy cost model and
6920 // adding code to simplify VPlans before calculating their costs.
6921 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6922 if (TC == VF && !CM.foldTailByMasking())
6923 addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6924 InstsToIgnore&: CostCtx.SkipCostComputation);
6925
6926 for (Instruction *IVInst : IVInsts) {
6927 if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6928 continue;
6929 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6930 LLVM_DEBUG({
6931 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6932 << ": induction instruction " << *IVInst << "\n";
6933 });
6934 Cost += InductionCost;
6935 CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6936 }
6937 }
6938
6939 /// Compute the cost of all exiting conditions of the loop using the legacy
6940 /// cost model. This is to match the legacy behavior, which adds the cost of
6941 /// all exit conditions. Note that this over-estimates the cost, as there will
6942 /// be a single condition to control the vector loop.
6943 SmallVector<BasicBlock *> Exiting;
6944 CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6945 SetVector<Instruction *> ExitInstrs;
6946 // Collect all exit conditions.
6947 for (BasicBlock *EB : Exiting) {
6948 auto *Term = dyn_cast<CondBrInst>(Val: EB->getTerminator());
6949 if (!Term || CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6950 continue;
6951 if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) {
6952 ExitInstrs.insert(X: CondI);
6953 }
6954 }
6955 // Compute the cost of all instructions only feeding the exit conditions.
6956 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6957 Instruction *CondI = ExitInstrs[I];
6958 if (!OrigLoop->contains(Inst: CondI) ||
6959 !CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6960 continue;
6961 InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6962 LLVM_DEBUG({
6963 dbgs() << "Cost of " << CondICost << " for VF " << VF
6964 << ": exit condition instruction " << *CondI << "\n";
6965 });
6966 Cost += CondICost;
6967 for (Value *Op : CondI->operands()) {
6968 auto *OpI = dyn_cast<Instruction>(Val: Op);
6969 if (!OpI || CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) ||
6970 any_of(Range: OpI->users(), P: [&ExitInstrs](User *U) {
6971 return !ExitInstrs.contains(key: cast<Instruction>(Val: U));
6972 }))
6973 continue;
6974 ExitInstrs.insert(X: OpI);
6975 }
6976 }
6977
6978 // Pre-compute the costs for branches except for the backedge, as the number
6979 // of replicate regions in a VPlan may not directly match the number of
6980 // branches, which would lead to different decisions.
6981 // TODO: Compute cost of branches for each replicate region in the VPlan,
6982 // which is more accurate than the legacy cost model.
6983 for (BasicBlock *BB : OrigLoop->blocks()) {
6984 if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6985 continue;
6986 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6987 if (BB == OrigLoop->getLoopLatch())
6988 continue;
6989 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6990 Cost += BranchCost;
6991 }
6992
6993 // Don't apply special costs when instruction cost is forced to make sure the
6994 // forced cost is used for each recipe.
6995 if (ForceTargetInstructionCost.getNumOccurrences())
6996 return Cost;
6997
6998 // Pre-compute costs for instructions that are forced-scalar or profitable to
6999 // scalarize. Their costs will be computed separately in the legacy cost
7000 // model.
7001 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7002 if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
7003 continue;
7004 CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
7005 InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
7006 LLVM_DEBUG({
7007 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7008 << ": forced scalar " << *ForcedScalar << "\n";
7009 });
7010 Cost += ForcedCost;
7011 }
7012 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7013 if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
7014 continue;
7015 CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
7016 LLVM_DEBUG({
7017 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7018 << ": profitable to scalarize " << *Scalarized << "\n";
7019 });
7020 Cost += ScalarCost;
7021 }
7022
7023 return Cost;
7024}
7025
7026InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7027 ElementCount VF) const {
7028 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
7029 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7030
7031 // Now compute and add the VPlan-based cost.
7032 Cost += Plan.cost(VF, Ctx&: CostCtx);
7033#ifndef NDEBUG
7034 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
7035 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7036 << " (Estimated cost per lane: ");
7037 if (Cost.isValid()) {
7038 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
7039 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7040 } else /* No point dividing an invalid cost - it will still be invalid */
7041 LLVM_DEBUG(dbgs() << "Invalid");
7042 LLVM_DEBUG(dbgs() << ")\n");
7043#endif
7044 return Cost;
7045}
7046
7047#ifndef NDEBUG
7048/// Return true if the original loop \ TheLoop contains any instructions that do
7049/// not have corresponding recipes in \p Plan and are not marked to be ignored
7050/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7051/// cost-model did not account for.
7052static bool planContainsAdditionalSimplifications(VPlan &Plan,
7053 VPCostContext &CostCtx,
7054 Loop *TheLoop,
7055 ElementCount VF) {
7056 using namespace VPlanPatternMatch;
7057 // First collect all instructions for the recipes in Plan.
7058 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7059 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7060 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7061 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7062 return &WidenMem->getIngredient();
7063 return nullptr;
7064 };
7065
7066 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
7067 // the select doesn't need to be considered for the vector loop cost; go with
7068 // the more accurate VPlan-based cost model.
7069 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
7070 auto *VPI = dyn_cast<VPInstruction>(&R);
7071 if (!VPI || VPI->getOpcode() != Instruction::Select)
7072 continue;
7073
7074 if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7075 switch (WR->getOpcode()) {
7076 case Instruction::UDiv:
7077 case Instruction::SDiv:
7078 case Instruction::URem:
7079 case Instruction::SRem:
7080 return true;
7081 default:
7082 break;
7083 }
7084 }
7085 }
7086
7087 DenseSet<Instruction *> SeenInstrs;
7088 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7089 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7090 for (VPRecipeBase &R : *VPBB) {
7091 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7092 auto *IG = IR->getInterleaveGroup();
7093 unsigned NumMembers = IG->getNumMembers();
7094 for (unsigned I = 0; I != NumMembers; ++I) {
7095 if (Instruction *M = IG->getMember(I))
7096 SeenInstrs.insert(M);
7097 }
7098 continue;
7099 }
7100 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7101 // cost model won't cost it whilst the legacy will.
7102 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7103 if (none_of(FOR->users(),
7104 match_fn(m_VPInstruction<
7105 VPInstruction::FirstOrderRecurrenceSplice>())))
7106 return true;
7107 }
7108 // The VPlan-based cost model is more accurate for partial reductions and
7109 // comparing against the legacy cost isn't desirable.
7110 if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7111 if (VPR->isPartialReduction())
7112 return true;
7113
7114 // The VPlan-based cost model can analyze if recipes are scalar
7115 // recursively, but the legacy cost model cannot.
7116 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7117 auto *AddrI = dyn_cast<Instruction>(
7118 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7119 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7120 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7121 return true;
7122
7123 if (WidenMemR->isReverse()) {
7124 // If the stored value of a reverse store is invariant, LICM will
7125 // hoist the reverse operation to the preheader. In this case, the
7126 // result of the VPlan-based cost model will diverge from that of
7127 // the legacy model.
7128 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
7129 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7130 return true;
7131
7132 if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
7133 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7134 return true;
7135 }
7136 }
7137
7138 // The legacy cost model costs non-header phis with a scalar VF as a phi,
7139 // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
7140 if (isa<VPBlendRecipe>(&R) &&
7141 vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
7142 return true;
7143
7144 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7145 /// but the original instruction wasn't uniform-after-vectorization in the
7146 /// legacy cost model, the legacy cost overestimates the actual cost.
7147 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7148 if (RepR->isSingleScalar() &&
7149 !CostCtx.isLegacyUniformAfterVectorization(
7150 RepR->getUnderlyingInstr(), VF))
7151 return true;
7152 }
7153 if (Instruction *UI = GetInstructionForCost(&R)) {
7154 // If we adjusted the predicate of the recipe, the cost in the legacy
7155 // cost model may be different.
7156 CmpPredicate Pred;
7157 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7158 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7159 cast<CmpInst>(UI)->getPredicate())
7160 return true;
7161
7162 // Recipes with underlying instructions being moved out of the loop
7163 // region by LICM may cause discrepancies between the legacy cost model
7164 // and the VPlan-based cost model.
7165 if (!VPBB->getEnclosingLoopRegion())
7166 return true;
7167
7168 SeenInstrs.insert(UI);
7169 }
7170 }
7171 }
7172
7173 // If a reverse recipe has been sunk to the middle block (e.g., for a load
7174 // whose result is only used as a live-out), VPlan avoids the per-iteration
7175 // reverse shuffle cost that the legacy model accounts for.
7176 if (any_of(*Plan.getMiddleBlock(), [](const VPRecipeBase &R) {
7177 return match(&R, m_VPInstruction<VPInstruction::Reverse>());
7178 }))
7179 return true;
7180
7181 // Return true if the loop contains any instructions that are not also part of
7182 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7183 // that the VPlan contains extra simplifications.
7184 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7185 TheLoop](BasicBlock *BB) {
7186 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7187 // Skip induction phis when checking for simplifications, as they may not
7188 // be lowered directly be lowered to a corresponding PHI recipe.
7189 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7190 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7191 return false;
7192 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7193 });
7194 });
7195}
7196#endif
7197
7198VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7199 if (VPlans.empty())
7200 return VectorizationFactor::Disabled();
7201 // If there is a single VPlan with a single VF, return it directly.
7202 VPlan &FirstPlan = *VPlans[0];
7203 if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1)
7204 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7205
7206 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7207 << (CM.CostKind == TTI::TCK_RecipThroughput
7208 ? "Reciprocal Throughput\n"
7209 : CM.CostKind == TTI::TCK_Latency
7210 ? "Instruction Latency\n"
7211 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7212 : CM.CostKind == TTI::TCK_SizeAndLatency
7213 ? "Code Size and Latency\n"
7214 : "Unknown\n"));
7215
7216 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
7217 assert(hasPlanWithVF(ScalarVF) &&
7218 "More than a single plan/VF w/o any plan having scalar VF");
7219
7220 // TODO: Compute scalar cost using VPlan-based cost model.
7221 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7222 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7223 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7224 VectorizationFactor BestFactor = ScalarFactor;
7225
7226 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7227 if (ForceVectorization) {
7228 // Ignore scalar width, because the user explicitly wants vectorization.
7229 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7230 // evaluation.
7231 BestFactor.Cost = InstructionCost::getMax();
7232 }
7233
7234 for (auto &P : VPlans) {
7235 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7236 P->vectorFactors().end());
7237
7238 SmallVector<VPRegisterUsage, 8> RUs;
7239 if (any_of(Range&: VFs, P: [this](ElementCount VF) {
7240 return CM.shouldConsiderRegPressureForVF(VF);
7241 }))
7242 RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7243
7244 for (unsigned I = 0; I < VFs.size(); I++) {
7245 ElementCount VF = VFs[I];
7246 if (VF.isScalar())
7247 continue;
7248 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7249 LLVM_DEBUG(
7250 dbgs()
7251 << "LV: Not considering vector loop of width " << VF
7252 << " because it will not generate any vector instructions.\n");
7253 continue;
7254 }
7255 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7256 LLVM_DEBUG(
7257 dbgs()
7258 << "LV: Not considering vector loop of width " << VF
7259 << " because it would cause replicated blocks to be generated,"
7260 << " which isn't allowed when optimizing for size.\n");
7261 continue;
7262 }
7263
7264 InstructionCost Cost = cost(Plan&: *P, VF);
7265 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7266
7267 if (CM.shouldConsiderRegPressureForVF(VF) &&
7268 RUs[I].exceedsMaxNumRegs(TTI, OverrideMaxNumRegs: ForceTargetNumVectorRegs)) {
7269 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7270 << VF << " because it uses too many registers\n");
7271 continue;
7272 }
7273
7274 if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P->hasScalarTail()))
7275 BestFactor = CurrentFactor;
7276
7277 // If profitable add it to ProfitableVF list.
7278 if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P->hasScalarTail()))
7279 ProfitableVFs.push_back(Elt: CurrentFactor);
7280 }
7281 }
7282
7283#ifndef NDEBUG
7284 // Select the optimal vectorization factor according to the legacy cost-model.
7285 // This is now only used to verify the decisions by the new VPlan-based
7286 // cost-model and will be retired once the VPlan-based cost-model is
7287 // stabilized.
7288 VectorizationFactor LegacyVF = selectVectorizationFactor();
7289 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7290
7291 // Pre-compute the cost and use it to check if BestPlan contains any
7292 // simplifications not accounted for in the legacy cost model. If that's the
7293 // case, don't trigger the assertion, as the extra simplifications may cause a
7294 // different VF to be picked by the VPlan-based cost model.
7295 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, CM.PSE,
7296 OrigLoop);
7297 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7298 // Verify that the VPlan-based and legacy cost models agree, except for
7299 // * VPlans with early exits,
7300 // * VPlans with additional VPlan simplifications,
7301 // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
7302 // vp_scatter/vp_gather).
7303 // The legacy cost model doesn't properly model costs for such loops.
7304 bool UsesEVLGatherScatter =
7305 any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
7306 BestPlan.getVectorLoopRegion()->getEntry())),
7307 [](VPBasicBlock *VPBB) {
7308 return any_of(*VPBB, [](VPRecipeBase &R) {
7309 return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7310 !cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7311 });
7312 });
7313 assert(
7314 (BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7315 !Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
7316 planContainsAdditionalSimplifications(
7317 getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
7318 planContainsAdditionalSimplifications(
7319 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7320 " VPlan cost model and legacy cost model disagreed");
7321 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7322 "when vectorizing, the scalar cost must be computed.");
7323#endif
7324
7325 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7326 return BestFactor;
7327}
7328
7329DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7330 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7331 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7332 assert(BestVPlan.hasVF(BestVF) &&
7333 "Trying to execute plan with unsupported VF");
7334 assert(BestVPlan.hasUF(BestUF) &&
7335 "Trying to execute plan with unsupported UF");
7336 if (BestVPlan.hasEarlyExit())
7337 ++LoopsEarlyExitVectorized;
7338 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7339 // cost model is complete for better cost estimates.
7340 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7341 RUN_VPLAN_PASS(VPlanTransforms::materializePacksAndUnpacks, BestVPlan);
7342 RUN_VPLAN_PASS(VPlanTransforms::materializeBroadcasts, BestVPlan);
7343 RUN_VPLAN_PASS(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7344 bool HasBranchWeights =
7345 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
7346 if (HasBranchWeights) {
7347 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7348 RUN_VPLAN_PASS(VPlanTransforms::addBranchWeightToMiddleTerminator,
7349 BestVPlan, BestVF, VScale);
7350 }
7351
7352 // Checks are the same for all VPlans, added to BestVPlan only for
7353 // compactness.
7354 attachRuntimeChecks(Plan&: BestVPlan, RTChecks&: ILV.RTChecks, HasBranchWeights);
7355
7356 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7357 VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
7358
7359 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7360 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7361 VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan);
7362 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7363 BestVPlan.getScalarPreheader()) {
7364 // TODO: The vector loop would be dead, should not even try to vectorize.
7365 ORE->emit(RemarkBuilder: [&]() {
7366 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7367 OrigLoop->getStartLoc(),
7368 OrigLoop->getHeader())
7369 << "Created vector loop never executes due to insufficient trip "
7370 "count.";
7371 });
7372 return DenseMap<const SCEV *, Value *>();
7373 }
7374
7375 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7376
7377 VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan);
7378 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
7379 VPlanTransforms::convertEVLExitCond(Plan&: BestVPlan);
7380 // Regions are dissolved after optimizing for VF and UF, which completely
7381 // removes unneeded loop regions first.
7382 VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7383 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
7384 // its successors.
7385 VPlanTransforms::expandBranchOnTwoConds(Plan&: BestVPlan);
7386 // Convert loops with variable-length stepping after regions are dissolved.
7387 VPlanTransforms::convertToVariableLengthStep(Plan&: BestVPlan);
7388 VPlanTransforms::materializeBackedgeTakenCount(Plan&: BestVPlan, VectorPH);
7389 VPlanTransforms::materializeVectorTripCount(
7390 Plan&: BestVPlan, VectorPHVPBB: VectorPH, TailByMasking: CM.foldTailByMasking(),
7391 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: BestVF.isVector()), Step: &BestVPlan.getVFxUF());
7392 VPlanTransforms::materializeFactors(Plan&: BestVPlan, VectorPH, VF: BestVF);
7393 VPlanTransforms::cse(Plan&: BestVPlan);
7394 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7395
7396 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7397 // making any changes to the CFG.
7398 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7399 VPlanTransforms::expandSCEVs(Plan&: BestVPlan, SE&: *PSE.getSE());
7400 if (!ILV.getTripCount()) {
7401 ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7402 } else {
7403 assert(VectorizingEpilogue && "should only re-use the existing trip "
7404 "count during epilogue vectorization");
7405 }
7406
7407 // Perform the actual loop transformation.
7408 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7409 OrigLoop->getParentLoop(),
7410 Legal->getWidestInductionType());
7411
7412#ifdef EXPENSIVE_CHECKS
7413 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7414#endif
7415
7416 // 1. Set up the skeleton for vectorization, including vector pre-header and
7417 // middle block. The vector loop is created during VPlan execution.
7418 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7419 replaceVPBBWithIRVPBB(VPBB: BestVPlan.getScalarPreheader(),
7420 IRBB: State.CFG.PrevBB->getSingleSuccessor(), Plan: &BestVPlan);
7421 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7422
7423 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
7424
7425 // After vectorization, the exit blocks of the original loop will have
7426 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7427 // looked through single-entry phis.
7428 ScalarEvolution &SE = *PSE.getSE();
7429 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7430 if (!Exit->hasPredecessors())
7431 continue;
7432 for (VPRecipeBase &PhiR : Exit->phis())
7433 SE.forgetLcssaPhiWithNewPredecessor(L: OrigLoop,
7434 V: &cast<VPIRPhi>(Val&: PhiR).getIRPhi());
7435 }
7436 // Forget the original loop and block dispositions.
7437 SE.forgetLoop(L: OrigLoop);
7438 SE.forgetBlockAndLoopDispositions();
7439
7440 ILV.printDebugTracesAtStart();
7441
7442 //===------------------------------------------------===//
7443 //
7444 // Notice: any optimization or new instruction that go
7445 // into the code below should also be implemented in
7446 // the cost-model.
7447 //
7448 //===------------------------------------------------===//
7449
7450 // Retrieve loop information before executing the plan, which may remove the
7451 // original loop, if it becomes unreachable.
7452 MDNode *LID = OrigLoop->getLoopID();
7453 unsigned OrigLoopInvocationWeight = 0;
7454 std::optional<unsigned> OrigAverageTripCount =
7455 getLoopEstimatedTripCount(L: OrigLoop, EstimatedLoopInvocationWeight: &OrigLoopInvocationWeight);
7456
7457 BestVPlan.execute(State: &State);
7458
7459 // 2.6. Maintain Loop Hints
7460 // Keep all loop hints from the original loop on the vector loop (we'll
7461 // replace the vectorizer-specific hints below).
7462 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7463 // Add metadata to disable runtime unrolling a scalar loop when there
7464 // are no runtime checks about strides and memory. A scalar loop that is
7465 // rarely used is not worth unrolling.
7466 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7467 updateLoopMetadataAndProfileInfo(
7468 VectorLoop: HeaderVPBB ? LI->getLoopFor(BB: State.CFG.VPBB2IRBB.lookup(Val: HeaderVPBB))
7469 : nullptr,
7470 HeaderVPBB, Plan: BestVPlan, VectorizingEpilogue, OrigLoopID: LID, OrigAverageTripCount,
7471 OrigLoopInvocationWeight,
7472 EstimatedVFxUF: estimateElementCount(VF: BestVF * BestUF, VScale: CM.getVScaleForTuning()),
7473 DisableRuntimeUnroll);
7474
7475 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7476 // predication, updating analyses.
7477 ILV.fixVectorizedLoop(State);
7478
7479 ILV.printDebugTracesAtEnd();
7480
7481 return ExpandedSCEVs;
7482}
7483
7484//===--------------------------------------------------------------------===//
7485// EpilogueVectorizerMainLoop
7486//===--------------------------------------------------------------------===//
7487
7488/// This function is partially responsible for generating the control flow
7489/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7490BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
7491 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
7492 BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7493
7494 // Generate the code to check the minimum iteration count of the vector
7495 // epilogue (see below).
7496 EPI.EpilogueIterationCountCheck =
7497 emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: true);
7498 EPI.EpilogueIterationCountCheck->setName("iter.check");
7499
7500 VectorPH = cast<CondBrInst>(Val: EPI.EpilogueIterationCountCheck->getTerminator())
7501 ->getSuccessor(i: 1);
7502 // Generate the iteration count check for the main loop, *after* the check
7503 // for the epilogue loop, so that the path-length is shorter for the case
7504 // that goes directly through the vector epilogue. The longer-path length for
7505 // the main loop is compensated for, by the gain from vectorizing the larger
7506 // trip count. Note: the branch will get updated later on when we vectorize
7507 // the epilogue.
7508 EPI.MainLoopIterationCountCheck =
7509 emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: false);
7510
7511 return cast<CondBrInst>(Val: EPI.MainLoopIterationCountCheck->getTerminator())
7512 ->getSuccessor(i: 1);
7513}
7514
7515void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7516 LLVM_DEBUG({
7517 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7518 << "Main Loop VF:" << EPI.MainLoopVF
7519 << ", Main Loop UF:" << EPI.MainLoopUF
7520 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7521 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7522 });
7523}
7524
7525void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7526 DEBUG_WITH_TYPE(VerboseDebug, {
7527 dbgs() << "intermediate fn:\n"
7528 << *OrigLoop->getHeader()->getParent() << "\n";
7529 });
7530}
7531
7532BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
7533 BasicBlock *VectorPH, BasicBlock *Bypass, bool ForEpilogue) {
7534 assert(Bypass && "Expected valid bypass basic block.");
7535 Value *Count = getTripCount();
7536 MinProfitableTripCount = ElementCount::getFixed(MinVal: 0);
7537 Value *CheckMinIters = createIterationCountCheck(
7538 VectorPH, VF: ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7539 UF: ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7540
7541 BasicBlock *const TCCheckBlock = VectorPH;
7542 if (!ForEpilogue)
7543 TCCheckBlock->setName("vector.main.loop.iter.check");
7544
7545 // Create new preheader for vector loop.
7546 VectorPH = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7547 DT: static_cast<DominatorTree *>(nullptr), LI, MSSAU: nullptr,
7548 BBName: "vector.ph");
7549 if (ForEpilogue) {
7550 // Save the trip count so we don't have to regenerate it in the
7551 // vec.epilog.iter.check. This is safe to do because the trip count
7552 // generated here dominates the vector epilog iter check.
7553 EPI.TripCount = Count;
7554 } else {
7555 VectorPHVPBB = replaceVPBBWithIRVPBB(VPBB: VectorPHVPBB, IRBB: VectorPH);
7556 }
7557
7558 CondBrInst &BI = *CondBrInst::Create(Cond: CheckMinIters, IfTrue: Bypass, IfFalse: VectorPH);
7559 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7560 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
7561 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7562
7563 // When vectorizing the main loop, its trip-count check is placed in a new
7564 // block, whereas the overall trip-count check is placed in the VPlan entry
7565 // block. When vectorizing the epilogue loop, its trip-count check is placed
7566 // in the VPlan entry block.
7567 if (!ForEpilogue)
7568 introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7569 return TCCheckBlock;
7570}
7571
7572//===--------------------------------------------------------------------===//
7573// EpilogueVectorizerEpilogueLoop
7574//===--------------------------------------------------------------------===//
7575
7576/// This function creates a new scalar preheader, using the previous one as
7577/// entry block to the epilogue VPlan. The minimum iteration check is being
7578/// represented in VPlan.
7579BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
7580 BasicBlock *NewScalarPH = createScalarPreheader(Prefix: "vec.epilog.");
7581 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7582 OriginalScalarPH->setName("vec.epilog.iter.check");
7583 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: OriginalScalarPH);
7584 VPBasicBlock *OldEntry = Plan.getEntry();
7585 for (auto &R : make_early_inc_range(Range&: *OldEntry)) {
7586 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7587 // defining.
7588 if (isa<VPIRInstruction>(Val: &R))
7589 continue;
7590 R.moveBefore(BB&: *NewEntry, I: NewEntry->end());
7591 }
7592
7593 VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7594 Plan.setEntry(NewEntry);
7595 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7596
7597 return OriginalScalarPH;
7598}
7599
7600void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7601 LLVM_DEBUG({
7602 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7603 << "Epilogue Loop VF:" << EPI.EpilogueVF
7604 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7605 });
7606}
7607
7608void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7609 DEBUG_WITH_TYPE(VerboseDebug, {
7610 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7611 });
7612}
7613
7614VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7615 VFRange &Range) {
7616 assert((VPI->getOpcode() == Instruction::Load ||
7617 VPI->getOpcode() == Instruction::Store) &&
7618 "Must be called with either a load or store");
7619 Instruction *I = VPI->getUnderlyingInstr();
7620
7621 auto WillWiden = [&](ElementCount VF) -> bool {
7622 LoopVectorizationCostModel::InstWidening Decision =
7623 CM.getWideningDecision(I, VF);
7624 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7625 "CM decision should be taken at this point.");
7626 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7627 return true;
7628 if (CM.isScalarAfterVectorization(I, VF) ||
7629 CM.isProfitableToScalarize(I, VF))
7630 return false;
7631 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7632 };
7633
7634 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7635 return nullptr;
7636
7637 // If a mask is not required, drop it - use unmasked version for safe loads.
7638 // TODO: Determine if mask is needed in VPlan.
7639 VPValue *Mask = Legal->isMaskRequired(I) ? VPI->getMask() : nullptr;
7640
7641 // Determine if the pointer operand of the access is either consecutive or
7642 // reverse consecutive.
7643 LoopVectorizationCostModel::InstWidening Decision =
7644 CM.getWideningDecision(I, VF: Range.Start);
7645 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7646 bool Consecutive =
7647 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7648
7649 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(N: 0)
7650 : VPI->getOperand(N: 1);
7651 if (Consecutive) {
7652 auto *GEP = dyn_cast<GetElementPtrInst>(
7653 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7654 VPSingleDefRecipe *VectorPtr;
7655 if (Reverse) {
7656 // When folding the tail, we may compute an address that we don't in the
7657 // original scalar loop: drop the GEP no-wrap flags in this case.
7658 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7659 // emit negative indices.
7660 GEPNoWrapFlags Flags =
7661 CM.foldTailByMasking() || !GEP
7662 ? GEPNoWrapFlags::none()
7663 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7664 VectorPtr = new VPVectorEndPointerRecipe(
7665 Ptr, &Plan.getVF(), getLoadStoreType(I),
7666 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7667 } else {
7668 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7669 GEP ? GEP->getNoWrapFlags()
7670 : GEPNoWrapFlags::none(),
7671 VPI->getDebugLoc());
7672 }
7673 Builder.insert(R: VectorPtr);
7674 Ptr = VectorPtr;
7675 }
7676
7677 if (VPI->getOpcode() == Instruction::Load) {
7678 auto *Load = cast<LoadInst>(Val: I);
7679 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7680 *VPI, Load->getDebugLoc());
7681 if (Reverse) {
7682 Builder.insert(R: LoadR);
7683 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
7684 LoadR->getDebugLoc());
7685 }
7686 return LoadR;
7687 }
7688
7689 StoreInst *Store = cast<StoreInst>(Val: I);
7690 VPValue *StoredVal = VPI->getOperand(N: 0);
7691 if (Reverse)
7692 StoredVal = Builder.createNaryOp(Opcode: VPInstruction::Reverse, Operands: StoredVal,
7693 DL: Store->getDebugLoc());
7694 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
7695 Reverse, *VPI, Store->getDebugLoc());
7696}
7697
7698VPWidenIntOrFpInductionRecipe *
7699VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7700 VFRange &Range) {
7701 auto *I = cast<TruncInst>(Val: VPI->getUnderlyingInstr());
7702 // Optimize the special case where the source is a constant integer
7703 // induction variable. Notice that we can only optimize the 'trunc' case
7704 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7705 // (c) other casts depend on pointer size.
7706
7707 // Determine whether \p K is a truncation based on an induction variable that
7708 // can be optimized.
7709 auto IsOptimizableIVTruncate =
7710 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7711 return [=](ElementCount VF) -> bool {
7712 return CM.isOptimizableIVTruncate(I: K, VF);
7713 };
7714 };
7715
7716 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
7717 Predicate: IsOptimizableIVTruncate(I), Range))
7718 return nullptr;
7719
7720 auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
7721 Val: VPI->getOperand(N: 0)->getDefiningRecipe());
7722 PHINode *Phi = WidenIV->getPHINode();
7723 VPIRValue *Start = WidenIV->getStartValue();
7724 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7725
7726 // It is always safe to copy over the NoWrap and FastMath flags. In
7727 // particular, when folding tail by masking, the masked-off lanes are never
7728 // used, so it is safe.
7729 VPIRFlags Flags = vputils::getFlagsFromIndDesc(ID: IndDesc);
7730 VPValue *Step =
7731 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
7732 return new VPWidenIntOrFpInductionRecipe(
7733 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7734}
7735
7736VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7737 VFRange &Range) {
7738 CallInst *CI = cast<CallInst>(Val: VPI->getUnderlyingInstr());
7739 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7740 Predicate: [this, CI](ElementCount VF) {
7741 return CM.isScalarWithPredication(I: CI, VF);
7742 },
7743 Range);
7744
7745 if (IsPredicated)
7746 return nullptr;
7747
7748 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7749 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7750 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7751 ID == Intrinsic::pseudoprobe ||
7752 ID == Intrinsic::experimental_noalias_scope_decl))
7753 return nullptr;
7754
7755 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7756 VPI->op_begin() + CI->arg_size());
7757
7758 // Is it beneficial to perform intrinsic call compared to lib call?
7759 bool ShouldUseVectorIntrinsic =
7760 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7761 Predicate: [&](ElementCount VF) -> bool {
7762 return CM.getCallWideningDecision(CI, VF).Kind ==
7763 LoopVectorizationCostModel::CM_IntrinsicCall;
7764 },
7765 Range);
7766 if (ShouldUseVectorIntrinsic)
7767 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
7768 VPI->getDebugLoc());
7769
7770 Function *Variant = nullptr;
7771 std::optional<unsigned> MaskPos;
7772 // Is better to call a vectorized version of the function than to to scalarize
7773 // the call?
7774 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7775 Predicate: [&](ElementCount VF) -> bool {
7776 // The following case may be scalarized depending on the VF.
7777 // The flag shows whether we can use a usual Call for vectorized
7778 // version of the instruction.
7779
7780 // If we've found a variant at a previous VF, then stop looking. A
7781 // vectorized variant of a function expects input in a certain shape
7782 // -- basically the number of input registers, the number of lanes
7783 // per register, and whether there's a mask required.
7784 // We store a pointer to the variant in the VPWidenCallRecipe, so
7785 // once we have an appropriate variant it's only valid for that VF.
7786 // This will force a different vplan to be generated for each VF that
7787 // finds a valid variant.
7788 if (Variant)
7789 return false;
7790 LoopVectorizationCostModel::CallWideningDecision Decision =
7791 CM.getCallWideningDecision(CI, VF);
7792 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7793 Variant = Decision.Variant;
7794 MaskPos = Decision.MaskPos;
7795 return true;
7796 }
7797
7798 return false;
7799 },
7800 Range);
7801 if (ShouldUseVectorCall) {
7802 if (MaskPos.has_value()) {
7803 // We have 2 cases that would require a mask:
7804 // 1) The call needs to be predicated, either due to a conditional
7805 // in the scalar loop or use of an active lane mask with
7806 // tail-folding, and we use the appropriate mask for the block.
7807 // 2) No mask is required for the call instruction, but the only
7808 // available vector variant at this VF requires a mask, so we
7809 // synthesize an all-true mask.
7810 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7811
7812 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7813 }
7814
7815 Ops.push_back(Elt: VPI->getOperand(N: VPI->getNumOperandsWithoutMask() - 1));
7816 return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
7817 VPI->getDebugLoc());
7818 }
7819
7820 return nullptr;
7821}
7822
7823bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7824 assert((!isa<UncondBrInst, CondBrInst, PHINode, LoadInst, StoreInst>(I)) &&
7825 "Instruction should have been handled earlier");
7826 // Instruction should be widened, unless it is scalar after vectorization,
7827 // scalarization is profitable or it is predicated.
7828 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7829 return CM.isScalarAfterVectorization(I, VF) ||
7830 CM.isProfitableToScalarize(I, VF) ||
7831 CM.isScalarWithPredication(I, VF);
7832 };
7833 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7834 Range);
7835}
7836
7837VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7838 auto *I = VPI->getUnderlyingInstr();
7839 switch (VPI->getOpcode()) {
7840 default:
7841 return nullptr;
7842 case Instruction::SDiv:
7843 case Instruction::UDiv:
7844 case Instruction::SRem:
7845 case Instruction::URem: {
7846 // If not provably safe, use a select to form a safe divisor before widening the
7847 // div/rem operation itself. Otherwise fall through to general handling below.
7848 if (CM.isPredicatedInst(I)) {
7849 SmallVector<VPValue *> Ops(VPI->operandsWithoutMask());
7850 VPValue *Mask = VPI->getMask();
7851 VPValue *One = Plan.getConstantInt(Ty: I->getType(), Val: 1u);
7852 auto *SafeRHS =
7853 Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: VPI->getDebugLoc());
7854 Ops[1] = SafeRHS;
7855 return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
7856 }
7857 [[fallthrough]];
7858 }
7859 case Instruction::Add:
7860 case Instruction::And:
7861 case Instruction::AShr:
7862 case Instruction::FAdd:
7863 case Instruction::FCmp:
7864 case Instruction::FDiv:
7865 case Instruction::FMul:
7866 case Instruction::FNeg:
7867 case Instruction::FRem:
7868 case Instruction::FSub:
7869 case Instruction::ICmp:
7870 case Instruction::LShr:
7871 case Instruction::Mul:
7872 case Instruction::Or:
7873 case Instruction::Select:
7874 case Instruction::Shl:
7875 case Instruction::Sub:
7876 case Instruction::Xor:
7877 case Instruction::Freeze:
7878 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
7879 VPI->getDebugLoc());
7880 case Instruction::ExtractValue: {
7881 SmallVector<VPValue *> NewOps(VPI->operandsWithoutMask());
7882 auto *EVI = cast<ExtractValueInst>(Val: I);
7883 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7884 unsigned Idx = EVI->getIndices()[0];
7885 NewOps.push_back(Elt: Plan.getConstantInt(BitWidth: 32, Val: Idx));
7886 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7887 }
7888 };
7889}
7890
7891VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7892 VPInstruction *VPI) {
7893 // FIXME: Support other operations.
7894 unsigned Opcode = HI->Update->getOpcode();
7895 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7896 "Histogram update operation must be an Add or Sub");
7897
7898 SmallVector<VPValue *, 3> HGramOps;
7899 // Bucket address.
7900 HGramOps.push_back(Elt: VPI->getOperand(N: 1));
7901 // Increment value.
7902 HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: 1)));
7903
7904 // In case of predicated execution (due to tail-folding, or conditional
7905 // execution, or both), pass the relevant mask.
7906 if (Legal->isMaskRequired(I: HI->Store))
7907 HGramOps.push_back(Elt: VPI->getMask());
7908
7909 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7910}
7911
7912VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
7913 VFRange &Range) {
7914 auto *I = VPI->getUnderlyingInstr();
7915 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7916 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7917 Range);
7918
7919 bool IsPredicated = CM.isPredicatedInst(I);
7920
7921 // Even if the instruction is not marked as uniform, there are certain
7922 // intrinsic calls that can be effectively treated as such, so we check for
7923 // them here. Conservatively, we only do this for scalable vectors, since
7924 // for fixed-width VFs we can always fall back on full scalarization.
7925 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
7926 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
7927 case Intrinsic::assume:
7928 case Intrinsic::lifetime_start:
7929 case Intrinsic::lifetime_end:
7930 // For scalable vectors if one of the operands is variant then we still
7931 // want to mark as uniform, which will generate one instruction for just
7932 // the first lane of the vector. We can't scalarize the call in the same
7933 // way as for fixed-width vectors because we don't know how many lanes
7934 // there are.
7935 //
7936 // The reasons for doing it this way for scalable vectors are:
7937 // 1. For the assume intrinsic generating the instruction for the first
7938 // lane is still be better than not generating any at all. For
7939 // example, the input may be a splat across all lanes.
7940 // 2. For the lifetime start/end intrinsics the pointer operand only
7941 // does anything useful when the input comes from a stack object,
7942 // which suggests it should always be uniform. For non-stack objects
7943 // the effect is to poison the object, which still allows us to
7944 // remove the call.
7945 IsUniform = true;
7946 break;
7947 default:
7948 break;
7949 }
7950 }
7951 VPValue *BlockInMask = nullptr;
7952 if (!IsPredicated) {
7953 // Finalize the recipe for Instr, first if it is not predicated.
7954 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7955 } else {
7956 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7957 // Instructions marked for predication are replicated and a mask operand is
7958 // added initially. Masked replicate recipes will later be placed under an
7959 // if-then construct to prevent side-effects. Generate recipes to compute
7960 // the block mask for this region.
7961 BlockInMask = VPI->getMask();
7962 }
7963
7964 // Note that there is some custom logic to mark some intrinsics as uniform
7965 // manually above for scalable vectors, which this assert needs to account for
7966 // as well.
7967 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
7968 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7969 "Should not predicate a uniform recipe");
7970 auto *Recipe =
7971 new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
7972 BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
7973 return Recipe;
7974}
7975
7976VPRecipeBase *
7977VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
7978 VFRange &Range) {
7979 assert(!R->isPhi() && "phis must be handled earlier");
7980 // First, check for specific widening recipes that deal with optimizing
7981 // truncates, calls and memory operations.
7982
7983 VPRecipeBase *Recipe;
7984 auto *VPI = cast<VPInstruction>(Val: R);
7985 if (VPI->getOpcode() == Instruction::Trunc &&
7986 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
7987 return Recipe;
7988
7989 // All widen recipes below deal only with VF > 1.
7990 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7991 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
7992 return nullptr;
7993
7994 if (VPI->getOpcode() == Instruction::Call)
7995 return tryToWidenCall(VPI, Range);
7996
7997 Instruction *Instr = R->getUnderlyingInstr();
7998 if (VPI->getOpcode() == Instruction::Store)
7999 if (auto HistInfo = Legal->getHistogramInfo(I: cast<StoreInst>(Val: Instr)))
8000 return tryToWidenHistogram(HI: *HistInfo, VPI);
8001
8002 if (VPI->getOpcode() == Instruction::Load ||
8003 VPI->getOpcode() == Instruction::Store)
8004 return tryToWidenMemory(VPI, Range);
8005
8006 if (!shouldWiden(I: Instr, Range))
8007 return nullptr;
8008
8009 if (VPI->getOpcode() == Instruction::GetElementPtr)
8010 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Val: Instr),
8011 VPI->operandsWithoutMask(), *VPI,
8012 VPI->getDebugLoc());
8013
8014 if (Instruction::isCast(Opcode: VPI->getOpcode())) {
8015 auto *CI = cast<CastInst>(Val: Instr);
8016 auto *CastR = cast<VPInstructionWithType>(Val: VPI);
8017 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(N: 0),
8018 CastR->getResultType(), CI, *VPI, *VPI,
8019 VPI->getDebugLoc());
8020 }
8021
8022 return tryToWiden(VPI);
8023}
8024
8025void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8026 ElementCount MaxVF) {
8027 if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8028 return;
8029
8030 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8031
8032 const LoopAccessInfo *LAI = Legal->getLAI();
8033 LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8034 OrigLoop, LI, DT, PSE.getSE());
8035 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8036 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
8037 // Only use noalias metadata when using memory checks guaranteeing no
8038 // overlap across all iterations.
8039 LVer.prepareNoAliasMetadata();
8040 }
8041
8042 // Create initial base VPlan0, to serve as common starting point for all
8043 // candidates built later for specific VF ranges.
8044 auto VPlan0 = VPlanTransforms::buildVPlan0(
8045 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8046 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE, LVer: &LVer);
8047
8048 VPlanTransforms::simplifyRecipes(Plan&: *VPlan0);
8049 VPlanTransforms::handleEarlyExits(Plan&: *VPlan0, HasUncountableExit: Legal->hasUncountableEarlyExit());
8050 VPlanTransforms::addMiddleCheck(Plan&: *VPlan0, TailFolded: CM.foldTailByMasking());
8051 RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::createLoopRegions, *VPlan0);
8052
8053 // Create recipes for header phis.
8054 VPlanTransforms::createHeaderPhiRecipes(
8055 Plan&: *VPlan0, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
8056 Reductions: Legal->getReductionVars(), FixedOrderRecurrences: Legal->getFixedOrderRecurrences(),
8057 InLoopReductions: CM.getInLoopReductions(), AllowReordering: Hints.allowReordering());
8058
8059 if (CM.foldTailByMasking())
8060 RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *VPlan0);
8061 RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize,
8062 *VPlan0);
8063
8064 auto MaxVFTimes2 = MaxVF * 2;
8065 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8066 VFRange SubRange = {VF, MaxVFTimes2};
8067 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8068 InitialPlan: std::unique_ptr<VPlan>(VPlan0->duplicate()), Range&: SubRange, LVer: &LVer)) {
8069 // Now optimize the initial VPlan.
8070 VPlanTransforms::hoistPredicatedLoads(Plan&: *Plan, PSE, L: OrigLoop);
8071 VPlanTransforms::sinkPredicatedStores(Plan&: *Plan, PSE, L: OrigLoop);
8072 RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
8073 CM.getMinimalBitwidths());
8074 RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
8075 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
8076 if (CM.foldTailWithEVL()) {
8077 RUN_VPLAN_PASS(VPlanTransforms::addExplicitVectorLength, *Plan,
8078 CM.getMaxSafeElements());
8079 RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
8080 }
8081
8082 if (auto P = VPlanTransforms::narrowInterleaveGroups(Plan&: *Plan, TTI))
8083 VPlans.push_back(Elt: std::move(P));
8084
8085 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8086 VPlans.push_back(Elt: std::move(Plan));
8087 }
8088 VF = SubRange.End;
8089 }
8090}
8091
8092VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8093 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8094
8095 using namespace llvm::VPlanPatternMatch;
8096 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8097
8098 // ---------------------------------------------------------------------------
8099 // Build initial VPlan: Scan the body of the loop in a topological order to
8100 // visit each basic block after having visited its predecessor basic blocks.
8101 // ---------------------------------------------------------------------------
8102
8103 bool RequiresScalarEpilogueCheck =
8104 LoopVectorizationPlanner::getDecisionAndClampRange(
8105 Predicate: [this](ElementCount VF) {
8106 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8107 },
8108 Range);
8109 // Update the branch in the middle block if a scalar epilogue is required.
8110 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8111 if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == 2) {
8112 auto *BranchOnCond = cast<VPInstruction>(Val: MiddleVPBB->getTerminator());
8113 assert(MiddleVPBB->getSuccessors()[1] == Plan->getScalarPreheader() &&
8114 "second successor must be scalar preheader");
8115 BranchOnCond->setOperand(I: 0, New: Plan->getFalse());
8116 }
8117
8118 // Don't use getDecisionAndClampRange here, because we don't know the UF
8119 // so this function is better to be conservative, rather than to split
8120 // it up into different VPlans.
8121 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8122 bool IVUpdateMayOverflow = false;
8123 for (ElementCount VF : Range)
8124 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8125
8126 TailFoldingStyle Style = CM.getTailFoldingStyle();
8127 // Use NUW for the induction increment if we proved that it won't overflow in
8128 // the vector loop or when not folding the tail. In the later case, we know
8129 // that the canonical induction increment will not overflow as the vector trip
8130 // count is >= increment and a multiple of the increment.
8131 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8132 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8133 if (!HasNUW) {
8134 auto *IVInc =
8135 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(N: 0);
8136 assert(match(IVInc,
8137 m_VPInstruction<Instruction::Add>(
8138 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8139 "Did not find the canonical IV increment");
8140 cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8141 }
8142
8143 // ---------------------------------------------------------------------------
8144 // Pre-construction: record ingredients whose recipes we'll need to further
8145 // process after constructing the initial VPlan.
8146 // ---------------------------------------------------------------------------
8147
8148 // For each interleave group which is relevant for this (possibly trimmed)
8149 // Range, add it to the set of groups to be later applied to the VPlan and add
8150 // placeholders for its members' Recipes which we'll be replacing with a
8151 // single VPInterleaveRecipe.
8152 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8153 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8154 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8155 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8156 LoopVectorizationCostModel::CM_Interleave);
8157 // For scalable vectors, the interleave factors must be <= 8 since we
8158 // require the (de)interleaveN intrinsics instead of shufflevectors.
8159 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8160 "Unsupported interleave factor for scalable vectors");
8161 return Result;
8162 };
8163 if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8164 continue;
8165 InterleaveGroups.insert(Ptr: IG);
8166 }
8167
8168 // ---------------------------------------------------------------------------
8169 // Construct wide recipes and apply predication for original scalar
8170 // VPInstructions in the loop.
8171 // ---------------------------------------------------------------------------
8172 VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
8173
8174 // Scan the body of the loop in a topological order to visit each basic block
8175 // after having visited its predecessor basic blocks.
8176 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8177 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8178 HeaderVPBB);
8179
8180 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8181
8182 // Collect blocks that need predication for in-loop reduction recipes.
8183 DenseSet<BasicBlock *> BlocksNeedingPredication;
8184 for (BasicBlock *BB : OrigLoop->blocks())
8185 if (CM.blockNeedsPredicationForAnyReason(BB))
8186 BlocksNeedingPredication.insert(V: BB);
8187
8188 VPlanTransforms::createInLoopReductionRecipes(Plan&: *Plan, BlocksNeedingPredication,
8189 MinVF: Range.Start);
8190
8191 // Now process all other blocks and instructions.
8192 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8193 // Convert input VPInstructions to widened recipes.
8194 for (VPRecipeBase &R : make_early_inc_range(
8195 Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end()))) {
8196 // Skip recipes that do not need transforming.
8197 if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(Val: &R))
8198 continue;
8199 auto *VPI = cast<VPInstruction>(Val: &R);
8200 if (!VPI->getUnderlyingValue())
8201 continue;
8202
8203 // TODO: Gradually replace uses of underlying instruction by analyses on
8204 // VPlan. Migrate code relying on the underlying instruction from VPlan0
8205 // to construct recipes below to not use the underlying instruction.
8206 Instruction *Instr = cast<Instruction>(Val: VPI->getUnderlyingValue());
8207 Builder.setInsertPoint(VPI);
8208
8209 // The stores with invariant address inside the loop will be deleted, and
8210 // in the exit block, a uniform store recipe will be created for the final
8211 // invariant store of the reduction.
8212 StoreInst *SI;
8213 if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8214 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8215 // Only create recipe for the final invariant store of the reduction.
8216 if (Legal->isInvariantStoreOfReduction(SI)) {
8217 auto *Recipe = new VPReplicateRecipe(
8218 SI, VPI->operandsWithoutMask(), true /* IsUniform */,
8219 nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
8220 Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8221 }
8222 R.eraseFromParent();
8223 continue;
8224 }
8225
8226 VPRecipeBase *Recipe =
8227 RecipeBuilder.tryToCreateWidenNonPhiRecipe(R: VPI, Range);
8228 if (!Recipe)
8229 Recipe =
8230 RecipeBuilder.handleReplication(VPI: cast<VPInstruction>(Val: VPI), Range);
8231
8232 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8233 if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8234 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8235 // moved to the phi section in the header.
8236 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8237 } else {
8238 Builder.insert(R: Recipe);
8239 }
8240 if (Recipe->getNumDefinedValues() == 1) {
8241 VPI->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8242 } else {
8243 assert(Recipe->getNumDefinedValues() == 0 &&
8244 "Unexpected multidef recipe");
8245 }
8246 R.eraseFromParent();
8247 }
8248 }
8249
8250 assert(isa<VPRegionBlock>(LoopRegion) &&
8251 !LoopRegion->getEntryBasicBlock()->empty() &&
8252 "entry block must be set to a VPRegionBlock having a non-empty entry "
8253 "VPBasicBlock");
8254
8255 // TODO: We can't call runPass on these transforms yet, due to verifier
8256 // failures.
8257 VPlanTransforms::addExitUsersForFirstOrderRecurrences(Plan&: *Plan, Range);
8258
8259 // ---------------------------------------------------------------------------
8260 // Transform initial VPlan: Apply previously taken decisions, in order, to
8261 // bring the VPlan to its final state.
8262 // ---------------------------------------------------------------------------
8263
8264 addReductionResultComputation(Plan, RecipeBuilder, MinVF: Range.Start);
8265
8266 // Optimize FindIV reductions to use sentinel-based approach when possible.
8267 RUN_VPLAN_PASS(VPlanTransforms::optimizeFindIVReductions, *Plan, PSE,
8268 *OrigLoop);
8269 VPlanTransforms::optimizeInductionLiveOutUsers(Plan&: *Plan, PSE,
8270 FoldTail: CM.foldTailByMasking());
8271
8272 // Apply mandatory transformation to handle reductions with multiple in-loop
8273 // uses if possible, bail out otherwise.
8274 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMultiUseReductions, *Plan, ORE,
8275 OrigLoop))
8276 return nullptr;
8277 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8278 // NaNs if possible, bail out otherwise.
8279 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMaxMinNumReductions, *Plan))
8280 return nullptr;
8281
8282 // Create whole-vector selects for find-last recurrences.
8283 if (!RUN_VPLAN_PASS(VPlanTransforms::handleFindLastReductions, *Plan))
8284 return nullptr;
8285
8286 // Create partial reduction recipes for scaled reductions and transform
8287 // recipes to abstract recipes if it is legal and beneficial and clamp the
8288 // range for better cost estimation.
8289 // TODO: Enable following transform when the EVL-version of extended-reduction
8290 // and mulacc-reduction are implemented.
8291 if (!CM.foldTailWithEVL()) {
8292 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
8293 OrigLoop);
8294 RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
8295 Range);
8296 RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
8297 Range);
8298 }
8299
8300 for (ElementCount VF : Range)
8301 Plan->addVF(VF);
8302 Plan->setName("Initial VPlan");
8303
8304 // Interleave memory: for each Interleave Group we marked earlier as relevant
8305 // for this VPlan, replace the Recipes widening its memory instructions with a
8306 // single VPInterleaveRecipe at its insertion point.
8307 RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
8308 InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
8309
8310 // Replace VPValues for known constant strides.
8311 RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
8312 Legal->getLAI()->getSymbolicStrides());
8313
8314 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8315 return Legal->blockNeedsPredication(BB);
8316 };
8317 RUN_VPLAN_PASS(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
8318 BlockNeedsPredication);
8319
8320 // Sink users of fixed-order recurrence past the recipe defining the previous
8321 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8322 if (!RUN_VPLAN_PASS(VPlanTransforms::adjustFixedOrderRecurrences, *Plan,
8323 Builder))
8324 return nullptr;
8325
8326 if (useActiveLaneMask(Style)) {
8327 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8328 // TailFoldingStyle is visible there.
8329 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8330 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow);
8331 }
8332
8333 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8334 return Plan;
8335}
8336
8337VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8338 // Outer loop handling: They may require CFG and instruction level
8339 // transformations before even evaluating whether vectorization is profitable.
8340 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8341 // the vectorization pipeline.
8342 assert(!OrigLoop->isInnermost());
8343 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8344
8345 auto Plan = VPlanTransforms::buildVPlan0(
8346 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8347 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE);
8348
8349 VPlanTransforms::handleEarlyExits(Plan&: *Plan,
8350 /*HasUncountableExit*/ false);
8351 VPlanTransforms::addMiddleCheck(Plan&: *Plan, /*TailFolded*/ false);
8352
8353 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8354
8355 VPlanTransforms::createHeaderPhiRecipes(
8356 Plan&: *Plan, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
8357 Reductions: MapVector<PHINode *, RecurrenceDescriptor>(),
8358 FixedOrderRecurrences: SmallPtrSet<const PHINode *, 1>(), InLoopReductions: SmallPtrSet<PHINode *, 1>(),
8359 /*AllowReordering=*/false);
8360
8361 for (ElementCount VF : Range)
8362 Plan->addVF(VF);
8363
8364 if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(Plan&: *Plan, TLI: *TLI))
8365 return nullptr;
8366
8367 // Optimize induction live-out users to use precomputed end values.
8368 VPlanTransforms::optimizeInductionLiveOutUsers(Plan&: *Plan, PSE,
8369 /*FoldTail=*/false);
8370
8371 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8372 return Plan;
8373}
8374
8375void LoopVectorizationPlanner::addReductionResultComputation(
8376 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8377 using namespace VPlanPatternMatch;
8378 VPTypeAnalysis TypeInfo(*Plan);
8379 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8380 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8381 SmallVector<VPRecipeBase *> ToDelete;
8382 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8383 Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
8384 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8385 for (VPRecipeBase &R :
8386 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8387 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8388 // TODO: Remove check for constant incoming value once removeDeadRecipes is
8389 // used on VPlan0.
8390 if (!PhiR || isa<VPIRValue>(Val: PhiR->getOperand(N: 1)))
8391 continue;
8392
8393 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8394 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8395 PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
8396 Type *PhiTy = TypeInfo.inferScalarType(V: PhiR);
8397 // If tail is folded by masking, introduce selects between the phi
8398 // and the users outside the vector region of each reduction, at the
8399 // beginning of the dedicated latch block.
8400 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8401 auto *NewExitingVPV = PhiR->getBackedgeValue();
8402 // Don't output selects for partial reductions because they have an output
8403 // with fewer lanes than the VF. So the operands of the select would have
8404 // different numbers of lanes. Partial reductions mask the input instead.
8405 auto *RR = dyn_cast<VPReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe());
8406 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8407 (!RR || !RR->isPartialReduction())) {
8408 VPValue *Cond = vputils::findHeaderMask(Plan&: *Plan);
8409 NewExitingVPV =
8410 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", Flags: *PhiR);
8411 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
8412 using namespace VPlanPatternMatch;
8413 return match(
8414 U: &U, P: m_CombineOr(
8415 L: m_VPInstruction<VPInstruction::ComputeAnyOfResult>(),
8416 R: m_VPInstruction<VPInstruction::ComputeReductionResult>()));
8417 });
8418
8419 if (CM.usePredicatedReductionSelect(RecurrenceKind))
8420 PhiR->setOperand(I: 1, New: NewExitingVPV);
8421 }
8422
8423 // We want code in the middle block to appear to execute on the location of
8424 // the scalar loop's latch terminator because: (a) it is all compiler
8425 // generated, (b) these instructions are always executed after evaluating
8426 // the latch conditional branch, and (c) other passes may add new
8427 // predecessors which terminate on this line. This is the easiest way to
8428 // ensure we don't accidentally cause an extra step back into the loop while
8429 // debugging.
8430 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8431
8432 // TODO: At the moment ComputeReductionResult also drives creation of the
8433 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8434 // even for in-loop reductions, until the reduction resume value handling is
8435 // also modeled in VPlan.
8436 VPInstruction *FinalReductionResult;
8437 VPBuilder::InsertPointGuard Guard(Builder);
8438 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
8439 // For AnyOf reductions, find the select among PhiR's users. This is used
8440 // both to find NewVal for ComputeAnyOfResult and to adjust the reduction.
8441 VPRecipeBase *AnyOfSelect = nullptr;
8442 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8443 AnyOfSelect = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
8444 return match(U, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()));
8445 }));
8446 }
8447 if (AnyOfSelect) {
8448 VPValue *Start = PhiR->getStartValue();
8449 // NewVal is the non-phi operand of the select.
8450 VPValue *NewVal = AnyOfSelect->getOperand(N: 1) == PhiR
8451 ? AnyOfSelect->getOperand(N: 2)
8452 : AnyOfSelect->getOperand(N: 1);
8453 FinalReductionResult =
8454 Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
8455 Operands: {Start, NewVal, NewExitingVPV}, DL: ExitDL);
8456 } else {
8457 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
8458 PhiR->getFastMathFlags());
8459 FinalReductionResult =
8460 Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
8461 Operands: {NewExitingVPV}, Flags, DL: ExitDL);
8462 }
8463 // If the vector reduction can be performed in a smaller type, we truncate
8464 // then extend the loop exit value to enable InstCombine to evaluate the
8465 // entire expression in the smaller type.
8466 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8467 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8468 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8469 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
8470 "Unexpected truncated min-max recurrence!");
8471 Type *RdxTy = RdxDesc.getRecurrenceType();
8472 VPWidenCastRecipe *Trunc;
8473 Instruction::CastOps ExtendOpc =
8474 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8475 VPWidenCastRecipe *Extnd;
8476 {
8477 VPBuilder::InsertPointGuard Guard(Builder);
8478 Builder.setInsertPoint(
8479 TheBB: NewExitingVPV->getDefiningRecipe()->getParent(),
8480 IP: std::next(x: NewExitingVPV->getDefiningRecipe()->getIterator()));
8481 Trunc =
8482 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: NewExitingVPV, ResultTy: RdxTy);
8483 Extnd = Builder.createWidenCast(Opcode: ExtendOpc, Op: Trunc, ResultTy: PhiTy);
8484 }
8485 if (PhiR->getOperand(N: 1) == NewExitingVPV)
8486 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
8487
8488 // Update ComputeReductionResult with the truncated exiting value and
8489 // extend its result. Operand 0 provides the values to be reduced.
8490 FinalReductionResult->setOperand(I: 0, New: Trunc);
8491 FinalReductionResult =
8492 Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
8493 }
8494
8495 // Update all users outside the vector region. Also replace redundant
8496 // extracts.
8497 for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
8498 auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
8499 if (FinalReductionResult == U || Parent->getParent())
8500 continue;
8501 // Skip FindIV reduction chain recipes (ComputeReductionResult, icmp).
8502 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind) &&
8503 match(U, P: m_CombineOr(
8504 L: m_VPInstruction<VPInstruction::ComputeReductionResult>(),
8505 R: m_VPInstruction<Instruction::ICmp>())))
8506 continue;
8507 U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
8508
8509 // Look through ExtractLastPart.
8510 if (match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
8511 U = cast<VPInstruction>(Val: U)->getSingleUser();
8512
8513 if (match(U, P: m_CombineOr(L: m_ExtractLane(Op0: m_VPValue(), Op1: m_VPValue()),
8514 R: m_ExtractLastLane(Op0: m_VPValue()))))
8515 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
8516 }
8517
8518 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8519 // with a boolean reduction phi node to check if the condition is true in
8520 // any iteration. The final value is selected by the final
8521 // ComputeReductionResult.
8522 if (AnyOfSelect) {
8523 VPValue *Cmp = AnyOfSelect->getOperand(N: 0);
8524 // If the compare is checking the reduction PHI node, adjust it to check
8525 // the start value.
8526 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8527 CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
8528 Builder.setInsertPoint(AnyOfSelect);
8529
8530 // If the true value of the select is the reduction phi, the new value is
8531 // selected if the negated condition is true in any iteration.
8532 if (AnyOfSelect->getOperand(N: 1) == PhiR)
8533 Cmp = Builder.createNot(Operand: Cmp);
8534 VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
8535 AnyOfSelect->getVPSingleValue()->replaceAllUsesWith(New: Or);
8536 // Delete AnyOfSelect now that it has invalid types.
8537 ToDelete.push_back(Elt: AnyOfSelect);
8538
8539 // Convert the reduction phi to operate on bools.
8540 PhiR->setOperand(I: 0, New: Plan->getFalse());
8541 continue;
8542 }
8543
8544 RecurKind RK = PhiR->getRecurrenceKind();
8545 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
8546 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
8547 !RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) &&
8548 !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RK))) {
8549 VPBuilder PHBuilder(Plan->getVectorPreheader());
8550 VPValue *Iden = Plan->getOrAddLiveIn(
8551 V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: PhiR->getFastMathFlags()));
8552 auto *ScaleFactorVPV = Plan->getConstantInt(BitWidth: 32, Val: 1);
8553 VPValue *StartV = PHBuilder.createNaryOp(
8554 Opcode: VPInstruction::ReductionStartVector,
8555 Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV}, Flags: *PhiR);
8556 PhiR->setOperand(I: 0, New: StartV);
8557 }
8558 }
8559 for (VPRecipeBase *R : ToDelete)
8560 R->eraseFromParent();
8561
8562 RUN_VPLAN_PASS(VPlanTransforms::clearReductionWrapFlags, *Plan);
8563}
8564
8565void LoopVectorizationPlanner::attachRuntimeChecks(
8566 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8567 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8568 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(N: 0)) {
8569 assert((!CM.OptForSize ||
8570 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8571 "Cannot SCEV check stride or overflow when optimizing for size");
8572 VPlanTransforms::attachCheckBlock(Plan, Cond: SCEVCheckCond, CheckBlock: SCEVCheckBlock,
8573 AddBranchWeights: HasBranchWeights);
8574 }
8575 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8576 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(N: 0)) {
8577 // VPlan-native path does not do any analysis for runtime checks
8578 // currently.
8579 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
8580 "Runtime checks are not supported for outer loops yet");
8581
8582 if (CM.OptForSize) {
8583 assert(
8584 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8585 "Cannot emit memory checks when optimizing for size, unless forced "
8586 "to vectorize.");
8587 ORE->emit(RemarkBuilder: [&]() {
8588 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
8589 OrigLoop->getStartLoc(),
8590 OrigLoop->getHeader())
8591 << "Code-size may be reduced by not forcing "
8592 "vectorization, or by source-code modifications "
8593 "eliminating the need for runtime checks "
8594 "(e.g., adding 'restrict').";
8595 });
8596 }
8597 VPlanTransforms::attachCheckBlock(Plan, Cond: MemCheckCond, CheckBlock: MemCheckBlock,
8598 AddBranchWeights: HasBranchWeights);
8599 }
8600}
8601
8602void LoopVectorizationPlanner::addMinimumIterationCheck(
8603 VPlan &Plan, ElementCount VF, unsigned UF,
8604 ElementCount MinProfitableTripCount) const {
8605 const uint32_t *BranchWeights =
8606 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())
8607 ? &MinItersBypassWeights[0]
8608 : nullptr;
8609 VPlanTransforms::addMinimumIterationCheck(
8610 Plan, VF, UF, MinProfitableTripCount,
8611 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()), TailFolded: CM.foldTailByMasking(),
8612 OrigLoop, MinItersBypassWeights: BranchWeights,
8613 DL: OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
8614}
8615
8616// Determine how to lower the scalar epilogue, which depends on 1) optimising
8617// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8618// predication, and 4) a TTI hook that analyses whether the loop is suitable
8619// for predication.
8620static ScalarEpilogueLowering getScalarEpilogueLowering(
8621 Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
8622 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8623 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
8624 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8625 // don't look at hints or options, and don't request a scalar epilogue.
8626 if (F->hasOptSize() ||
8627 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8628 return CM_ScalarEpilogueNotAllowedOptSize;
8629
8630 // 2) If set, obey the directives
8631 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8632 switch (PreferPredicateOverEpilogue) {
8633 case PreferPredicateTy::ScalarEpilogue:
8634 return CM_ScalarEpilogueAllowed;
8635 case PreferPredicateTy::PredicateElseScalarEpilogue:
8636 return CM_ScalarEpilogueNotNeededUsePredicate;
8637 case PreferPredicateTy::PredicateOrDontVectorize:
8638 return CM_ScalarEpilogueNotAllowedUsePredicate;
8639 };
8640 }
8641
8642 // 3) If set, obey the hints
8643 switch (Hints.getPredicate()) {
8644 case LoopVectorizeHints::FK_Enabled:
8645 return CM_ScalarEpilogueNotNeededUsePredicate;
8646 case LoopVectorizeHints::FK_Disabled:
8647 return CM_ScalarEpilogueAllowed;
8648 };
8649
8650 // 4) if the TTI hook indicates this is profitable, request predication.
8651 TailFoldingInfo TFI(TLI, &LVL, IAI);
8652 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
8653 return CM_ScalarEpilogueNotNeededUsePredicate;
8654
8655 return CM_ScalarEpilogueAllowed;
8656}
8657
8658// Process the loop in the VPlan-native vectorization path. This path builds
8659// VPlan upfront in the vectorization pipeline, which allows to apply
8660// VPlan-to-VPlan transformations from the very beginning without modifying the
8661// input LLVM IR.
8662static bool processLoopInVPlanNativePath(
8663 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8664 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8665 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8666 OptimizationRemarkEmitter *ORE,
8667 std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
8668 LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
8669
8670 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
8671 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8672 return false;
8673 }
8674 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8675 Function *F = L->getHeader()->getParent();
8676 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8677
8678 ScalarEpilogueLowering SEL =
8679 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL&: *LVL, IAI: &IAI);
8680
8681 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
8682 GetBFI, F, &Hints, IAI, OptForSize);
8683 // Use the planner for outer loop vectorization.
8684 // TODO: CM is not used at this point inside the planner. Turn CM into an
8685 // optional argument if we don't need it in the future.
8686 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
8687 ORE);
8688
8689 // Get user vectorization factor.
8690 ElementCount UserVF = Hints.getWidth();
8691
8692 CM.collectElementTypesForWidening();
8693
8694 // Plan how to best vectorize, return the best VF and its cost.
8695 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8696
8697 // If we are stress testing VPlan builds, do not attempt to generate vector
8698 // code. Masked vector code generation support will follow soon.
8699 // Also, do not attempt to vectorize if no vector code will be produced.
8700 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
8701 return false;
8702
8703 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
8704
8705 {
8706 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
8707 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
8708 Checks, BestPlan);
8709 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8710 << L->getHeader()->getParent()->getName() << "\"\n");
8711 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, /*UF=*/1,
8712 MinProfitableTripCount: VF.MinProfitableTripCount);
8713
8714 LVP.executePlan(BestVF: VF.Width, /*UF=*/BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
8715 }
8716
8717 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
8718
8719 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8720 return true;
8721}
8722
8723// Emit a remark if there are stores to floats that required a floating point
8724// extension. If the vectorized loop was generated with floating point there
8725// will be a performance penalty from the conversion overhead and the change in
8726// the vector width.
8727static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
8728 SmallVector<Instruction *, 4> Worklist;
8729 for (BasicBlock *BB : L->getBlocks()) {
8730 for (Instruction &Inst : *BB) {
8731 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
8732 if (S->getValueOperand()->getType()->isFloatTy())
8733 Worklist.push_back(Elt: S);
8734 }
8735 }
8736 }
8737
8738 // Traverse the floating point stores upwards searching, for floating point
8739 // conversions.
8740 SmallPtrSet<const Instruction *, 4> Visited;
8741 SmallPtrSet<const Instruction *, 4> EmittedRemark;
8742 while (!Worklist.empty()) {
8743 auto *I = Worklist.pop_back_val();
8744 if (!L->contains(Inst: I))
8745 continue;
8746 if (!Visited.insert(Ptr: I).second)
8747 continue;
8748
8749 // Emit a remark if the floating point store required a floating
8750 // point conversion.
8751 // TODO: More work could be done to identify the root cause such as a
8752 // constant or a function return type and point the user to it.
8753 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
8754 ORE->emit(RemarkBuilder: [&]() {
8755 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
8756 I->getDebugLoc(), L->getHeader())
8757 << "floating point conversion changes vector width. "
8758 << "Mixed floating point precision requires an up/down "
8759 << "cast that will negatively impact performance.";
8760 });
8761
8762 for (Use &Op : I->operands())
8763 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
8764 Worklist.push_back(Elt: OpI);
8765 }
8766}
8767
8768/// For loops with uncountable early exits, find the cost of doing work when
8769/// exiting the loop early, such as calculating the final exit values of
8770/// variables used outside the loop.
8771/// TODO: This is currently overly pessimistic because the loop may not take
8772/// the early exit, but better to keep this conservative for now. In future,
8773/// it might be possible to relax this by using branch probabilities.
8774static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
8775 VPlan &Plan, ElementCount VF) {
8776 InstructionCost Cost = 0;
8777 for (auto *ExitVPBB : Plan.getExitBlocks()) {
8778 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
8779 // If the predecessor is not the middle.block, then it must be the
8780 // vector.early.exit block, which may contain work to calculate the exit
8781 // values of variables used outside the loop.
8782 if (PredVPBB != Plan.getMiddleBlock()) {
8783 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
8784 << PredVPBB->getName() << ":\n");
8785 Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
8786 }
8787 }
8788 }
8789 return Cost;
8790}
8791
8792/// This function determines whether or not it's still profitable to vectorize
8793/// the loop given the extra work we have to do outside of the loop:
8794/// 1. Perform the runtime checks before entering the loop to ensure it's safe
8795/// to vectorize.
8796/// 2. In the case of loops with uncountable early exits, we may have to do
8797/// extra work when exiting the loop early, such as calculating the final
8798/// exit values of variables used outside the loop.
8799/// 3. The middle block.
8800static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
8801 VectorizationFactor &VF, Loop *L,
8802 PredicatedScalarEvolution &PSE,
8803 VPCostContext &CostCtx, VPlan &Plan,
8804 ScalarEpilogueLowering SEL,
8805 std::optional<unsigned> VScale) {
8806 InstructionCost RtC = Checks.getCost();
8807 if (!RtC.isValid())
8808 return false;
8809
8810 // When interleaving only scalar and vector cost will be equal, which in turn
8811 // would lead to a divide by 0. Fall back to hard threshold.
8812 if (VF.Width.isScalar()) {
8813 // TODO: Should we rename VectorizeMemoryCheckThreshold?
8814 if (RtC > VectorizeMemoryCheckThreshold) {
8815 LLVM_DEBUG(
8816 dbgs()
8817 << "LV: Interleaving only is not profitable due to runtime checks\n");
8818 return false;
8819 }
8820 return true;
8821 }
8822
8823 // The scalar cost should only be 0 when vectorizing with a user specified
8824 // VF/IC. In those cases, runtime checks should always be generated.
8825 uint64_t ScalarC = VF.ScalarCost.getValue();
8826 if (ScalarC == 0)
8827 return true;
8828
8829 InstructionCost TotalCost = RtC;
8830 // Add on the cost of any work required in the vector early exit block, if
8831 // one exists.
8832 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
8833 TotalCost += Plan.getMiddleBlock()->cost(VF: VF.Width, Ctx&: CostCtx);
8834
8835 // First, compute the minimum iteration count required so that the vector
8836 // loop outperforms the scalar loop.
8837 // The total cost of the scalar loop is
8838 // ScalarC * TC
8839 // where
8840 // * TC is the actual trip count of the loop.
8841 // * ScalarC is the cost of a single scalar iteration.
8842 //
8843 // The total cost of the vector loop is
8844 // TotalCost + VecC * (TC / VF) + EpiC
8845 // where
8846 // * TotalCost is the sum of the costs cost of
8847 // - the generated runtime checks, i.e. RtC
8848 // - performing any additional work in the vector.early.exit block for
8849 // loops with uncountable early exits.
8850 // - the middle block, if ExpectedTC <= VF.Width.
8851 // * VecC is the cost of a single vector iteration.
8852 // * TC is the actual trip count of the loop
8853 // * VF is the vectorization factor
8854 // * EpiCost is the cost of the generated epilogue, including the cost
8855 // of the remaining scalar operations.
8856 //
8857 // Vectorization is profitable once the total vector cost is less than the
8858 // total scalar cost:
8859 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
8860 //
8861 // Now we can compute the minimum required trip count TC as
8862 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
8863 //
8864 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
8865 // the computations are performed on doubles, not integers and the result
8866 // is rounded up, hence we get an upper estimate of the TC.
8867 unsigned IntVF = estimateElementCount(VF: VF.Width, VScale);
8868 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
8869 uint64_t MinTC1 =
8870 Div == 0 ? 0 : divideCeil(Numerator: TotalCost.getValue() * IntVF, Denominator: Div);
8871
8872 // Second, compute a minimum iteration count so that the cost of the
8873 // runtime checks is only a fraction of the total scalar loop cost. This
8874 // adds a loop-dependent bound on the overhead incurred if the runtime
8875 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
8876 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
8877 // cost, compute
8878 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
8879 uint64_t MinTC2 = divideCeil(Numerator: RtC.getValue() * 10, Denominator: ScalarC);
8880
8881 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
8882 // epilogue is allowed, choose the next closest multiple of VF. This should
8883 // partly compensate for ignoring the epilogue cost.
8884 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
8885 if (SEL == CM_ScalarEpilogueAllowed)
8886 MinTC = alignTo(Value: MinTC, Align: IntVF);
8887 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
8888
8889 LLVM_DEBUG(
8890 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
8891 << VF.MinProfitableTripCount << "\n");
8892
8893 // Skip vectorization if the expected trip count is less than the minimum
8894 // required trip count.
8895 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
8896 if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
8897 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
8898 "trip count < minimum profitable VF ("
8899 << *ExpectedTC << " < " << VF.MinProfitableTripCount
8900 << ")\n");
8901
8902 return false;
8903 }
8904 }
8905 return true;
8906}
8907
8908LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8909 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8910 !EnableLoopInterleaving),
8911 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8912 !EnableLoopVectorization) {}
8913
8914/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
8915/// vectorization.
8916static SmallVector<VPInstruction *>
8917preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
8918 using namespace VPlanPatternMatch;
8919 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
8920 // introduce multiple uses of undef/poison. If the reduction start value may
8921 // be undef or poison it needs to be frozen and the frozen start has to be
8922 // used when computing the reduction result. We also need to use the frozen
8923 // value in the resume phi generated by the main vector loop, as this is also
8924 // used to compute the reduction result after the epilogue vector loop.
8925 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
8926 bool UpdateResumePhis) {
8927 VPBuilder Builder(Plan.getEntry());
8928 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
8929 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
8930 if (!VPI)
8931 continue;
8932 VPValue *OrigStart;
8933 if (!matchFindIVResult(VPI, ReducedIV: m_VPValue(), Start: m_VPValue(V&: OrigStart)))
8934 continue;
8935 if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
8936 continue;
8937 VPInstruction *Freeze =
8938 Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, DL: {}, Name: "fr");
8939 VPI->setOperand(I: 2, New: Freeze);
8940 if (UpdateResumePhis)
8941 OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
8942 return Freeze != &U && isa<VPPhi>(Val: &U);
8943 });
8944 }
8945 };
8946 AddFreezeForFindLastIVReductions(MainPlan, true);
8947 AddFreezeForFindLastIVReductions(EpiPlan, false);
8948
8949 VPValue *VectorTC = nullptr;
8950 auto *Term =
8951 MainPlan.getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8952 [[maybe_unused]] bool MatchedTC =
8953 match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue(V&: VectorTC)));
8954 assert(MatchedTC && "must match vector trip count");
8955
8956 // If there is a suitable resume value for the canonical induction in the
8957 // scalar (which will become vector) epilogue loop, use it and move it to the
8958 // beginning of the scalar preheader. Otherwise create it below.
8959 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
8960 auto ResumePhiIter =
8961 find_if(Range: MainScalarPH->phis(), P: [VectorTC](VPRecipeBase &R) {
8962 return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Ops: m_Specific(VPV: VectorTC),
8963 Ops: m_ZeroInt()));
8964 });
8965 VPPhi *ResumePhi = nullptr;
8966 if (ResumePhiIter == MainScalarPH->phis().end()) {
8967 using namespace llvm::VPlanPatternMatch;
8968 assert(
8969 match(MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue(),
8970 m_ZeroInt()) &&
8971 "canonical IV must start at 0");
8972 Type *Ty = VPTypeAnalysis(MainPlan).inferScalarType(V: VectorTC);
8973 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
8974 ResumePhi = ScalarPHBuilder.createScalarPhi(
8975 IncomingValues: {VectorTC, MainPlan.getZero(Ty)}, DL: {}, Name: "vec.epilog.resume.val");
8976 } else {
8977 ResumePhi = cast<VPPhi>(Val: &*ResumePhiIter);
8978 ResumePhi->setName("vec.epilog.resume.val");
8979 if (&MainScalarPH->front() != ResumePhi)
8980 ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->begin());
8981 }
8982
8983 // Create a ResumeForEpilogue for the canonical IV resume as the
8984 // first non-phi, to keep it alive for the epilogue.
8985 VPBuilder ResumeBuilder(MainScalarPH);
8986 ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue, Operands: ResumePhi);
8987
8988 // Create ResumeForEpilogue instructions for the resume phis of the
8989 // VPIRPhis in the scalar header of the main plan and return them so they can
8990 // be used as resume values when vectorizing the epilogue.
8991 return to_vector(
8992 Range: map_range(C: MainPlan.getScalarHeader()->phis(), F: [&](VPRecipeBase &R) {
8993 assert(isa<VPIRPhi>(R) &&
8994 "only VPIRPhis expected in the scalar header");
8995 return ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue,
8996 Operands: R.getOperand(N: 0));
8997 }));
8998}
8999
9000/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9001/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
9002/// reductions require creating new instructions to compute the resume values.
9003/// They are collected in a vector and returned. They must be moved to the
9004/// preheader of the vector epilogue loop, after created by the execution of \p
9005/// Plan.
9006static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
9007 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
9008 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
9009 ScalarEvolution &SE) {
9010 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9011 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9012 Header->setName("vec.epilog.vector.body");
9013
9014 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
9015 // When vectorizing the epilogue loop, the canonical induction needs to be
9016 // adjusted by the value after the main vector loop. Find the resume value
9017 // created during execution of the main VPlan. It must be the first phi in the
9018 // loop preheader. Use the value to increment the canonical IV, and update all
9019 // users in the loop region to use the adjusted value.
9020 // FIXME: Improve modeling for canonical IV start values in the epilogue
9021 // loop.
9022 using namespace llvm::PatternMatch;
9023 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
9024 for (Value *Inc : EPResumeVal->incoming_values()) {
9025 if (match(V: Inc, P: m_SpecificInt(V: 0)))
9026 continue;
9027 assert(!EPI.VectorTripCount &&
9028 "Must only have a single non-zero incoming value");
9029 EPI.VectorTripCount = Inc;
9030 }
9031 // If we didn't find a non-zero vector trip count, all incoming values
9032 // must be zero, which also means the vector trip count is zero. Pick the
9033 // first zero as vector trip count.
9034 // TODO: We should not choose VF * UF so the main vector loop is known to
9035 // be dead.
9036 if (!EPI.VectorTripCount) {
9037 assert(EPResumeVal->getNumIncomingValues() > 0 &&
9038 all_of(EPResumeVal->incoming_values(),
9039 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
9040 "all incoming values must be 0");
9041 EPI.VectorTripCount = EPResumeVal->getOperand(i_nocapture: 0);
9042 }
9043 VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9044 assert(all_of(IV->users(),
9045 [](const VPUser *U) {
9046 return isa<VPScalarIVStepsRecipe>(U) ||
9047 isa<VPDerivedIVRecipe>(U) ||
9048 cast<VPRecipeBase>(U)->isScalarCast() ||
9049 cast<VPInstruction>(U)->getOpcode() ==
9050 Instruction::Add;
9051 }) &&
9052 "the canonical IV should only be used by its increment or "
9053 "ScalarIVSteps when resetting the start value");
9054 VPBuilder Builder(Header, Header->getFirstNonPhi());
9055 VPInstruction *Add = Builder.createAdd(LHS: IV, RHS: VPV);
9056 IV->replaceAllUsesWith(New: Add);
9057 Add->setOperand(I: 0, New: IV);
9058
9059 DenseMap<Value *, Value *> ToFrozen;
9060 SmallVector<Instruction *> InstsToMove;
9061 // Ensure that the start values for all header phi recipes are updated before
9062 // vectorizing the epilogue loop. Skip the canonical IV, which has been
9063 // handled above.
9064 for (VPRecipeBase &R : drop_begin(RangeOrContainer: Header->phis())) {
9065 Value *ResumeV = nullptr;
9066 // TODO: Move setting of resume values to prepareToExecute.
9067 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9068 // Find the reduction result by searching users of the phi or its backedge
9069 // value.
9070 auto IsReductionResult = [](VPRecipeBase *R) {
9071 auto *VPI = dyn_cast<VPInstruction>(Val: R);
9072 if (!VPI)
9073 return false;
9074 return VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9075 VPI->getOpcode() == VPInstruction::ComputeReductionResult;
9076 };
9077 auto *RdxResult = cast<VPInstruction>(
9078 Val: vputils::findRecipe(Start: ReductionPhi->getBackedgeValue(), Pred: IsReductionResult));
9079 assert(RdxResult && "expected to find reduction result");
9080
9081 ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9082 ->getIncomingValueForBlock(BB: L->getLoopPreheader());
9083
9084 // Check for FindIV pattern by looking for icmp user of RdxResult.
9085 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
9086 using namespace VPlanPatternMatch;
9087 VPValue *SentinelVPV = nullptr;
9088 bool IsFindIV = any_of(Range: RdxResult->users(), P: [&](VPUser *U) {
9089 return match(U, P: VPlanPatternMatch::m_SpecificICmp(
9090 MatchPred: ICmpInst::ICMP_NE, Op0: m_Specific(VPV: RdxResult),
9091 Op1: m_VPValue(V&: SentinelVPV)));
9092 });
9093
9094 if (RdxResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
9095 Value *StartV = RdxResult->getOperand(N: 0)->getLiveInIRValue();
9096 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9097 // start value; compare the final value from the main vector loop
9098 // to the start value.
9099 BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9100 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9101 ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9102 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9103 InstsToMove.push_back(Elt: I);
9104 } else if (IsFindIV) {
9105 assert(SentinelVPV && "expected to find icmp using RdxResult");
9106
9107 // Get the frozen start value from the main loop.
9108 Value *FrozenStartV = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9109 BB: EPI.MainLoopIterationCountCheck);
9110 if (auto *FreezeI = dyn_cast<FreezeInst>(Val: FrozenStartV))
9111 ToFrozen[FreezeI->getOperand(i_nocapture: 0)] = FrozenStartV;
9112
9113 // Adjust resume: select(icmp eq ResumeV, FrozenStartV), Sentinel,
9114 // ResumeV
9115 BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9116 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9117 Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: FrozenStartV);
9118 if (auto *I = dyn_cast<Instruction>(Val: Cmp))
9119 InstsToMove.push_back(Elt: I);
9120 ResumeV =
9121 Builder.CreateSelect(C: Cmp, True: SentinelVPV->getLiveInIRValue(), False: ResumeV);
9122 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9123 InstsToMove.push_back(Elt: I);
9124 } else {
9125 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9126 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9127 if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9128 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9129 "unexpected start value");
9130 // Partial sub-reductions always start at 0 and account for the
9131 // reduction start value in a final subtraction. Update it to use the
9132 // resume value from the main vector loop.
9133 if (PhiR->getVFScaleFactor() > 1 &&
9134 PhiR->getRecurrenceKind() == RecurKind::Sub) {
9135 auto *Sub = cast<VPInstruction>(Val: RdxResult->getSingleUser());
9136 assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode");
9137 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
9138 "Expected operand to match the original start value of the "
9139 "reduction");
9140 assert(VPlanPatternMatch::match(VPI->getOperand(0),
9141 VPlanPatternMatch::m_ZeroInt()) &&
9142 "Expected start value for partial sub-reduction to start at "
9143 "zero");
9144 Sub->setOperand(I: 0, New: StartVal);
9145 } else
9146 VPI->setOperand(I: 0, New: StartVal);
9147 continue;
9148 }
9149 }
9150 } else {
9151 // Retrieve the induction resume values for wide inductions from
9152 // their original phi nodes in the scalar loop.
9153 PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9154 // Hook up to the PHINode generated by a ResumePhi recipe of main
9155 // loop VPlan, which feeds the scalar loop.
9156 ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9157 }
9158 assert(ResumeV && "Must have a resume value");
9159 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9160 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9161 }
9162
9163 // For some VPValues in the epilogue plan we must re-use the generated IR
9164 // values from the main plan. Replace them with live-in VPValues.
9165 // TODO: This is a workaround needed for epilogue vectorization and it
9166 // should be removed once induction resume value creation is done
9167 // directly in VPlan.
9168 for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9169 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9170 // epilogue plan. This ensures all users use the same frozen value.
9171 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9172 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9173 VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9174 V: ToFrozen.lookup(Val: VPI->getOperand(N: 0)->getLiveInIRValue())));
9175 continue;
9176 }
9177
9178 // Re-use the trip count and steps expanded for the main loop, as
9179 // skeleton creation needs it as a value that dominates both the scalar
9180 // and vector epilogue loops
9181 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9182 if (!ExpandR)
9183 continue;
9184 VPValue *ExpandedVal =
9185 Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9186 ExpandR->replaceAllUsesWith(New: ExpandedVal);
9187 if (Plan.getTripCount() == ExpandR)
9188 Plan.resetTripCount(NewTripCount: ExpandedVal);
9189 ExpandR->eraseFromParent();
9190 }
9191
9192 auto VScale = CM.getVScaleForTuning();
9193 unsigned MainLoopStep =
9194 estimateElementCount(VF: EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9195 unsigned EpilogueLoopStep =
9196 estimateElementCount(VF: EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9197 VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
9198 Plan, TripCount: EPI.TripCount, VectorTripCount: EPI.VectorTripCount,
9199 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()), EpilogueVF: EPI.EpilogueVF,
9200 EpilogueUF: EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9201
9202 return InstsToMove;
9203}
9204
9205static void
9206fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
9207 VPlan &BestEpiPlan,
9208 ArrayRef<VPInstruction *> ResumeValues) {
9209 // Fix resume values from the additional bypass block.
9210 BasicBlock *PH = L->getLoopPreheader();
9211 for (auto *Pred : predecessors(BB: PH)) {
9212 for (PHINode &Phi : PH->phis()) {
9213 if (Phi.getBasicBlockIndex(BB: Pred) != -1)
9214 continue;
9215 Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
9216 }
9217 }
9218 auto *ScalarPH = cast<VPIRBasicBlock>(Val: BestEpiPlan.getScalarPreheader());
9219 if (ScalarPH->hasPredecessors()) {
9220 // Fix resume values for inductions and reductions from the additional
9221 // bypass block using the incoming values from the main loop's resume phis.
9222 // ResumeValues correspond 1:1 with the scalar loop header phis.
9223 for (auto [ResumeV, HeaderPhi] :
9224 zip(t&: ResumeValues, u: BestEpiPlan.getScalarHeader()->phis())) {
9225 auto *HeaderPhiR = cast<VPIRPhi>(Val: &HeaderPhi);
9226 if (isa<VPIRValue>(Val: HeaderPhiR->getIncomingValueForBlock(VPBB: ScalarPH)))
9227 continue;
9228 auto *EpiResumePhi =
9229 cast<PHINode>(Val: HeaderPhiR->getIRPhi().getIncomingValueForBlock(BB: PH));
9230 if (EpiResumePhi->getBasicBlockIndex(BB: BypassBlock) == -1)
9231 continue;
9232 auto *MainResumePhi = cast<PHINode>(Val: ResumeV->getUnderlyingValue());
9233 EpiResumePhi->setIncomingValueForBlock(
9234 BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
9235 }
9236 }
9237}
9238
9239/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9240/// loop, after both plans have executed, updating branches from the iteration
9241/// and runtime checks of the main loop, as well as updating various phis. \p
9242/// InstsToMove contains instructions that need to be moved to the preheader of
9243/// the epilogue vector loop.
9244static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
9245 EpilogueLoopVectorizationInfo &EPI,
9246 DominatorTree *DT,
9247 GeneratedRTChecks &Checks,
9248 ArrayRef<Instruction *> InstsToMove,
9249 ArrayRef<VPInstruction *> ResumeValues) {
9250 BasicBlock *VecEpilogueIterationCountCheck =
9251 cast<VPIRBasicBlock>(Val: EpiPlan.getEntry())->getIRBasicBlock();
9252
9253 BasicBlock *VecEpiloguePreHeader =
9254 cast<CondBrInst>(Val: VecEpilogueIterationCountCheck->getTerminator())
9255 ->getSuccessor(i: 1);
9256 // Adjust the control flow taking the state info from the main loop
9257 // vectorization into account.
9258 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
9259 "expected this to be saved from the previous pass.");
9260 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9261 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
9262 From: VecEpilogueIterationCountCheck, To: VecEpiloguePreHeader);
9263
9264 DTU.applyUpdates(Updates: {{DominatorTree::Delete, EPI.MainLoopIterationCountCheck,
9265 VecEpilogueIterationCountCheck},
9266 {DominatorTree::Insert, EPI.MainLoopIterationCountCheck,
9267 VecEpiloguePreHeader}});
9268
9269 BasicBlock *ScalarPH =
9270 cast<VPIRBasicBlock>(Val: EpiPlan.getScalarPreheader())->getIRBasicBlock();
9271 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
9272 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9273 DTU.applyUpdates(
9274 Updates: {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck,
9275 VecEpilogueIterationCountCheck},
9276 {DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}});
9277
9278 // Adjust the terminators of runtime check blocks and phis using them.
9279 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9280 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9281 if (SCEVCheckBlock) {
9282 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9283 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9284 DTU.applyUpdates(Updates: {{DominatorTree::Delete, SCEVCheckBlock,
9285 VecEpilogueIterationCountCheck},
9286 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9287 }
9288 if (MemCheckBlock) {
9289 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9290 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9291 DTU.applyUpdates(
9292 Updates: {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9293 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9294 }
9295
9296 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9297 // or reductions which merge control-flow from the latch block and the
9298 // middle block. Update the incoming values here and move the Phi into the
9299 // preheader.
9300 SmallVector<PHINode *, 4> PhisInBlock(
9301 llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
9302
9303 for (PHINode *Phi : PhisInBlock) {
9304 Phi->moveBefore(InsertPos: VecEpiloguePreHeader->getFirstNonPHIIt());
9305 Phi->replaceIncomingBlockWith(
9306 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
9307 New: VecEpilogueIterationCountCheck);
9308
9309 // If the phi doesn't have an incoming value from the
9310 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9311 // incoming value and also those from other check blocks. This is needed
9312 // for reduction phis only.
9313 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
9314 return EPI.EpilogueIterationCountCheck == IncB;
9315 }))
9316 continue;
9317 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
9318 if (SCEVCheckBlock)
9319 Phi->removeIncomingValue(BB: SCEVCheckBlock);
9320 if (MemCheckBlock)
9321 Phi->removeIncomingValue(BB: MemCheckBlock);
9322 }
9323
9324 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9325 for (auto *I : InstsToMove)
9326 I->moveBefore(InsertPos: IP);
9327
9328 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9329 // after executing the main loop. We need to update the resume values of
9330 // inductions and reductions during epilogue vectorization.
9331 fixScalarResumeValuesFromBypass(BypassBlock: VecEpilogueIterationCountCheck, L, BestEpiPlan&: EpiPlan,
9332 ResumeValues);
9333
9334 // Remove dead phis that were moved to the epilogue preheader but are unused
9335 // (e.g., resume phis for inductions not widened in the epilogue vector loop).
9336 for (PHINode &Phi : make_early_inc_range(Range: VecEpiloguePreHeader->phis()))
9337 if (Phi.use_empty())
9338 Phi.eraseFromParent();
9339}
9340
9341bool LoopVectorizePass::processLoop(Loop *L) {
9342 assert((EnableVPlanNativePath || L->isInnermost()) &&
9343 "VPlan-native path is not enabled. Only process inner loops.");
9344
9345 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9346 << L->getHeader()->getParent()->getName() << "' from "
9347 << L->getLocStr() << "\n");
9348
9349 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9350
9351 LLVM_DEBUG(
9352 dbgs() << "LV: Loop hints:"
9353 << " force="
9354 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9355 ? "disabled"
9356 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9357 ? "enabled"
9358 : "?"))
9359 << " width=" << Hints.getWidth()
9360 << " interleave=" << Hints.getInterleave() << "\n");
9361
9362 // Function containing loop
9363 Function *F = L->getHeader()->getParent();
9364
9365 // Looking at the diagnostic output is the only way to determine if a loop
9366 // was vectorized (other than looking at the IR or machine code), so it
9367 // is important to generate an optimization remark for each loop. Most of
9368 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9369 // generated as OptimizationRemark and OptimizationRemarkMissed are
9370 // less verbose reporting vectorized loops and unvectorized loops that may
9371 // benefit from vectorization, respectively.
9372
9373 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9374 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9375 return false;
9376 }
9377
9378 PredicatedScalarEvolution PSE(*SE, *L);
9379
9380 // Query this against the original loop and save it here because the profile
9381 // of the original loop header may change as the transformation happens.
9382 bool OptForSize = llvm::shouldOptimizeForSize(
9383 BB: L->getHeader(), PSI,
9384 BFI: PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
9385 QueryType: PGSOQueryType::IRPass);
9386
9387 // Check if it is legal to vectorize the loop.
9388 LoopVectorizationRequirements Requirements;
9389 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9390 &Requirements, &Hints, DB, AC,
9391 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
9392 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9393 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9394 Hints.emitRemarkWithHints();
9395 return false;
9396 }
9397
9398 if (LVL.hasUncountableEarlyExit()) {
9399 if (!EnableEarlyExitVectorization) {
9400 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
9401 "early exit is not enabled",
9402 ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
9403 return false;
9404 }
9405 }
9406
9407 if (!LVL.getPotentiallyFaultingLoads().empty()) {
9408 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with potentially "
9409 "faulting load is not supported",
9410 ORETag: "PotentiallyFaultingLoadsNotSupported", ORE, TheLoop: L);
9411 return false;
9412 }
9413
9414 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9415 // here. They may require CFG and instruction level transformations before
9416 // even evaluating whether vectorization is profitable. Since we cannot modify
9417 // the incoming IR, we need to build VPlan upfront in the vectorization
9418 // pipeline.
9419 if (!L->isInnermost())
9420 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9421 ORE, GetBFI, OptForSize, Hints,
9422 Requirements);
9423
9424 assert(L->isInnermost() && "Inner loop expected.");
9425
9426 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9427 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9428
9429 // If an override option has been passed in for interleaved accesses, use it.
9430 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9431 UseInterleaved = EnableInterleavedMemAccesses;
9432
9433 // Analyze interleaved memory accesses.
9434 if (UseInterleaved)
9435 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9436
9437 if (LVL.hasUncountableEarlyExit()) {
9438 BasicBlock *LoopLatch = L->getLoopLatch();
9439 if (IAI.requiresScalarEpilogue() ||
9440 any_of(Range: LVL.getCountableExitingBlocks(),
9441 P: [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9442 reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
9443 "requiring a scalar epilogue is unsupported",
9444 ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
9445 return false;
9446 }
9447 }
9448
9449 // Check the function attributes and profiles to find out if this function
9450 // should be optimized for size.
9451 ScalarEpilogueLowering SEL =
9452 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, IAI: &IAI);
9453
9454 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9455 // count by optimizing for size, to minimize overheads.
9456 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9457 if (ExpectedTC && ExpectedTC->isFixed() &&
9458 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9459 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9460 << "This loop is worth vectorizing only if no scalar "
9461 << "iteration overheads are incurred.");
9462 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9463 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9464 else {
9465 LLVM_DEBUG(dbgs() << "\n");
9466 // Predicate tail-folded loops are efficient even when the loop
9467 // iteration count is low. However, setting the epilogue policy to
9468 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9469 // with runtime checks. It's more effective to let
9470 // `isOutsideLoopWorkProfitable` determine if vectorization is
9471 // beneficial for the loop.
9472 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9473 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9474 }
9475 }
9476
9477 // Check the function attributes to see if implicit floats or vectors are
9478 // allowed.
9479 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
9480 reportVectorizationFailure(
9481 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9482 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9483 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9484 Hints.emitRemarkWithHints();
9485 return false;
9486 }
9487
9488 // Check if the target supports potentially unsafe FP vectorization.
9489 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9490 // for the target we're vectorizing for, to make sure none of the
9491 // additional fp-math flags can help.
9492 if (Hints.isPotentiallyUnsafe() &&
9493 TTI->isFPVectorizationPotentiallyUnsafe()) {
9494 reportVectorizationFailure(
9495 DebugMsg: "Potentially unsafe FP op prevents vectorization",
9496 OREMsg: "loop not vectorized due to unsafe FP support.",
9497 ORETag: "UnsafeFP", ORE, TheLoop: L);
9498 Hints.emitRemarkWithHints();
9499 return false;
9500 }
9501
9502 bool AllowOrderedReductions;
9503 // If the flag is set, use that instead and override the TTI behaviour.
9504 if (ForceOrderedReductions.getNumOccurrences() > 0)
9505 AllowOrderedReductions = ForceOrderedReductions;
9506 else
9507 AllowOrderedReductions = TTI->enableOrderedReductions();
9508 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9509 ORE->emit(RemarkBuilder: [&]() {
9510 auto *ExactFPMathInst = Requirements.getExactFPInst();
9511 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9512 ExactFPMathInst->getDebugLoc(),
9513 ExactFPMathInst->getParent())
9514 << "loop not vectorized: cannot prove it is safe to reorder "
9515 "floating-point operations";
9516 });
9517 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9518 "reorder floating-point operations\n");
9519 Hints.emitRemarkWithHints();
9520 return false;
9521 }
9522
9523 // Use the cost model.
9524 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9525 GetBFI, F, &Hints, IAI, OptForSize);
9526 // Use the planner for vectorization.
9527 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9528 ORE);
9529
9530 // Get user vectorization factor and interleave count.
9531 ElementCount UserVF = Hints.getWidth();
9532 unsigned UserIC = Hints.getInterleave();
9533 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9534 UserIC = 1;
9535
9536 // Plan how to best vectorize.
9537 LVP.plan(UserVF, UserIC);
9538 VectorizationFactor VF = LVP.computeBestVF();
9539 unsigned IC = 1;
9540
9541 if (ORE->allowExtraAnalysis(LV_NAME))
9542 LVP.emitInvalidCostRemarks(ORE);
9543
9544 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
9545 if (LVP.hasPlanWithVF(VF: VF.Width)) {
9546 // Select the interleave count.
9547 IC = LVP.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
9548
9549 unsigned SelectedIC = std::max(a: IC, b: UserIC);
9550 // Optimistically generate runtime checks if they are needed. Drop them if
9551 // they turn out to not be profitable.
9552 if (VF.Width.isVector() || SelectedIC > 1) {
9553 Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC,
9554 ORE&: *ORE);
9555
9556 // Bail out early if either the SCEV or memory runtime checks are known to
9557 // fail. In that case, the vector loop would never execute.
9558 using namespace llvm::PatternMatch;
9559 if (Checks.getSCEVChecks().first &&
9560 match(V: Checks.getSCEVChecks().first, P: m_One()))
9561 return false;
9562 if (Checks.getMemRuntimeChecks().first &&
9563 match(V: Checks.getMemRuntimeChecks().first, P: m_One()))
9564 return false;
9565 }
9566
9567 // Check if it is profitable to vectorize with runtime checks.
9568 bool ForceVectorization =
9569 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9570 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF: VF.Width), CM,
9571 CM.CostKind, CM.PSE, L);
9572 if (!ForceVectorization &&
9573 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9574 Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
9575 VScale: CM.getVScaleForTuning())) {
9576 ORE->emit(RemarkBuilder: [&]() {
9577 return OptimizationRemarkAnalysisAliasing(
9578 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9579 L->getHeader())
9580 << "loop not vectorized: cannot prove it is safe to reorder "
9581 "memory operations";
9582 });
9583 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9584 Hints.emitRemarkWithHints();
9585 return false;
9586 }
9587 }
9588
9589 // Identify the diagnostic messages that should be produced.
9590 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9591 bool VectorizeLoop = true, InterleaveLoop = true;
9592 if (VF.Width.isScalar()) {
9593 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9594 VecDiagMsg = {
9595 "VectorizationNotBeneficial",
9596 "the cost-model indicates that vectorization is not beneficial"};
9597 VectorizeLoop = false;
9598 }
9599
9600 if (UserIC == 1 && Hints.getInterleave() > 1) {
9601 assert(!LVL.isSafeForAnyVectorWidth() &&
9602 "UserIC should only be ignored due to unsafe dependencies");
9603 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9604 IntDiagMsg = {"InterleavingUnsafe",
9605 "Ignoring user-specified interleave count due to possibly "
9606 "unsafe dependencies in the loop."};
9607 InterleaveLoop = false;
9608 } else if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > 1) {
9609 // Tell the user interleaving was avoided up-front, despite being explicitly
9610 // requested.
9611 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9612 "interleaving should be avoided up front\n");
9613 IntDiagMsg = {"InterleavingAvoided",
9614 "Ignoring UserIC, because interleaving was avoided up front"};
9615 InterleaveLoop = false;
9616 } else if (IC == 1 && UserIC <= 1) {
9617 // Tell the user interleaving is not beneficial.
9618 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9619 IntDiagMsg = {
9620 "InterleavingNotBeneficial",
9621 "the cost-model indicates that interleaving is not beneficial"};
9622 InterleaveLoop = false;
9623 if (UserIC == 1) {
9624 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9625 IntDiagMsg.second +=
9626 " and is explicitly disabled or interleave count is set to 1";
9627 }
9628 } else if (IC > 1 && UserIC == 1) {
9629 // Tell the user interleaving is beneficial, but it explicitly disabled.
9630 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
9631 "disabled.\n");
9632 IntDiagMsg = {"InterleavingBeneficialButDisabled",
9633 "the cost-model indicates that interleaving is beneficial "
9634 "but is explicitly disabled or interleave count is set to 1"};
9635 InterleaveLoop = false;
9636 }
9637
9638 // If there is a histogram in the loop, do not just interleave without
9639 // vectorizing. The order of operations will be incorrect without the
9640 // histogram intrinsics, which are only used for recipes with VF > 1.
9641 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
9642 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9643 << "to histogram operations.\n");
9644 IntDiagMsg = {
9645 "HistogramPreventsScalarInterleaving",
9646 "Unable to interleave without vectorization due to constraints on "
9647 "the order of histogram operations"};
9648 InterleaveLoop = false;
9649 }
9650
9651 // Override IC if user provided an interleave count.
9652 IC = UserIC > 0 ? UserIC : IC;
9653
9654 // Emit diagnostic messages, if any.
9655 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9656 if (!VectorizeLoop && !InterleaveLoop) {
9657 // Do not vectorize or interleaving the loop.
9658 ORE->emit(RemarkBuilder: [&]() {
9659 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9660 L->getStartLoc(), L->getHeader())
9661 << VecDiagMsg.second;
9662 });
9663 ORE->emit(RemarkBuilder: [&]() {
9664 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9665 L->getStartLoc(), L->getHeader())
9666 << IntDiagMsg.second;
9667 });
9668 return false;
9669 }
9670
9671 if (!VectorizeLoop && InterleaveLoop) {
9672 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9673 ORE->emit(RemarkBuilder: [&]() {
9674 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9675 L->getStartLoc(), L->getHeader())
9676 << VecDiagMsg.second;
9677 });
9678 } else if (VectorizeLoop && !InterleaveLoop) {
9679 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9680 << ") in " << L->getLocStr() << '\n');
9681 ORE->emit(RemarkBuilder: [&]() {
9682 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9683 L->getStartLoc(), L->getHeader())
9684 << IntDiagMsg.second;
9685 });
9686 } else if (VectorizeLoop && InterleaveLoop) {
9687 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9688 << ") in " << L->getLocStr() << '\n');
9689 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9690 }
9691
9692 // Report the vectorization decision.
9693 if (VF.Width.isScalar()) {
9694 using namespace ore;
9695 assert(IC > 1);
9696 ORE->emit(RemarkBuilder: [&]() {
9697 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9698 L->getHeader())
9699 << "interleaved loop (interleaved count: "
9700 << NV("InterleaveCount", IC) << ")";
9701 });
9702 } else {
9703 // Report the vectorization decision.
9704 reportVectorization(ORE, TheLoop: L, VF, IC);
9705 }
9706 if (ORE->allowExtraAnalysis(LV_NAME))
9707 checkMixedPrecision(L, ORE);
9708
9709 // If we decided that it is *legal* to interleave or vectorize the loop, then
9710 // do it.
9711
9712 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9713 // Consider vectorizing the epilogue too if it's profitable.
9714 VectorizationFactor EpilogueVF =
9715 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
9716 if (EpilogueVF.Width.isVector()) {
9717 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
9718
9719 // The first pass vectorizes the main loop and creates a scalar epilogue
9720 // to be vectorized by executing the plan (potentially with a different
9721 // factor) again shortly afterwards.
9722 VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
9723 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
9724 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
9725 SmallVector<VPInstruction *> ResumeValues =
9726 preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
9727 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
9728 BestEpiPlan);
9729 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9730 Checks, *BestMainPlan);
9731 auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
9732 BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false);
9733 ++LoopsVectorized;
9734
9735 // Second pass vectorizes the epilogue and adjusts the control flow
9736 // edges from the first pass.
9737 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9738 Checks, BestEpiPlan);
9739 SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
9740 Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI, CM, SE&: *PSE.getSE());
9741 LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, DT,
9742 VectorizingEpilogue: true);
9743 connectEpilogueVectorLoop(EpiPlan&: BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
9744 ResumeValues);
9745 ++LoopsEpilogueVectorized;
9746 } else {
9747 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
9748 BestPlan);
9749 // TODO: Move to general VPlan pipeline once epilogue loops are also
9750 // supported.
9751 RUN_VPLAN_PASS(VPlanTransforms::materializeConstantVectorTripCount,
9752 BestPlan, VF.Width, IC, PSE);
9753 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, UF: IC,
9754 MinProfitableTripCount: VF.MinProfitableTripCount);
9755
9756 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9757 ++LoopsVectorized;
9758 }
9759
9760 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
9761 "DT not preserved correctly");
9762 assert(!verifyFunction(*F, &dbgs()));
9763
9764 return true;
9765}
9766
9767LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
9768
9769 // Don't attempt if
9770 // 1. the target claims to have no vector registers, and
9771 // 2. interleaving won't help ILP.
9772 //
9773 // The second condition is necessary because, even if the target has no
9774 // vector registers, loop vectorization may still enable scalar
9775 // interleaving.
9776 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
9777 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
9778 return LoopVectorizeResult(false, false);
9779
9780 bool Changed = false, CFGChanged = false;
9781
9782 // The vectorizer requires loops to be in simplified form.
9783 // Since simplification may add new inner loops, it has to run before the
9784 // legality and profitability checks. This means running the loop vectorizer
9785 // will simplify all loops, regardless of whether anything end up being
9786 // vectorized.
9787 for (const auto &L : *LI)
9788 Changed |= CFGChanged |=
9789 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
9790
9791 // Build up a worklist of inner-loops to vectorize. This is necessary as
9792 // the act of vectorizing or partially unrolling a loop creates new loops
9793 // and can invalidate iterators across the loops.
9794 SmallVector<Loop *, 8> Worklist;
9795
9796 for (Loop *L : *LI)
9797 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
9798
9799 LoopsAnalyzed += Worklist.size();
9800
9801 // Now walk the identified inner loops.
9802 while (!Worklist.empty()) {
9803 Loop *L = Worklist.pop_back_val();
9804
9805 // For the inner loops we actually process, form LCSSA to simplify the
9806 // transform.
9807 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
9808
9809 Changed |= CFGChanged |= processLoop(L);
9810
9811 if (Changed) {
9812 LAIs->clear();
9813
9814#ifndef NDEBUG
9815 if (VerifySCEV)
9816 SE->verify();
9817#endif
9818 }
9819 }
9820
9821 // Process each loop nest in the function.
9822 return LoopVectorizeResult(Changed, CFGChanged);
9823}
9824
9825PreservedAnalyses LoopVectorizePass::run(Function &F,
9826 FunctionAnalysisManager &AM) {
9827 LI = &AM.getResult<LoopAnalysis>(IR&: F);
9828 // There are no loops in the function. Return before computing other
9829 // expensive analyses.
9830 if (LI->empty())
9831 return PreservedAnalyses::all();
9832 SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
9833 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
9834 DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
9835 TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
9836 AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
9837 DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
9838 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
9839 LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
9840 AA = &AM.getResult<AAManager>(IR&: F);
9841
9842 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
9843 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
9844 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
9845 return AM.getResult<BlockFrequencyAnalysis>(IR&: F);
9846 };
9847 LoopVectorizeResult Result = runImpl(F);
9848 if (!Result.MadeAnyChange)
9849 return PreservedAnalyses::all();
9850 PreservedAnalyses PA;
9851
9852 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
9853 for (auto &BB : F)
9854 RemoveRedundantDbgInstrs(BB: &BB);
9855 }
9856
9857 PA.preserve<LoopAnalysis>();
9858 PA.preserve<DominatorTreeAnalysis>();
9859 PA.preserve<ScalarEvolutionAnalysis>();
9860 PA.preserve<LoopAccessAnalysis>();
9861
9862 if (Result.MadeCFGChange) {
9863 // Making CFG changes likely means a loop got vectorized. Indicate that
9864 // extra simplification passes should be run.
9865 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
9866 // be run if runtime checks have been added.
9867 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
9868 PA.preserve<ShouldRunExtraVectorPasses>();
9869 } else {
9870 PA.preserveSet<CFGAnalyses>();
9871 }
9872 return PA;
9873}
9874
9875void LoopVectorizePass::printPipeline(
9876 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
9877 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
9878 OS, MapClassName2PassName);
9879
9880 OS << '<';
9881 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
9882 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
9883 OS << '>';
9884}
9885