1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/DenseMapInfo.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
74#include "llvm/ADT/SmallPtrSet.h"
75#include "llvm/ADT/SmallVector.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
80#include "llvm/ADT/iterator_range.h"
81#include "llvm/Analysis/AssumptionCache.h"
82#include "llvm/Analysis/BasicAliasAnalysis.h"
83#include "llvm/Analysis/BlockFrequencyInfo.h"
84#include "llvm/Analysis/CFG.h"
85#include "llvm/Analysis/CodeMetrics.h"
86#include "llvm/Analysis/DemandedBits.h"
87#include "llvm/Analysis/GlobalsModRef.h"
88#include "llvm/Analysis/LoopAccessAnalysis.h"
89#include "llvm/Analysis/LoopAnalysisManager.h"
90#include "llvm/Analysis/LoopInfo.h"
91#include "llvm/Analysis/LoopIterator.h"
92#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93#include "llvm/Analysis/ProfileSummaryInfo.h"
94#include "llvm/Analysis/ScalarEvolution.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
97#include "llvm/Analysis/TargetLibraryInfo.h"
98#include "llvm/Analysis/TargetTransformInfo.h"
99#include "llvm/Analysis/ValueTracking.h"
100#include "llvm/Analysis/VectorUtils.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/DiagnosticInfo.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
117#include "llvm/IR/IntrinsicInst.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/ProfDataUtils.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
131#include "llvm/Support/CommandLine.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/NativeFormatting.h"
137#include "llvm/Support/raw_ostream.h"
138#include "llvm/Transforms/Utils/BasicBlockUtils.h"
139#include "llvm/Transforms/Utils/InjectTLIMappings.h"
140#include "llvm/Transforms/Utils/Local.h"
141#include "llvm/Transforms/Utils/LoopSimplify.h"
142#include "llvm/Transforms/Utils/LoopUtils.h"
143#include "llvm/Transforms/Utils/LoopVersioning.h"
144#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
145#include "llvm/Transforms/Utils/SizeOpts.h"
146#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161
162#define LV_NAME "loop-vectorize"
163#define DEBUG_TYPE LV_NAME
164
165#ifndef NDEBUG
166const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167#endif
168
169STATISTIC(LoopsVectorized, "Number of loops vectorized");
170STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
174static cl::opt<bool> EnableEpilogueVectorization(
175 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
176 cl::desc("Enable vectorization of epilogue loops."));
177
178static cl::opt<unsigned> EpilogueVectorizationForceVF(
179 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
180 cl::desc("When epilogue vectorization is enabled, and a value greater than "
181 "1 is specified, forces the given VF for all applicable epilogue "
182 "loops."));
183
184static cl::opt<unsigned> EpilogueVectorizationMinVF(
185 "epilogue-vectorization-minimum-VF", cl::Hidden,
186 cl::desc("Only loops with vectorization factor equal to or larger than "
187 "the specified value are considered for epilogue vectorization."));
188
189/// Loops with a known constant trip count below this number are vectorized only
190/// if no scalar iteration overheads are incurred.
191static cl::opt<unsigned> TinyTripCountVectorThreshold(
192 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
193 cl::desc("Loops with a constant trip count that is smaller than this "
194 "value are vectorized only if no scalar iteration overheads "
195 "are incurred."));
196
197static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
198 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
199 cl::desc("The maximum allowed number of runtime memory checks"));
200
201/// Note: This currently only applies to `llvm.masked.load` and
202/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
203static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
204 "force-target-supports-masked-memory-ops", cl::init(Val: false), cl::Hidden,
205 cl::desc("Assume the target supports masked memory operations (used for "
206 "testing)."));
207
208// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
209// that predication is preferred, and this lists all options. I.e., the
210// vectorizer will try to fold the tail-loop (epilogue) into the vector body
211// and predicate the instructions accordingly. If tail-folding fails, there are
212// different fallback strategies depending on these values:
213namespace PreferPredicateTy {
214 enum Option {
215 ScalarEpilogue = 0,
216 PredicateElseScalarEpilogue,
217 PredicateOrDontVectorize
218 };
219} // namespace PreferPredicateTy
220
221static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
222 "prefer-predicate-over-epilogue",
223 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
224 cl::Hidden,
225 cl::desc("Tail-folding and predication preferences over creating a scalar "
226 "epilogue loop."),
227 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
228 "scalar-epilogue",
229 "Don't tail-predicate loops, create scalar epilogue"),
230 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
231 "predicate-else-scalar-epilogue",
232 "prefer tail-folding, create scalar epilogue if tail "
233 "folding fails."),
234 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
235 "predicate-dont-vectorize",
236 "prefers tail-folding, don't attempt vectorization if "
237 "tail-folding fails.")));
238
239static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
240 "force-tail-folding-style", cl::desc("Force the tail folding style"),
241 cl::init(Val: TailFoldingStyle::None),
242 cl::values(
243 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 clEnumValN(
245 TailFoldingStyle::Data, "data",
246 "Create lane mask for data only, using active.lane.mask intrinsic"),
247 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
248 "data-without-lane-mask",
249 "Create lane mask with compare/stepvector"),
250 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
251 "Create lane mask using active.lane.mask intrinsic, and use "
252 "it for both data and control flow"),
253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254 "Use predicated EVL instructions for tail folding. If EVL "
255 "is unsupported, fallback to data-without-lane-mask.")));
256
257cl::opt<bool> llvm::EnableWideActiveLaneMask(
258 "enable-wide-lane-mask", cl::init(Val: false), cl::Hidden,
259 cl::desc("Enable use of wide lane masks when used for control flow in "
260 "tail-folded loops"));
261
262static cl::opt<bool> MaximizeBandwidth(
263 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
264 cl::desc("Maximize bandwidth when selecting vectorization factor which "
265 "will be determined by the smallest type in loop."));
266
267static cl::opt<bool> EnableInterleavedMemAccesses(
268 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
269 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
270
271/// An interleave-group may need masking if it resides in a block that needs
272/// predication, or in order to mask away gaps.
273static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
274 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
275 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
276
277static cl::opt<unsigned> ForceTargetNumScalarRegs(
278 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
279 cl::desc("A flag that overrides the target's number of scalar registers."));
280
281static cl::opt<unsigned> ForceTargetNumVectorRegs(
282 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
283 cl::desc("A flag that overrides the target's number of vector registers."));
284
285static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
286 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "scalar loops."));
289
290static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
291 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
292 cl::desc("A flag that overrides the target's max interleave factor for "
293 "vectorized loops."));
294
295cl::opt<unsigned> llvm::ForceTargetInstructionCost(
296 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
297 cl::desc("A flag that overrides the target's expected cost for "
298 "an instruction to a single constant value. Mostly "
299 "useful for getting consistent testing."));
300
301static cl::opt<bool> ForceTargetSupportsScalableVectors(
302 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
303 cl::desc(
304 "Pretend that scalable vectors are supported, even if the target does "
305 "not support them. This flag should only be used for testing."));
306
307static cl::opt<unsigned> SmallLoopCost(
308 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
309 cl::desc(
310 "The cost of a loop that is considered 'small' by the interleaver."));
311
312static cl::opt<bool> LoopVectorizeWithBlockFrequency(
313 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
314 cl::desc("Enable the use of the block frequency analysis to access PGO "
315 "heuristics minimizing code growth in cold regions and being more "
316 "aggressive in hot regions."));
317
318// Runtime interleave loops for load/store throughput.
319static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
320 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
321 cl::desc(
322 "Enable runtime interleaving until load/store ports are saturated"));
323
324/// The number of stores in a loop that are allowed to need predication.
325cl::opt<unsigned> NumberOfStoresToPredicate(
326 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
327 cl::desc("Max number of stores to be predicated behind an if."));
328
329static cl::opt<bool> EnableIndVarRegisterHeur(
330 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
331 cl::desc("Count the induction variable only once when interleaving"));
332
333static cl::opt<bool> EnableCondStoresVectorization(
334 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
335 cl::desc("Enable if predication of stores during vectorization."));
336
337static cl::opt<unsigned> MaxNestedScalarReductionIC(
338 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
339 cl::desc("The maximum interleave count to use when interleaving a scalar "
340 "reduction in a nested loop."));
341
342static cl::opt<bool>
343 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
344 cl::Hidden,
345 cl::desc("Prefer in-loop vector reductions, "
346 "overriding the targets preference."));
347
348static cl::opt<bool> ForceOrderedReductions(
349 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
350 cl::desc("Enable the vectorisation of loops with in-order (strict) "
351 "FP reductions"));
352
353static cl::opt<bool> PreferPredicatedReductionSelect(
354 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
355 cl::desc(
356 "Prefer predicating a reduction operation over an after loop select."));
357
358cl::opt<bool> llvm::EnableVPlanNativePath(
359 "enable-vplan-native-path", cl::Hidden,
360 cl::desc("Enable VPlan-native vectorization path with "
361 "support for outer loop vectorization."));
362
363cl::opt<bool>
364 llvm::VerifyEachVPlan("vplan-verify-each",
365#ifdef EXPENSIVE_CHECKS
366 cl::init(true),
367#else
368 cl::init(Val: false),
369#endif
370 cl::Hidden,
371 cl::desc("Verify VPlans after VPlan transforms."));
372
373#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
374cl::opt<bool> llvm::VPlanPrintAfterAll(
375 "vplan-print-after-all", cl::init(false), cl::Hidden,
376 cl::desc("Print VPlans after all VPlan transformations."));
377
378cl::list<std::string> llvm::VPlanPrintAfterPasses(
379 "vplan-print-after", cl::Hidden,
380 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
381
382cl::opt<bool> llvm::VPlanPrintVectorRegionScope(
383 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
384 cl::desc("Limit VPlan printing to vector loop region in "
385 "`-vplan-print-after*` if the plan has one."));
386#endif
387
388// This flag enables the stress testing of the VPlan H-CFG construction in the
389// VPlan-native vectorization path. It must be used in conjuction with
390// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
391// verification of the H-CFGs built.
392static cl::opt<bool> VPlanBuildStressTest(
393 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
394 cl::desc(
395 "Build VPlan for every supported loop nest in the function and bail "
396 "out right after the build (stress test the VPlan H-CFG construction "
397 "in the VPlan-native vectorization path)."));
398
399cl::opt<bool> llvm::EnableLoopInterleaving(
400 "interleave-loops", cl::init(Val: true), cl::Hidden,
401 cl::desc("Enable loop interleaving in Loop vectorization passes"));
402cl::opt<bool> llvm::EnableLoopVectorization(
403 "vectorize-loops", cl::init(Val: true), cl::Hidden,
404 cl::desc("Run the Loop vectorization passes"));
405
406static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
407 "force-widen-divrem-via-safe-divisor", cl::Hidden,
408 cl::desc(
409 "Override cost based safe divisor widening for div/rem instructions"));
410
411static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
412 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
413 cl::Hidden,
414 cl::desc("Try wider VFs if they enable the use of vector variants"));
415
416static cl::opt<bool> EnableEarlyExitVectorization(
417 "enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
418 cl::desc(
419 "Enable vectorization of early exit loops with uncountable exits."));
420
421static cl::opt<bool> ConsiderRegPressure(
422 "vectorizer-consider-reg-pressure", cl::init(Val: false), cl::Hidden,
423 cl::desc("Discard VFs if their register pressure is too high."));
424
425// Likelyhood of bypassing the vectorized loop because there are zero trips left
426// after prolog. See `emitIterationCountCheck`.
427static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
428
429/// A helper function that returns true if the given type is irregular. The
430/// type is irregular if its allocated size doesn't equal the store size of an
431/// element of the corresponding vector type.
432static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
433 // Determine if an array of N elements of type Ty is "bitcast compatible"
434 // with a <N x Ty> vector.
435 // This is only true if there is no padding between the array elements.
436 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
437}
438
439/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
440/// ElementCount to include loops whose trip count is a function of vscale.
441static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
442 const Loop *L) {
443 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
444 return ElementCount::getFixed(MinVal: ExpectedTC);
445
446 const SCEV *BTC = SE->getBackedgeTakenCount(L);
447 if (isa<SCEVCouldNotCompute>(Val: BTC))
448 return ElementCount::getFixed(MinVal: 0);
449
450 const SCEV *ExitCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
451 if (isa<SCEVVScale>(Val: ExitCount))
452 return ElementCount::getScalable(MinVal: 1);
453
454 const APInt *Scale;
455 if (match(S: ExitCount, P: m_scev_Mul(Op0: m_scev_APInt(C&: Scale), Op1: m_SCEVVScale())))
456 if (cast<SCEVMulExpr>(Val: ExitCount)->hasNoUnsignedWrap())
457 if (Scale->getActiveBits() <= 32)
458 return ElementCount::getScalable(MinVal: Scale->getZExtValue());
459
460 return ElementCount::getFixed(MinVal: 0);
461}
462
463/// Get the maximum trip count for \p L from the SCEV unsigned range, excluding
464/// zero from the range. Only valid when not folding the tail, as the minimum
465/// iteration count check guards against a zero trip count. Returns 0 if
466/// unknown.
467static unsigned getMaxTCFromNonZeroRange(PredicatedScalarEvolution &PSE,
468 Loop *L) {
469 const SCEV *BTC = PSE.getBackedgeTakenCount();
470 if (isa<SCEVCouldNotCompute>(Val: BTC))
471 return 0;
472 ScalarEvolution *SE = PSE.getSE();
473 const SCEV *TripCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
474 ConstantRange TCRange = SE->getUnsignedRange(S: TripCount);
475 APInt MaxTCFromRange = TCRange.getUnsignedMax();
476 if (!MaxTCFromRange.isZero() && MaxTCFromRange.getActiveBits() <= 32)
477 return MaxTCFromRange.getZExtValue();
478 return 0;
479}
480
481/// Returns "best known" trip count, which is either a valid positive trip count
482/// or std::nullopt when an estimate cannot be made (including when the trip
483/// count would overflow), for the specified loop \p L as defined by the
484/// following procedure:
485/// 1) Returns exact trip count if it is known.
486/// 2) Returns expected trip count according to profile data if any.
487/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
488/// 4) Returns the maximum trip count from the SCEV range excluding zero,
489/// if \p CanUseConstantMax and \p CanExcludeZeroTrips.
490/// 5) Returns std::nullopt if all of the above failed.
491static std::optional<ElementCount>
492getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
493 bool CanUseConstantMax = true,
494 bool CanExcludeZeroTrips = false) {
495 // Check if exact trip count is known.
496 if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
497 return ExpectedTC;
498
499 // Check if there is an expected trip count available from profile data.
500 if (LoopVectorizeWithBlockFrequency)
501 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
502 return ElementCount::getFixed(MinVal: *EstimatedTC);
503
504 if (!CanUseConstantMax)
505 return std::nullopt;
506
507 // Check if upper bound estimate is known.
508 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
509 return ElementCount::getFixed(MinVal: ExpectedTC);
510
511 // Get the maximum trip count from the SCEV range excluding zero. This is
512 // only safe when not folding the tail, as the minimum iteration count check
513 // prevents entering the vector loop with a zero trip count.
514 if (CanUseConstantMax && CanExcludeZeroTrips)
515 if (unsigned RefinedTC = getMaxTCFromNonZeroRange(PSE, L))
516 return ElementCount::getFixed(MinVal: RefinedTC);
517
518 return std::nullopt;
519}
520
521namespace {
522// Forward declare GeneratedRTChecks.
523class GeneratedRTChecks;
524
525using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
526} // namespace
527
528namespace llvm {
529
530AnalysisKey ShouldRunExtraVectorPasses::Key;
531
532/// InnerLoopVectorizer vectorizes loops which contain only one basic
533/// block to a specified vectorization factor (VF).
534/// This class performs the widening of scalars into vectors, or multiple
535/// scalars. This class also implements the following features:
536/// * It inserts an epilogue loop for handling loops that don't have iteration
537/// counts that are known to be a multiple of the vectorization factor.
538/// * It handles the code generation for reduction variables.
539/// * Scalarization (implementation using scalars) of un-vectorizable
540/// instructions.
541/// InnerLoopVectorizer does not perform any vectorization-legality
542/// checks, and relies on the caller to check for the different legality
543/// aspects. The InnerLoopVectorizer relies on the
544/// LoopVectorizationLegality class to provide information about the induction
545/// and reduction variables that were found to a given vectorization factor.
546class InnerLoopVectorizer {
547public:
548 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
549 LoopInfo *LI, DominatorTree *DT,
550 const TargetTransformInfo *TTI, AssumptionCache *AC,
551 ElementCount VecWidth, unsigned UnrollFactor,
552 LoopVectorizationCostModel *CM,
553 GeneratedRTChecks &RTChecks, VPlan &Plan)
554 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
555 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
556 Cost(CM), RTChecks(RTChecks), Plan(Plan),
557 VectorPHVPBB(cast<VPBasicBlock>(
558 Val: Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
559
560 virtual ~InnerLoopVectorizer() = default;
561
562 /// Creates a basic block for the scalar preheader. Both
563 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
564 /// the method to create additional blocks and checks needed for epilogue
565 /// vectorization.
566 virtual BasicBlock *createVectorizedLoopSkeleton();
567
568 /// Fix the vectorized code, taking care of header phi's, and more.
569 void fixVectorizedLoop(VPTransformState &State);
570
571 /// Fix the non-induction PHIs in \p Plan.
572 void fixNonInductionPHIs(VPTransformState &State);
573
574protected:
575 friend class LoopVectorizationPlanner;
576
577 /// Create and return a new IR basic block for the scalar preheader whose name
578 /// is prefixed with \p Prefix.
579 BasicBlock *createScalarPreheader(StringRef Prefix);
580
581 /// Allow subclasses to override and print debug traces before/after vplan
582 /// execution, when trace information is requested.
583 virtual void printDebugTracesAtStart() {}
584 virtual void printDebugTracesAtEnd() {}
585
586 /// The original loop.
587 Loop *OrigLoop;
588
589 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
590 /// dynamic knowledge to simplify SCEV expressions and converts them to a
591 /// more usable form.
592 PredicatedScalarEvolution &PSE;
593
594 /// Loop Info.
595 LoopInfo *LI;
596
597 /// Dominator Tree.
598 DominatorTree *DT;
599
600 /// Target Transform Info.
601 const TargetTransformInfo *TTI;
602
603 /// Assumption Cache.
604 AssumptionCache *AC;
605
606 /// The vectorization SIMD factor to use. Each vector will have this many
607 /// vector elements.
608 ElementCount VF;
609
610 /// The vectorization unroll factor to use. Each scalar is vectorized to this
611 /// many different vector instructions.
612 unsigned UF;
613
614 /// The builder that we use
615 IRBuilder<> Builder;
616
617 // --- Vectorization state ---
618
619 /// The profitablity analysis.
620 LoopVectorizationCostModel *Cost;
621
622 /// Structure to hold information about generated runtime checks, responsible
623 /// for cleaning the checks, if vectorization turns out unprofitable.
624 GeneratedRTChecks &RTChecks;
625
626 VPlan &Plan;
627
628 /// The vector preheader block of \p Plan, used as target for check blocks
629 /// introduced during skeleton creation.
630 VPBasicBlock *VectorPHVPBB;
631};
632
633/// Encapsulate information regarding vectorization of a loop and its epilogue.
634/// This information is meant to be updated and used across two stages of
635/// epilogue vectorization.
636struct EpilogueLoopVectorizationInfo {
637 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
638 unsigned MainLoopUF = 0;
639 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
640 unsigned EpilogueUF = 0;
641 BasicBlock *MainLoopIterationCountCheck = nullptr;
642 BasicBlock *EpilogueIterationCountCheck = nullptr;
643 Value *VectorTripCount = nullptr;
644 VPlan &EpiloguePlan;
645
646 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
647 ElementCount EVF, unsigned EUF,
648 VPlan &EpiloguePlan)
649 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
650 EpiloguePlan(EpiloguePlan) {
651 assert(EUF == 1 &&
652 "A high UF for the epilogue loop is likely not beneficial.");
653 }
654};
655
656/// An extension of the inner loop vectorizer that creates a skeleton for a
657/// vectorized loop that has its epilogue (residual) also vectorized.
658/// The idea is to run the vplan on a given loop twice, firstly to setup the
659/// skeleton and vectorize the main loop, and secondly to complete the skeleton
660/// from the first step and vectorize the epilogue. This is achieved by
661/// deriving two concrete strategy classes from this base class and invoking
662/// them in succession from the loop vectorizer planner.
663class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
664public:
665 InnerLoopAndEpilogueVectorizer(
666 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
667 DominatorTree *DT, const TargetTransformInfo *TTI, AssumptionCache *AC,
668 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
669 GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
670 ElementCount MinProfitableTripCount, unsigned UnrollFactor)
671 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
672 UnrollFactor, CM, Checks, Plan),
673 EPI(EPI), MinProfitableTripCount(MinProfitableTripCount) {}
674
675 /// Holds and updates state information required to vectorize the main loop
676 /// and its epilogue in two separate passes. This setup helps us avoid
677 /// regenerating and recomputing runtime safety checks. It also helps us to
678 /// shorten the iteration-count-check path length for the cases where the
679 /// iteration count of the loop is so small that the main vector loop is
680 /// completely skipped.
681 EpilogueLoopVectorizationInfo &EPI;
682
683protected:
684 ElementCount MinProfitableTripCount;
685};
686
687/// A specialized derived class of inner loop vectorizer that performs
688/// vectorization of *main* loops in the process of vectorizing loops and their
689/// epilogues.
690class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
691public:
692 EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
693 LoopInfo *LI, DominatorTree *DT,
694 const TargetTransformInfo *TTI,
695 AssumptionCache *AC,
696 EpilogueLoopVectorizationInfo &EPI,
697 LoopVectorizationCostModel *CM,
698 GeneratedRTChecks &Check, VPlan &Plan)
699 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
700 Check, Plan, EPI.MainLoopVF,
701 EPI.MainLoopVF, EPI.MainLoopUF) {}
702
703protected:
704 void printDebugTracesAtStart() override;
705 void printDebugTracesAtEnd() override;
706};
707
708// A specialized derived class of inner loop vectorizer that performs
709// vectorization of *epilogue* loops in the process of vectorizing loops and
710// their epilogues.
711class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
712public:
713 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
714 LoopInfo *LI, DominatorTree *DT,
715 const TargetTransformInfo *TTI,
716 AssumptionCache *AC,
717 EpilogueLoopVectorizationInfo &EPI,
718 LoopVectorizationCostModel *CM,
719 GeneratedRTChecks &Checks, VPlan &Plan)
720 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
721 Checks, Plan, EPI.EpilogueVF,
722 EPI.EpilogueVF, EPI.EpilogueUF) {}
723 /// Implements the interface for creating a vectorized skeleton using the
724 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
725 BasicBlock *createVectorizedLoopSkeleton() final;
726
727protected:
728 void printDebugTracesAtStart() override;
729 void printDebugTracesAtEnd() override;
730};
731} // end namespace llvm
732
733/// Look for a meaningful debug location on the instruction or its operands.
734static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
735 if (!I)
736 return DebugLoc::getUnknown();
737
738 DebugLoc Empty;
739 if (I->getDebugLoc() != Empty)
740 return I->getDebugLoc();
741
742 for (Use &Op : I->operands()) {
743 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
744 if (OpInst->getDebugLoc() != Empty)
745 return OpInst->getDebugLoc();
746 }
747
748 return I->getDebugLoc();
749}
750
751/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
752/// is passed, the message relates to that particular instruction.
753#ifndef NDEBUG
754static void debugVectorizationMessage(const StringRef Prefix,
755 const StringRef DebugMsg,
756 Instruction *I) {
757 dbgs() << "LV: " << Prefix << DebugMsg;
758 if (I != nullptr)
759 dbgs() << " " << *I;
760 else
761 dbgs() << '.';
762 dbgs() << '\n';
763}
764#endif
765
766/// Create an analysis remark that explains why vectorization failed
767///
768/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
769/// RemarkName is the identifier for the remark. If \p I is passed it is an
770/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
771/// the location of the remark. If \p DL is passed, use it as debug location for
772/// the remark. \return the remark object that can be streamed to.
773static OptimizationRemarkAnalysis
774createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
775 Instruction *I, DebugLoc DL = {}) {
776 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
777 // If debug location is attached to the instruction, use it. Otherwise if DL
778 // was not provided, use the loop's.
779 if (I && I->getDebugLoc())
780 DL = I->getDebugLoc();
781 else if (!DL)
782 DL = TheLoop->getStartLoc();
783
784 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
785}
786
787namespace llvm {
788
789/// Return the runtime value for VF.
790Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
791 return B.CreateElementCount(Ty, EC: VF);
792}
793
794void reportVectorizationFailure(const StringRef DebugMsg,
795 const StringRef OREMsg, const StringRef ORETag,
796 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
797 Instruction *I) {
798 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
799 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
800 ORE->emit(
801 OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
802 << "loop not vectorized: " << OREMsg);
803}
804
805/// Reports an informative message: print \p Msg for debugging purposes as well
806/// as an optimization remark. Uses either \p I as location of the remark, or
807/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
808/// remark. If \p DL is passed, use it as debug location for the remark.
809static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
810 OptimizationRemarkEmitter *ORE,
811 Loop *TheLoop, Instruction *I = nullptr,
812 DebugLoc DL = {}) {
813 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
814 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
815 ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
816 I, DL)
817 << Msg);
818}
819
820/// Report successful vectorization of the loop. In case an outer loop is
821/// vectorized, prepend "outer" to the vectorization remark.
822static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
823 VectorizationFactor VF, unsigned IC) {
824 LLVM_DEBUG(debugVectorizationMessage(
825 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
826 nullptr));
827 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
828 ORE->emit(RemarkBuilder: [&]() {
829 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
830 TheLoop->getHeader())
831 << "vectorized " << LoopType << "loop (vectorization width: "
832 << ore::NV("VectorizationFactor", VF.Width)
833 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
834 });
835}
836
837} // end namespace llvm
838
839namespace llvm {
840
841// Loop vectorization cost-model hints how the scalar epilogue loop should be
842// lowered.
843enum ScalarEpilogueLowering {
844
845 // The default: allowing scalar epilogues.
846 CM_ScalarEpilogueAllowed,
847
848 // Vectorization with OptForSize: don't allow epilogues.
849 CM_ScalarEpilogueNotAllowedOptSize,
850
851 // A special case of vectorisation with OptForSize: loops with a very small
852 // trip count are considered for vectorization under OptForSize, thereby
853 // making sure the cost of their loop body is dominant, free of runtime
854 // guards and scalar iteration overheads.
855 CM_ScalarEpilogueNotAllowedLowTripLoop,
856
857 // Loop hint predicate indicating an epilogue is undesired.
858 CM_ScalarEpilogueNotNeededUsePredicate,
859
860 // Directive indicating we must either tail fold or not vectorize
861 CM_ScalarEpilogueNotAllowedUsePredicate
862};
863
864/// LoopVectorizationCostModel - estimates the expected speedups due to
865/// vectorization.
866/// In many cases vectorization is not profitable. This can happen because of
867/// a number of reasons. In this class we mainly attempt to predict the
868/// expected speedup/slowdowns due to the supported instruction set. We use the
869/// TargetTransformInfo to query the different backends for the cost of
870/// different operations.
871class LoopVectorizationCostModel {
872 friend class LoopVectorizationPlanner;
873
874public:
875 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
876 PredicatedScalarEvolution &PSE, LoopInfo *LI,
877 LoopVectorizationLegality *Legal,
878 const TargetTransformInfo &TTI,
879 const TargetLibraryInfo *TLI, DemandedBits *DB,
880 AssumptionCache *AC,
881 OptimizationRemarkEmitter *ORE,
882 std::function<BlockFrequencyInfo &()> GetBFI,
883 const Function *F, const LoopVectorizeHints *Hints,
884 InterleavedAccessInfo &IAI, bool OptForSize)
885 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
886 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI(GetBFI),
887 TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
888 OptForSize(OptForSize) {
889 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
890 initializeVScaleForTuning();
891 CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
892 }
893
894 /// \return An upper bound for the vectorization factors (both fixed and
895 /// scalable). If the factors are 0, vectorization and interleaving should be
896 /// avoided up front.
897 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
898
899 /// \return True if runtime checks are required for vectorization, and false
900 /// otherwise.
901 bool runtimeChecksRequired();
902
903 /// Setup cost-based decisions for user vectorization factor.
904 /// \return true if the UserVF is a feasible VF to be chosen.
905 bool selectUserVectorizationFactor(ElementCount UserVF) {
906 collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
907 return expectedCost(VF: UserVF).isValid();
908 }
909
910 /// \return True if maximizing vector bandwidth is enabled by the target or
911 /// user options, for the given register kind.
912 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
913
914 /// \return True if register pressure should be considered for the given VF.
915 bool shouldConsiderRegPressureForVF(ElementCount VF);
916
917 /// \return The size (in bits) of the smallest and widest types in the code
918 /// that needs to be vectorized. We ignore values that remain scalar such as
919 /// 64 bit loop indices.
920 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
921
922 /// Memory access instruction may be vectorized in more than one way.
923 /// Form of instruction after vectorization depends on cost.
924 /// This function takes cost-based decisions for Load/Store instructions
925 /// and collects them in a map. This decisions map is used for building
926 /// the lists of loop-uniform and loop-scalar instructions.
927 /// The calculated cost is saved with widening decision in order to
928 /// avoid redundant calculations.
929 void setCostBasedWideningDecision(ElementCount VF);
930
931 /// A call may be vectorized in different ways depending on whether we have
932 /// vectorized variants available and whether the target supports masking.
933 /// This function analyzes all calls in the function at the supplied VF,
934 /// makes a decision based on the costs of available options, and stores that
935 /// decision in a map for use in planning and plan execution.
936 void setVectorizedCallDecision(ElementCount VF);
937
938 /// Collect values we want to ignore in the cost model.
939 void collectValuesToIgnore();
940
941 /// Collect all element types in the loop for which widening is needed.
942 void collectElementTypesForWidening();
943
944 /// Split reductions into those that happen in the loop, and those that happen
945 /// outside. In loop reductions are collected into InLoopReductions.
946 void collectInLoopReductions();
947
948 /// Returns true if we should use strict in-order reductions for the given
949 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
950 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
951 /// of FP operations.
952 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
953 return !Hints->allowReordering() && RdxDesc.isOrdered();
954 }
955
956 /// \returns The smallest bitwidth each instruction can be represented with.
957 /// The vector equivalents of these instructions should be truncated to this
958 /// type.
959 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
960 return MinBWs;
961 }
962
963 /// \returns True if it is more profitable to scalarize instruction \p I for
964 /// vectorization factor \p VF.
965 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
966 assert(VF.isVector() &&
967 "Profitable to scalarize relevant only for VF > 1.");
968 assert(
969 TheLoop->isInnermost() &&
970 "cost-model should not be used for outer loops (in VPlan-native path)");
971
972 auto Scalars = InstsToScalarize.find(Key: VF);
973 assert(Scalars != InstsToScalarize.end() &&
974 "VF not yet analyzed for scalarization profitability");
975 return Scalars->second.contains(Key: I);
976 }
977
978 /// Returns true if \p I is known to be uniform after vectorization.
979 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
980 assert(
981 TheLoop->isInnermost() &&
982 "cost-model should not be used for outer loops (in VPlan-native path)");
983 // Pseudo probe needs to be duplicated for each unrolled iteration and
984 // vector lane so that profiled loop trip count can be accurately
985 // accumulated instead of being under counted.
986 if (isa<PseudoProbeInst>(Val: I))
987 return false;
988
989 if (VF.isScalar())
990 return true;
991
992 auto UniformsPerVF = Uniforms.find(Val: VF);
993 assert(UniformsPerVF != Uniforms.end() &&
994 "VF not yet analyzed for uniformity");
995 return UniformsPerVF->second.count(Ptr: I);
996 }
997
998 /// Returns true if \p I is known to be scalar after vectorization.
999 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1000 assert(
1001 TheLoop->isInnermost() &&
1002 "cost-model should not be used for outer loops (in VPlan-native path)");
1003 if (VF.isScalar())
1004 return true;
1005
1006 auto ScalarsPerVF = Scalars.find(Val: VF);
1007 assert(ScalarsPerVF != Scalars.end() &&
1008 "Scalar values are not calculated for VF");
1009 return ScalarsPerVF->second.count(Ptr: I);
1010 }
1011
1012 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1013 /// for vectorization factor \p VF.
1014 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1015 // Truncs must truncate at most to their destination type.
1016 if (isa_and_nonnull<TruncInst>(Val: I) && MinBWs.contains(Key: I) &&
1017 I->getType()->getScalarSizeInBits() < MinBWs.lookup(Key: I))
1018 return false;
1019 return VF.isVector() && MinBWs.contains(Key: I) &&
1020 !isProfitableToScalarize(I, VF) &&
1021 !isScalarAfterVectorization(I, VF);
1022 }
1023
1024 /// Decision that was taken during cost calculation for memory instruction.
1025 enum InstWidening {
1026 CM_Unknown,
1027 CM_Widen, // For consecutive accesses with stride +1.
1028 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1029 CM_Interleave,
1030 CM_GatherScatter,
1031 CM_Scalarize,
1032 CM_VectorCall,
1033 CM_IntrinsicCall
1034 };
1035
1036 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1037 /// instruction \p I and vector width \p VF.
1038 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1039 InstructionCost Cost) {
1040 assert(VF.isVector() && "Expected VF >=2");
1041 WideningDecisions[{I, VF}] = {W, Cost};
1042 }
1043
1044 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1045 /// interleaving group \p Grp and vector width \p VF.
1046 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1047 ElementCount VF, InstWidening W,
1048 InstructionCost Cost) {
1049 assert(VF.isVector() && "Expected VF >=2");
1050 /// Broadcast this decicion to all instructions inside the group.
1051 /// When interleaving, the cost will only be assigned one instruction, the
1052 /// insert position. For other cases, add the appropriate fraction of the
1053 /// total cost to each instruction. This ensures accurate costs are used,
1054 /// even if the insert position instruction is not used.
1055 InstructionCost InsertPosCost = Cost;
1056 InstructionCost OtherMemberCost = 0;
1057 if (W != CM_Interleave)
1058 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1059 ;
1060 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1061 if (auto *I = Grp->getMember(Index: Idx)) {
1062 if (Grp->getInsertPos() == I)
1063 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1064 else
1065 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1066 }
1067 }
1068 }
1069
1070 /// Return the cost model decision for the given instruction \p I and vector
1071 /// width \p VF. Return CM_Unknown if this instruction did not pass
1072 /// through the cost modeling.
1073 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1074 assert(VF.isVector() && "Expected VF to be a vector VF");
1075 assert(
1076 TheLoop->isInnermost() &&
1077 "cost-model should not be used for outer loops (in VPlan-native path)");
1078
1079 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1080 auto Itr = WideningDecisions.find(Val: InstOnVF);
1081 if (Itr == WideningDecisions.end())
1082 return CM_Unknown;
1083 return Itr->second.first;
1084 }
1085
1086 /// Return the vectorization cost for the given instruction \p I and vector
1087 /// width \p VF.
1088 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1089 assert(VF.isVector() && "Expected VF >=2");
1090 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1091 assert(WideningDecisions.contains(InstOnVF) &&
1092 "The cost is not calculated");
1093 return WideningDecisions[InstOnVF].second;
1094 }
1095
1096 struct CallWideningDecision {
1097 InstWidening Kind;
1098 Function *Variant;
1099 Intrinsic::ID IID;
1100 std::optional<unsigned> MaskPos;
1101 InstructionCost Cost;
1102 };
1103
1104 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1105 Function *Variant, Intrinsic::ID IID,
1106 std::optional<unsigned> MaskPos,
1107 InstructionCost Cost) {
1108 assert(!VF.isScalar() && "Expected vector VF");
1109 CallWideningDecisions[{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1110 }
1111
1112 CallWideningDecision getCallWideningDecision(CallInst *CI,
1113 ElementCount VF) const {
1114 assert(!VF.isScalar() && "Expected vector VF");
1115 auto I = CallWideningDecisions.find(Val: {CI, VF});
1116 if (I == CallWideningDecisions.end())
1117 return {.Kind: CM_Unknown, .Variant: nullptr, .IID: Intrinsic::not_intrinsic, .MaskPos: std::nullopt, .Cost: 0};
1118 return I->second;
1119 }
1120
1121 /// Return True if instruction \p I is an optimizable truncate whose operand
1122 /// is an induction variable. Such a truncate will be removed by adding a new
1123 /// induction variable with the destination type.
1124 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1125 // If the instruction is not a truncate, return false.
1126 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1127 if (!Trunc)
1128 return false;
1129
1130 // Get the source and destination types of the truncate.
1131 Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1132 Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1133
1134 // If the truncate is free for the given types, return false. Replacing a
1135 // free truncate with an induction variable would add an induction variable
1136 // update instruction to each iteration of the loop. We exclude from this
1137 // check the primary induction variable since it will need an update
1138 // instruction regardless.
1139 Value *Op = Trunc->getOperand(i_nocapture: 0);
1140 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1141 return false;
1142
1143 // If the truncated value is not an induction variable, return false.
1144 return Legal->isInductionPhi(V: Op);
1145 }
1146
1147 /// Collects the instructions to scalarize for each predicated instruction in
1148 /// the loop.
1149 void collectInstsToScalarize(ElementCount VF);
1150
1151 /// Collect values that will not be widened, including Uniforms, Scalars, and
1152 /// Instructions to Scalarize for the given \p VF.
1153 /// The sets depend on CM decision for Load/Store instructions
1154 /// that may be vectorized as interleave, gather-scatter or scalarized.
1155 /// Also make a decision on what to do about call instructions in the loop
1156 /// at that VF -- scalarize, call a known vector routine, or call a
1157 /// vector intrinsic.
1158 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1159 // Do the analysis once.
1160 if (VF.isScalar() || Uniforms.contains(Val: VF))
1161 return;
1162 setCostBasedWideningDecision(VF);
1163 collectLoopUniforms(VF);
1164 setVectorizedCallDecision(VF);
1165 collectLoopScalars(VF);
1166 collectInstsToScalarize(VF);
1167 }
1168
1169 /// Returns true if the target machine supports masked store operation
1170 /// for the given \p DataType and kind of access to \p Ptr.
1171 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1172 unsigned AddressSpace) const {
1173 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1174 (ForceTargetSupportsMaskedMemoryOps ||
1175 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace));
1176 }
1177
1178 /// Returns true if the target machine supports masked load operation
1179 /// for the given \p DataType and kind of access to \p Ptr.
1180 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1181 unsigned AddressSpace) const {
1182 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1183 (ForceTargetSupportsMaskedMemoryOps ||
1184 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace));
1185 }
1186
1187 /// Returns true if the target machine can represent \p V as a masked gather
1188 /// or scatter operation.
1189 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1190 bool LI = isa<LoadInst>(Val: V);
1191 bool SI = isa<StoreInst>(Val: V);
1192 if (!LI && !SI)
1193 return false;
1194 auto *Ty = getLoadStoreType(I: V);
1195 Align Align = getLoadStoreAlignment(I: V);
1196 if (VF.isVector())
1197 Ty = VectorType::get(ElementType: Ty, EC: VF);
1198 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1199 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1200 }
1201
1202 /// Returns true if the target machine supports all of the reduction
1203 /// variables found for the given VF.
1204 bool canVectorizeReductions(ElementCount VF) const {
1205 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1206 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1207 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1208 }));
1209 }
1210
1211 /// Given costs for both strategies, return true if the scalar predication
1212 /// lowering should be used for div/rem. This incorporates an override
1213 /// option so it is not simply a cost comparison.
1214 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1215 InstructionCost SafeDivisorCost) const {
1216 switch (ForceSafeDivisor) {
1217 case cl::BOU_UNSET:
1218 return ScalarCost < SafeDivisorCost;
1219 case cl::BOU_TRUE:
1220 return false;
1221 case cl::BOU_FALSE:
1222 return true;
1223 }
1224 llvm_unreachable("impossible case value");
1225 }
1226
1227 /// Returns true if \p I is an instruction which requires predication and
1228 /// for which our chosen predication strategy is scalarization (i.e. we
1229 /// don't have an alternate strategy such as masking available).
1230 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1231 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1232
1233 /// Wrapper function for LoopVectorizationLegality::isMaskRequired,
1234 /// that passes the Instruction \p I and if we fold tail.
1235 bool isMaskRequired(Instruction *I) const;
1236
1237 /// Returns true if \p I is an instruction that needs to be predicated
1238 /// at runtime. The result is independent of the predication mechanism.
1239 /// Superset of instructions that return true for isScalarWithPredication.
1240 bool isPredicatedInst(Instruction *I) const;
1241
1242 /// A helper function that returns how much we should divide the cost of a
1243 /// predicated block by. Typically this is the reciprocal of the block
1244 /// probability, i.e. if we return X we are assuming the predicated block will
1245 /// execute once for every X iterations of the loop header so the block should
1246 /// only contribute 1/X of its cost to the total cost calculation, but when
1247 /// optimizing for code size it will just be 1 as code size costs don't depend
1248 /// on execution probabilities.
1249 ///
1250 /// Note that if a block wasn't originally predicated but was predicated due
1251 /// to tail folding, the divisor will still be 1 because it will execute for
1252 /// every iteration of the loop header.
1253 inline uint64_t
1254 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1255 const BasicBlock *BB);
1256
1257 /// Returns true if an artificially high cost for emulated masked memrefs
1258 /// should be used.
1259 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1260
1261 /// Return the costs for our two available strategies for lowering a
1262 /// div/rem operation which requires speculating at least one lane.
1263 /// First result is for scalarization (will be invalid for scalable
1264 /// vectors); second is for the safe-divisor strategy.
1265 std::pair<InstructionCost, InstructionCost>
1266 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1267
1268 /// Returns true if \p I is a memory instruction with consecutive memory
1269 /// access that can be widened.
1270 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1271
1272 /// Returns true if \p I is a memory instruction in an interleaved-group
1273 /// of memory accesses that can be vectorized with wide vector loads/stores
1274 /// and shuffles.
1275 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1276
1277 /// Check if \p Instr belongs to any interleaved access group.
1278 bool isAccessInterleaved(Instruction *Instr) const {
1279 return InterleaveInfo.isInterleaved(Instr);
1280 }
1281
1282 /// Get the interleaved access group that \p Instr belongs to.
1283 const InterleaveGroup<Instruction> *
1284 getInterleavedAccessGroup(Instruction *Instr) const {
1285 return InterleaveInfo.getInterleaveGroup(Instr);
1286 }
1287
1288 /// Returns true if we're required to use a scalar epilogue for at least
1289 /// the final iteration of the original loop.
1290 bool requiresScalarEpilogue(bool IsVectorizing) const {
1291 if (!isScalarEpilogueAllowed()) {
1292 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1293 return false;
1294 }
1295 // If we might exit from anywhere but the latch and early exit vectorization
1296 // is disabled, we must run the exiting iteration in scalar form.
1297 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1298 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1299 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1300 "from latch block\n");
1301 return true;
1302 }
1303 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1304 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1305 "interleaved group requires scalar epilogue\n");
1306 return true;
1307 }
1308 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1309 return false;
1310 }
1311
1312 /// Returns true if a scalar epilogue is allowed (e.g.., not prevented by
1313 /// optsize or a loop hint annotation).
1314 bool isScalarEpilogueAllowed() const {
1315 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1316 }
1317
1318 /// Returns true if tail-folding is preferred over a scalar epilogue.
1319 bool preferPredicatedLoop() const {
1320 return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
1321 ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1322 }
1323
1324 /// Returns the TailFoldingStyle that is best for the current loop.
1325 TailFoldingStyle getTailFoldingStyle() const {
1326 return ChosenTailFoldingStyle;
1327 }
1328
1329 /// Selects and saves TailFoldingStyle.
1330 /// \param IsScalableVF true if scalable vector factors enabled.
1331 /// \param UserIC User specific interleave count.
1332 void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC) {
1333 assert(ChosenTailFoldingStyle == TailFoldingStyle::None &&
1334 "Tail folding must not be selected yet.");
1335 if (!Legal->canFoldTailByMasking()) {
1336 ChosenTailFoldingStyle = TailFoldingStyle::None;
1337 return;
1338 }
1339
1340 // Default to TTI preference, but allow command line override.
1341 ChosenTailFoldingStyle = TTI.getPreferredTailFoldingStyle();
1342 if (ForceTailFoldingStyle.getNumOccurrences())
1343 ChosenTailFoldingStyle = ForceTailFoldingStyle.getValue();
1344
1345 if (ChosenTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1346 return;
1347 // Override EVL styles if needed.
1348 // FIXME: Investigate opportunity for fixed vector factor.
1349 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1350 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1351 if (EVLIsLegal)
1352 return;
1353 // If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1354 // if it's allowed, or DataWithoutLaneMask otherwise.
1355 if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
1356 ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1357 ChosenTailFoldingStyle = TailFoldingStyle::None;
1358 else
1359 ChosenTailFoldingStyle = TailFoldingStyle::DataWithoutLaneMask;
1360
1361 LLVM_DEBUG(
1362 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1363 "not try to generate VP Intrinsics "
1364 << (UserIC > 1
1365 ? "since interleave count specified is greater than 1.\n"
1366 : "due to non-interleaving reasons.\n"));
1367 }
1368
1369 /// Returns true if all loop blocks should be masked to fold tail loop.
1370 bool foldTailByMasking() const {
1371 return getTailFoldingStyle() != TailFoldingStyle::None;
1372 }
1373
1374 /// Returns true if the use of wide lane masks is requested and the loop is
1375 /// using tail-folding with a lane mask for control flow.
1376 bool useWideActiveLaneMask() const {
1377 if (!EnableWideActiveLaneMask)
1378 return false;
1379
1380 return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow;
1381 }
1382
1383 /// Return maximum safe number of elements to be processed per vector
1384 /// iteration, which do not prevent store-load forwarding and are safe with
1385 /// regard to the memory dependencies. Required for EVL-based VPlans to
1386 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1387 /// MaxSafeElements).
1388 /// TODO: need to consider adjusting cost model to use this value as a
1389 /// vectorization factor for EVL-based vectorization.
1390 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1391
1392 /// Returns true if the instructions in this block requires predication
1393 /// for any reason, e.g. because tail folding now requires a predicate
1394 /// or because the block in the original loop was predicated.
1395 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1396 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1397 }
1398
1399 /// Returns true if VP intrinsics with explicit vector length support should
1400 /// be generated in the tail folded loop.
1401 bool foldTailWithEVL() const {
1402 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1403 }
1404
1405 /// Returns true if the Phi is part of an inloop reduction.
1406 bool isInLoopReduction(PHINode *Phi) const {
1407 return InLoopReductions.contains(Ptr: Phi);
1408 }
1409
1410 /// Returns the set of in-loop reduction PHIs.
1411 const SmallPtrSetImpl<PHINode *> &getInLoopReductions() const {
1412 return InLoopReductions;
1413 }
1414
1415 /// Returns true if the predicated reduction select should be used to set the
1416 /// incoming value for the reduction phi.
1417 bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const {
1418 // Force to use predicated reduction select since the EVL of the
1419 // second-to-last iteration might not be VF*UF.
1420 if (foldTailWithEVL())
1421 return true;
1422
1423 // Note: For FindLast recurrences we prefer a predicated select to simplify
1424 // matching in handleFindLastReductions(), rather than handle multiple
1425 // cases.
1426 if (RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RecurrenceKind))
1427 return true;
1428
1429 return PreferPredicatedReductionSelect ||
1430 TTI.preferPredicatedReductionSelect();
1431 }
1432
1433 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1434 /// with factor VF. Return the cost of the instruction, including
1435 /// scalarization overhead if it's needed.
1436 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1437
1438 /// Estimate cost of a call instruction CI if it were vectorized with factor
1439 /// VF. Return the cost of the instruction, including scalarization overhead
1440 /// if it's needed.
1441 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1442
1443 /// Invalidates decisions already taken by the cost model.
1444 void invalidateCostModelingDecisions() {
1445 WideningDecisions.clear();
1446 CallWideningDecisions.clear();
1447 Uniforms.clear();
1448 Scalars.clear();
1449 }
1450
1451 /// Returns the expected execution cost. The unit of the cost does
1452 /// not matter because we use the 'cost' units to compare different
1453 /// vector widths. The cost that is returned is *not* normalized by
1454 /// the factor width.
1455 InstructionCost expectedCost(ElementCount VF);
1456
1457 bool hasPredStores() const { return NumPredStores > 0; }
1458
1459 /// Returns true if epilogue vectorization is considered profitable, and
1460 /// false otherwise.
1461 /// \p VF is the vectorization factor chosen for the original loop.
1462 /// \p Multiplier is an aditional scaling factor applied to VF before
1463 /// comparing to EpilogueVectorizationMinVF.
1464 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1465 const unsigned IC) const;
1466
1467 /// Returns the execution time cost of an instruction for a given vector
1468 /// width. Vector width of one means scalar.
1469 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1470
1471 /// Return the cost of instructions in an inloop reduction pattern, if I is
1472 /// part of that pattern.
1473 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1474 ElementCount VF,
1475 Type *VectorTy) const;
1476
1477 /// Returns true if \p Op should be considered invariant and if it is
1478 /// trivially hoistable.
1479 bool shouldConsiderInvariant(Value *Op);
1480
1481 /// Return the value of vscale used for tuning the cost model.
1482 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1483
1484private:
1485 unsigned NumPredStores = 0;
1486
1487 /// Used to store the value of vscale used for tuning the cost model. It is
1488 /// initialized during object construction.
1489 std::optional<unsigned> VScaleForTuning;
1490
1491 /// Initializes the value of vscale used for tuning the cost model. If
1492 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1493 /// return the value returned by the corresponding TTI method.
1494 void initializeVScaleForTuning() {
1495 const Function *Fn = TheLoop->getHeader()->getParent();
1496 if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1497 auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1498 auto Min = Attr.getVScaleRangeMin();
1499 auto Max = Attr.getVScaleRangeMax();
1500 if (Max && Min == Max) {
1501 VScaleForTuning = Max;
1502 return;
1503 }
1504 }
1505
1506 VScaleForTuning = TTI.getVScaleForTuning();
1507 }
1508
1509 /// \return An upper bound for the vectorization factors for both
1510 /// fixed and scalable vectorization, where the minimum-known number of
1511 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1512 /// disabled or unsupported, then the scalable part will be equal to
1513 /// ElementCount::getScalable(0).
1514 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1515 ElementCount UserVF, unsigned UserIC,
1516 bool FoldTailByMasking);
1517
1518 /// If \p VF * \p UserIC > MaxTripcount, clamps VF to the next lower VF that
1519 /// results in VF * UserIC <= MaxTripCount.
1520 ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1521 unsigned UserIC,
1522 bool FoldTailByMasking) const;
1523
1524 /// \return the maximized element count based on the targets vector
1525 /// registers and the loop trip-count, but limited to a maximum safe VF.
1526 /// This is a helper function of computeFeasibleMaxVF.
1527 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1528 unsigned SmallestType,
1529 unsigned WidestType,
1530 ElementCount MaxSafeVF, unsigned UserIC,
1531 bool FoldTailByMasking);
1532
1533 /// Checks if scalable vectorization is supported and enabled. Caches the
1534 /// result to avoid repeated debug dumps for repeated queries.
1535 bool isScalableVectorizationAllowed();
1536
1537 /// \return the maximum legal scalable VF, based on the safe max number
1538 /// of elements.
1539 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1540
1541 /// Calculate vectorization cost of memory instruction \p I.
1542 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1543
1544 /// The cost computation for scalarized memory instruction.
1545 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1546
1547 /// The cost computation for interleaving group of memory instructions.
1548 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1549
1550 /// The cost computation for Gather/Scatter instruction.
1551 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1552
1553 /// The cost computation for widening instruction \p I with consecutive
1554 /// memory access.
1555 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1556
1557 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1558 /// Load: scalar load + broadcast.
1559 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1560 /// element)
1561 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1562
1563 /// Estimate the overhead of scalarizing an instruction. This is a
1564 /// convenience wrapper for the type-based getScalarizationOverhead API.
1565 InstructionCost getScalarizationOverhead(Instruction *I,
1566 ElementCount VF) const;
1567
1568 /// Map of scalar integer values to the smallest bitwidth they can be legally
1569 /// represented as. The vector equivalents of these values should be truncated
1570 /// to this type.
1571 MapVector<Instruction *, uint64_t> MinBWs;
1572
1573 /// A type representing the costs for instructions if they were to be
1574 /// scalarized rather than vectorized. The entries are Instruction-Cost
1575 /// pairs.
1576 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1577
1578 /// A set containing all BasicBlocks that are known to present after
1579 /// vectorization as a predicated block.
1580 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1581 PredicatedBBsAfterVectorization;
1582
1583 /// Records whether it is allowed to have the original scalar loop execute at
1584 /// least once. This may be needed as a fallback loop in case runtime
1585 /// aliasing/dependence checks fail, or to handle the tail/remainder
1586 /// iterations when the trip count is unknown or doesn't divide by the VF,
1587 /// or as a peel-loop to handle gaps in interleave-groups.
1588 /// Under optsize and when the trip count is very small we don't allow any
1589 /// iterations to execute in the scalar loop.
1590 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1591
1592 /// Control finally chosen tail folding style.
1593 TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;
1594
1595 /// true if scalable vectorization is supported and enabled.
1596 std::optional<bool> IsScalableVectorizationAllowed;
1597
1598 /// Maximum safe number of elements to be processed per vector iteration,
1599 /// which do not prevent store-load forwarding and are safe with regard to the
1600 /// memory dependencies. Required for EVL-based veectorization, where this
1601 /// value is used as the upper bound of the safe AVL.
1602 std::optional<unsigned> MaxSafeElements;
1603
1604 /// A map holding scalar costs for different vectorization factors. The
1605 /// presence of a cost for an instruction in the mapping indicates that the
1606 /// instruction will be scalarized when vectorizing with the associated
1607 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1608 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1609
1610 /// Holds the instructions known to be uniform after vectorization.
1611 /// The data is collected per VF.
1612 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1613
1614 /// Holds the instructions known to be scalar after vectorization.
1615 /// The data is collected per VF.
1616 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1617
1618 /// Holds the instructions (address computations) that are forced to be
1619 /// scalarized.
1620 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1621
1622 /// PHINodes of the reductions that should be expanded in-loop.
1623 SmallPtrSet<PHINode *, 4> InLoopReductions;
1624
1625 /// A Map of inloop reduction operations and their immediate chain operand.
1626 /// FIXME: This can be removed once reductions can be costed correctly in
1627 /// VPlan. This was added to allow quick lookup of the inloop operations.
1628 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1629
1630 /// Returns the expected difference in cost from scalarizing the expression
1631 /// feeding a predicated instruction \p PredInst. The instructions to
1632 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1633 /// non-negative return value implies the expression will be scalarized.
1634 /// Currently, only single-use chains are considered for scalarization.
1635 InstructionCost computePredInstDiscount(Instruction *PredInst,
1636 ScalarCostsTy &ScalarCosts,
1637 ElementCount VF);
1638
1639 /// Collect the instructions that are uniform after vectorization. An
1640 /// instruction is uniform if we represent it with a single scalar value in
1641 /// the vectorized loop corresponding to each vector iteration. Examples of
1642 /// uniform instructions include pointer operands of consecutive or
1643 /// interleaved memory accesses. Note that although uniformity implies an
1644 /// instruction will be scalar, the reverse is not true. In general, a
1645 /// scalarized instruction will be represented by VF scalar values in the
1646 /// vectorized loop, each corresponding to an iteration of the original
1647 /// scalar loop.
1648 void collectLoopUniforms(ElementCount VF);
1649
1650 /// Collect the instructions that are scalar after vectorization. An
1651 /// instruction is scalar if it is known to be uniform or will be scalarized
1652 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1653 /// to the list if they are used by a load/store instruction that is marked as
1654 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1655 /// VF values in the vectorized loop, each corresponding to an iteration of
1656 /// the original scalar loop.
1657 void collectLoopScalars(ElementCount VF);
1658
1659 /// Keeps cost model vectorization decision and cost for instructions.
1660 /// Right now it is used for memory instructions only.
1661 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1662 std::pair<InstWidening, InstructionCost>>;
1663
1664 DecisionList WideningDecisions;
1665
1666 using CallDecisionList =
1667 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1668
1669 CallDecisionList CallWideningDecisions;
1670
1671 /// Returns true if \p V is expected to be vectorized and it needs to be
1672 /// extracted.
1673 bool needsExtract(Value *V, ElementCount VF) const {
1674 Instruction *I = dyn_cast<Instruction>(Val: V);
1675 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1676 TheLoop->isLoopInvariant(V: I) ||
1677 getWideningDecision(I, VF) == CM_Scalarize ||
1678 (isa<CallInst>(Val: I) &&
1679 getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize))
1680 return false;
1681
1682 // Assume we can vectorize V (and hence we need extraction) if the
1683 // scalars are not computed yet. This can happen, because it is called
1684 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1685 // the scalars are collected. That should be a safe assumption in most
1686 // cases, because we check if the operands have vectorizable types
1687 // beforehand in LoopVectorizationLegality.
1688 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1689 };
1690
1691 /// Returns a range containing only operands needing to be extracted.
1692 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1693 ElementCount VF) const {
1694
1695 SmallPtrSet<const Value *, 4> UniqueOperands;
1696 SmallVector<Value *, 4> Res;
1697 for (Value *Op : Ops) {
1698 if (isa<Constant>(Val: Op) || !UniqueOperands.insert(Ptr: Op).second ||
1699 !needsExtract(V: Op, VF))
1700 continue;
1701 Res.push_back(Elt: Op);
1702 }
1703 return Res;
1704 }
1705
1706public:
1707 /// The loop that we evaluate.
1708 Loop *TheLoop;
1709
1710 /// Predicated scalar evolution analysis.
1711 PredicatedScalarEvolution &PSE;
1712
1713 /// Loop Info analysis.
1714 LoopInfo *LI;
1715
1716 /// Vectorization legality.
1717 LoopVectorizationLegality *Legal;
1718
1719 /// Vector target information.
1720 const TargetTransformInfo &TTI;
1721
1722 /// Target Library Info.
1723 const TargetLibraryInfo *TLI;
1724
1725 /// Demanded bits analysis.
1726 DemandedBits *DB;
1727
1728 /// Assumption cache.
1729 AssumptionCache *AC;
1730
1731 /// Interface to emit optimization remarks.
1732 OptimizationRemarkEmitter *ORE;
1733
1734 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1735 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1736 /// there is no predication.
1737 std::function<BlockFrequencyInfo &()> GetBFI;
1738 /// The BlockFrequencyInfo returned from GetBFI.
1739 BlockFrequencyInfo *BFI = nullptr;
1740 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1741 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1742 BlockFrequencyInfo &getBFI() {
1743 if (!BFI)
1744 BFI = &GetBFI();
1745 return *BFI;
1746 }
1747
1748 const Function *TheFunction;
1749
1750 /// Loop Vectorize Hint.
1751 const LoopVectorizeHints *Hints;
1752
1753 /// The interleave access information contains groups of interleaved accesses
1754 /// with the same stride and close to each other.
1755 InterleavedAccessInfo &InterleaveInfo;
1756
1757 /// Values to ignore in the cost model.
1758 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1759
1760 /// Values to ignore in the cost model when VF > 1.
1761 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1762
1763 /// All element types found in the loop.
1764 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1765
1766 /// The kind of cost that we are calculating
1767 TTI::TargetCostKind CostKind;
1768
1769 /// Whether this loop should be optimized for size based on function attribute
1770 /// or profile information.
1771 bool OptForSize;
1772
1773 /// The highest VF possible for this loop, without using MaxBandwidth.
1774 FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
1775};
1776} // end namespace llvm
1777
1778namespace {
1779/// Helper struct to manage generating runtime checks for vectorization.
1780///
1781/// The runtime checks are created up-front in temporary blocks to allow better
1782/// estimating the cost and un-linked from the existing IR. After deciding to
1783/// vectorize, the checks are moved back. If deciding not to vectorize, the
1784/// temporary blocks are completely removed.
1785class GeneratedRTChecks {
1786 /// Basic block which contains the generated SCEV checks, if any.
1787 BasicBlock *SCEVCheckBlock = nullptr;
1788
1789 /// The value representing the result of the generated SCEV checks. If it is
1790 /// nullptr no SCEV checks have been generated.
1791 Value *SCEVCheckCond = nullptr;
1792
1793 /// Basic block which contains the generated memory runtime checks, if any.
1794 BasicBlock *MemCheckBlock = nullptr;
1795
1796 /// The value representing the result of the generated memory runtime checks.
1797 /// If it is nullptr no memory runtime checks have been generated.
1798 Value *MemRuntimeCheckCond = nullptr;
1799
1800 DominatorTree *DT;
1801 LoopInfo *LI;
1802 TargetTransformInfo *TTI;
1803
1804 SCEVExpander SCEVExp;
1805 SCEVExpander MemCheckExp;
1806
1807 bool CostTooHigh = false;
1808
1809 Loop *OuterLoop = nullptr;
1810
1811 PredicatedScalarEvolution &PSE;
1812
1813 /// The kind of cost that we are calculating
1814 TTI::TargetCostKind CostKind;
1815
1816public:
1817 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1818 LoopInfo *LI, TargetTransformInfo *TTI,
1819 TTI::TargetCostKind CostKind)
1820 : DT(DT), LI(LI), TTI(TTI),
1821 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1822 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1823 PSE(PSE), CostKind(CostKind) {}
1824
1825 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1826 /// accurately estimate the cost of the runtime checks. The blocks are
1827 /// un-linked from the IR and are added back during vector code generation. If
1828 /// there is no vector code generation, the check blocks are removed
1829 /// completely.
1830 void create(Loop *L, const LoopAccessInfo &LAI,
1831 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1832 OptimizationRemarkEmitter &ORE) {
1833
1834 // Hard cutoff to limit compile-time increase in case a very large number of
1835 // runtime checks needs to be generated.
1836 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1837 // profile info.
1838 CostTooHigh =
1839 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1840 if (CostTooHigh) {
1841 // Mark runtime checks as never succeeding when they exceed the threshold.
1842 MemRuntimeCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1843 SCEVCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1844 ORE.emit(RemarkBuilder: [&]() {
1845 return OptimizationRemarkAnalysisAliasing(
1846 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1847 L->getHeader())
1848 << "loop not vectorized: too many memory checks needed";
1849 });
1850 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1851 return;
1852 }
1853
1854 BasicBlock *LoopHeader = L->getHeader();
1855 BasicBlock *Preheader = L->getLoopPreheader();
1856
1857 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1858 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1859 // may be used by SCEVExpander. The blocks will be un-linked from their
1860 // predecessors and removed from LI & DT at the end of the function.
1861 if (!UnionPred.isAlwaysTrue()) {
1862 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1863 MSSAU: nullptr, BBName: "vector.scevcheck");
1864
1865 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1866 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1867 if (isa<Constant>(Val: SCEVCheckCond)) {
1868 // Clean up directly after expanding the predicate to a constant, to
1869 // avoid further expansions re-using anything left over from SCEVExp.
1870 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1871 SCEVCleaner.cleanup();
1872 }
1873 }
1874
1875 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1876 if (RtPtrChecking.Need) {
1877 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1878 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1879 BBName: "vector.memcheck");
1880
1881 auto DiffChecks = RtPtrChecking.getDiffChecks();
1882 if (DiffChecks) {
1883 Value *RuntimeVF = nullptr;
1884 MemRuntimeCheckCond = addDiffRuntimeChecks(
1885 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1886 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1887 if (!RuntimeVF)
1888 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1889 return RuntimeVF;
1890 },
1891 IC);
1892 } else {
1893 MemRuntimeCheckCond = addRuntimeChecks(
1894 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1895 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1896 }
1897 assert(MemRuntimeCheckCond &&
1898 "no RT checks generated although RtPtrChecking "
1899 "claimed checks are required");
1900 }
1901
1902 SCEVExp.eraseDeadInstructions(Root: SCEVCheckCond);
1903
1904 if (!MemCheckBlock && !SCEVCheckBlock)
1905 return;
1906
1907 // Unhook the temporary block with the checks, update various places
1908 // accordingly.
1909 if (SCEVCheckBlock)
1910 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1911 if (MemCheckBlock)
1912 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1913
1914 if (SCEVCheckBlock) {
1915 SCEVCheckBlock->getTerminator()->moveBefore(
1916 InsertPos: Preheader->getTerminator()->getIterator());
1917 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1918 UI->setDebugLoc(DebugLoc::getTemporary());
1919 Preheader->getTerminator()->eraseFromParent();
1920 }
1921 if (MemCheckBlock) {
1922 MemCheckBlock->getTerminator()->moveBefore(
1923 InsertPos: Preheader->getTerminator()->getIterator());
1924 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1925 UI->setDebugLoc(DebugLoc::getTemporary());
1926 Preheader->getTerminator()->eraseFromParent();
1927 }
1928
1929 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1930 if (MemCheckBlock) {
1931 DT->eraseNode(BB: MemCheckBlock);
1932 LI->removeBlock(BB: MemCheckBlock);
1933 }
1934 if (SCEVCheckBlock) {
1935 DT->eraseNode(BB: SCEVCheckBlock);
1936 LI->removeBlock(BB: SCEVCheckBlock);
1937 }
1938
1939 // Outer loop is used as part of the later cost calculations.
1940 OuterLoop = L->getParentLoop();
1941 }
1942
1943 InstructionCost getCost() {
1944 if (SCEVCheckBlock || MemCheckBlock)
1945 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1946
1947 if (CostTooHigh) {
1948 InstructionCost Cost;
1949 Cost.setInvalid();
1950 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1951 return Cost;
1952 }
1953
1954 InstructionCost RTCheckCost = 0;
1955 if (SCEVCheckBlock)
1956 for (Instruction &I : *SCEVCheckBlock) {
1957 if (SCEVCheckBlock->getTerminator() == &I)
1958 continue;
1959 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1960 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1961 RTCheckCost += C;
1962 }
1963 if (MemCheckBlock) {
1964 InstructionCost MemCheckCost = 0;
1965 for (Instruction &I : *MemCheckBlock) {
1966 if (MemCheckBlock->getTerminator() == &I)
1967 continue;
1968 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1969 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1970 MemCheckCost += C;
1971 }
1972
1973 // If the runtime memory checks are being created inside an outer loop
1974 // we should find out if these checks are outer loop invariant. If so,
1975 // the checks will likely be hoisted out and so the effective cost will
1976 // reduce according to the outer loop trip count.
1977 if (OuterLoop) {
1978 ScalarEvolution *SE = MemCheckExp.getSE();
1979 // TODO: If profitable, we could refine this further by analysing every
1980 // individual memory check, since there could be a mixture of loop
1981 // variant and invariant checks that mean the final condition is
1982 // variant.
1983 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1984 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1985 // It seems reasonable to assume that we can reduce the effective
1986 // cost of the checks even when we know nothing about the trip
1987 // count. Assume that the outer loop executes at least twice.
1988 unsigned BestTripCount = 2;
1989
1990 // Get the best known TC estimate.
1991 if (auto EstimatedTC = getSmallBestKnownTC(
1992 PSE, L: OuterLoop, /* CanUseConstantMax = */ false))
1993 if (EstimatedTC->isFixed())
1994 BestTripCount = EstimatedTC->getFixedValue();
1995
1996 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1997
1998 // Let's ensure the cost is always at least 1.
1999 NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
2000 b: (InstructionCost::CostType)1);
2001
2002 if (BestTripCount > 1)
2003 LLVM_DEBUG(dbgs()
2004 << "We expect runtime memory checks to be hoisted "
2005 << "out of the outer loop. Cost reduced from "
2006 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2007
2008 MemCheckCost = NewMemCheckCost;
2009 }
2010 }
2011
2012 RTCheckCost += MemCheckCost;
2013 }
2014
2015 if (SCEVCheckBlock || MemCheckBlock)
2016 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2017 << "\n");
2018
2019 return RTCheckCost;
2020 }
2021
2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2023 /// unused.
2024 ~GeneratedRTChecks() {
2025 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2026 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2027 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(BB: SCEVCheckBlock);
2028 bool MemChecksUsed = !MemCheckBlock || !pred_empty(BB: MemCheckBlock);
2029 if (SCEVChecksUsed)
2030 SCEVCleaner.markResultUsed();
2031
2032 if (MemChecksUsed) {
2033 MemCheckCleaner.markResultUsed();
2034 } else {
2035 auto &SE = *MemCheckExp.getSE();
2036 // Memory runtime check generation creates compares that use expanded
2037 // values. Remove them before running the SCEVExpanderCleaners.
2038 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2039 if (MemCheckExp.isInsertedInstruction(I: &I))
2040 continue;
2041 SE.forgetValue(V: &I);
2042 I.eraseFromParent();
2043 }
2044 }
2045 MemCheckCleaner.cleanup();
2046 SCEVCleaner.cleanup();
2047
2048 if (!SCEVChecksUsed)
2049 SCEVCheckBlock->eraseFromParent();
2050 if (!MemChecksUsed)
2051 MemCheckBlock->eraseFromParent();
2052 }
2053
2054 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2055 /// outside VPlan.
2056 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
2057 using namespace llvm::PatternMatch;
2058 if (!SCEVCheckCond || match(V: SCEVCheckCond, P: m_ZeroInt()))
2059 return {nullptr, nullptr};
2060
2061 return {SCEVCheckCond, SCEVCheckBlock};
2062 }
2063
2064 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2065 /// outside VPlan.
2066 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
2067 using namespace llvm::PatternMatch;
2068 if (MemRuntimeCheckCond && match(V: MemRuntimeCheckCond, P: m_ZeroInt()))
2069 return {nullptr, nullptr};
2070 return {MemRuntimeCheckCond, MemCheckBlock};
2071 }
2072
2073 /// Return true if any runtime checks have been added
2074 bool hasChecks() const {
2075 return getSCEVChecks().first || getMemRuntimeChecks().first;
2076 }
2077};
2078} // namespace
2079
2080static bool useActiveLaneMask(TailFoldingStyle Style) {
2081 return Style == TailFoldingStyle::Data ||
2082 Style == TailFoldingStyle::DataAndControlFlow;
2083}
2084
2085static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2086 return Style == TailFoldingStyle::DataAndControlFlow;
2087}
2088
2089// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2090// vectorization. The loop needs to be annotated with #pragma omp simd
2091// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2092// vector length information is not provided, vectorization is not considered
2093// explicit. Interleave hints are not allowed either. These limitations will be
2094// relaxed in the future.
2095// Please, note that we are currently forced to abuse the pragma 'clang
2096// vectorize' semantics. This pragma provides *auto-vectorization hints*
2097// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2098// provides *explicit vectorization hints* (LV can bypass legal checks and
2099// assume that vectorization is legal). However, both hints are implemented
2100// using the same metadata (llvm.loop.vectorize, processed by
2101// LoopVectorizeHints). This will be fixed in the future when the native IR
2102// representation for pragma 'omp simd' is introduced.
2103static bool isExplicitVecOuterLoop(Loop *OuterLp,
2104 OptimizationRemarkEmitter *ORE) {
2105 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2106 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2107
2108 // Only outer loops with an explicit vectorization hint are supported.
2109 // Unannotated outer loops are ignored.
2110 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2111 return false;
2112
2113 Function *Fn = OuterLp->getHeader()->getParent();
2114 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2115 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2116 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2117 return false;
2118 }
2119
2120 if (Hints.getInterleave() > 1) {
2121 // TODO: Interleave support is future work.
2122 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2123 "outer loops.\n");
2124 Hints.emitRemarkWithHints();
2125 return false;
2126 }
2127
2128 return true;
2129}
2130
2131static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2132 OptimizationRemarkEmitter *ORE,
2133 SmallVectorImpl<Loop *> &V) {
2134 // Collect inner loops and outer loops without irreducible control flow. For
2135 // now, only collect outer loops that have explicit vectorization hints. If we
2136 // are stress testing the VPlan H-CFG construction, we collect the outermost
2137 // loop of every loop nest.
2138 if (L.isInnermost() || VPlanBuildStressTest ||
2139 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2140 LoopBlocksRPO RPOT(&L);
2141 RPOT.perform(LI);
2142 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2143 V.push_back(Elt: &L);
2144 // TODO: Collect inner loops inside marked outer loops in case
2145 // vectorization fails for the outer loop. Do not invoke
2146 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2147 // already known to be reducible. We can use an inherited attribute for
2148 // that.
2149 return;
2150 }
2151 }
2152 for (Loop *InnerL : L)
2153 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2154}
2155
2156//===----------------------------------------------------------------------===//
2157// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2158// LoopVectorizationCostModel and LoopVectorizationPlanner.
2159//===----------------------------------------------------------------------===//
2160
2161/// FIXME: The newly created binary instructions should contain nsw/nuw
2162/// flags, which can be found from the original scalar operations.
2163Value *
2164llvm::emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2165 Value *Step,
2166 InductionDescriptor::InductionKind InductionKind,
2167 const BinaryOperator *InductionBinOp) {
2168 using namespace llvm::PatternMatch;
2169 Type *StepTy = Step->getType();
2170 Value *CastedIndex = StepTy->isIntegerTy()
2171 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2172 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2173 if (CastedIndex != Index) {
2174 CastedIndex->setName(CastedIndex->getName() + ".cast");
2175 Index = CastedIndex;
2176 }
2177
2178 // Note: the IR at this point is broken. We cannot use SE to create any new
2179 // SCEV and then expand it, hoping that SCEV's simplification will give us
2180 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2181 // lead to various SCEV crashes. So all we can do is to use builder and rely
2182 // on InstCombine for future simplifications. Here we handle some trivial
2183 // cases only.
2184 auto CreateAdd = [&B](Value *X, Value *Y) {
2185 assert(X->getType() == Y->getType() && "Types don't match!");
2186 if (match(V: X, P: m_ZeroInt()))
2187 return Y;
2188 if (match(V: Y, P: m_ZeroInt()))
2189 return X;
2190 return B.CreateAdd(LHS: X, RHS: Y);
2191 };
2192
2193 // We allow X to be a vector type, in which case Y will potentially be
2194 // splatted into a vector with the same element count.
2195 auto CreateMul = [&B](Value *X, Value *Y) {
2196 assert(X->getType()->getScalarType() == Y->getType() &&
2197 "Types don't match!");
2198 if (match(V: X, P: m_One()))
2199 return Y;
2200 if (match(V: Y, P: m_One()))
2201 return X;
2202 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2203 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2204 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2205 return B.CreateMul(LHS: X, RHS: Y);
2206 };
2207
2208 switch (InductionKind) {
2209 case InductionDescriptor::IK_IntInduction: {
2210 assert(!isa<VectorType>(Index->getType()) &&
2211 "Vector indices not supported for integer inductions yet");
2212 assert(Index->getType() == StartValue->getType() &&
2213 "Index type does not match StartValue type");
2214 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2215 return B.CreateSub(LHS: StartValue, RHS: Index);
2216 auto *Offset = CreateMul(Index, Step);
2217 return CreateAdd(StartValue, Offset);
2218 }
2219 case InductionDescriptor::IK_PtrInduction:
2220 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2221 case InductionDescriptor::IK_FpInduction: {
2222 assert(!isa<VectorType>(Index->getType()) &&
2223 "Vector indices not supported for FP inductions yet");
2224 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2225 assert(InductionBinOp &&
2226 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2227 InductionBinOp->getOpcode() == Instruction::FSub) &&
2228 "Original bin op should be defined for FP induction");
2229
2230 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2231 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2232 Name: "induction");
2233 }
2234 case InductionDescriptor::IK_NoInduction:
2235 return nullptr;
2236 }
2237 llvm_unreachable("invalid enum");
2238}
2239
2240static std::optional<unsigned> getMaxVScale(const Function &F,
2241 const TargetTransformInfo &TTI) {
2242 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2243 return MaxVScale;
2244
2245 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2246 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2247
2248 return std::nullopt;
2249}
2250
2251/// For the given VF and UF and maximum trip count computed for the loop, return
2252/// whether the induction variable might overflow in the vectorized loop. If not,
2253/// then we know a runtime overflow check always evaluates to false and can be
2254/// removed.
2255static bool isIndvarOverflowCheckKnownFalse(
2256 const LoopVectorizationCostModel *Cost,
2257 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2258 // Always be conservative if we don't know the exact unroll factor.
2259 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2260
2261 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2262 APInt MaxUIntTripCount = IdxTy->getMask();
2263
2264 // We know the runtime overflow check is known false iff the (max) trip-count
2265 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2266 // the vector loop induction variable.
2267 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2268 uint64_t MaxVF = VF.getKnownMinValue();
2269 if (VF.isScalable()) {
2270 std::optional<unsigned> MaxVScale =
2271 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2272 if (!MaxVScale)
2273 return false;
2274 MaxVF *= *MaxVScale;
2275 }
2276
2277 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2278 }
2279
2280 return false;
2281}
2282
2283// Return whether we allow using masked interleave-groups (for dealing with
2284// strided loads/stores that reside in predicated blocks, or for dealing
2285// with gaps).
2286static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2287 // If an override option has been passed in for interleaved accesses, use it.
2288 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2289 return EnableMaskedInterleavedMemAccesses;
2290
2291 return TTI.enableMaskedInterleavedAccessVectorization();
2292}
2293
2294/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2295/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2296/// predecessors and successors of VPBB, if any, are rewired to the new
2297/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2298static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
2299 BasicBlock *IRBB,
2300 VPlan *Plan = nullptr) {
2301 if (!Plan)
2302 Plan = VPBB->getPlan();
2303 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2304 auto IP = IRVPBB->begin();
2305 for (auto &R : make_early_inc_range(Range: VPBB->phis()))
2306 R.moveBefore(BB&: *IRVPBB, I: IP);
2307
2308 for (auto &R :
2309 make_early_inc_range(Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end())))
2310 R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2311
2312 VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2313 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2314 return IRVPBB;
2315}
2316
2317BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
2318 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2319 assert(VectorPH && "Invalid loop structure");
2320 assert((OrigLoop->getUniqueLatchExitBlock() ||
2321 Cost->requiresScalarEpilogue(VF.isVector())) &&
2322 "loops not exiting via the latch without required epilogue?");
2323
2324 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2325 // wrapping the newly created scalar preheader here at the moment, because the
2326 // Plan's scalar preheader may be unreachable at this point. Instead it is
2327 // replaced in executePlan.
2328 return SplitBlock(Old: VectorPH, SplitPt: VectorPH->getTerminator(), DT, LI, MSSAU: nullptr,
2329 BBName: Twine(Prefix) + "scalar.ph");
2330}
2331
2332/// Knowing that loop \p L executes a single vector iteration, add instructions
2333/// that will get simplified and thus should not have any cost to \p
2334/// InstsToIgnore.
2335static void addFullyUnrolledInstructionsToIgnore(
2336 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2337 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2338 auto *Cmp = L->getLatchCmpInst();
2339 if (Cmp)
2340 InstsToIgnore.insert(Ptr: Cmp);
2341 for (const auto &KV : IL) {
2342 // Extract the key by hand so that it can be used in the lambda below. Note
2343 // that captured structured bindings are a C++20 extension.
2344 const PHINode *IV = KV.first;
2345
2346 // Get next iteration value of the induction variable.
2347 Instruction *IVInst =
2348 cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2349 if (all_of(Range: IVInst->users(),
2350 P: [&](const User *U) { return U == IV || U == Cmp; }))
2351 InstsToIgnore.insert(Ptr: IVInst);
2352 }
2353}
2354
2355BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2356 // Create a new IR basic block for the scalar preheader.
2357 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
2358 return ScalarPH->getSinglePredecessor();
2359}
2360
2361namespace {
2362
2363struct CSEDenseMapInfo {
2364 static bool canHandle(const Instruction *I) {
2365 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2366 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2367 }
2368
2369 static inline Instruction *getEmptyKey() {
2370 return DenseMapInfo<Instruction *>::getEmptyKey();
2371 }
2372
2373 static inline Instruction *getTombstoneKey() {
2374 return DenseMapInfo<Instruction *>::getTombstoneKey();
2375 }
2376
2377 static unsigned getHashValue(const Instruction *I) {
2378 assert(canHandle(I) && "Unknown instruction!");
2379 return hash_combine(args: I->getOpcode(),
2380 args: hash_combine_range(R: I->operand_values()));
2381 }
2382
2383 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2384 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2385 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2386 return LHS == RHS;
2387 return LHS->isIdenticalTo(I: RHS);
2388 }
2389};
2390
2391} // end anonymous namespace
2392
2393/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2394/// removal, in favor of the VPlan-based one.
2395static void legacyCSE(BasicBlock *BB) {
2396 // Perform simple cse.
2397 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2398 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2399 if (!CSEDenseMapInfo::canHandle(I: &In))
2400 continue;
2401
2402 // Check if we can replace this instruction with any of the
2403 // visited instructions.
2404 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2405 In.replaceAllUsesWith(V);
2406 In.eraseFromParent();
2407 continue;
2408 }
2409
2410 CSEMap[&In] = &In;
2411 }
2412}
2413
2414/// This function attempts to return a value that represents the ElementCount
2415/// at runtime. For fixed-width VFs we know this precisely at compile
2416/// time, but for scalable VFs we calculate it based on an estimate of the
2417/// vscale value.
2418static unsigned estimateElementCount(ElementCount VF,
2419 std::optional<unsigned> VScale) {
2420 unsigned EstimatedVF = VF.getKnownMinValue();
2421 if (VF.isScalable())
2422 if (VScale)
2423 EstimatedVF *= *VScale;
2424 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2425 return EstimatedVF;
2426}
2427
2428InstructionCost
2429LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2430 ElementCount VF) const {
2431 // We only need to calculate a cost if the VF is scalar; for actual vectors
2432 // we should already have a pre-calculated cost at each VF.
2433 if (!VF.isScalar())
2434 return getCallWideningDecision(CI, VF).Cost;
2435
2436 Type *RetTy = CI->getType();
2437 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2438 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2439 return *RedCost;
2440
2441 SmallVector<Type *, 4> Tys;
2442 for (auto &ArgOp : CI->args())
2443 Tys.push_back(Elt: ArgOp->getType());
2444
2445 InstructionCost ScalarCallCost =
2446 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2447
2448 // If this is an intrinsic we may have a lower cost for it.
2449 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2450 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2451 return std::min(a: ScalarCallCost, b: IntrinsicCost);
2452 }
2453 return ScalarCallCost;
2454}
2455
2456static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2457 if (VF.isScalar() || !canVectorizeTy(Ty))
2458 return Ty;
2459 return toVectorizedTy(Ty, EC: VF);
2460}
2461
2462InstructionCost
2463LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2464 ElementCount VF) const {
2465 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2466 assert(ID && "Expected intrinsic call!");
2467 Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2468 FastMathFlags FMF;
2469 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2470 FMF = FPMO->getFastMathFlags();
2471
2472 SmallVector<const Value *> Arguments(CI->args());
2473 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2474 SmallVector<Type *> ParamTys;
2475 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2476 result: std::back_inserter(x&: ParamTys),
2477 unary_op: [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2478
2479 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2480 dyn_cast<IntrinsicInst>(Val: CI),
2481 InstructionCost::getInvalid());
2482 return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2483}
2484
2485void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2486 // Fix widened non-induction PHIs by setting up the PHI operands.
2487 fixNonInductionPHIs(State);
2488
2489 // Don't apply optimizations below when no (vector) loop remains, as they all
2490 // require one at the moment.
2491 VPBasicBlock *HeaderVPBB =
2492 vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2493 if (!HeaderVPBB)
2494 return;
2495
2496 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2497
2498 // Remove redundant induction instructions.
2499 legacyCSE(BB: HeaderBB);
2500}
2501
2502void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2503 auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2504 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2505 for (VPRecipeBase &P : VPBB->phis()) {
2506 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2507 if (!VPPhi)
2508 continue;
2509 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2510 // Make sure the builder has a valid insert point.
2511 Builder.SetInsertPoint(NewPhi);
2512 for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2513 NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB[VPBB]);
2514 }
2515 }
2516}
2517
2518void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2519 // We should not collect Scalars more than once per VF. Right now, this
2520 // function is called from collectUniformsAndScalars(), which already does
2521 // this check. Collecting Scalars for VF=1 does not make any sense.
2522 assert(VF.isVector() && !Scalars.contains(VF) &&
2523 "This function should not be visited twice for the same VF");
2524
2525 // This avoids any chances of creating a REPLICATE recipe during planning
2526 // since that would result in generation of scalarized code during execution,
2527 // which is not supported for scalable vectors.
2528 if (VF.isScalable()) {
2529 Scalars[VF].insert_range(R&: Uniforms[VF]);
2530 return;
2531 }
2532
2533 SmallSetVector<Instruction *, 8> Worklist;
2534
2535 // These sets are used to seed the analysis with pointers used by memory
2536 // accesses that will remain scalar.
2537 SmallSetVector<Instruction *, 8> ScalarPtrs;
2538 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2539 auto *Latch = TheLoop->getLoopLatch();
2540
2541 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2542 // The pointer operands of loads and stores will be scalar as long as the
2543 // memory access is not a gather or scatter operation. The value operand of a
2544 // store will remain scalar if the store is scalarized.
2545 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2546 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2547 assert(WideningDecision != CM_Unknown &&
2548 "Widening decision should be ready at this moment");
2549 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2550 if (Ptr == Store->getValueOperand())
2551 return WideningDecision == CM_Scalarize;
2552 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2553 "Ptr is neither a value or pointer operand");
2554 return WideningDecision != CM_GatherScatter;
2555 };
2556
2557 // A helper that returns true if the given value is a getelementptr
2558 // instruction contained in the loop.
2559 auto IsLoopVaryingGEP = [&](Value *V) {
2560 return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2561 };
2562
2563 // A helper that evaluates a memory access's use of a pointer. If the use will
2564 // be a scalar use and the pointer is only used by memory accesses, we place
2565 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2566 // PossibleNonScalarPtrs.
2567 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2568 // We only care about bitcast and getelementptr instructions contained in
2569 // the loop.
2570 if (!IsLoopVaryingGEP(Ptr))
2571 return;
2572
2573 // If the pointer has already been identified as scalar (e.g., if it was
2574 // also identified as uniform), there's nothing to do.
2575 auto *I = cast<Instruction>(Val: Ptr);
2576 if (Worklist.count(key: I))
2577 return;
2578
2579 // If the use of the pointer will be a scalar use, and all users of the
2580 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2581 // place the pointer in PossibleNonScalarPtrs.
2582 if (IsScalarUse(MemAccess, Ptr) &&
2583 all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2584 ScalarPtrs.insert(X: I);
2585 else
2586 PossibleNonScalarPtrs.insert(Ptr: I);
2587 };
2588
2589 // We seed the scalars analysis with three classes of instructions: (1)
2590 // instructions marked uniform-after-vectorization and (2) bitcast,
2591 // getelementptr and (pointer) phi instructions used by memory accesses
2592 // requiring a scalar use.
2593 //
2594 // (1) Add to the worklist all instructions that have been identified as
2595 // uniform-after-vectorization.
2596 Worklist.insert_range(R&: Uniforms[VF]);
2597
2598 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2599 // memory accesses requiring a scalar use. The pointer operands of loads and
2600 // stores will be scalar unless the operation is a gather or scatter.
2601 // The value operand of a store will remain scalar if the store is scalarized.
2602 for (auto *BB : TheLoop->blocks())
2603 for (auto &I : *BB) {
2604 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2605 EvaluatePtrUse(Load, Load->getPointerOperand());
2606 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2607 EvaluatePtrUse(Store, Store->getPointerOperand());
2608 EvaluatePtrUse(Store, Store->getValueOperand());
2609 }
2610 }
2611 for (auto *I : ScalarPtrs)
2612 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2613 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2614 Worklist.insert(X: I);
2615 }
2616
2617 // Insert the forced scalars.
2618 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2619 // induction variable when the PHI user is scalarized.
2620 auto ForcedScalar = ForcedScalars.find(Val: VF);
2621 if (ForcedScalar != ForcedScalars.end())
2622 for (auto *I : ForcedScalar->second) {
2623 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2624 Worklist.insert(X: I);
2625 }
2626
2627 // Expand the worklist by looking through any bitcasts and getelementptr
2628 // instructions we've already identified as scalar. This is similar to the
2629 // expansion step in collectLoopUniforms(); however, here we're only
2630 // expanding to include additional bitcasts and getelementptr instructions.
2631 unsigned Idx = 0;
2632 while (Idx != Worklist.size()) {
2633 Instruction *Dst = Worklist[Idx++];
2634 if (!IsLoopVaryingGEP(Dst->getOperand(i: 0)))
2635 continue;
2636 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
2637 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
2638 auto *J = cast<Instruction>(Val: U);
2639 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
2640 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
2641 IsScalarUse(J, Src));
2642 })) {
2643 Worklist.insert(X: Src);
2644 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2645 }
2646 }
2647
2648 // An induction variable will remain scalar if all users of the induction
2649 // variable and induction variable update remain scalar.
2650 for (const auto &Induction : Legal->getInductionVars()) {
2651 auto *Ind = Induction.first;
2652 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2653
2654 // If tail-folding is applied, the primary induction variable will be used
2655 // to feed a vector compare.
2656 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2657 continue;
2658
2659 // Returns true if \p Indvar is a pointer induction that is used directly by
2660 // load/store instruction \p I.
2661 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2662 Instruction *I) {
2663 return Induction.second.getKind() ==
2664 InductionDescriptor::IK_PtrInduction &&
2665 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
2666 Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse(I, Indvar);
2667 };
2668
2669 // Determine if all users of the induction variable are scalar after
2670 // vectorization.
2671 bool ScalarInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2672 auto *I = cast<Instruction>(Val: U);
2673 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2674 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2675 });
2676 if (!ScalarInd)
2677 continue;
2678
2679 // If the induction variable update is a fixed-order recurrence, neither the
2680 // induction variable or its update should be marked scalar after
2681 // vectorization.
2682 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2683 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2684 continue;
2685
2686 // Determine if all users of the induction variable update instruction are
2687 // scalar after vectorization.
2688 bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
2689 auto *I = cast<Instruction>(Val: U);
2690 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2691 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2692 });
2693 if (!ScalarIndUpdate)
2694 continue;
2695
2696 // The induction variable and its update instruction will remain scalar.
2697 Worklist.insert(X: Ind);
2698 Worklist.insert(X: IndUpdate);
2699 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2700 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2701 << "\n");
2702 }
2703
2704 Scalars[VF].insert_range(R&: Worklist);
2705}
2706
2707bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
2708 ElementCount VF) {
2709 if (!isPredicatedInst(I))
2710 return false;
2711
2712 // Do we have a non-scalar lowering for this predicated
2713 // instruction? No - it is scalar with predication.
2714 switch(I->getOpcode()) {
2715 default:
2716 return true;
2717 case Instruction::Call:
2718 if (VF.isScalar())
2719 return true;
2720 return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
2721 case Instruction::Load:
2722 case Instruction::Store: {
2723 auto *Ptr = getLoadStorePointerOperand(V: I);
2724 auto *Ty = getLoadStoreType(I);
2725 unsigned AS = getLoadStoreAddressSpace(I);
2726 Type *VTy = Ty;
2727 if (VF.isVector())
2728 VTy = VectorType::get(ElementType: Ty, EC: VF);
2729 const Align Alignment = getLoadStoreAlignment(I);
2730 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2731 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
2732 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2733 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
2734 }
2735 case Instruction::UDiv:
2736 case Instruction::SDiv:
2737 case Instruction::SRem:
2738 case Instruction::URem: {
2739 // We have the option to use the safe-divisor idiom to avoid predication.
2740 // The cost based decision here will always select safe-divisor for
2741 // scalable vectors as scalarization isn't legal.
2742 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2743 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2744 }
2745 }
2746}
2747
2748bool LoopVectorizationCostModel::isMaskRequired(Instruction *I) const {
2749 return Legal->isMaskRequired(I, TailFolded: foldTailByMasking());
2750}
2751
2752// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2753bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2754 // TODO: We can use the loop-preheader as context point here and get
2755 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2756 if (isSafeToSpeculativelyExecute(I) ||
2757 (isa<LoadInst, StoreInst, CallInst>(Val: I) && !isMaskRequired(I)) ||
2758 isa<UncondBrInst, CondBrInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2759 return false;
2760
2761 // If the instruction was executed conditionally in the original scalar loop,
2762 // predication is needed with a mask whose lanes are all possibly inactive.
2763 if (Legal->blockNeedsPredication(BB: I->getParent()))
2764 return true;
2765
2766 // If we're not folding the tail by masking, predication is unnecessary.
2767 if (!foldTailByMasking())
2768 return false;
2769
2770 // All that remain are instructions with side-effects originally executed in
2771 // the loop unconditionally, but now execute under a tail-fold mask (only)
2772 // having at least one active lane (the first). If the side-effects of the
2773 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2774 // - it will cause the same side-effects as when masked.
2775 switch(I->getOpcode()) {
2776 default:
2777 llvm_unreachable(
2778 "instruction should have been considered by earlier checks");
2779 case Instruction::Call:
2780 // Side-effects of a Call are assumed to be non-invariant, needing a
2781 // (fold-tail) mask.
2782 assert(isMaskRequired(I) &&
2783 "should have returned earlier for calls not needing a mask");
2784 return true;
2785 case Instruction::Load:
2786 // If the address is loop invariant no predication is needed.
2787 return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2788 case Instruction::Store: {
2789 // For stores, we need to prove both speculation safety (which follows from
2790 // the same argument as loads), but also must prove the value being stored
2791 // is correct. The easiest form of the later is to require that all values
2792 // stored are the same.
2793 return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2794 TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2795 }
2796 case Instruction::UDiv:
2797 case Instruction::URem:
2798 // If the divisor is loop-invariant no predication is needed.
2799 return !Legal->isInvariant(V: I->getOperand(i: 1));
2800 case Instruction::SDiv:
2801 case Instruction::SRem:
2802 // Conservative for now, since masked-off lanes may be poison and could
2803 // trigger signed overflow.
2804 return true;
2805 }
2806}
2807
2808uint64_t LoopVectorizationCostModel::getPredBlockCostDivisor(
2809 TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
2810 if (CostKind == TTI::TCK_CodeSize)
2811 return 1;
2812 // If the block wasn't originally predicated then return early to avoid
2813 // computing BlockFrequencyInfo unnecessarily.
2814 if (!Legal->blockNeedsPredication(BB))
2815 return 1;
2816
2817 uint64_t HeaderFreq =
2818 getBFI().getBlockFreq(BB: TheLoop->getHeader()).getFrequency();
2819 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2820 assert(HeaderFreq >= BBFreq &&
2821 "Header has smaller block freq than dominated BB?");
2822 return std::round(x: (double)HeaderFreq / BBFreq);
2823}
2824
2825std::pair<InstructionCost, InstructionCost>
2826LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
2827 ElementCount VF) {
2828 assert(I->getOpcode() == Instruction::UDiv ||
2829 I->getOpcode() == Instruction::SDiv ||
2830 I->getOpcode() == Instruction::SRem ||
2831 I->getOpcode() == Instruction::URem);
2832 assert(!isSafeToSpeculativelyExecute(I));
2833
2834 // Scalarization isn't legal for scalable vector types
2835 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2836 if (!VF.isScalable()) {
2837 // Get the scalarization cost and scale this amount by the probability of
2838 // executing the predicated block. If the instruction is not predicated,
2839 // we fall through to the next case.
2840 ScalarizationCost = 0;
2841
2842 // These instructions have a non-void type, so account for the phi nodes
2843 // that we will create. This cost is likely to be zero. The phi node
2844 // cost, if any, should be scaled by the block probability because it
2845 // models a copy at the end of each predicated block.
2846 ScalarizationCost +=
2847 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
2848
2849 // The cost of the non-predicated instruction.
2850 ScalarizationCost +=
2851 VF.getFixedValue() *
2852 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
2853
2854 // The cost of insertelement and extractelement instructions needed for
2855 // scalarization.
2856 ScalarizationCost += getScalarizationOverhead(I, VF);
2857
2858 // Scale the cost by the probability of executing the predicated blocks.
2859 // This assumes the predicated block for each vector lane is equally
2860 // likely.
2861 ScalarizationCost =
2862 ScalarizationCost / getPredBlockCostDivisor(CostKind, BB: I->getParent());
2863 }
2864
2865 InstructionCost SafeDivisorCost = 0;
2866 auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
2867 // The cost of the select guard to ensure all lanes are well defined
2868 // after we speculate above any internal control flow.
2869 SafeDivisorCost +=
2870 TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
2871 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
2872 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
2873
2874 SmallVector<const Value *, 4> Operands(I->operand_values());
2875 SafeDivisorCost += TTI.getArithmeticInstrCost(
2876 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
2877 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2878 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2879 Args: Operands, CxtI: I);
2880 return {ScalarizationCost, SafeDivisorCost};
2881}
2882
2883bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
2884 Instruction *I, ElementCount VF) const {
2885 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2886 assert(getWideningDecision(I, VF) == CM_Unknown &&
2887 "Decision should not be set yet.");
2888 auto *Group = getInterleavedAccessGroup(Instr: I);
2889 assert(Group && "Must have a group.");
2890 unsigned InterleaveFactor = Group->getFactor();
2891
2892 // If the instruction's allocated size doesn't equal its type size, it
2893 // requires padding and will be scalarized.
2894 auto &DL = I->getDataLayout();
2895 auto *ScalarTy = getLoadStoreType(I);
2896 if (hasIrregularType(Ty: ScalarTy, DL))
2897 return false;
2898
2899 // For scalable vectors, the interleave factors must be <= 8 since we require
2900 // the (de)interleaveN intrinsics instead of shufflevectors.
2901 if (VF.isScalable() && InterleaveFactor > 8)
2902 return false;
2903
2904 // If the group involves a non-integral pointer, we may not be able to
2905 // losslessly cast all values to a common type.
2906 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
2907 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
2908 Instruction *Member = Group->getMember(Index: Idx);
2909 if (!Member)
2910 continue;
2911 auto *MemberTy = getLoadStoreType(I: Member);
2912 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
2913 // Don't coerce non-integral pointers to integers or vice versa.
2914 if (MemberNI != ScalarNI)
2915 // TODO: Consider adding special nullptr value case here
2916 return false;
2917 if (MemberNI && ScalarNI &&
2918 ScalarTy->getPointerAddressSpace() !=
2919 MemberTy->getPointerAddressSpace())
2920 return false;
2921 }
2922
2923 // Check if masking is required.
2924 // A Group may need masking for one of two reasons: it resides in a block that
2925 // needs predication, or it was decided to use masking to deal with gaps
2926 // (either a gap at the end of a load-access that may result in a speculative
2927 // load, or any gaps in a store-access).
2928 bool PredicatedAccessRequiresMasking =
2929 blockNeedsPredicationForAnyReason(BB: I->getParent()) && isMaskRequired(I);
2930 bool LoadAccessWithGapsRequiresEpilogMasking =
2931 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
2932 !isScalarEpilogueAllowed();
2933 bool StoreAccessWithGapsRequiresMasking =
2934 isa<StoreInst>(Val: I) && !Group->isFull();
2935 if (!PredicatedAccessRequiresMasking &&
2936 !LoadAccessWithGapsRequiresEpilogMasking &&
2937 !StoreAccessWithGapsRequiresMasking)
2938 return true;
2939
2940 // If masked interleaving is required, we expect that the user/target had
2941 // enabled it, because otherwise it either wouldn't have been created or
2942 // it should have been invalidated by the CostModel.
2943 assert(useMaskedInterleavedAccesses(TTI) &&
2944 "Masked interleave-groups for predicated accesses are not enabled.");
2945
2946 if (Group->isReverse())
2947 return false;
2948
2949 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
2950 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
2951 StoreAccessWithGapsRequiresMasking;
2952 if (VF.isScalable() && NeedsMaskForGaps)
2953 return false;
2954
2955 auto *Ty = getLoadStoreType(I);
2956 const Align Alignment = getLoadStoreAlignment(I);
2957 unsigned AS = getLoadStoreAddressSpace(I);
2958 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
2959 : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
2960}
2961
2962bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
2963 Instruction *I, ElementCount VF) {
2964 // Get and ensure we have a valid memory instruction.
2965 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
2966
2967 auto *Ptr = getLoadStorePointerOperand(V: I);
2968 auto *ScalarTy = getLoadStoreType(I);
2969
2970 // In order to be widened, the pointer should be consecutive, first of all.
2971 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
2972 return false;
2973
2974 // If the instruction is a store located in a predicated block, it will be
2975 // scalarized.
2976 if (isScalarWithPredication(I, VF))
2977 return false;
2978
2979 // If the instruction's allocated size doesn't equal it's type size, it
2980 // requires padding and will be scalarized.
2981 auto &DL = I->getDataLayout();
2982 if (hasIrregularType(Ty: ScalarTy, DL))
2983 return false;
2984
2985 return true;
2986}
2987
2988void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
2989 // We should not collect Uniforms more than once per VF. Right now,
2990 // this function is called from collectUniformsAndScalars(), which
2991 // already does this check. Collecting Uniforms for VF=1 does not make any
2992 // sense.
2993
2994 assert(VF.isVector() && !Uniforms.contains(VF) &&
2995 "This function should not be visited twice for the same VF");
2996
2997 // Visit the list of Uniforms. If we find no uniform value, we won't
2998 // analyze again. Uniforms.count(VF) will return 1.
2999 Uniforms[VF].clear();
3000
3001 // Now we know that the loop is vectorizable!
3002 // Collect instructions inside the loop that will remain uniform after
3003 // vectorization.
3004
3005 // Global values, params and instructions outside of current loop are out of
3006 // scope.
3007 auto IsOutOfScope = [&](Value *V) -> bool {
3008 Instruction *I = dyn_cast<Instruction>(Val: V);
3009 return (!I || !TheLoop->contains(Inst: I));
3010 };
3011
3012 // Worklist containing uniform instructions demanding lane 0.
3013 SetVector<Instruction *> Worklist;
3014
3015 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3016 // that require predication must not be considered uniform after
3017 // vectorization, because that would create an erroneous replicating region
3018 // where only a single instance out of VF should be formed.
3019 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3020 if (IsOutOfScope(I)) {
3021 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3022 << *I << "\n");
3023 return;
3024 }
3025 if (isPredicatedInst(I)) {
3026 LLVM_DEBUG(
3027 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3028 << "\n");
3029 return;
3030 }
3031 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3032 Worklist.insert(X: I);
3033 };
3034
3035 // Start with the conditional branches exiting the loop. If the branch
3036 // condition is an instruction contained in the loop that is only used by the
3037 // branch, it is uniform. Note conditions from uncountable early exits are not
3038 // uniform.
3039 SmallVector<BasicBlock *> Exiting;
3040 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3041 for (BasicBlock *E : Exiting) {
3042 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3043 continue;
3044 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
3045 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3046 AddToWorklistIfAllowed(Cmp);
3047 }
3048
3049 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
3050 // Return true if all lanes perform the same memory operation, and we can
3051 // thus choose to execute only one.
3052 auto IsUniformMemOpUse = [&](Instruction *I) {
3053 // If the value was already known to not be uniform for the previous
3054 // (smaller VF), it cannot be uniform for the larger VF.
3055 if (PrevVF.isVector()) {
3056 auto Iter = Uniforms.find(Val: PrevVF);
3057 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
3058 return false;
3059 }
3060 if (!Legal->isUniformMemOp(I&: *I, VF))
3061 return false;
3062 if (isa<LoadInst>(Val: I))
3063 // Loading the same address always produces the same result - at least
3064 // assuming aliasing and ordering which have already been checked.
3065 return true;
3066 // Storing the same value on every iteration.
3067 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3068 };
3069
3070 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3071 InstWidening WideningDecision = getWideningDecision(I, VF);
3072 assert(WideningDecision != CM_Unknown &&
3073 "Widening decision should be ready at this moment");
3074
3075 if (IsUniformMemOpUse(I))
3076 return true;
3077
3078 return (WideningDecision == CM_Widen ||
3079 WideningDecision == CM_Widen_Reverse ||
3080 WideningDecision == CM_Interleave);
3081 };
3082
3083 // Returns true if Ptr is the pointer operand of a memory access instruction
3084 // I, I is known to not require scalarization, and the pointer is not also
3085 // stored.
3086 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3087 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
3088 return false;
3089 return getLoadStorePointerOperand(V: I) == Ptr &&
3090 (IsUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
3091 };
3092
3093 // Holds a list of values which are known to have at least one uniform use.
3094 // Note that there may be other uses which aren't uniform. A "uniform use"
3095 // here is something which only demands lane 0 of the unrolled iterations;
3096 // it does not imply that all lanes produce the same value (e.g. this is not
3097 // the usual meaning of uniform)
3098 SetVector<Value *> HasUniformUse;
3099
3100 // Scan the loop for instructions which are either a) known to have only
3101 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3102 for (auto *BB : TheLoop->blocks())
3103 for (auto &I : *BB) {
3104 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3105 switch (II->getIntrinsicID()) {
3106 case Intrinsic::sideeffect:
3107 case Intrinsic::experimental_noalias_scope_decl:
3108 case Intrinsic::assume:
3109 case Intrinsic::lifetime_start:
3110 case Intrinsic::lifetime_end:
3111 if (TheLoop->hasLoopInvariantOperands(I: &I))
3112 AddToWorklistIfAllowed(&I);
3113 break;
3114 default:
3115 break;
3116 }
3117 }
3118
3119 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3120 if (IsOutOfScope(EVI->getAggregateOperand())) {
3121 AddToWorklistIfAllowed(EVI);
3122 continue;
3123 }
3124 // Only ExtractValue instructions where the aggregate value comes from a
3125 // call are allowed to be non-uniform.
3126 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3127 "Expected aggregate value to be call return value");
3128 }
3129
3130 // If there's no pointer operand, there's nothing to do.
3131 auto *Ptr = getLoadStorePointerOperand(V: &I);
3132 if (!Ptr)
3133 continue;
3134
3135 // If the pointer can be proven to be uniform, always add it to the
3136 // worklist.
3137 if (isa<Instruction>(Val: Ptr) && Legal->isUniform(V: Ptr, VF))
3138 AddToWorklistIfAllowed(cast<Instruction>(Val: Ptr));
3139
3140 if (IsUniformMemOpUse(&I))
3141 AddToWorklistIfAllowed(&I);
3142
3143 if (IsVectorizedMemAccessUse(&I, Ptr))
3144 HasUniformUse.insert(X: Ptr);
3145 }
3146
3147 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3148 // demanding) users. Since loops are assumed to be in LCSSA form, this
3149 // disallows uses outside the loop as well.
3150 for (auto *V : HasUniformUse) {
3151 if (IsOutOfScope(V))
3152 continue;
3153 auto *I = cast<Instruction>(Val: V);
3154 bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User *U) -> bool {
3155 auto *UI = cast<Instruction>(Val: U);
3156 return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse(UI, V);
3157 });
3158 if (UsersAreMemAccesses)
3159 AddToWorklistIfAllowed(I);
3160 }
3161
3162 // Expand Worklist in topological order: whenever a new instruction
3163 // is added , its users should be already inside Worklist. It ensures
3164 // a uniform instruction will only be used by uniform instructions.
3165 unsigned Idx = 0;
3166 while (Idx != Worklist.size()) {
3167 Instruction *I = Worklist[Idx++];
3168
3169 for (auto *OV : I->operand_values()) {
3170 // isOutOfScope operands cannot be uniform instructions.
3171 if (IsOutOfScope(OV))
3172 continue;
3173 // First order recurrence Phi's should typically be considered
3174 // non-uniform.
3175 auto *OP = dyn_cast<PHINode>(Val: OV);
3176 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3177 continue;
3178 // If all the users of the operand are uniform, then add the
3179 // operand into the uniform worklist.
3180 auto *OI = cast<Instruction>(Val: OV);
3181 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
3182 auto *J = cast<Instruction>(Val: U);
3183 return Worklist.count(key: J) || IsVectorizedMemAccessUse(J, OI);
3184 }))
3185 AddToWorklistIfAllowed(OI);
3186 }
3187 }
3188
3189 // For an instruction to be added into Worklist above, all its users inside
3190 // the loop should also be in Worklist. However, this condition cannot be
3191 // true for phi nodes that form a cyclic dependence. We must process phi
3192 // nodes separately. An induction variable will remain uniform if all users
3193 // of the induction variable and induction variable update remain uniform.
3194 // The code below handles both pointer and non-pointer induction variables.
3195 BasicBlock *Latch = TheLoop->getLoopLatch();
3196 for (const auto &Induction : Legal->getInductionVars()) {
3197 auto *Ind = Induction.first;
3198 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3199
3200 // Determine if all users of the induction variable are uniform after
3201 // vectorization.
3202 bool UniformInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3203 auto *I = cast<Instruction>(Val: U);
3204 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3205 IsVectorizedMemAccessUse(I, Ind);
3206 });
3207 if (!UniformInd)
3208 continue;
3209
3210 // Determine if all users of the induction variable update instruction are
3211 // uniform after vectorization.
3212 bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3213 auto *I = cast<Instruction>(Val: U);
3214 return I == Ind || Worklist.count(key: I) ||
3215 IsVectorizedMemAccessUse(I, IndUpdate);
3216 });
3217 if (!UniformIndUpdate)
3218 continue;
3219
3220 // The induction variable and its update instruction will remain uniform.
3221 AddToWorklistIfAllowed(Ind);
3222 AddToWorklistIfAllowed(IndUpdate);
3223 }
3224
3225 Uniforms[VF].insert_range(R&: Worklist);
3226}
3227
3228bool LoopVectorizationCostModel::runtimeChecksRequired() {
3229 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3230
3231 if (Legal->getRuntimePointerChecking()->Need) {
3232 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3233 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3234 "loop with '#pragma clang loop vectorize(enable)' when "
3235 "compiling with -Os/-Oz",
3236 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3237 return true;
3238 }
3239
3240 if (!PSE.getPredicate().isAlwaysTrue()) {
3241 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3242 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3243 "loop with '#pragma clang loop vectorize(enable)' when "
3244 "compiling with -Os/-Oz",
3245 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3246 return true;
3247 }
3248
3249 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3250 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3251 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3252 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3253 "this loop without such check by compiling with -Os/-Oz",
3254 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3255 return true;
3256 }
3257
3258 return false;
3259}
3260
3261bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3262 if (IsScalableVectorizationAllowed)
3263 return *IsScalableVectorizationAllowed;
3264
3265 IsScalableVectorizationAllowed = false;
3266 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3267 return false;
3268
3269 if (Hints->isScalableVectorizationDisabled()) {
3270 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3271 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3272 return false;
3273 }
3274
3275 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3276
3277 auto MaxScalableVF = ElementCount::getScalable(
3278 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3279
3280 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3281 // FIXME: While for scalable vectors this is currently sufficient, this should
3282 // be replaced by a more detailed mechanism that filters out specific VFs,
3283 // instead of invalidating vectorization for a whole set of VFs based on the
3284 // MaxVF.
3285
3286 // Disable scalable vectorization if the loop contains unsupported reductions.
3287 if (!canVectorizeReductions(VF: MaxScalableVF)) {
3288 reportVectorizationInfo(
3289 Msg: "Scalable vectorization not supported for the reduction "
3290 "operations found in this loop.",
3291 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3292 return false;
3293 }
3294
3295 // Disable scalable vectorization if the loop contains any instructions
3296 // with element types not supported for scalable vectors.
3297 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3298 return !Ty->isVoidTy() &&
3299 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3300 })) {
3301 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3302 "for all element types found in this loop.",
3303 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3304 return false;
3305 }
3306
3307 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3308 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3309 "for safe distance analysis.",
3310 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3311 return false;
3312 }
3313
3314 IsScalableVectorizationAllowed = true;
3315 return true;
3316}
3317
3318ElementCount
3319LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3320 if (!isScalableVectorizationAllowed())
3321 return ElementCount::getScalable(MinVal: 0);
3322
3323 auto MaxScalableVF = ElementCount::getScalable(
3324 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3325 if (Legal->isSafeForAnyVectorWidth())
3326 return MaxScalableVF;
3327
3328 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3329 // Limit MaxScalableVF by the maximum safe dependence distance.
3330 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3331
3332 if (!MaxScalableVF)
3333 reportVectorizationInfo(
3334 Msg: "Max legal vector width too small, scalable vectorization "
3335 "unfeasible.",
3336 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3337
3338 return MaxScalableVF;
3339}
3340
3341FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3342 unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
3343 bool FoldTailByMasking) {
3344 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3345 unsigned SmallestType, WidestType;
3346 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3347
3348 // Get the maximum safe dependence distance in bits computed by LAA.
3349 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3350 // the memory accesses that is most restrictive (involved in the smallest
3351 // dependence distance).
3352 unsigned MaxSafeElementsPowerOf2 =
3353 bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3354 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3355 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3356 MaxSafeElementsPowerOf2 =
3357 std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3358 }
3359 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3360 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3361
3362 if (!Legal->isSafeForAnyVectorWidth())
3363 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3364
3365 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3366 << ".\n");
3367 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3368 << ".\n");
3369
3370 // First analyze the UserVF, fall back if the UserVF should be ignored.
3371 if (UserVF) {
3372 auto MaxSafeUserVF =
3373 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3374
3375 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3376 // If `VF=vscale x N` is safe, then so is `VF=N`
3377 if (UserVF.isScalable())
3378 return FixedScalableVFPair(
3379 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3380
3381 return UserVF;
3382 }
3383
3384 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3385
3386 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3387 // is better to ignore the hint and let the compiler choose a suitable VF.
3388 if (!UserVF.isScalable()) {
3389 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3390 << " is unsafe, clamping to max safe VF="
3391 << MaxSafeFixedVF << ".\n");
3392 ORE->emit(RemarkBuilder: [&]() {
3393 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3394 TheLoop->getStartLoc(),
3395 TheLoop->getHeader())
3396 << "User-specified vectorization factor "
3397 << ore::NV("UserVectorizationFactor", UserVF)
3398 << " is unsafe, clamping to maximum safe vectorization factor "
3399 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3400 });
3401 return MaxSafeFixedVF;
3402 }
3403
3404 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3405 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3406 << " is ignored because scalable vectors are not "
3407 "available.\n");
3408 ORE->emit(RemarkBuilder: [&]() {
3409 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3410 TheLoop->getStartLoc(),
3411 TheLoop->getHeader())
3412 << "User-specified vectorization factor "
3413 << ore::NV("UserVectorizationFactor", UserVF)
3414 << " is ignored because the target does not support scalable "
3415 "vectors. The compiler will pick a more suitable value.";
3416 });
3417 } else {
3418 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3419 << " is unsafe. Ignoring scalable UserVF.\n");
3420 ORE->emit(RemarkBuilder: [&]() {
3421 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3422 TheLoop->getStartLoc(),
3423 TheLoop->getHeader())
3424 << "User-specified vectorization factor "
3425 << ore::NV("UserVectorizationFactor", UserVF)
3426 << " is unsafe. Ignoring the hint to let the compiler pick a "
3427 "more suitable value.";
3428 });
3429 }
3430 }
3431
3432 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3433 << " / " << WidestType << " bits.\n");
3434
3435 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
3436 ElementCount::getScalable(MinVal: 0));
3437 if (auto MaxVF =
3438 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3439 MaxSafeVF: MaxSafeFixedVF, UserIC, FoldTailByMasking))
3440 Result.FixedVF = MaxVF;
3441
3442 if (auto MaxVF =
3443 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3444 MaxSafeVF: MaxSafeScalableVF, UserIC, FoldTailByMasking))
3445 if (MaxVF.isScalable()) {
3446 Result.ScalableVF = MaxVF;
3447 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3448 << "\n");
3449 }
3450
3451 return Result;
3452}
3453
3454FixedScalableVFPair
3455LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3456 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3457 // TODO: It may be useful to do since it's still likely to be dynamically
3458 // uniform if the target can skip.
3459 reportVectorizationFailure(
3460 DebugMsg: "Not inserting runtime ptr check for divergent target",
3461 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3462 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3463 return FixedScalableVFPair::getNone();
3464 }
3465
3466 ScalarEvolution *SE = PSE.getSE();
3467 ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3468 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3469 if (!MaxTC && ScalarEpilogueStatus == CM_ScalarEpilogueAllowed)
3470 MaxTC = getMaxTCFromNonZeroRange(PSE, L: TheLoop);
3471 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3472 if (TC != ElementCount::getFixed(MinVal: MaxTC))
3473 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3474 if (TC.isScalar()) {
3475 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3476 OREMsg: "loop trip count is one, irrelevant for vectorization",
3477 ORETag: "SingleIterationLoop", ORE, TheLoop);
3478 return FixedScalableVFPair::getNone();
3479 }
3480
3481 // If BTC matches the widest induction type and is -1 then the trip count
3482 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3483 // to vectorize.
3484 const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3485 if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3486 BTC->getType()->getScalarSizeInBits() >=
3487 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3488 SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3489 RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3490 reportVectorizationFailure(
3491 DebugMsg: "Trip count computation wrapped",
3492 OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3493 ORETag: "TripCountWrapped", ORE, TheLoop);
3494 return FixedScalableVFPair::getNone();
3495 }
3496
3497 switch (ScalarEpilogueStatus) {
3498 case CM_ScalarEpilogueAllowed:
3499 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: false);
3500 case CM_ScalarEpilogueNotAllowedUsePredicate:
3501 [[fallthrough]];
3502 case CM_ScalarEpilogueNotNeededUsePredicate:
3503 LLVM_DEBUG(
3504 dbgs() << "LV: vector predicate hint/switch found.\n"
3505 << "LV: Not allowing scalar epilogue, creating predicated "
3506 << "vector loop.\n");
3507 break;
3508 case CM_ScalarEpilogueNotAllowedLowTripLoop:
3509 // fallthrough as a special case of OptForSize
3510 case CM_ScalarEpilogueNotAllowedOptSize:
3511 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3512 LLVM_DEBUG(
3513 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3514 else
3515 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3516 << "count.\n");
3517
3518 // Bail if runtime checks are required, which are not good when optimising
3519 // for size.
3520 if (runtimeChecksRequired())
3521 return FixedScalableVFPair::getNone();
3522
3523 break;
3524 }
3525
3526 // Now try the tail folding
3527
3528 // Invalidate interleave groups that require an epilogue if we can't mask
3529 // the interleave-group.
3530 if (!useMaskedInterleavedAccesses(TTI)) {
3531 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3532 "No decisions should have been taken at this point");
3533 // Note: There is no need to invalidate any cost modeling decisions here, as
3534 // none were taken so far.
3535 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3536 }
3537
3538 FixedScalableVFPair MaxFactors =
3539 computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: true);
3540
3541 // Avoid tail folding if the trip count is known to be a multiple of any VF
3542 // we choose.
3543 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3544 MaxFactors.FixedVF.getFixedValue();
3545 if (MaxFactors.ScalableVF) {
3546 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3547 if (MaxVScale) {
3548 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3549 a: *MaxPowerOf2RuntimeVF,
3550 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3551 } else
3552 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3553 }
3554
3555 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3556 // Return false if the loop is neither a single-latch-exit loop nor an
3557 // early-exit loop as tail-folding is not supported in that case.
3558 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3559 !Legal->hasUncountableEarlyExit())
3560 return false;
3561 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3562 ScalarEvolution *SE = PSE.getSE();
3563 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3564 // with uncountable exits. For countable loops, the symbolic maximum must
3565 // remain identical to the known back-edge taken count.
3566 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3567 assert((Legal->hasUncountableEarlyExit() ||
3568 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3569 "Invalid loop count");
3570 const SCEV *ExitCount = SE->getAddExpr(
3571 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3572 const SCEV *Rem = SE->getURemExpr(
3573 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3574 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3575 return Rem->isZero();
3576 };
3577
3578 if (MaxPowerOf2RuntimeVF > 0u) {
3579 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3580 "MaxFixedVF must be a power of 2");
3581 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3582 // Accept MaxFixedVF if we do not have a tail.
3583 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3584 return MaxFactors;
3585 }
3586 }
3587
3588 auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3589 if (ExpectedTC && ExpectedTC->isFixed() &&
3590 ExpectedTC->getFixedValue() <=
3591 TTI.getMinTripCountTailFoldingThreshold()) {
3592 if (MaxPowerOf2RuntimeVF > 0u) {
3593 // If we have a low-trip-count, and the fixed-width VF is known to divide
3594 // the trip count but the scalable factor does not, use the fixed-width
3595 // factor in preference to allow the generation of a non-predicated loop.
3596 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3597 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3598 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3599 "remain for any chosen VF.\n");
3600 MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: 0);
3601 return MaxFactors;
3602 }
3603 }
3604
3605 reportVectorizationFailure(
3606 DebugMsg: "The trip count is below the minial threshold value.",
3607 OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3608 ORE, TheLoop);
3609 return FixedScalableVFPair::getNone();
3610 }
3611
3612 // If we don't know the precise trip count, or if the trip count that we
3613 // found modulo the vectorization factor is not zero, try to fold the tail
3614 // by masking.
3615 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3616 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3617 setTailFoldingStyle(IsScalableVF: ContainsScalableVF, UserIC);
3618 if (foldTailByMasking()) {
3619 if (foldTailWithEVL()) {
3620 LLVM_DEBUG(
3621 dbgs()
3622 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3623 "try to generate VP Intrinsics with scalable vector "
3624 "factors only.\n");
3625 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3626 // for now.
3627 // TODO: extend it for fixed vectors, if required.
3628 assert(ContainsScalableVF && "Expected scalable vector factor.");
3629
3630 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
3631 }
3632 return MaxFactors;
3633 }
3634
3635 // If there was a tail-folding hint/switch, but we can't fold the tail by
3636 // masking, fallback to a vectorization with a scalar epilogue.
3637 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3638 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3639 "scalar epilogue instead.\n");
3640 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3641 return MaxFactors;
3642 }
3643
3644 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3645 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3646 return FixedScalableVFPair::getNone();
3647 }
3648
3649 if (TC.isZero()) {
3650 reportVectorizationFailure(
3651 DebugMsg: "unable to calculate the loop count due to complex control flow",
3652 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3653 return FixedScalableVFPair::getNone();
3654 }
3655
3656 reportVectorizationFailure(
3657 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3658 OREMsg: "cannot optimize for size and vectorize at the same time. "
3659 "Enable vectorization of this loop with '#pragma clang loop "
3660 "vectorize(enable)' when compiling with -Os/-Oz",
3661 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3662 return FixedScalableVFPair::getNone();
3663}
3664
3665bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
3666 ElementCount VF) {
3667 if (ConsiderRegPressure.getNumOccurrences())
3668 return ConsiderRegPressure;
3669
3670 // TODO: We should eventually consider register pressure for all targets. The
3671 // TTI hook is temporary whilst target-specific issues are being fixed.
3672 if (TTI.shouldConsiderVectorizationRegPressure())
3673 return true;
3674
3675 if (!useMaxBandwidth(RegKind: VF.isScalable()
3676 ? TargetTransformInfo::RGK_ScalableVector
3677 : TargetTransformInfo::RGK_FixedWidthVector))
3678 return false;
3679 // Only calculate register pressure for VFs enabled by MaxBandwidth.
3680 return ElementCount::isKnownGT(
3681 LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3682 : MaxPermissibleVFWithoutMaxBW.FixedVF);
3683}
3684
3685bool LoopVectorizationCostModel::useMaxBandwidth(
3686 TargetTransformInfo::RegisterKind RegKind) {
3687 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3688 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
3689 (UseWiderVFIfCallVariantsPresent &&
3690 Legal->hasVectorCallVariants())));
3691}
3692
3693ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3694 ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
3695 bool FoldTailByMasking) const {
3696 unsigned EstimatedVF = VF.getKnownMinValue();
3697 if (VF.isScalable() && TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3698 auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3699 auto Min = Attr.getVScaleRangeMin();
3700 EstimatedVF *= Min;
3701 }
3702
3703 // When a scalar epilogue is required, at least one iteration of the scalar
3704 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3705 // max VF that results in a dead vector loop.
3706 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
3707 MaxTripCount -= 1;
3708
3709 // When the user specifies an interleave count, we need to ensure that
3710 // VF * UserIC <= MaxTripCount to avoid a dead vector loop.
3711 unsigned IC = UserIC > 0 ? UserIC : 1;
3712 unsigned EstimatedVFTimesIC = EstimatedVF * IC;
3713
3714 if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
3715 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
3716 // If upper bound loop trip count (TC) is known at compile time there is no
3717 // point in choosing VF greater than TC / IC (as done in the loop below).
3718 // Select maximum power of two which doesn't exceed TC / IC. If VF is
3719 // scalable, we only fall back on a fixed VF when the TC is less than or
3720 // equal to the known number of lanes.
3721 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC);
3722 if (ClampedUpperTripCount == 0)
3723 ClampedUpperTripCount = 1;
3724 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3725 "exceeding the constant trip count"
3726 << (UserIC > 0 ? " divided by UserIC" : "") << ": "
3727 << ClampedUpperTripCount << "\n");
3728 return ElementCount::get(MinVal: ClampedUpperTripCount,
3729 Scalable: FoldTailByMasking ? VF.isScalable() : false);
3730 }
3731 return VF;
3732}
3733
3734ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3735 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3736 ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
3737 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3738 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3739 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3740 : TargetTransformInfo::RGK_FixedWidthVector);
3741
3742 // Convenience function to return the minimum of two ElementCounts.
3743 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3744 assert((LHS.isScalable() == RHS.isScalable()) &&
3745 "Scalable flags must match");
3746 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3747 };
3748
3749 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3750 // Note that both WidestRegister and WidestType may not be a powers of 2.
3751 auto MaxVectorElementCount = ElementCount::get(
3752 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3753 Scalable: ComputeScalableMaxVF);
3754 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3755 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3756 << (MaxVectorElementCount * WidestType) << " bits.\n");
3757
3758 if (!MaxVectorElementCount) {
3759 LLVM_DEBUG(dbgs() << "LV: The target has no "
3760 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3761 << " vector registers.\n");
3762 return ElementCount::getFixed(MinVal: 1);
3763 }
3764
3765 ElementCount MaxVF = clampVFByMaxTripCount(
3766 VF: MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking);
3767 // If the MaxVF was already clamped, there's no point in trying to pick a
3768 // larger one.
3769 if (MaxVF != MaxVectorElementCount)
3770 return MaxVF;
3771
3772 TargetTransformInfo::RegisterKind RegKind =
3773 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3774 : TargetTransformInfo::RGK_FixedWidthVector;
3775
3776 if (MaxVF.isScalable())
3777 MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3778 else
3779 MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3780
3781 if (useMaxBandwidth(RegKind)) {
3782 auto MaxVectorElementCountMaxBW = ElementCount::get(
3783 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
3784 Scalable: ComputeScalableMaxVF);
3785 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3786
3787 if (ElementCount MinVF =
3788 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
3789 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
3790 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3791 << ") with target's minimum: " << MinVF << '\n');
3792 MaxVF = MinVF;
3793 }
3794 }
3795
3796 MaxVF =
3797 clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC, FoldTailByMasking);
3798
3799 if (MaxVectorElementCount != MaxVF) {
3800 // Invalidate any widening decisions we might have made, in case the loop
3801 // requires prediction (decided later), but we have already made some
3802 // load/store widening decisions.
3803 invalidateCostModelingDecisions();
3804 }
3805 }
3806 return MaxVF;
3807}
3808
3809bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3810 const VectorizationFactor &B,
3811 const unsigned MaxTripCount,
3812 bool HasTail,
3813 bool IsEpilogue) const {
3814 InstructionCost CostA = A.Cost;
3815 InstructionCost CostB = B.Cost;
3816
3817 // Improve estimate for the vector width if it is scalable.
3818 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3819 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3820 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3821 if (A.Width.isScalable())
3822 EstimatedWidthA *= *VScale;
3823 if (B.Width.isScalable())
3824 EstimatedWidthB *= *VScale;
3825 }
3826
3827 // When optimizing for size choose whichever is smallest, which will be the
3828 // one with the smallest cost for the whole loop. On a tie pick the larger
3829 // vector width, on the assumption that throughput will be greater.
3830 if (CM.CostKind == TTI::TCK_CodeSize)
3831 return CostA < CostB ||
3832 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3833
3834 // Assume vscale may be larger than 1 (or the value being tuned for),
3835 // so that scalable vectorization is slightly favorable over fixed-width
3836 // vectorization.
3837 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3838 A.Width.isScalable() && !B.Width.isScalable();
3839
3840 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3841 const InstructionCost &RHS) {
3842 return PreferScalable ? LHS <= RHS : LHS < RHS;
3843 };
3844
3845 // To avoid the need for FP division:
3846 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3847 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3848 bool LowerCostWithoutTC =
3849 CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3850 if (!MaxTripCount)
3851 return LowerCostWithoutTC;
3852
3853 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3854 InstructionCost VectorCost,
3855 InstructionCost ScalarCost) {
3856 // If the trip count is a known (possibly small) constant, the trip count
3857 // will be rounded up to an integer number of iterations under
3858 // FoldTailByMasking. The total cost in that case will be
3859 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3860 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3861 // some extra overheads, but for the purpose of comparing the costs of
3862 // different VFs we can use this to compare the total loop-body cost
3863 // expected after vectorization.
3864 if (HasTail)
3865 return VectorCost * (MaxTripCount / VF) +
3866 ScalarCost * (MaxTripCount % VF);
3867 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
3868 };
3869
3870 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3871 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3872 bool LowerCostWithTC = CmpFn(RTCostA, RTCostB);
3873 LLVM_DEBUG(if (LowerCostWithTC != LowerCostWithoutTC) {
3874 dbgs() << "LV: VF " << (LowerCostWithTC ? A.Width : B.Width)
3875 << " has lower cost than VF "
3876 << (LowerCostWithTC ? B.Width : A.Width)
3877 << " when taking the cost of the remaining scalar loop iterations "
3878 "into consideration for a maximum trip count of "
3879 << MaxTripCount << ".\n";
3880 });
3881 return LowerCostWithTC;
3882}
3883
3884bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3885 const VectorizationFactor &B,
3886 bool HasTail,
3887 bool IsEpilogue) const {
3888 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3889 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3890 IsEpilogue);
3891}
3892
3893void LoopVectorizationPlanner::emitInvalidCostRemarks(
3894 OptimizationRemarkEmitter *ORE) {
3895 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3896 SmallVector<RecipeVFPair> InvalidCosts;
3897 for (const auto &Plan : VPlans) {
3898 for (ElementCount VF : Plan->vectorFactors()) {
3899 // The VPlan-based cost model is designed for computing vector cost.
3900 // Querying VPlan-based cost model with a scarlar VF will cause some
3901 // errors because we expect the VF is vector for most of the widen
3902 // recipes.
3903 if (VF.isScalar())
3904 continue;
3905
3906 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
3907 OrigLoop);
3908 precomputeCosts(Plan&: *Plan, VF, CostCtx);
3909 auto Iter = vp_depth_first_deep(G: Plan->getVectorLoopRegion()->getEntry());
3910 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3911 for (auto &R : *VPBB) {
3912 if (!R.cost(VF, Ctx&: CostCtx).isValid())
3913 InvalidCosts.emplace_back(Args: &R, Args&: VF);
3914 }
3915 }
3916 }
3917 }
3918 if (InvalidCosts.empty())
3919 return;
3920
3921 // Emit a report of VFs with invalid costs in the loop.
3922
3923 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3924 DenseMap<VPRecipeBase *, unsigned> Numbering;
3925 unsigned I = 0;
3926 for (auto &Pair : InvalidCosts)
3927 if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
3928 ++I;
3929
3930 // Sort the list, first on recipe(number) then on VF.
3931 sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3932 unsigned NA = Numbering[A.first];
3933 unsigned NB = Numbering[B.first];
3934 if (NA != NB)
3935 return NA < NB;
3936 return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
3937 });
3938
3939 // For a list of ordered recipe-VF pairs:
3940 // [(load, VF1), (load, VF2), (store, VF1)]
3941 // group the recipes together to emit separate remarks for:
3942 // load (VF1, VF2)
3943 // store (VF1)
3944 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3945 auto Subset = ArrayRef<RecipeVFPair>();
3946 do {
3947 if (Subset.empty())
3948 Subset = Tail.take_front(N: 1);
3949
3950 VPRecipeBase *R = Subset.front().first;
3951
3952 unsigned Opcode =
3953 TypeSwitch<const VPRecipeBase *, unsigned>(R)
3954 .Case(caseFn: [](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
3955 .Case(
3956 caseFn: [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
3957 .Case(caseFn: [](const VPWidenLoadRecipe *R) { return Instruction::Load; })
3958 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3959 caseFn: [](const auto *R) { return Instruction::Call; })
3960 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
3961 VPWidenCastRecipe>(
3962 caseFn: [](const auto *R) { return R->getOpcode(); })
3963 .Case(caseFn: [](const VPInterleaveRecipe *R) {
3964 return R->getStoredValues().empty() ? Instruction::Load
3965 : Instruction::Store;
3966 })
3967 .Case(caseFn: [](const VPReductionRecipe *R) {
3968 return RecurrenceDescriptor::getOpcode(Kind: R->getRecurrenceKind());
3969 });
3970
3971 // If the next recipe is different, or if there are no other pairs,
3972 // emit a remark for the collated subset. e.g.
3973 // [(load, VF1), (load, VF2))]
3974 // to emit:
3975 // remark: invalid costs for 'load' at VF=(VF1, VF2)
3976 if (Subset == Tail || Tail[Subset.size()].first != R) {
3977 std::string OutString;
3978 raw_string_ostream OS(OutString);
3979 assert(!Subset.empty() && "Unexpected empty range");
3980 OS << "Recipe with invalid costs prevented vectorization at VF=(";
3981 for (const auto &Pair : Subset)
3982 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
3983 OS << "):";
3984 if (Opcode == Instruction::Call) {
3985 StringRef Name = "";
3986 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
3987 Name = Int->getIntrinsicName();
3988 } else {
3989 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
3990 Function *CalledFn =
3991 WidenCall ? WidenCall->getCalledScalarFunction()
3992 : cast<Function>(Val: R->getOperand(N: R->getNumOperands() - 1)
3993 ->getLiveInIRValue());
3994 Name = CalledFn->getName();
3995 }
3996 OS << " call to " << Name;
3997 } else
3998 OS << " " << Instruction::getOpcodeName(Opcode);
3999 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4000 DL: R->getDebugLoc());
4001 Tail = Tail.drop_front(N: Subset.size());
4002 Subset = {};
4003 } else
4004 // Grow the subset by one element
4005 Subset = Tail.take_front(N: Subset.size() + 1);
4006 } while (!Tail.empty());
4007}
4008
4009/// Check if any recipe of \p Plan will generate a vector value, which will be
4010/// assigned a vector register.
4011static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4012 const TargetTransformInfo &TTI) {
4013 assert(VF.isVector() && "Checking a scalar VF?");
4014 VPTypeAnalysis TypeInfo(Plan);
4015 DenseSet<VPRecipeBase *> EphemeralRecipes;
4016 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4017 // Set of already visited types.
4018 DenseSet<Type *> Visited;
4019 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4020 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4021 for (VPRecipeBase &R : *VPBB) {
4022 if (EphemeralRecipes.contains(V: &R))
4023 continue;
4024 // Continue early if the recipe is considered to not produce a vector
4025 // result. Note that this includes VPInstruction where some opcodes may
4026 // produce a vector, to preserve existing behavior as VPInstructions model
4027 // aspects not directly mapped to existing IR instructions.
4028 switch (R.getVPRecipeID()) {
4029 case VPRecipeBase::VPDerivedIVSC:
4030 case VPRecipeBase::VPScalarIVStepsSC:
4031 case VPRecipeBase::VPReplicateSC:
4032 case VPRecipeBase::VPInstructionSC:
4033 case VPRecipeBase::VPCanonicalIVPHISC:
4034 case VPRecipeBase::VPCurrentIterationPHISC:
4035 case VPRecipeBase::VPVectorPointerSC:
4036 case VPRecipeBase::VPVectorEndPointerSC:
4037 case VPRecipeBase::VPExpandSCEVSC:
4038 case VPRecipeBase::VPPredInstPHISC:
4039 case VPRecipeBase::VPBranchOnMaskSC:
4040 continue;
4041 case VPRecipeBase::VPReductionSC:
4042 case VPRecipeBase::VPActiveLaneMaskPHISC:
4043 case VPRecipeBase::VPWidenCallSC:
4044 case VPRecipeBase::VPWidenCanonicalIVSC:
4045 case VPRecipeBase::VPWidenCastSC:
4046 case VPRecipeBase::VPWidenGEPSC:
4047 case VPRecipeBase::VPWidenIntrinsicSC:
4048 case VPRecipeBase::VPWidenSC:
4049 case VPRecipeBase::VPBlendSC:
4050 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
4051 case VPRecipeBase::VPHistogramSC:
4052 case VPRecipeBase::VPWidenPHISC:
4053 case VPRecipeBase::VPWidenIntOrFpInductionSC:
4054 case VPRecipeBase::VPWidenPointerInductionSC:
4055 case VPRecipeBase::VPReductionPHISC:
4056 case VPRecipeBase::VPInterleaveEVLSC:
4057 case VPRecipeBase::VPInterleaveSC:
4058 case VPRecipeBase::VPWidenLoadEVLSC:
4059 case VPRecipeBase::VPWidenLoadSC:
4060 case VPRecipeBase::VPWidenStoreEVLSC:
4061 case VPRecipeBase::VPWidenStoreSC:
4062 break;
4063 default:
4064 llvm_unreachable("unhandled recipe");
4065 }
4066
4067 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4068 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4069 if (!NumLegalParts)
4070 return false;
4071 if (VF.isScalable()) {
4072 // <vscale x 1 x iN> is assumed to be profitable over iN because
4073 // scalable registers are a distinct register class from scalar
4074 // ones. If we ever find a target which wants to lower scalable
4075 // vectors back to scalars, we'll need to update this code to
4076 // explicitly ask TTI about the register class uses for each part.
4077 return NumLegalParts <= VF.getKnownMinValue();
4078 }
4079 // Two or more elements that share a register - are vectorized.
4080 return NumLegalParts < VF.getFixedValue();
4081 };
4082
4083 // If no def nor is a store, e.g., branches, continue - no value to check.
4084 if (R.getNumDefinedValues() == 0 &&
4085 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(Val: &R))
4086 continue;
4087 // For multi-def recipes, currently only interleaved loads, suffice to
4088 // check first def only.
4089 // For stores check their stored value; for interleaved stores suffice
4090 // the check first stored value only. In all cases this is the second
4091 // operand.
4092 VPValue *ToCheck =
4093 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
4094 Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4095 if (!Visited.insert(V: {ScalarTy}).second)
4096 continue;
4097 Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4098 if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4099 return true;
4100 }
4101 }
4102
4103 return false;
4104}
4105
4106static bool hasReplicatorRegion(VPlan &Plan) {
4107 return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4108 G: Plan.getVectorLoopRegion()->getEntry())),
4109 P: [](auto *VPRB) { return VPRB->isReplicator(); });
4110}
4111
4112#ifndef NDEBUG
4113VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4114 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4115 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4116 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4117 assert(
4118 any_of(VPlans,
4119 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4120 "Expected Scalar VF to be a candidate");
4121
4122 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4123 ExpectedCost);
4124 VectorizationFactor ChosenFactor = ScalarCost;
4125
4126 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4127 if (ForceVectorization &&
4128 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4129 // Ignore scalar width, because the user explicitly wants vectorization.
4130 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4131 // evaluation.
4132 ChosenFactor.Cost = InstructionCost::getMax();
4133 }
4134
4135 for (auto &P : VPlans) {
4136 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4137 P->vectorFactors().end());
4138
4139 SmallVector<VPRegisterUsage, 8> RUs;
4140 if (any_of(VFs, [this](ElementCount VF) {
4141 return CM.shouldConsiderRegPressureForVF(VF);
4142 }))
4143 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4144
4145 for (unsigned I = 0; I < VFs.size(); I++) {
4146 ElementCount VF = VFs[I];
4147 // The cost for scalar VF=1 is already calculated, so ignore it.
4148 if (VF.isScalar())
4149 continue;
4150
4151 InstructionCost C = CM.expectedCost(VF);
4152
4153 // Add on other costs that are modelled in VPlan, but not in the legacy
4154 // cost model.
4155 VPCostContext CostCtx(CM.TTI, *CM.TLI, *P, CM, CM.CostKind, CM.PSE,
4156 OrigLoop);
4157 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4158 assert(VectorRegion && "Expected to have a vector region!");
4159 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4160 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4161 for (VPRecipeBase &R : *VPBB) {
4162 auto *VPI = dyn_cast<VPInstruction>(&R);
4163 if (!VPI)
4164 continue;
4165 switch (VPI->getOpcode()) {
4166 // Selects are only modelled in the legacy cost model for safe
4167 // divisors.
4168 case Instruction::Select: {
4169 if (auto *WR =
4170 dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4171 switch (WR->getOpcode()) {
4172 case Instruction::UDiv:
4173 case Instruction::SDiv:
4174 case Instruction::URem:
4175 case Instruction::SRem:
4176 continue;
4177 default:
4178 break;
4179 }
4180 }
4181 C += VPI->cost(VF, CostCtx);
4182 break;
4183 }
4184 case VPInstruction::ActiveLaneMask: {
4185 unsigned Multiplier =
4186 cast<VPConstantInt>(VPI->getOperand(2))->getZExtValue();
4187 C += VPI->cost(VF * Multiplier, CostCtx);
4188 break;
4189 }
4190 case VPInstruction::ExplicitVectorLength:
4191 case VPInstruction::AnyOf:
4192 C += VPI->cost(VF, CostCtx);
4193 break;
4194 default:
4195 break;
4196 }
4197 }
4198 }
4199
4200 // Add the cost of any spills due to excess register usage
4201 if (CM.shouldConsiderRegPressureForVF(VF))
4202 C += RUs[I].spillCost(CostCtx, ForceTargetNumVectorRegs);
4203
4204 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4205 unsigned Width =
4206 estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4207 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4208 << " costs: " << (Candidate.Cost / Width));
4209 if (VF.isScalable())
4210 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4211 << CM.getVScaleForTuning().value_or(1) << ")");
4212 LLVM_DEBUG(dbgs() << ".\n");
4213
4214 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4215 LLVM_DEBUG(
4216 dbgs()
4217 << "LV: Not considering vector loop of width " << VF
4218 << " because it will not generate any vector instructions.\n");
4219 continue;
4220 }
4221
4222 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4223 LLVM_DEBUG(
4224 dbgs()
4225 << "LV: Not considering vector loop of width " << VF
4226 << " because it would cause replicated blocks to be generated,"
4227 << " which isn't allowed when optimizing for size.\n");
4228 continue;
4229 }
4230
4231 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4232 ChosenFactor = Candidate;
4233 }
4234 }
4235
4236 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4237 reportVectorizationFailure(
4238 "There are conditional stores.",
4239 "store that is conditionally executed prevents vectorization",
4240 "ConditionalStore", ORE, OrigLoop);
4241 ChosenFactor = ScalarCost;
4242 }
4243
4244 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4245 !isMoreProfitable(ChosenFactor, ScalarCost,
4246 !CM.foldTailByMasking())) dbgs()
4247 << "LV: Vectorization seems to be not beneficial, "
4248 << "but was forced by a user.\n");
4249 return ChosenFactor;
4250}
4251#endif
4252
4253/// Returns true if the VPlan contains a VPReductionPHIRecipe with
4254/// FindLast recurrence kind.
4255static bool hasFindLastReductionPhi(VPlan &Plan) {
4256 return any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4257 P: [](VPRecipeBase &R) {
4258 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4259 return RedPhi &&
4260 RecurrenceDescriptor::isFindLastRecurrenceKind(
4261 Kind: RedPhi->getRecurrenceKind());
4262 });
4263}
4264
4265/// Returns true if the VPlan contains header phi recipes that are not currently
4266/// supported for epilogue vectorization.
4267static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan) {
4268 return any_of(
4269 Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4270 P: [](VPRecipeBase &R) {
4271 if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R))
4272 return !WidenInd->getPHINode();
4273 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4274 return RedPhi && (RecurrenceDescriptor::isFindLastRecurrenceKind(
4275 Kind: RedPhi->getRecurrenceKind()) ||
4276 !RedPhi->getUnderlyingValue());
4277 });
4278}
4279
4280bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4281 ElementCount VF) const {
4282 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4283 // reductions need special handling and are currently unsupported.
4284 if (any_of(Range: OrigLoop->getHeader()->phis(), P: [&](PHINode &Phi) {
4285 if (!Legal->isReductionVariable(PN: &Phi))
4286 return Legal->isFixedOrderRecurrence(Phi: &Phi);
4287 RecurKind Kind =
4288 Legal->getRecurrenceDescriptor(PN: &Phi).getRecurrenceKind();
4289 return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
4290 }))
4291 return false;
4292
4293 // FindLast reductions and inductions without underlying PHI require special
4294 // handling and are currently not supported for epilogue vectorization.
4295 if (hasUnsupportedHeaderPhiRecipe(Plan&: getPlanFor(VF)))
4296 return false;
4297
4298 // Phis with uses outside of the loop require special handling and are
4299 // currently unsupported.
4300 for (const auto &Entry : Legal->getInductionVars()) {
4301 // Look for uses of the value of the induction at the last iteration.
4302 Value *PostInc =
4303 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4304 for (User *U : PostInc->users())
4305 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4306 return false;
4307 // Look for uses of penultimate value of the induction.
4308 for (User *U : Entry.first->users())
4309 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4310 return false;
4311 }
4312
4313 // Epilogue vectorization code has not been auditted to ensure it handles
4314 // non-latch exits properly. It may be fine, but it needs auditted and
4315 // tested.
4316 // TODO: Add support for loops with an early exit.
4317 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4318 return false;
4319
4320 return true;
4321}
4322
4323bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4324 const ElementCount VF, const unsigned IC) const {
4325 // FIXME: We need a much better cost-model to take different parameters such
4326 // as register pressure, code size increase and cost of extra branches into
4327 // account. For now we apply a very crude heuristic and only consider loops
4328 // with vectorization factors larger than a certain value.
4329
4330 // Allow the target to opt out.
4331 if (!TTI.preferEpilogueVectorization(Iters: VF * IC))
4332 return false;
4333
4334 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4335 ? EpilogueVectorizationMinVF
4336 : TTI.getEpilogueVectorizationMinVF();
4337 return estimateElementCount(VF: VF * IC, VScale: VScaleForTuning) >= MinVFThreshold;
4338}
4339
4340VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4341 ElementCount MainLoopVF, unsigned IC) {
4342 VectorizationFactor Result = VectorizationFactor::Disabled();
4343 if (!EnableEpilogueVectorization) {
4344 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4345 return Result;
4346 }
4347
4348 if (!CM.isScalarEpilogueAllowed()) {
4349 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4350 "epilogue is allowed.\n");
4351 return Result;
4352 }
4353
4354 // Not really a cost consideration, but check for unsupported cases here to
4355 // simplify the logic.
4356 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4357 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4358 "is not a supported candidate.\n");
4359 return Result;
4360 }
4361
4362 if (EpilogueVectorizationForceVF > 1) {
4363 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4364 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4365 if (hasPlanWithVF(VF: ForcedEC))
4366 return {ForcedEC, 0, 0};
4367
4368 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4369 "viable.\n");
4370 return Result;
4371 }
4372
4373 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4374 LLVM_DEBUG(
4375 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4376 return Result;
4377 }
4378
4379 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4380 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4381 "this loop\n");
4382 return Result;
4383 }
4384
4385 // Check if a plan's vector loop processes fewer iterations than VF (e.g. when
4386 // interleave groups have been narrowed) narrowInterleaveGroups) and return
4387 // the adjusted, effective VF.
4388 using namespace VPlanPatternMatch;
4389 auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount {
4390 auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock();
4391 if (match(V: &Exiting->back(),
4392 P: m_BranchOnCount(Op0: m_Add(Op0: m_CanonicalIV(), Op1: m_Specific(VPV: &Plan.getUF())),
4393 Op1: m_VPValue())))
4394 return ElementCount::get(MinVal: 1, Scalable: VF.isScalable());
4395 return VF;
4396 };
4397
4398 // Check if the main loop processes fewer than MainLoopVF elements per
4399 // iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF
4400 // as needed.
4401 VPlan &MainPlan = getPlanFor(VF: MainLoopVF);
4402 MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF);
4403
4404 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4405 // the main loop handles 8 lanes per iteration. We could still benefit from
4406 // vectorizing the epilogue loop with VF=4.
4407 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4408 MinVal: estimateElementCount(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4409
4410 Type *TCType = Legal->getWidestInductionType();
4411 const SCEV *RemainingIterations = nullptr;
4412 unsigned MaxTripCount = 0;
4413 const SCEV *TC = vputils::getSCEVExprForVPValue(V: MainPlan.getTripCount(), PSE);
4414 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4415 const SCEV *KnownMinTC;
4416 bool ScalableTC = match(S: TC, P: m_scev_c_Mul(Op0: m_SCEV(V&: KnownMinTC), Op1: m_SCEVVScale()));
4417 bool ScalableRemIter = false;
4418 ScalarEvolution &SE = *PSE.getSE();
4419 // Use versions of TC and VF in which both are either scalable or fixed.
4420 if (ScalableTC == MainLoopVF.isScalable()) {
4421 ScalableRemIter = ScalableTC;
4422 RemainingIterations =
4423 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4424 } else if (ScalableTC) {
4425 const SCEV *EstimatedTC = SE.getMulExpr(
4426 LHS: KnownMinTC,
4427 RHS: SE.getConstant(Ty: TCType, V: CM.getVScaleForTuning().value_or(u: 1)));
4428 RemainingIterations = SE.getURemExpr(
4429 LHS: EstimatedTC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4430 } else
4431 RemainingIterations =
4432 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: EstimatedRuntimeVF * IC));
4433
4434 // No iterations left to process in the epilogue.
4435 if (RemainingIterations->isZero())
4436 return Result;
4437
4438 if (MainLoopVF.isFixed()) {
4439 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4440 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4441 RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4442 MaxTripCount = SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4443 }
4444 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4445 << MaxTripCount << "\n");
4446 }
4447
4448 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
4449 return SE.isKnownPredicate(Pred: CmpInst::ICMP_UGT, LHS: VF, RHS: RemIter);
4450 };
4451 for (auto &NextVF : ProfitableVFs) {
4452 // Skip candidate VFs without a corresponding VPlan.
4453 if (!hasPlanWithVF(VF: NextVF.Width))
4454 continue;
4455
4456 ElementCount EffectiveVF =
4457 GetEffectiveVF(getPlanFor(VF: NextVF.Width), NextVF.Width);
4458 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4459 // vectors) or > the VF of the main loop (fixed vectors).
4460 if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() &&
4461 ElementCount::isKnownGE(LHS: EffectiveVF, RHS: EstimatedRuntimeVF)) ||
4462 (EffectiveVF.isScalable() &&
4463 ElementCount::isKnownGE(LHS: EffectiveVF, RHS: MainLoopVF)) ||
4464 (!EffectiveVF.isScalable() && !MainLoopVF.isScalable() &&
4465 ElementCount::isKnownGT(LHS: EffectiveVF, RHS: MainLoopVF)))
4466 continue;
4467
4468 // If EffectiveVF is greater than the number of remaining iterations, the
4469 // epilogue loop would be dead. Skip such factors. If the epilogue plan
4470 // also has narrowed interleave groups, use the effective VF since
4471 // the epilogue step will be reduced to its IC.
4472 // TODO: We should also consider comparing against a scalable
4473 // RemainingIterations when SCEV be able to evaluate non-canonical
4474 // vscale-based expressions.
4475 if (!ScalableRemIter) {
4476 // Handle the case where EffectiveVF and RemainingIterations are in
4477 // different numerical spaces.
4478 if (EffectiveVF.isScalable())
4479 EffectiveVF = ElementCount::getFixed(
4480 MinVal: estimateElementCount(VF: EffectiveVF, VScale: CM.getVScaleForTuning()));
4481 if (SkipVF(SE.getElementCount(Ty: TCType, EC: EffectiveVF), RemainingIterations))
4482 continue;
4483 }
4484
4485 if (Result.Width.isScalar() ||
4486 isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking(),
4487 /*IsEpilogue*/ true))
4488 Result = NextVF;
4489 }
4490
4491 if (Result != VectorizationFactor::Disabled())
4492 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4493 << Result.Width << "\n");
4494 return Result;
4495}
4496
4497std::pair<unsigned, unsigned>
4498LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4499 unsigned MinWidth = -1U;
4500 unsigned MaxWidth = 8;
4501 const DataLayout &DL = TheFunction->getDataLayout();
4502 // For in-loop reductions, no element types are added to ElementTypesInLoop
4503 // if there are no loads/stores in the loop. In this case, check through the
4504 // reduction variables to determine the maximum width.
4505 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4506 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4507 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4508 // When finding the min width used by the recurrence we need to account
4509 // for casts on the input operands of the recurrence.
4510 MinWidth = std::min(
4511 a: MinWidth,
4512 b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4513 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4514 MaxWidth = std::max(a: MaxWidth,
4515 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4516 }
4517 } else {
4518 for (Type *T : ElementTypesInLoop) {
4519 MinWidth = std::min<unsigned>(
4520 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4521 MaxWidth = std::max<unsigned>(
4522 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4523 }
4524 }
4525 return {MinWidth, MaxWidth};
4526}
4527
4528void LoopVectorizationCostModel::collectElementTypesForWidening() {
4529 ElementTypesInLoop.clear();
4530 // For each block.
4531 for (BasicBlock *BB : TheLoop->blocks()) {
4532 // For each instruction in the loop.
4533 for (Instruction &I : *BB) {
4534 Type *T = I.getType();
4535
4536 // Skip ignored values.
4537 if (ValuesToIgnore.count(Ptr: &I))
4538 continue;
4539
4540 // Only examine Loads, Stores and PHINodes.
4541 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4542 continue;
4543
4544 // Examine PHI nodes that are reduction variables. Update the type to
4545 // account for the recurrence type.
4546 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4547 if (!Legal->isReductionVariable(PN))
4548 continue;
4549 const RecurrenceDescriptor &RdxDesc =
4550 Legal->getRecurrenceDescriptor(PN);
4551 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4552 TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4553 Ty: RdxDesc.getRecurrenceType()))
4554 continue;
4555 T = RdxDesc.getRecurrenceType();
4556 }
4557
4558 // Examine the stored values.
4559 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4560 T = ST->getValueOperand()->getType();
4561
4562 assert(T->isSized() &&
4563 "Expected the load/store/recurrence type to be sized");
4564
4565 ElementTypesInLoop.insert(Ptr: T);
4566 }
4567 }
4568}
4569
4570unsigned
4571LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4572 InstructionCost LoopCost) {
4573 // -- The interleave heuristics --
4574 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4575 // There are many micro-architectural considerations that we can't predict
4576 // at this level. For example, frontend pressure (on decode or fetch) due to
4577 // code size, or the number and capabilities of the execution ports.
4578 //
4579 // We use the following heuristics to select the interleave count:
4580 // 1. If the code has reductions, then we interleave to break the cross
4581 // iteration dependency.
4582 // 2. If the loop is really small, then we interleave to reduce the loop
4583 // overhead.
4584 // 3. We don't interleave if we think that we will spill registers to memory
4585 // due to the increased register pressure.
4586
4587 // Only interleave tail-folded loops if wide lane masks are requested, as the
4588 // overhead of multiple instructions to calculate the predicate is likely
4589 // not beneficial. If a scalar epilogue is not allowed for any other reason,
4590 // do not interleave.
4591 if (!CM.isScalarEpilogueAllowed() &&
4592 !(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4593 return 1;
4594
4595 if (any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4596 P: IsaPred<VPCurrentIterationPHIRecipe>)) {
4597 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
4598 "Unroll factor forced to be 1.\n");
4599 return 1;
4600 }
4601
4602 // We used the distance for the interleave count.
4603 if (!Legal->isSafeForAnyVectorWidth())
4604 return 1;
4605
4606 // We don't attempt to perform interleaving for loops with uncountable early
4607 // exits because the VPInstruction::AnyOf code cannot currently handle
4608 // multiple parts.
4609 if (Plan.hasEarlyExit())
4610 return 1;
4611
4612 const bool HasReductions =
4613 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4614 P: IsaPred<VPReductionPHIRecipe>);
4615
4616 // FIXME: implement interleaving for FindLast transform correctly.
4617 if (hasFindLastReductionPhi(Plan))
4618 return 1;
4619
4620 VPRegisterUsage R =
4621 calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore: CM.ValuesToIgnore)[0];
4622
4623 // If we did not calculate the cost for VF (because the user selected the VF)
4624 // then we calculate the cost of VF here.
4625 if (LoopCost == 0) {
4626 if (VF.isScalar())
4627 LoopCost = CM.expectedCost(VF);
4628 else
4629 LoopCost = cost(Plan, VF, RU: &R);
4630 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4631
4632 // Loop body is free and there is no need for interleaving.
4633 if (LoopCost == 0)
4634 return 1;
4635 }
4636
4637 // We divide by these constants so assume that we have at least one
4638 // instruction that uses at least one register.
4639 for (auto &Pair : R.MaxLocalUsers) {
4640 Pair.second = std::max(a: Pair.second, b: 1U);
4641 }
4642
4643 // We calculate the interleave count using the following formula.
4644 // Subtract the number of loop invariants from the number of available
4645 // registers. These registers are used by all of the interleaved instances.
4646 // Next, divide the remaining registers by the number of registers that is
4647 // required by the loop, in order to estimate how many parallel instances
4648 // fit without causing spills. All of this is rounded down if necessary to be
4649 // a power of two. We want power of two interleave count to simplify any
4650 // addressing operations or alignment considerations.
4651 // We also want power of two interleave counts to ensure that the induction
4652 // variable of the vector loop wraps to zero, when tail is folded by masking;
4653 // this currently happens when OptForSize, in which case IC is set to 1 above.
4654 unsigned IC = UINT_MAX;
4655
4656 for (const auto &Pair : R.MaxLocalUsers) {
4657 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4658 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4659 << " registers of "
4660 << TTI.getRegisterClassName(Pair.first)
4661 << " register class\n");
4662 if (VF.isScalar()) {
4663 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4664 TargetNumRegisters = ForceTargetNumScalarRegs;
4665 } else {
4666 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4667 TargetNumRegisters = ForceTargetNumVectorRegs;
4668 }
4669 unsigned MaxLocalUsers = Pair.second;
4670 unsigned LoopInvariantRegs = 0;
4671 if (R.LoopInvariantRegs.contains(Key: Pair.first))
4672 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4673
4674 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4675 MaxLocalUsers);
4676 // Don't count the induction variable as interleaved.
4677 if (EnableIndVarRegisterHeur) {
4678 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
4679 std::max(a: 1U, b: (MaxLocalUsers - 1)));
4680 }
4681
4682 IC = std::min(a: IC, b: TmpIC);
4683 }
4684
4685 // Clamp the interleave ranges to reasonable counts.
4686 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4687 LLVM_DEBUG(dbgs() << "LV: MaxInterleaveFactor for the target is "
4688 << MaxInterleaveCount << "\n");
4689
4690 // Check if the user has overridden the max.
4691 if (VF.isScalar()) {
4692 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4693 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4694 } else {
4695 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4696 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4697 }
4698
4699 // Try to get the exact trip count, or an estimate based on profiling data or
4700 // ConstantMax from PSE, failing that.
4701 auto BestKnownTC =
4702 getSmallBestKnownTC(PSE, L: OrigLoop,
4703 /*CanUseConstantMax=*/true,
4704 /*CanExcludeZeroTrips=*/CM.isScalarEpilogueAllowed());
4705
4706 // For fixed length VFs treat a scalable trip count as unknown.
4707 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
4708 // Re-evaluate trip counts and VFs to be in the same numerical space.
4709 unsigned AvailableTC =
4710 estimateElementCount(VF: *BestKnownTC, VScale: CM.getVScaleForTuning());
4711 unsigned EstimatedVF = estimateElementCount(VF, VScale: CM.getVScaleForTuning());
4712
4713 // At least one iteration must be scalar when this constraint holds. So the
4714 // maximum available iterations for interleaving is one less.
4715 if (CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()))
4716 --AvailableTC;
4717
4718 unsigned InterleaveCountLB = bit_floor(Value: std::max(
4719 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4720
4721 if (getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop).isNonZero()) {
4722 // If the best known trip count is exact, we select between two
4723 // prospective ICs, where
4724 //
4725 // 1) the aggressive IC is capped by the trip count divided by VF
4726 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4727 //
4728 // The final IC is selected in a way that the epilogue loop trip count is
4729 // minimized while maximizing the IC itself, so that we either run the
4730 // vector loop at least once if it generates a small epilogue loop, or
4731 // else we run the vector loop at least twice.
4732
4733 unsigned InterleaveCountUB = bit_floor(Value: std::max(
4734 a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4735 MaxInterleaveCount = InterleaveCountLB;
4736
4737 if (InterleaveCountUB != InterleaveCountLB) {
4738 unsigned TailTripCountUB =
4739 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4740 unsigned TailTripCountLB =
4741 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4742 // If both produce same scalar tail, maximize the IC to do the same work
4743 // in fewer vector loop iterations
4744 if (TailTripCountUB == TailTripCountLB)
4745 MaxInterleaveCount = InterleaveCountUB;
4746 }
4747 } else {
4748 // If trip count is an estimated compile time constant, limit the
4749 // IC to be capped by the trip count divided by VF * 2, such that the
4750 // vector loop runs at least twice to make interleaving seem profitable
4751 // when there is an epilogue loop present. Since exact Trip count is not
4752 // known we choose to be conservative in our IC estimate.
4753 MaxInterleaveCount = InterleaveCountLB;
4754 }
4755 }
4756
4757 assert(MaxInterleaveCount > 0 &&
4758 "Maximum interleave count must be greater than 0");
4759
4760 // Clamp the calculated IC to be between the 1 and the max interleave count
4761 // that the target and trip count allows.
4762 if (IC > MaxInterleaveCount)
4763 IC = MaxInterleaveCount;
4764 else
4765 // Make sure IC is greater than 0.
4766 IC = std::max(a: 1u, b: IC);
4767
4768 assert(IC > 0 && "Interleave count must be greater than 0.");
4769
4770 // Interleave if we vectorized this loop and there is a reduction that could
4771 // benefit from interleaving.
4772 if (VF.isVector() && HasReductions) {
4773 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4774 return IC;
4775 }
4776
4777 // For any scalar loop that either requires runtime checks or predication we
4778 // are better off leaving this to the unroller. Note that if we've already
4779 // vectorized the loop we will have done the runtime check and so interleaving
4780 // won't require further checks.
4781 bool ScalarInterleavingRequiresPredication =
4782 (VF.isScalar() && any_of(Range: OrigLoop->blocks(), P: [this](BasicBlock *BB) {
4783 return Legal->blockNeedsPredication(BB);
4784 }));
4785 bool ScalarInterleavingRequiresRuntimePointerCheck =
4786 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4787
4788 // We want to interleave small loops in order to reduce the loop overhead and
4789 // potentially expose ILP opportunities.
4790 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4791 << "LV: IC is " << IC << '\n'
4792 << "LV: VF is " << VF << '\n');
4793 const bool AggressivelyInterleave =
4794 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4795 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4796 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4797 // We assume that the cost overhead is 1 and we use the cost model
4798 // to estimate the cost of the loop and interleave until the cost of the
4799 // loop overhead is about 5% of the cost of the loop.
4800 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4801 Value: SmallLoopCost / LoopCost.getValue()));
4802
4803 // Interleave until store/load ports (estimated by max interleave count) are
4804 // saturated.
4805 unsigned NumStores = 0;
4806 unsigned NumLoads = 0;
4807 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4808 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
4809 for (VPRecipeBase &R : *VPBB) {
4810 if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: &R)) {
4811 NumLoads++;
4812 continue;
4813 }
4814 if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(Val: &R)) {
4815 NumStores++;
4816 continue;
4817 }
4818
4819 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R)) {
4820 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4821 NumStores += StoreOps;
4822 else
4823 NumLoads += InterleaveR->getNumDefinedValues();
4824 continue;
4825 }
4826 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4827 NumLoads += isa<LoadInst>(Val: RepR->getUnderlyingInstr());
4828 NumStores += isa<StoreInst>(Val: RepR->getUnderlyingInstr());
4829 continue;
4830 }
4831 if (isa<VPHistogramRecipe>(Val: &R)) {
4832 NumLoads++;
4833 NumStores++;
4834 continue;
4835 }
4836 }
4837 }
4838 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4839 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4840
4841 // There is little point in interleaving for reductions containing selects
4842 // and compares when VF=1 since it may just create more overhead than it's
4843 // worth for loops with small trip counts. This is because we still have to
4844 // do the final reduction after the loop.
4845 bool HasSelectCmpReductions =
4846 HasReductions &&
4847 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4848 P: [](VPRecipeBase &R) {
4849 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4850 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4851 Kind: RedR->getRecurrenceKind()) ||
4852 RecurrenceDescriptor::isFindIVRecurrenceKind(
4853 Kind: RedR->getRecurrenceKind()));
4854 });
4855 if (HasSelectCmpReductions) {
4856 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4857 return 1;
4858 }
4859
4860 // If we have a scalar reduction (vector reductions are already dealt with
4861 // by this point), we can increase the critical path length if the loop
4862 // we're interleaving is inside another loop. For tree-wise reductions
4863 // set the limit to 2, and for ordered reductions it's best to disable
4864 // interleaving entirely.
4865 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
4866 bool HasOrderedReductions =
4867 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4868 P: [](VPRecipeBase &R) {
4869 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4870
4871 return RedR && RedR->isOrdered();
4872 });
4873 if (HasOrderedReductions) {
4874 LLVM_DEBUG(
4875 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4876 return 1;
4877 }
4878
4879 unsigned F = MaxNestedScalarReductionIC;
4880 SmallIC = std::min(a: SmallIC, b: F);
4881 StoresIC = std::min(a: StoresIC, b: F);
4882 LoadsIC = std::min(a: LoadsIC, b: F);
4883 }
4884
4885 if (EnableLoadStoreRuntimeInterleave &&
4886 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4887 LLVM_DEBUG(
4888 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4889 return std::max(a: StoresIC, b: LoadsIC);
4890 }
4891
4892 // If there are scalar reductions and TTI has enabled aggressive
4893 // interleaving for reductions, we will interleave to expose ILP.
4894 if (VF.isScalar() && AggressivelyInterleave) {
4895 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4896 // Interleave no less than SmallIC but not as aggressive as the normal IC
4897 // to satisfy the rare situation when resources are too limited.
4898 return std::max(a: IC / 2, b: SmallIC);
4899 }
4900
4901 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4902 return SmallIC;
4903 }
4904
4905 // Interleave if this is a large loop (small loops are already dealt with by
4906 // this point) that could benefit from interleaving.
4907 if (AggressivelyInterleave) {
4908 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4909 return IC;
4910 }
4911
4912 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4913 return 1;
4914}
4915
4916bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4917 ElementCount VF) {
4918 // TODO: Cost model for emulated masked load/store is completely
4919 // broken. This hack guides the cost model to use an artificially
4920 // high enough value to practically disable vectorization with such
4921 // operations, except where previously deployed legality hack allowed
4922 // using very low cost values. This is to avoid regressions coming simply
4923 // from moving "masked load/store" check from legality to cost model.
4924 // Masked Load/Gather emulation was previously never allowed.
4925 // Limited number of Masked Store/Scatter emulation was allowed.
4926 assert((isPredicatedInst(I)) &&
4927 "Expecting a scalar emulated instruction");
4928 return isa<LoadInst>(Val: I) ||
4929 (isa<StoreInst>(Val: I) &&
4930 NumPredStores > NumberOfStoresToPredicate);
4931}
4932
4933void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4934 assert(VF.isVector() && "Expected VF >= 2");
4935
4936 // If we've already collected the instructions to scalarize or the predicated
4937 // BBs after vectorization, there's nothing to do. Collection may already have
4938 // occurred if we have a user-selected VF and are now computing the expected
4939 // cost for interleaving.
4940 if (InstsToScalarize.contains(Key: VF) ||
4941 PredicatedBBsAfterVectorization.contains(Val: VF))
4942 return;
4943
4944 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4945 // not profitable to scalarize any instructions, the presence of VF in the
4946 // map will indicate that we've analyzed it already.
4947 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4948
4949 // Find all the instructions that are scalar with predication in the loop and
4950 // determine if it would be better to not if-convert the blocks they are in.
4951 // If so, we also record the instructions to scalarize.
4952 for (BasicBlock *BB : TheLoop->blocks()) {
4953 if (!blockNeedsPredicationForAnyReason(BB))
4954 continue;
4955 for (Instruction &I : *BB)
4956 if (isScalarWithPredication(I: &I, VF)) {
4957 ScalarCostsTy ScalarCosts;
4958 // Do not apply discount logic for:
4959 // 1. Scalars after vectorization, as there will only be a single copy
4960 // of the instruction.
4961 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4962 // 3. Emulated masked memrefs, if a hacked cost is needed.
4963 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
4964 !useEmulatedMaskMemRefHack(I: &I, VF) &&
4965 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) {
4966 for (const auto &[I, IC] : ScalarCosts)
4967 ScalarCostsVF.insert(KV: {I, IC});
4968 // Check if we decided to scalarize a call. If so, update the widening
4969 // decision of the call to CM_Scalarize with the computed scalar cost.
4970 for (const auto &[I, Cost] : ScalarCosts) {
4971 auto *CI = dyn_cast<CallInst>(Val: I);
4972 if (!CI || !CallWideningDecisions.contains(Val: {CI, VF}))
4973 continue;
4974 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4975 CallWideningDecisions[{CI, VF}].Cost = Cost;
4976 }
4977 }
4978 // Remember that BB will remain after vectorization.
4979 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
4980 for (auto *Pred : predecessors(BB)) {
4981 if (Pred->getSingleSuccessor() == BB)
4982 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
4983 }
4984 }
4985 }
4986}
4987
4988InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4989 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4990 assert(!isUniformAfterVectorization(PredInst, VF) &&
4991 "Instruction marked uniform-after-vectorization will be predicated");
4992
4993 // Initialize the discount to zero, meaning that the scalar version and the
4994 // vector version cost the same.
4995 InstructionCost Discount = 0;
4996
4997 // Holds instructions to analyze. The instructions we visit are mapped in
4998 // ScalarCosts. Those instructions are the ones that would be scalarized if
4999 // we find that the scalar version costs less.
5000 SmallVector<Instruction *, 8> Worklist;
5001
5002 // Returns true if the given instruction can be scalarized.
5003 auto CanBeScalarized = [&](Instruction *I) -> bool {
5004 // We only attempt to scalarize instructions forming a single-use chain
5005 // from the original predicated block that would otherwise be vectorized.
5006 // Although not strictly necessary, we give up on instructions we know will
5007 // already be scalar to avoid traversing chains that are unlikely to be
5008 // beneficial.
5009 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5010 isScalarAfterVectorization(I, VF))
5011 return false;
5012
5013 // If the instruction is scalar with predication, it will be analyzed
5014 // separately. We ignore it within the context of PredInst.
5015 if (isScalarWithPredication(I, VF))
5016 return false;
5017
5018 // If any of the instruction's operands are uniform after vectorization,
5019 // the instruction cannot be scalarized. This prevents, for example, a
5020 // masked load from being scalarized.
5021 //
5022 // We assume we will only emit a value for lane zero of an instruction
5023 // marked uniform after vectorization, rather than VF identical values.
5024 // Thus, if we scalarize an instruction that uses a uniform, we would
5025 // create uses of values corresponding to the lanes we aren't emitting code
5026 // for. This behavior can be changed by allowing getScalarValue to clone
5027 // the lane zero values for uniforms rather than asserting.
5028 for (Use &U : I->operands())
5029 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5030 if (isUniformAfterVectorization(I: J, VF))
5031 return false;
5032
5033 // Otherwise, we can scalarize the instruction.
5034 return true;
5035 };
5036
5037 // Compute the expected cost discount from scalarizing the entire expression
5038 // feeding the predicated instruction. We currently only consider expressions
5039 // that are single-use instruction chains.
5040 Worklist.push_back(Elt: PredInst);
5041 while (!Worklist.empty()) {
5042 Instruction *I = Worklist.pop_back_val();
5043
5044 // If we've already analyzed the instruction, there's nothing to do.
5045 if (ScalarCosts.contains(Key: I))
5046 continue;
5047
5048 // Cannot scalarize fixed-order recurrence phis at the moment.
5049 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5050 continue;
5051
5052 // Compute the cost of the vector instruction. Note that this cost already
5053 // includes the scalarization overhead of the predicated instruction.
5054 InstructionCost VectorCost = getInstructionCost(I, VF);
5055
5056 // Compute the cost of the scalarized instruction. This cost is the cost of
5057 // the instruction as if it wasn't if-converted and instead remained in the
5058 // predicated block. We will scale this cost by block probability after
5059 // computing the scalarization overhead.
5060 InstructionCost ScalarCost =
5061 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
5062
5063 // Compute the scalarization overhead of needed insertelement instructions
5064 // and phi nodes.
5065 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5066 Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5067 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5068 ScalarCost += TTI.getScalarizationOverhead(
5069 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5070 /*Insert=*/true,
5071 /*Extract=*/false, CostKind);
5072 }
5073 ScalarCost +=
5074 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5075 }
5076
5077 // Compute the scalarization overhead of needed extractelement
5078 // instructions. For each of the instruction's operands, if the operand can
5079 // be scalarized, add it to the worklist; otherwise, account for the
5080 // overhead.
5081 for (Use &U : I->operands())
5082 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5083 assert(canVectorizeTy(J->getType()) &&
5084 "Instruction has non-scalar type");
5085 if (CanBeScalarized(J))
5086 Worklist.push_back(Elt: J);
5087 else if (needsExtract(V: J, VF)) {
5088 Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5089 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5090 ScalarCost += TTI.getScalarizationOverhead(
5091 Ty: cast<VectorType>(Val: VectorTy),
5092 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5093 /*Extract*/ true, CostKind);
5094 }
5095 }
5096 }
5097
5098 // Scale the total scalar cost by block probability.
5099 ScalarCost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5100
5101 // Compute the discount. A non-negative discount means the vector version
5102 // of the instruction costs more, and scalarizing would be beneficial.
5103 Discount += VectorCost - ScalarCost;
5104 ScalarCosts[I] = ScalarCost;
5105 }
5106
5107 return Discount;
5108}
5109
5110InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5111 InstructionCost Cost;
5112
5113 // If the vector loop gets executed exactly once with the given VF, ignore the
5114 // costs of comparison and induction instructions, as they'll get simplified
5115 // away.
5116 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5117 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5118 if (TC == VF && !foldTailByMasking())
5119 addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5120 InstsToIgnore&: ValuesToIgnoreForVF);
5121
5122 // For each block.
5123 for (BasicBlock *BB : TheLoop->blocks()) {
5124 InstructionCost BlockCost;
5125
5126 // For each instruction in the old loop.
5127 for (Instruction &I : *BB) {
5128 // Skip ignored values.
5129 if (ValuesToIgnore.count(Ptr: &I) || ValuesToIgnoreForVF.count(Ptr: &I) ||
5130 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5131 continue;
5132
5133 InstructionCost C = getInstructionCost(I: &I, VF);
5134
5135 // Check if we should override the cost.
5136 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0) {
5137 // For interleave groups, use ForceTargetInstructionCost once for the
5138 // whole group.
5139 if (VF.isVector() && getWideningDecision(I: &I, VF) == CM_Interleave) {
5140 if (getInterleavedAccessGroup(Instr: &I)->getInsertPos() == &I)
5141 C = InstructionCost(ForceTargetInstructionCost);
5142 else
5143 C = InstructionCost(0);
5144 } else {
5145 C = InstructionCost(ForceTargetInstructionCost);
5146 }
5147 }
5148
5149 BlockCost += C;
5150 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5151 << VF << " For instruction: " << I << '\n');
5152 }
5153
5154 // If we are vectorizing a predicated block, it will have been
5155 // if-converted. This means that the block's instructions (aside from
5156 // stores and instructions that may divide by zero) will now be
5157 // unconditionally executed. For the scalar case, we may not always execute
5158 // the predicated block, if it is an if-else block. Thus, scale the block's
5159 // cost by the probability of executing it.
5160 // getPredBlockCostDivisor will return 1 for blocks that are only predicated
5161 // by the header mask when folding the tail.
5162 if (VF.isScalar())
5163 BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5164
5165 Cost += BlockCost;
5166 }
5167
5168 return Cost;
5169}
5170
5171/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
5172/// according to isAddressSCEVForCost.
5173///
5174/// This SCEV can be sent to the Target in order to estimate the address
5175/// calculation cost.
5176static const SCEV *getAddressAccessSCEV(
5177 Value *Ptr,
5178 PredicatedScalarEvolution &PSE,
5179 const Loop *TheLoop) {
5180 const SCEV *Addr = PSE.getSCEV(V: Ptr);
5181 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L: TheLoop) ? Addr
5182 : nullptr;
5183}
5184
5185InstructionCost
5186LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5187 ElementCount VF) {
5188 assert(VF.isVector() &&
5189 "Scalarization cost of instruction implies vectorization.");
5190 if (VF.isScalable())
5191 return InstructionCost::getInvalid();
5192
5193 Type *ValTy = getLoadStoreType(I);
5194 auto *SE = PSE.getSE();
5195
5196 unsigned AS = getLoadStoreAddressSpace(I);
5197 Value *Ptr = getLoadStorePointerOperand(V: I);
5198 Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5199 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5200 // that it is being called from this specific place.
5201
5202 // Figure out whether the access is strided and get the stride value
5203 // if it's known in compile time
5204 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
5205
5206 // Get the cost of the scalar memory instruction and address computation.
5207 InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
5208 PtrTy, SE, Ptr: PtrSCEV, CostKind);
5209
5210 // Don't pass *I here, since it is scalar but will actually be part of a
5211 // vectorized loop where the user of it is a vectorized instruction.
5212 const Align Alignment = getLoadStoreAlignment(I);
5213 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5214 Cost += VF.getFixedValue() *
5215 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy->getScalarType(), Alignment,
5216 AddressSpace: AS, CostKind, OpdInfo: OpInfo);
5217
5218 // Get the overhead of the extractelement and insertelement instructions
5219 // we might create due to scalarization.
5220 Cost += getScalarizationOverhead(I, VF);
5221
5222 // If we have a predicated load/store, it will need extra i1 extracts and
5223 // conditional branches, but may not be executed for each vector lane. Scale
5224 // the cost by the probability of executing the predicated block.
5225 if (isPredicatedInst(I)) {
5226 Cost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5227
5228 // Add the cost of an i1 extract and a branch
5229 auto *VecI1Ty =
5230 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5231 Cost += TTI.getScalarizationOverhead(
5232 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5233 /*Insert=*/false, /*Extract=*/true, CostKind);
5234 Cost += TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind);
5235
5236 if (useEmulatedMaskMemRefHack(I, VF))
5237 // Artificially setting to a high enough value to practically disable
5238 // vectorization with such operations.
5239 Cost = 3000000;
5240 }
5241
5242 return Cost;
5243}
5244
5245InstructionCost
5246LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5247 ElementCount VF) {
5248 Type *ValTy = getLoadStoreType(I);
5249 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5250 Value *Ptr = getLoadStorePointerOperand(V: I);
5251 unsigned AS = getLoadStoreAddressSpace(I);
5252 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5253
5254 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5255 "Stride should be 1 or -1 for consecutive memory access");
5256 const Align Alignment = getLoadStoreAlignment(I);
5257 InstructionCost Cost = 0;
5258 if (isMaskRequired(I)) {
5259 unsigned IID = I->getOpcode() == Instruction::Load
5260 ? Intrinsic::masked_load
5261 : Intrinsic::masked_store;
5262 Cost += TTI.getMemIntrinsicInstrCost(
5263 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS), CostKind);
5264 } else {
5265 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5266 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5267 CostKind, OpdInfo: OpInfo, I);
5268 }
5269
5270 bool Reverse = ConsecutiveStride < 0;
5271 if (Reverse)
5272 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5273 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5274 return Cost;
5275}
5276
5277InstructionCost
5278LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5279 ElementCount VF) {
5280 assert(Legal->isUniformMemOp(*I, VF));
5281
5282 Type *ValTy = getLoadStoreType(I);
5283 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5284 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5285 const Align Alignment = getLoadStoreAlignment(I);
5286 unsigned AS = getLoadStoreAddressSpace(I);
5287 if (isa<LoadInst>(Val: I)) {
5288 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5289 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5290 CostKind) +
5291 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5292 SrcTy: VectorTy, Mask: {}, CostKind);
5293 }
5294 StoreInst *SI = cast<StoreInst>(Val: I);
5295
5296 bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5297 // TODO: We have existing tests that request the cost of extracting element
5298 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5299 // the actual generated code, which involves extracting the last element of
5300 // a scalable vector where the lane to extract is unknown at compile time.
5301 InstructionCost Cost =
5302 TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5303 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS, CostKind);
5304 if (!IsLoopInvariantStoreValue)
5305 Cost += TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
5306 Val: VectorTy, CostKind, Index: 0);
5307 return Cost;
5308}
5309
5310InstructionCost
5311LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5312 ElementCount VF) {
5313 Type *ValTy = getLoadStoreType(I);
5314 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5315 const Align Alignment = getLoadStoreAlignment(I);
5316 Value *Ptr = getLoadStorePointerOperand(V: I);
5317 Type *PtrTy = Ptr->getType();
5318
5319 if (!Legal->isUniform(V: Ptr, VF))
5320 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
5321
5322 unsigned IID = I->getOpcode() == Instruction::Load
5323 ? Intrinsic::masked_gather
5324 : Intrinsic::masked_scatter;
5325 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5326 TTI.getMemIntrinsicInstrCost(
5327 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Ptr, isMaskRequired(I),
5328 Alignment, I),
5329 CostKind);
5330}
5331
5332InstructionCost
5333LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5334 ElementCount VF) {
5335 const auto *Group = getInterleavedAccessGroup(Instr: I);
5336 assert(Group && "Fail to get an interleaved access group.");
5337
5338 Instruction *InsertPos = Group->getInsertPos();
5339 Type *ValTy = getLoadStoreType(I: InsertPos);
5340 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5341 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5342
5343 unsigned InterleaveFactor = Group->getFactor();
5344 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
5345
5346 // Holds the indices of existing members in the interleaved group.
5347 SmallVector<unsigned, 4> Indices;
5348 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5349 if (Group->getMember(Index: IF))
5350 Indices.push_back(Elt: IF);
5351
5352 // Calculate the cost of the whole interleaved group.
5353 bool UseMaskForGaps =
5354 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5355 (isa<StoreInst>(Val: I) && !Group->isFull());
5356 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5357 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5358 Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: isMaskRequired(I), UseMaskForGaps);
5359
5360 if (Group->isReverse()) {
5361 // TODO: Add support for reversed masked interleaved access.
5362 assert(!isMaskRequired(I) &&
5363 "Reverse masked interleaved access not supported.");
5364 Cost += Group->getNumMembers() *
5365 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5366 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5367 }
5368 return Cost;
5369}
5370
5371std::optional<InstructionCost>
5372LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5373 ElementCount VF,
5374 Type *Ty) const {
5375 using namespace llvm::PatternMatch;
5376 // Early exit for no inloop reductions
5377 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
5378 return std::nullopt;
5379 auto *VectorTy = cast<VectorType>(Val: Ty);
5380
5381 // We are looking for a pattern of, and finding the minimal acceptable cost:
5382 // reduce(mul(ext(A), ext(B))) or
5383 // reduce(mul(A, B)) or
5384 // reduce(ext(A)) or
5385 // reduce(A).
5386 // The basic idea is that we walk down the tree to do that, finding the root
5387 // reduction instruction in InLoopReductionImmediateChains. From there we find
5388 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5389 // of the components. If the reduction cost is lower then we return it for the
5390 // reduction instruction and 0 for the other instructions in the pattern. If
5391 // it is not we return an invalid cost specifying the orignal cost method
5392 // should be used.
5393 Instruction *RetI = I;
5394 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5395 if (!RetI->hasOneUser())
5396 return std::nullopt;
5397 RetI = RetI->user_back();
5398 }
5399
5400 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5401 RetI->user_back()->getOpcode() == Instruction::Add) {
5402 RetI = RetI->user_back();
5403 }
5404
5405 // Test if the found instruction is a reduction, and if not return an invalid
5406 // cost specifying the parent to use the original cost modelling.
5407 Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5408 if (!LastChain)
5409 return std::nullopt;
5410
5411 // Find the reduction this chain is a part of and calculate the basic cost of
5412 // the reduction on its own.
5413 Instruction *ReductionPhi = LastChain;
5414 while (!isa<PHINode>(Val: ReductionPhi))
5415 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5416
5417 const RecurrenceDescriptor &RdxDesc =
5418 Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
5419
5420 InstructionCost BaseCost;
5421 RecurKind RK = RdxDesc.getRecurrenceKind();
5422 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5423 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5424 BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5425 FMF: RdxDesc.getFastMathFlags(), CostKind);
5426 } else {
5427 BaseCost = TTI.getArithmeticReductionCost(
5428 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5429 }
5430
5431 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5432 // normal fmul instruction to the cost of the fadd reduction.
5433 if (RK == RecurKind::FMulAdd)
5434 BaseCost +=
5435 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5436
5437 // If we're using ordered reductions then we can just return the base cost
5438 // here, since getArithmeticReductionCost calculates the full ordered
5439 // reduction cost when FP reassociation is not allowed.
5440 if (useOrderedReductions(RdxDesc))
5441 return BaseCost;
5442
5443 // Get the operand that was not the reduction chain and match it to one of the
5444 // patterns, returning the better cost if it is found.
5445 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
5446 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
5447 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
5448
5449 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
5450
5451 Instruction *Op0, *Op1;
5452 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5453 match(V: RedOp,
5454 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5455 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5456 Op0->getOpcode() == Op1->getOpcode() &&
5457 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
5458 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5459 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5460
5461 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5462 // Note that the extend opcodes need to all match, or if A==B they will have
5463 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5464 // which is equally fine.
5465 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5466 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
5467 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5468
5469 InstructionCost ExtCost =
5470 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5471 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5472 InstructionCost MulCost =
5473 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5474 InstructionCost Ext2Cost =
5475 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5476 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5477
5478 InstructionCost RedCost = TTI.getMulAccReductionCost(
5479 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5480 CostKind);
5481
5482 if (RedCost.isValid() &&
5483 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5484 return I == RetI ? RedCost : 0;
5485 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5486 !TheLoop->isLoopInvariant(V: RedOp)) {
5487 // Matched reduce(ext(A))
5488 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5489 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
5490 InstructionCost RedCost = TTI.getExtendedReductionCost(
5491 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5492 FMF: RdxDesc.getFastMathFlags(), CostKind);
5493
5494 InstructionCost ExtCost =
5495 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5496 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5497 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5498 return I == RetI ? RedCost : 0;
5499 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5500 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5501 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5502 Op0->getOpcode() == Op1->getOpcode() &&
5503 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5504 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5505 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
5506 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
5507 Type *LargestOpTy =
5508 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5509 : Op0Ty;
5510 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5511
5512 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5513 // different sizes. We take the largest type as the ext to reduce, and add
5514 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5515 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5516 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5517 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5518 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5519 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5520 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5521 InstructionCost MulCost =
5522 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5523
5524 InstructionCost RedCost = TTI.getMulAccReductionCost(
5525 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5526 CostKind);
5527 InstructionCost ExtraExtCost = 0;
5528 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5529 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5530 ExtraExtCost = TTI.getCastInstrCost(
5531 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5532 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
5533 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5534 }
5535
5536 if (RedCost.isValid() &&
5537 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5538 return I == RetI ? RedCost : 0;
5539 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5540 // Matched reduce.add(mul())
5541 InstructionCost MulCost =
5542 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5543
5544 InstructionCost RedCost = TTI.getMulAccReductionCost(
5545 IsUnsigned: true, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy,
5546 CostKind);
5547
5548 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5549 return I == RetI ? RedCost : 0;
5550 }
5551 }
5552
5553 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5554}
5555
5556InstructionCost
5557LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5558 ElementCount VF) {
5559 // Calculate scalar cost only. Vectorization cost should be ready at this
5560 // moment.
5561 if (VF.isScalar()) {
5562 Type *ValTy = getLoadStoreType(I);
5563 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5564 const Align Alignment = getLoadStoreAlignment(I);
5565 unsigned AS = getLoadStoreAddressSpace(I);
5566
5567 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5568 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5569 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5570 OpdInfo: OpInfo, I);
5571 }
5572 return getWideningCost(I, VF);
5573}
5574
5575InstructionCost
5576LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5577 ElementCount VF) const {
5578
5579 // There is no mechanism yet to create a scalable scalarization loop,
5580 // so this is currently Invalid.
5581 if (VF.isScalable())
5582 return InstructionCost::getInvalid();
5583
5584 if (VF.isScalar())
5585 return 0;
5586
5587 InstructionCost Cost = 0;
5588 Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5589 if (!RetTy->isVoidTy() &&
5590 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5591
5592 TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None;
5593 if (isa<LoadInst>(Val: I))
5594 VIC = TTI::VectorInstrContext::Load;
5595 else if (isa<StoreInst>(Val: I))
5596 VIC = TTI::VectorInstrContext::Store;
5597
5598 for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5599 Cost += TTI.getScalarizationOverhead(
5600 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5601 /*Insert=*/true, /*Extract=*/false, CostKind,
5602 /*ForPoisonSrc=*/true, VL: {}, VIC);
5603 }
5604 }
5605
5606 // Some targets keep addresses scalar.
5607 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5608 return Cost;
5609
5610 // Some targets support efficient element stores.
5611 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5612 return Cost;
5613
5614 // Collect operands to consider.
5615 CallInst *CI = dyn_cast<CallInst>(Val: I);
5616 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5617
5618 // Skip operands that do not require extraction/scalarization and do not incur
5619 // any overhead.
5620 SmallVector<Type *> Tys;
5621 for (auto *V : filterExtractingOperands(Ops, VF))
5622 Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5623
5624 TTI::VectorInstrContext OperandVIC = isa<StoreInst>(Val: I)
5625 ? TTI::VectorInstrContext::Store
5626 : TTI::VectorInstrContext::None;
5627 return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC: OperandVIC);
5628}
5629
5630void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5631 if (VF.isScalar())
5632 return;
5633 NumPredStores = 0;
5634 for (BasicBlock *BB : TheLoop->blocks()) {
5635 // For each instruction in the old loop.
5636 for (Instruction &I : *BB) {
5637 Value *Ptr = getLoadStorePointerOperand(V: &I);
5638 if (!Ptr)
5639 continue;
5640
5641 // TODO: We should generate better code and update the cost model for
5642 // predicated uniform stores. Today they are treated as any other
5643 // predicated store (see added test cases in
5644 // invariant-store-vectorization.ll).
5645 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5646 NumPredStores++;
5647
5648 if (Legal->isUniformMemOp(I, VF)) {
5649 auto IsLegalToScalarize = [&]() {
5650 if (!VF.isScalable())
5651 // Scalarization of fixed length vectors "just works".
5652 return true;
5653
5654 // We have dedicated lowering for unpredicated uniform loads and
5655 // stores. Note that even with tail folding we know that at least
5656 // one lane is active (i.e. generalized predication is not possible
5657 // here), and the logic below depends on this fact.
5658 if (!foldTailByMasking())
5659 return true;
5660
5661 // For scalable vectors, a uniform memop load is always
5662 // uniform-by-parts and we know how to scalarize that.
5663 if (isa<LoadInst>(Val: I))
5664 return true;
5665
5666 // A uniform store isn't neccessarily uniform-by-part
5667 // and we can't assume scalarization.
5668 auto &SI = cast<StoreInst>(Val&: I);
5669 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5670 };
5671
5672 const InstructionCost GatherScatterCost =
5673 isLegalGatherOrScatter(V: &I, VF) ?
5674 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5675
5676 // Load: Scalar load + broadcast
5677 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5678 // FIXME: This cost is a significant under-estimate for tail folded
5679 // memory ops.
5680 const InstructionCost ScalarizationCost =
5681 IsLegalToScalarize() ? getUniformMemOpCost(I: &I, VF)
5682 : InstructionCost::getInvalid();
5683
5684 // Choose better solution for the current VF, Note that Invalid
5685 // costs compare as maximumal large. If both are invalid, we get
5686 // scalable invalid which signals a failure and a vectorization abort.
5687 if (GatherScatterCost < ScalarizationCost)
5688 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5689 else
5690 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5691 continue;
5692 }
5693
5694 // We assume that widening is the best solution when possible.
5695 if (memoryInstructionCanBeWidened(I: &I, VF)) {
5696 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5697 int ConsecutiveStride = Legal->isConsecutivePtr(
5698 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5699 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5700 "Expected consecutive stride.");
5701 InstWidening Decision =
5702 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5703 setWideningDecision(I: &I, VF, W: Decision, Cost);
5704 continue;
5705 }
5706
5707 // Choose between Interleaving, Gather/Scatter or Scalarization.
5708 InstructionCost InterleaveCost = InstructionCost::getInvalid();
5709 unsigned NumAccesses = 1;
5710 if (isAccessInterleaved(Instr: &I)) {
5711 const auto *Group = getInterleavedAccessGroup(Instr: &I);
5712 assert(Group && "Fail to get an interleaved access group.");
5713
5714 // Make one decision for the whole group.
5715 if (getWideningDecision(I: &I, VF) != CM_Unknown)
5716 continue;
5717
5718 NumAccesses = Group->getNumMembers();
5719 if (interleavedAccessCanBeWidened(I: &I, VF))
5720 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5721 }
5722
5723 InstructionCost GatherScatterCost =
5724 isLegalGatherOrScatter(V: &I, VF)
5725 ? getGatherScatterCost(I: &I, VF) * NumAccesses
5726 : InstructionCost::getInvalid();
5727
5728 InstructionCost ScalarizationCost =
5729 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5730
5731 // Choose better solution for the current VF,
5732 // write down this decision and use it during vectorization.
5733 InstructionCost Cost;
5734 InstWidening Decision;
5735 if (InterleaveCost <= GatherScatterCost &&
5736 InterleaveCost < ScalarizationCost) {
5737 Decision = CM_Interleave;
5738 Cost = InterleaveCost;
5739 } else if (GatherScatterCost < ScalarizationCost) {
5740 Decision = CM_GatherScatter;
5741 Cost = GatherScatterCost;
5742 } else {
5743 Decision = CM_Scalarize;
5744 Cost = ScalarizationCost;
5745 }
5746 // If the instructions belongs to an interleave group, the whole group
5747 // receives the same decision. The whole group receives the cost, but
5748 // the cost will actually be assigned to one instruction.
5749 if (const auto *Group = getInterleavedAccessGroup(Instr: &I)) {
5750 if (Decision == CM_Scalarize) {
5751 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5752 if (auto *I = Group->getMember(Index: Idx)) {
5753 setWideningDecision(I, VF, W: Decision,
5754 Cost: getMemInstScalarizationCost(I, VF));
5755 }
5756 }
5757 } else {
5758 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5759 }
5760 } else
5761 setWideningDecision(I: &I, VF, W: Decision, Cost);
5762 }
5763 }
5764
5765 // Make sure that any load of address and any other address computation
5766 // remains scalar unless there is gather/scatter support. This avoids
5767 // inevitable extracts into address registers, and also has the benefit of
5768 // activating LSR more, since that pass can't optimize vectorized
5769 // addresses.
5770 if (TTI.prefersVectorizedAddressing())
5771 return;
5772
5773 // Start with all scalar pointer uses.
5774 SmallPtrSet<Instruction *, 8> AddrDefs;
5775 for (BasicBlock *BB : TheLoop->blocks())
5776 for (Instruction &I : *BB) {
5777 Instruction *PtrDef =
5778 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5779 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5780 getWideningDecision(I: &I, VF) != CM_GatherScatter)
5781 AddrDefs.insert(Ptr: PtrDef);
5782 }
5783
5784 // Add all instructions used to generate the addresses.
5785 SmallVector<Instruction *, 4> Worklist;
5786 append_range(C&: Worklist, R&: AddrDefs);
5787 while (!Worklist.empty()) {
5788 Instruction *I = Worklist.pop_back_val();
5789 for (auto &Op : I->operands())
5790 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5791 if (TheLoop->contains(Inst: InstOp) && !isa<PHINode>(Val: InstOp) &&
5792 AddrDefs.insert(Ptr: InstOp).second)
5793 Worklist.push_back(Elt: InstOp);
5794 }
5795
5796 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5797 // If there are direct memory op users of the newly scalarized load,
5798 // their cost may have changed because there's no scalarization
5799 // overhead for the operand. Update it.
5800 for (User *U : LI->users()) {
5801 if (!isa<LoadInst, StoreInst>(Val: U))
5802 continue;
5803 if (getWideningDecision(I: cast<Instruction>(Val: U), VF) != CM_Scalarize)
5804 continue;
5805 setWideningDecision(
5806 I: cast<Instruction>(Val: U), VF, W: CM_Scalarize,
5807 Cost: getMemInstScalarizationCost(I: cast<Instruction>(Val: U), VF));
5808 }
5809 };
5810 for (auto *I : AddrDefs) {
5811 if (isa<LoadInst>(Val: I)) {
5812 // Setting the desired widening decision should ideally be handled in
5813 // by cost functions, but since this involves the task of finding out
5814 // if the loaded register is involved in an address computation, it is
5815 // instead changed here when we know this is the case.
5816 InstWidening Decision = getWideningDecision(I, VF);
5817 if (!isPredicatedInst(I) &&
5818 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
5819 (!Legal->isUniformMemOp(I&: *I, VF) && Decision == CM_Scalarize))) {
5820 // Scalarize a widened load of address or update the cost of a scalar
5821 // load of an address.
5822 setWideningDecision(
5823 I, VF, W: CM_Scalarize,
5824 Cost: (VF.getKnownMinValue() *
5825 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
5826 UpdateMemOpUserCost(cast<LoadInst>(Val: I));
5827 } else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5828 // Scalarize all members of this interleaved group when any member
5829 // is used as an address. The address-used load skips scalarization
5830 // overhead, other members include it.
5831 for (unsigned Idx = 0; Idx < Group->getFactor(); ++Idx) {
5832 if (Instruction *Member = Group->getMember(Index: Idx)) {
5833 InstructionCost Cost =
5834 AddrDefs.contains(Ptr: Member)
5835 ? (VF.getKnownMinValue() *
5836 getMemoryInstructionCost(I: Member,
5837 VF: ElementCount::getFixed(MinVal: 1)))
5838 : getMemInstScalarizationCost(I: Member, VF);
5839 setWideningDecision(I: Member, VF, W: CM_Scalarize, Cost);
5840 UpdateMemOpUserCost(cast<LoadInst>(Val: Member));
5841 }
5842 }
5843 }
5844 } else {
5845 // Cannot scalarize fixed-order recurrence phis at the moment.
5846 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5847 continue;
5848
5849 // Make sure I gets scalarized and a cost estimate without
5850 // scalarization overhead.
5851 ForcedScalars[VF].insert(Ptr: I);
5852 }
5853 }
5854}
5855
5856void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5857 assert(!VF.isScalar() &&
5858 "Trying to set a vectorization decision for a scalar VF");
5859
5860 auto ForcedScalar = ForcedScalars.find(Val: VF);
5861 for (BasicBlock *BB : TheLoop->blocks()) {
5862 // For each instruction in the old loop.
5863 for (Instruction &I : *BB) {
5864 CallInst *CI = dyn_cast<CallInst>(Val: &I);
5865
5866 if (!CI)
5867 continue;
5868
5869 InstructionCost ScalarCost = InstructionCost::getInvalid();
5870 InstructionCost VectorCost = InstructionCost::getInvalid();
5871 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5872 Function *ScalarFunc = CI->getCalledFunction();
5873 Type *ScalarRetTy = CI->getType();
5874 SmallVector<Type *, 4> Tys, ScalarTys;
5875 for (auto &ArgOp : CI->args())
5876 ScalarTys.push_back(Elt: ArgOp->getType());
5877
5878 // Estimate cost of scalarized vector call. The source operands are
5879 // assumed to be vectors, so we need to extract individual elements from
5880 // there, execute VF scalar calls, and then gather the result into the
5881 // vector return value.
5882 if (VF.isFixed()) {
5883 InstructionCost ScalarCallCost =
5884 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5885
5886 // Compute costs of unpacking argument values for the scalar calls and
5887 // packing the return values to a vector.
5888 InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5889 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5890 } else {
5891 // There is no point attempting to calculate the scalar cost for a
5892 // scalable VF as we know it will be Invalid.
5893 assert(!getScalarizationOverhead(CI, VF).isValid() &&
5894 "Unexpected valid cost for scalarizing scalable vectors");
5895 ScalarCost = InstructionCost::getInvalid();
5896 }
5897
5898 // Honor ForcedScalars and UniformAfterVectorization decisions.
5899 // TODO: For calls, it might still be more profitable to widen. Use
5900 // VPlan-based cost model to compare different options.
5901 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5902 ForcedScalar->second.contains(Ptr: CI)) ||
5903 isUniformAfterVectorization(I: CI, VF))) {
5904 setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5905 IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5906 Cost: ScalarCost);
5907 continue;
5908 }
5909
5910 bool MaskRequired = isMaskRequired(I: CI);
5911 // Compute corresponding vector type for return value and arguments.
5912 Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5913 for (Type *ScalarTy : ScalarTys)
5914 Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5915
5916 // An in-loop reduction using an fmuladd intrinsic is a special case;
5917 // we don't want the normal cost for that intrinsic.
5918 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5919 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5920 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5921 IID: getVectorIntrinsicIDForCall(CI, TLI),
5922 MaskPos: std::nullopt, Cost: *RedCost);
5923 continue;
5924 }
5925
5926 // Find the cost of vectorizing the call, if we can find a suitable
5927 // vector variant of the function.
5928 VFInfo FuncInfo;
5929 Function *VecFunc = nullptr;
5930 // Search through any available variants for one we can use at this VF.
5931 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5932 // Must match requested VF.
5933 if (Info.Shape.VF != VF)
5934 continue;
5935
5936 // Must take a mask argument if one is required
5937 if (MaskRequired && !Info.isMasked())
5938 continue;
5939
5940 // Check that all parameter kinds are supported
5941 bool ParamsOk = true;
5942 for (VFParameter Param : Info.Shape.Parameters) {
5943 switch (Param.ParamKind) {
5944 case VFParamKind::Vector:
5945 break;
5946 case VFParamKind::OMP_Uniform: {
5947 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5948 // Make sure the scalar parameter in the loop is invariant.
5949 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
5950 L: TheLoop))
5951 ParamsOk = false;
5952 break;
5953 }
5954 case VFParamKind::OMP_Linear: {
5955 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5956 // Find the stride for the scalar parameter in this loop and see if
5957 // it matches the stride for the variant.
5958 // TODO: do we need to figure out the cost of an extract to get the
5959 // first lane? Or do we hope that it will be folded away?
5960 ScalarEvolution *SE = PSE.getSE();
5961 if (!match(S: SE->getSCEV(V: ScalarParam),
5962 P: m_scev_AffineAddRec(
5963 Op0: m_SCEV(), Op1: m_scev_SpecificSInt(V: Param.LinearStepOrPos),
5964 L: m_SpecificLoop(L: TheLoop))))
5965 ParamsOk = false;
5966 break;
5967 }
5968 case VFParamKind::GlobalPredicate:
5969 break;
5970 default:
5971 ParamsOk = false;
5972 break;
5973 }
5974 }
5975
5976 if (!ParamsOk)
5977 continue;
5978
5979 // Found a suitable candidate, stop here.
5980 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
5981 FuncInfo = Info;
5982 break;
5983 }
5984
5985 if (TLI && VecFunc && !CI->isNoBuiltin())
5986 VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
5987
5988 // Find the cost of an intrinsic; some targets may have instructions that
5989 // perform the operation without needing an actual call.
5990 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
5991 if (IID != Intrinsic::not_intrinsic)
5992 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
5993
5994 InstructionCost Cost = ScalarCost;
5995 InstWidening Decision = CM_Scalarize;
5996
5997 if (VectorCost.isValid() && VectorCost <= Cost) {
5998 Cost = VectorCost;
5999 Decision = CM_VectorCall;
6000 }
6001
6002 if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
6003 Cost = IntrinsicCost;
6004 Decision = CM_IntrinsicCall;
6005 }
6006
6007 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6008 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6009 }
6010 }
6011}
6012
6013bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6014 if (!Legal->isInvariant(V: Op))
6015 return false;
6016 // Consider Op invariant, if it or its operands aren't predicated
6017 // instruction in the loop. In that case, it is not trivially hoistable.
6018 auto *OpI = dyn_cast<Instruction>(Val: Op);
6019 return !OpI || !TheLoop->contains(Inst: OpI) ||
6020 (!isPredicatedInst(I: OpI) &&
6021 (!isa<PHINode>(Val: OpI) || OpI->getParent() != TheLoop->getHeader()) &&
6022 all_of(Range: OpI->operands(),
6023 P: [this](Value *Op) { return shouldConsiderInvariant(Op); }));
6024}
6025
6026InstructionCost
6027LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6028 ElementCount VF) {
6029 // If we know that this instruction will remain uniform, check the cost of
6030 // the scalar version.
6031 if (isUniformAfterVectorization(I, VF))
6032 VF = ElementCount::getFixed(MinVal: 1);
6033
6034 if (VF.isVector() && isProfitableToScalarize(I, VF))
6035 return InstsToScalarize[VF][I];
6036
6037 // Forced scalars do not have any scalarization overhead.
6038 auto ForcedScalar = ForcedScalars.find(Val: VF);
6039 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6040 auto InstSet = ForcedScalar->second;
6041 if (InstSet.count(Ptr: I))
6042 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
6043 VF.getKnownMinValue();
6044 }
6045
6046 Type *RetTy = I->getType();
6047 if (canTruncateToMinimalBitwidth(I, VF))
6048 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
6049 auto *SE = PSE.getSE();
6050
6051 Type *VectorTy;
6052 if (isScalarAfterVectorization(I, VF)) {
6053 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
6054 [this](Instruction *I, ElementCount VF) -> bool {
6055 if (VF.isScalar())
6056 return true;
6057
6058 auto Scalarized = InstsToScalarize.find(Key: VF);
6059 assert(Scalarized != InstsToScalarize.end() &&
6060 "VF not yet analyzed for scalarization profitability");
6061 return !Scalarized->second.count(Key: I) &&
6062 llvm::all_of(Range: I->users(), P: [&](User *U) {
6063 auto *UI = cast<Instruction>(Val: U);
6064 return !Scalarized->second.count(Key: UI);
6065 });
6066 };
6067
6068 // With the exception of GEPs and PHIs, after scalarization there should
6069 // only be one copy of the instruction generated in the loop. This is
6070 // because the VF is either 1, or any instructions that need scalarizing
6071 // have already been dealt with by the time we get here. As a result,
6072 // it means we don't have to multiply the instruction cost by VF.
6073 assert(I->getOpcode() == Instruction::GetElementPtr ||
6074 I->getOpcode() == Instruction::PHI ||
6075 (I->getOpcode() == Instruction::BitCast &&
6076 I->getType()->isPointerTy()) ||
6077 HasSingleCopyAfterVectorization(I, VF));
6078 VectorTy = RetTy;
6079 } else
6080 VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
6081
6082 if (VF.isVector() && VectorTy->isVectorTy() &&
6083 !TTI.getNumberOfParts(Tp: VectorTy))
6084 return InstructionCost::getInvalid();
6085
6086 // TODO: We need to estimate the cost of intrinsic calls.
6087 switch (I->getOpcode()) {
6088 case Instruction::GetElementPtr:
6089 // We mark this instruction as zero-cost because the cost of GEPs in
6090 // vectorized code depends on whether the corresponding memory instruction
6091 // is scalarized or not. Therefore, we handle GEPs with the memory
6092 // instruction cost.
6093 return 0;
6094 case Instruction::UncondBr:
6095 case Instruction::CondBr: {
6096 // In cases of scalarized and predicated instructions, there will be VF
6097 // predicated blocks in the vectorized loop. Each branch around these
6098 // blocks requires also an extract of its vector compare i1 element.
6099 // Note that the conditional branch from the loop latch will be replaced by
6100 // a single branch controlling the loop, so there is no extra overhead from
6101 // scalarization.
6102 bool ScalarPredicatedBB = false;
6103 CondBrInst *BI = dyn_cast<CondBrInst>(Val: I);
6104 if (VF.isVector() && BI &&
6105 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6106 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
6107 BI->getParent() != TheLoop->getLoopLatch())
6108 ScalarPredicatedBB = true;
6109
6110 if (ScalarPredicatedBB) {
6111 // Not possible to scalarize scalable vector with predicated instructions.
6112 if (VF.isScalable())
6113 return InstructionCost::getInvalid();
6114 // Return cost for branches around scalarized and predicated blocks.
6115 auto *VecI1Ty =
6116 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6117 return (TTI.getScalarizationOverhead(
6118 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6119 /*Insert*/ false, /*Extract*/ true, CostKind) +
6120 (TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind) *
6121 VF.getFixedValue()));
6122 }
6123
6124 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6125 // The back-edge branch will remain, as will all scalar branches.
6126 return TTI.getCFInstrCost(Opcode: Instruction::UncondBr, CostKind);
6127
6128 // This branch will be eliminated by if-conversion.
6129 return 0;
6130 // Note: We currently assume zero cost for an unconditional branch inside
6131 // a predicated block since it will become a fall-through, although we
6132 // may decide in the future to call TTI for all branches.
6133 }
6134 case Instruction::Switch: {
6135 if (VF.isScalar())
6136 return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6137 auto *Switch = cast<SwitchInst>(Val: I);
6138 return Switch->getNumCases() *
6139 TTI.getCmpSelInstrCost(
6140 Opcode: Instruction::ICmp,
6141 ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6142 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6143 VecPred: CmpInst::ICMP_EQ, CostKind);
6144 }
6145 case Instruction::PHI: {
6146 auto *Phi = cast<PHINode>(Val: I);
6147
6148 // First-order recurrences are replaced by vector shuffles inside the loop.
6149 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6150 SmallVector<int> Mask(VF.getKnownMinValue());
6151 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6152 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6153 DstTy: cast<VectorType>(Val: VectorTy),
6154 SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6155 Index: VF.getKnownMinValue() - 1);
6156 }
6157
6158 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6159 // converted into select instructions. We require N - 1 selects per phi
6160 // node, where N is the number of incoming values.
6161 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6162 Type *ResultTy = Phi->getType();
6163
6164 // All instructions in an Any-of reduction chain are narrowed to bool.
6165 // Check if that is the case for this phi node.
6166 auto *HeaderUser = cast_if_present<PHINode>(
6167 Val: find_singleton<User>(Range: Phi->users(), P: [this](User *U, bool) -> User * {
6168 auto *Phi = dyn_cast<PHINode>(Val: U);
6169 if (Phi && Phi->getParent() == TheLoop->getHeader())
6170 return Phi;
6171 return nullptr;
6172 }));
6173 if (HeaderUser) {
6174 auto &ReductionVars = Legal->getReductionVars();
6175 auto Iter = ReductionVars.find(Key: HeaderUser);
6176 if (Iter != ReductionVars.end() &&
6177 RecurrenceDescriptor::isAnyOfRecurrenceKind(
6178 Kind: Iter->second.getRecurrenceKind()))
6179 ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6180 }
6181 return (Phi->getNumIncomingValues() - 1) *
6182 TTI.getCmpSelInstrCost(
6183 Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6184 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6185 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6186 }
6187
6188 // When tail folding with EVL, if the phi is part of an out of loop
6189 // reduction then it will be transformed into a wide vp_merge.
6190 if (VF.isVector() && foldTailWithEVL() &&
6191 Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6192 IntrinsicCostAttributes ICA(
6193 Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6194 {toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6195 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6196 }
6197
6198 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6199 }
6200 case Instruction::UDiv:
6201 case Instruction::SDiv:
6202 case Instruction::URem:
6203 case Instruction::SRem:
6204 if (VF.isVector() && isPredicatedInst(I)) {
6205 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6206 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6207 ScalarCost : SafeDivisorCost;
6208 }
6209 // We've proven all lanes safe to speculate, fall through.
6210 [[fallthrough]];
6211 case Instruction::Add:
6212 case Instruction::Sub: {
6213 auto Info = Legal->getHistogramInfo(I);
6214 if (Info && VF.isVector()) {
6215 const HistogramInfo *HGram = Info.value();
6216 // Assume that a non-constant update value (or a constant != 1) requires
6217 // a multiply, and add that into the cost.
6218 InstructionCost MulCost = TTI::TCC_Free;
6219 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
6220 if (!RHS || RHS->getZExtValue() != 1)
6221 MulCost =
6222 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6223
6224 // Find the cost of the histogram operation itself.
6225 Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6226 Type *ScalarTy = I->getType();
6227 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6228 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6229 Type::getVoidTy(C&: I->getContext()),
6230 {PtrTy, ScalarTy, MaskTy});
6231
6232 // Add the costs together with the add/sub operation.
6233 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6234 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6235 }
6236 [[fallthrough]];
6237 }
6238 case Instruction::FAdd:
6239 case Instruction::FSub:
6240 case Instruction::Mul:
6241 case Instruction::FMul:
6242 case Instruction::FDiv:
6243 case Instruction::FRem:
6244 case Instruction::Shl:
6245 case Instruction::LShr:
6246 case Instruction::AShr:
6247 case Instruction::And:
6248 case Instruction::Or:
6249 case Instruction::Xor: {
6250 // If we're speculating on the stride being 1, the multiplication may
6251 // fold away. We can generalize this for all operations using the notion
6252 // of neutral elements. (TODO)
6253 if (I->getOpcode() == Instruction::Mul &&
6254 ((TheLoop->isLoopInvariant(V: I->getOperand(i: 0)) &&
6255 PSE.getSCEV(V: I->getOperand(i: 0))->isOne()) ||
6256 (TheLoop->isLoopInvariant(V: I->getOperand(i: 1)) &&
6257 PSE.getSCEV(V: I->getOperand(i: 1))->isOne())))
6258 return 0;
6259
6260 // Detect reduction patterns
6261 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6262 return *RedCost;
6263
6264 // Certain instructions can be cheaper to vectorize if they have a constant
6265 // second vector operand. One example of this are shifts on x86.
6266 Value *Op2 = I->getOperand(i: 1);
6267 if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6268 PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6269 isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6270 Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6271 }
6272 auto Op2Info = TTI.getOperandInfo(V: Op2);
6273 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6274 shouldConsiderInvariant(Op: Op2))
6275 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6276
6277 SmallVector<const Value *, 4> Operands(I->operand_values());
6278 return TTI.getArithmeticInstrCost(
6279 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6280 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6281 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6282 }
6283 case Instruction::FNeg: {
6284 return TTI.getArithmeticInstrCost(
6285 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6286 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6287 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6288 Args: I->getOperand(i: 0), CxtI: I);
6289 }
6290 case Instruction::Select: {
6291 SelectInst *SI = cast<SelectInst>(Val: I);
6292 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6293 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6294
6295 const Value *Op0, *Op1;
6296 using namespace llvm::PatternMatch;
6297 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6298 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6299 // select x, y, false --> x & y
6300 // select x, true, y --> x | y
6301 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6302 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6303 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6304 Op1->getType()->getScalarSizeInBits() == 1);
6305
6306 return TTI.getArithmeticInstrCost(
6307 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And,
6308 Ty: VectorTy, CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: {Op0, Op1}, CxtI: I);
6309 }
6310
6311 Type *CondTy = SI->getCondition()->getType();
6312 if (!ScalarCond)
6313 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6314
6315 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6316 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6317 Pred = Cmp->getPredicate();
6318 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6319 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6320 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6321 }
6322 case Instruction::ICmp:
6323 case Instruction::FCmp: {
6324 Type *ValTy = I->getOperand(i: 0)->getType();
6325
6326 if (canTruncateToMinimalBitwidth(I, VF)) {
6327 [[maybe_unused]] Instruction *Op0AsInstruction =
6328 dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6329 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6330 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6331 "if both the operand and the compare are marked for "
6332 "truncation, they must have the same bitwidth");
6333 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[I]);
6334 }
6335
6336 VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6337 return TTI.getCmpSelInstrCost(
6338 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6339 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6340 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6341 }
6342 case Instruction::Store:
6343 case Instruction::Load: {
6344 ElementCount Width = VF;
6345 if (Width.isVector()) {
6346 InstWidening Decision = getWideningDecision(I, VF: Width);
6347 assert(Decision != CM_Unknown &&
6348 "CM decision should be taken at this point");
6349 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6350 return InstructionCost::getInvalid();
6351 if (Decision == CM_Scalarize)
6352 Width = ElementCount::getFixed(MinVal: 1);
6353 }
6354 VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6355 return getMemoryInstructionCost(I, VF);
6356 }
6357 case Instruction::BitCast:
6358 if (I->getType()->isPointerTy())
6359 return 0;
6360 [[fallthrough]];
6361 case Instruction::ZExt:
6362 case Instruction::SExt:
6363 case Instruction::FPToUI:
6364 case Instruction::FPToSI:
6365 case Instruction::FPExt:
6366 case Instruction::PtrToInt:
6367 case Instruction::IntToPtr:
6368 case Instruction::SIToFP:
6369 case Instruction::UIToFP:
6370 case Instruction::Trunc:
6371 case Instruction::FPTrunc: {
6372 // Computes the CastContextHint from a Load/Store instruction.
6373 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6374 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6375 "Expected a load or a store!");
6376
6377 if (VF.isScalar() || !TheLoop->contains(Inst: I))
6378 return TTI::CastContextHint::Normal;
6379
6380 switch (getWideningDecision(I, VF)) {
6381 case LoopVectorizationCostModel::CM_GatherScatter:
6382 return TTI::CastContextHint::GatherScatter;
6383 case LoopVectorizationCostModel::CM_Interleave:
6384 return TTI::CastContextHint::Interleave;
6385 case LoopVectorizationCostModel::CM_Scalarize:
6386 case LoopVectorizationCostModel::CM_Widen:
6387 return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6388 : TTI::CastContextHint::Normal;
6389 case LoopVectorizationCostModel::CM_Widen_Reverse:
6390 return TTI::CastContextHint::Reversed;
6391 case LoopVectorizationCostModel::CM_Unknown:
6392 llvm_unreachable("Instr did not go through cost modelling?");
6393 case LoopVectorizationCostModel::CM_VectorCall:
6394 case LoopVectorizationCostModel::CM_IntrinsicCall:
6395 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6396 }
6397
6398 llvm_unreachable("Unhandled case!");
6399 };
6400
6401 unsigned Opcode = I->getOpcode();
6402 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6403 // For Trunc, the context is the only user, which must be a StoreInst.
6404 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6405 if (I->hasOneUse())
6406 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
6407 CCH = ComputeCCH(Store);
6408 }
6409 // For Z/Sext, the context is the operand, which must be a LoadInst.
6410 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6411 Opcode == Instruction::FPExt) {
6412 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
6413 CCH = ComputeCCH(Load);
6414 }
6415
6416 // We optimize the truncation of induction variables having constant
6417 // integer steps. The cost of these truncations is the same as the scalar
6418 // operation.
6419 if (isOptimizableIVTruncate(I, VF)) {
6420 auto *Trunc = cast<TruncInst>(Val: I);
6421 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6422 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6423 }
6424
6425 // Detect reduction patterns
6426 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6427 return *RedCost;
6428
6429 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
6430 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6431 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6432 SrcScalarTy =
6433 IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6434 Type *SrcVecTy =
6435 VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6436
6437 if (canTruncateToMinimalBitwidth(I, VF)) {
6438 // If the result type is <= the source type, there will be no extend
6439 // after truncating the users to the minimal required bitwidth.
6440 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6441 (I->getOpcode() == Instruction::ZExt ||
6442 I->getOpcode() == Instruction::SExt))
6443 return 0;
6444 }
6445
6446 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6447 }
6448 case Instruction::Call:
6449 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6450 case Instruction::ExtractValue:
6451 return TTI.getInstructionCost(U: I, CostKind);
6452 case Instruction::Alloca:
6453 // We cannot easily widen alloca to a scalable alloca, as
6454 // the result would need to be a vector of pointers.
6455 if (VF.isScalable())
6456 return InstructionCost::getInvalid();
6457 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: RetTy, CostKind);
6458 default:
6459 // This opcode is unknown. Assume that it is the same as 'mul'.
6460 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6461 } // end of switch.
6462}
6463
6464void LoopVectorizationCostModel::collectValuesToIgnore() {
6465 // Ignore ephemeral values.
6466 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6467
6468 SmallVector<Value *, 4> DeadInterleavePointerOps;
6469 SmallVector<Value *, 4> DeadOps;
6470
6471 // If a scalar epilogue is required, users outside the loop won't use
6472 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6473 // that is the case.
6474 bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6475 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6476 return RequiresScalarEpilogue &&
6477 !TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6478 };
6479
6480 LoopBlocksDFS DFS(TheLoop);
6481 DFS.perform(LI);
6482 for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6483 for (Instruction &I : reverse(C&: *BB)) {
6484 if (VecValuesToIgnore.contains(Ptr: &I) || ValuesToIgnore.contains(Ptr: &I))
6485 continue;
6486
6487 // Add instructions that would be trivially dead and are only used by
6488 // values already ignored to DeadOps to seed worklist.
6489 if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6490 all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6491 return VecValuesToIgnore.contains(Ptr: U) ||
6492 ValuesToIgnore.contains(Ptr: U) || IsLiveOutDead(U);
6493 }))
6494 DeadOps.push_back(Elt: &I);
6495
6496 // For interleave groups, we only create a pointer for the start of the
6497 // interleave group. Queue up addresses of group members except the insert
6498 // position for further processing.
6499 if (isAccessInterleaved(Instr: &I)) {
6500 auto *Group = getInterleavedAccessGroup(Instr: &I);
6501 if (Group->getInsertPos() == &I)
6502 continue;
6503 Value *PointerOp = getLoadStorePointerOperand(V: &I);
6504 DeadInterleavePointerOps.push_back(Elt: PointerOp);
6505 }
6506
6507 // Queue branches for analysis. They are dead, if their successors only
6508 // contain dead instructions.
6509 if (isa<CondBrInst>(Val: &I))
6510 DeadOps.push_back(Elt: &I);
6511 }
6512
6513 // Mark ops feeding interleave group members as free, if they are only used
6514 // by other dead computations.
6515 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6516 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
6517 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
6518 Instruction *UI = cast<Instruction>(Val: U);
6519 return !VecValuesToIgnore.contains(Ptr: U) &&
6520 (!isAccessInterleaved(Instr: UI) ||
6521 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6522 }))
6523 continue;
6524 VecValuesToIgnore.insert(Ptr: Op);
6525 append_range(C&: DeadInterleavePointerOps, R: Op->operands());
6526 }
6527
6528 // Mark ops that would be trivially dead and are only used by ignored
6529 // instructions as free.
6530 BasicBlock *Header = TheLoop->getHeader();
6531
6532 // Returns true if the block contains only dead instructions. Such blocks will
6533 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6534 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6535 auto IsEmptyBlock = [this](BasicBlock *BB) {
6536 return all_of(Range&: *BB, P: [this](Instruction &I) {
6537 return ValuesToIgnore.contains(Ptr: &I) || VecValuesToIgnore.contains(Ptr: &I) ||
6538 isa<UncondBrInst>(Val: &I);
6539 });
6540 };
6541 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6542 auto *Op = dyn_cast<Instruction>(Val: DeadOps[I]);
6543
6544 // Check if the branch should be considered dead.
6545 if (auto *Br = dyn_cast_or_null<CondBrInst>(Val: Op)) {
6546 BasicBlock *ThenBB = Br->getSuccessor(i: 0);
6547 BasicBlock *ElseBB = Br->getSuccessor(i: 1);
6548 // Don't considers branches leaving the loop for simplification.
6549 if (!TheLoop->contains(BB: ThenBB) || !TheLoop->contains(BB: ElseBB))
6550 continue;
6551 bool ThenEmpty = IsEmptyBlock(ThenBB);
6552 bool ElseEmpty = IsEmptyBlock(ElseBB);
6553 if ((ThenEmpty && ElseEmpty) ||
6554 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6555 ElseBB->phis().empty()) ||
6556 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6557 ThenBB->phis().empty())) {
6558 VecValuesToIgnore.insert(Ptr: Br);
6559 DeadOps.push_back(Elt: Br->getCondition());
6560 }
6561 continue;
6562 }
6563
6564 // Skip any op that shouldn't be considered dead.
6565 if (!Op || !TheLoop->contains(Inst: Op) ||
6566 (isa<PHINode>(Val: Op) && Op->getParent() == Header) ||
6567 !wouldInstructionBeTriviallyDead(I: Op, TLI) ||
6568 any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6569 return !VecValuesToIgnore.contains(Ptr: U) &&
6570 !ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead(U);
6571 }))
6572 continue;
6573
6574 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6575 // which applies for both scalar and vector versions. Otherwise it is only
6576 // dead in vector versions, so only add it to VecValuesToIgnore.
6577 if (all_of(Range: Op->users(),
6578 P: [this](User *U) { return ValuesToIgnore.contains(Ptr: U); }))
6579 ValuesToIgnore.insert(Ptr: Op);
6580
6581 VecValuesToIgnore.insert(Ptr: Op);
6582 append_range(C&: DeadOps, R: Op->operands());
6583 }
6584
6585 // Ignore type-promoting instructions we identified during reduction
6586 // detection.
6587 for (const auto &Reduction : Legal->getReductionVars()) {
6588 const RecurrenceDescriptor &RedDes = Reduction.second;
6589 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6590 VecValuesToIgnore.insert_range(R: Casts);
6591 }
6592 // Ignore type-casting instructions we identified during induction
6593 // detection.
6594 for (const auto &Induction : Legal->getInductionVars()) {
6595 const InductionDescriptor &IndDes = Induction.second;
6596 VecValuesToIgnore.insert_range(R: IndDes.getCastInsts());
6597 }
6598}
6599
6600void LoopVectorizationCostModel::collectInLoopReductions() {
6601 // Avoid duplicating work finding in-loop reductions.
6602 if (!InLoopReductions.empty())
6603 return;
6604
6605 for (const auto &Reduction : Legal->getReductionVars()) {
6606 PHINode *Phi = Reduction.first;
6607 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6608
6609 // Multi-use reductions (e.g., used in FindLastIV patterns) are handled
6610 // separately and should not be considered for in-loop reductions.
6611 if (RdxDesc.hasUsesOutsideReductionChain())
6612 continue;
6613
6614 // We don't collect reductions that are type promoted (yet).
6615 if (RdxDesc.getRecurrenceType() != Phi->getType())
6616 continue;
6617
6618 // In-loop AnyOf and FindIV reductions are not yet supported.
6619 RecurKind Kind = RdxDesc.getRecurrenceKind();
6620 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) ||
6621 RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) ||
6622 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
6623 continue;
6624
6625 // If the target would prefer this reduction to happen "in-loop", then we
6626 // want to record it as such.
6627 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6628 !TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6629 continue;
6630
6631 // Check that we can correctly put the reductions into the loop, by
6632 // finding the chain of operations that leads from the phi to the loop
6633 // exit value.
6634 SmallVector<Instruction *, 4> ReductionOperations =
6635 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6636 bool InLoop = !ReductionOperations.empty();
6637
6638 if (InLoop) {
6639 InLoopReductions.insert(Ptr: Phi);
6640 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6641 Instruction *LastChain = Phi;
6642 for (auto *I : ReductionOperations) {
6643 InLoopReductionImmediateChains[I] = LastChain;
6644 LastChain = I;
6645 }
6646 }
6647 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6648 << " reduction for phi: " << *Phi << "\n");
6649 }
6650}
6651
6652// This function will select a scalable VF if the target supports scalable
6653// vectors and a fixed one otherwise.
6654// TODO: we could return a pair of values that specify the max VF and
6655// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6656// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6657// doesn't have a cost model that can choose which plan to execute if
6658// more than one is generated.
6659static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6660 LoopVectorizationCostModel &CM) {
6661 unsigned WidestType;
6662 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6663
6664 TargetTransformInfo::RegisterKind RegKind =
6665 TTI.enableScalableVectorization()
6666 ? TargetTransformInfo::RGK_ScalableVector
6667 : TargetTransformInfo::RGK_FixedWidthVector;
6668
6669 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6670 unsigned N = RegSize.getKnownMinValue() / WidestType;
6671 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6672}
6673
6674VectorizationFactor
6675LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6676 ElementCount VF = UserVF;
6677 // Outer loop handling: They may require CFG and instruction level
6678 // transformations before even evaluating whether vectorization is profitable.
6679 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6680 // the vectorization pipeline.
6681 if (!OrigLoop->isInnermost()) {
6682 // If the user doesn't provide a vectorization factor, determine a
6683 // reasonable one.
6684 if (UserVF.isZero()) {
6685 VF = determineVPlanVF(TTI, CM);
6686 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6687
6688 // Make sure we have a VF > 1 for stress testing.
6689 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6690 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6691 << "overriding computed VF.\n");
6692 VF = ElementCount::getFixed(MinVal: 4);
6693 }
6694 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6695 !ForceTargetSupportsScalableVectors) {
6696 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6697 << "not supported by the target.\n");
6698 reportVectorizationFailure(
6699 DebugMsg: "Scalable vectorization requested but not supported by the target",
6700 OREMsg: "the scalable user-specified vectorization width for outer-loop "
6701 "vectorization cannot be used because the target does not support "
6702 "scalable vectors.",
6703 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6704 return VectorizationFactor::Disabled();
6705 }
6706 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6707 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6708 "VF needs to be a power of two");
6709 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6710 << "VF " << VF << " to build VPlans.\n");
6711 buildVPlans(MinVF: VF, MaxVF: VF);
6712
6713 if (VPlans.empty())
6714 return VectorizationFactor::Disabled();
6715
6716 // For VPlan build stress testing, we bail out after VPlan construction.
6717 if (VPlanBuildStressTest)
6718 return VectorizationFactor::Disabled();
6719
6720 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6721 }
6722
6723 LLVM_DEBUG(
6724 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6725 "VPlan-native path.\n");
6726 return VectorizationFactor::Disabled();
6727}
6728
6729void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6730 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6731 CM.collectValuesToIgnore();
6732 CM.collectElementTypesForWidening();
6733
6734 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6735 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6736 return;
6737
6738 // Invalidate interleave groups if all blocks of loop will be predicated.
6739 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6740 !useMaskedInterleavedAccesses(TTI)) {
6741 LLVM_DEBUG(
6742 dbgs()
6743 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6744 "which requires masked-interleaved support.\n");
6745 if (CM.InterleaveInfo.invalidateGroups())
6746 // Invalidating interleave groups also requires invalidating all decisions
6747 // based on them, which includes widening decisions and uniform and scalar
6748 // values.
6749 CM.invalidateCostModelingDecisions();
6750 }
6751
6752 if (CM.foldTailByMasking())
6753 Legal->prepareToFoldTailByMasking();
6754
6755 ElementCount MaxUserVF =
6756 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6757 if (UserVF) {
6758 if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6759 reportVectorizationInfo(
6760 Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6761 ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6762 } else {
6763 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6764 "VF needs to be a power of two");
6765 // Collect the instructions (and their associated costs) that will be more
6766 // profitable to scalarize.
6767 CM.collectInLoopReductions();
6768 if (CM.selectUserVectorizationFactor(UserVF)) {
6769 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6770 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6771 LLVM_DEBUG(printPlans(dbgs()));
6772 return;
6773 }
6774 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6775 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6776 }
6777 }
6778
6779 // Collect the Vectorization Factor Candidates.
6780 SmallVector<ElementCount> VFCandidates;
6781 for (auto VF = ElementCount::getFixed(MinVal: 1);
6782 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
6783 VFCandidates.push_back(Elt: VF);
6784 for (auto VF = ElementCount::getScalable(MinVal: 1);
6785 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
6786 VFCandidates.push_back(Elt: VF);
6787
6788 CM.collectInLoopReductions();
6789 for (const auto &VF : VFCandidates) {
6790 // Collect Uniform and Scalar instructions after vectorization with VF.
6791 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6792 }
6793
6794 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
6795 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
6796
6797 LLVM_DEBUG(printPlans(dbgs()));
6798}
6799
6800InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6801 ElementCount VF) const {
6802 InstructionCost Cost = CM.getInstructionCost(I: UI, VF);
6803 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6804 return InstructionCost(ForceTargetInstructionCost);
6805 return Cost;
6806}
6807
6808bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6809 ElementCount VF) const {
6810 return CM.isUniformAfterVectorization(I, VF);
6811}
6812
6813bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6814 return CM.ValuesToIgnore.contains(Ptr: UI) ||
6815 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
6816 SkipCostComputation.contains(Ptr: UI);
6817}
6818
6819uint64_t VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
6820 return CM.getPredBlockCostDivisor(CostKind, BB);
6821}
6822
6823InstructionCost
6824LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6825 VPCostContext &CostCtx) const {
6826 InstructionCost Cost;
6827 // Cost modeling for inductions is inaccurate in the legacy cost model
6828 // compared to the recipes that are generated. To match here initially during
6829 // VPlan cost model bring up directly use the induction costs from the legacy
6830 // cost model. Note that we do this as pre-processing; the VPlan may not have
6831 // any recipes associated with the original induction increment instruction
6832 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6833 // the cost of induction phis and increments (both that are represented by
6834 // recipes and those that are not), to avoid distinguishing between them here,
6835 // and skip all recipes that represent induction phis and increments (the
6836 // former case) later on, if they exist, to avoid counting them twice.
6837 // Similarly we pre-compute the cost of any optimized truncates.
6838 // TODO: Switch to more accurate costing based on VPlan.
6839 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6840 Instruction *IVInc = cast<Instruction>(
6841 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6842 SmallVector<Instruction *> IVInsts = {IVInc};
6843 for (unsigned I = 0; I != IVInsts.size(); I++) {
6844 for (Value *Op : IVInsts[I]->operands()) {
6845 auto *OpI = dyn_cast<Instruction>(Val: Op);
6846 if (Op == IV || !OpI || !OrigLoop->contains(Inst: OpI) || !Op->hasOneUse())
6847 continue;
6848 IVInsts.push_back(Elt: OpI);
6849 }
6850 }
6851 IVInsts.push_back(Elt: IV);
6852 for (User *U : IV->users()) {
6853 auto *CI = cast<Instruction>(Val: U);
6854 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6855 continue;
6856 IVInsts.push_back(Elt: CI);
6857 }
6858
6859 // If the vector loop gets executed exactly once with the given VF, ignore
6860 // the costs of comparison and induction instructions, as they'll get
6861 // simplified away.
6862 // TODO: Remove this code after stepping away from the legacy cost model and
6863 // adding code to simplify VPlans before calculating their costs.
6864 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6865 if (TC == VF && !CM.foldTailByMasking())
6866 addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6867 InstsToIgnore&: CostCtx.SkipCostComputation);
6868
6869 for (Instruction *IVInst : IVInsts) {
6870 if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6871 continue;
6872 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6873 LLVM_DEBUG({
6874 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6875 << ": induction instruction " << *IVInst << "\n";
6876 });
6877 Cost += InductionCost;
6878 CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6879 }
6880 }
6881
6882 /// Compute the cost of all exiting conditions of the loop using the legacy
6883 /// cost model. This is to match the legacy behavior, which adds the cost of
6884 /// all exit conditions. Note that this over-estimates the cost, as there will
6885 /// be a single condition to control the vector loop.
6886 SmallVector<BasicBlock *> Exiting;
6887 CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6888 SetVector<Instruction *> ExitInstrs;
6889 // Collect all exit conditions.
6890 for (BasicBlock *EB : Exiting) {
6891 auto *Term = dyn_cast<CondBrInst>(Val: EB->getTerminator());
6892 if (!Term || CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6893 continue;
6894 if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) {
6895 ExitInstrs.insert(X: CondI);
6896 }
6897 }
6898 // Compute the cost of all instructions only feeding the exit conditions.
6899 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6900 Instruction *CondI = ExitInstrs[I];
6901 if (!OrigLoop->contains(Inst: CondI) ||
6902 !CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6903 continue;
6904 InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6905 LLVM_DEBUG({
6906 dbgs() << "Cost of " << CondICost << " for VF " << VF
6907 << ": exit condition instruction " << *CondI << "\n";
6908 });
6909 Cost += CondICost;
6910 for (Value *Op : CondI->operands()) {
6911 auto *OpI = dyn_cast<Instruction>(Val: Op);
6912 if (!OpI || CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) ||
6913 any_of(Range: OpI->users(), P: [&ExitInstrs](User *U) {
6914 return !ExitInstrs.contains(key: cast<Instruction>(Val: U));
6915 }))
6916 continue;
6917 ExitInstrs.insert(X: OpI);
6918 }
6919 }
6920
6921 // Pre-compute the costs for branches except for the backedge, as the number
6922 // of replicate regions in a VPlan may not directly match the number of
6923 // branches, which would lead to different decisions.
6924 // TODO: Compute cost of branches for each replicate region in the VPlan,
6925 // which is more accurate than the legacy cost model.
6926 for (BasicBlock *BB : OrigLoop->blocks()) {
6927 if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6928 continue;
6929 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6930 if (BB == OrigLoop->getLoopLatch())
6931 continue;
6932 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6933 Cost += BranchCost;
6934 }
6935
6936 // Don't apply special costs when instruction cost is forced to make sure the
6937 // forced cost is used for each recipe.
6938 if (ForceTargetInstructionCost.getNumOccurrences())
6939 return Cost;
6940
6941 // Pre-compute costs for instructions that are forced-scalar or profitable to
6942 // scalarize. Their costs will be computed separately in the legacy cost
6943 // model.
6944 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6945 if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
6946 continue;
6947 CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
6948 InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
6949 LLVM_DEBUG({
6950 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6951 << ": forced scalar " << *ForcedScalar << "\n";
6952 });
6953 Cost += ForcedCost;
6954 }
6955 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6956 if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
6957 continue;
6958 CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
6959 LLVM_DEBUG({
6960 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6961 << ": profitable to scalarize " << *Scalarized << "\n";
6962 });
6963 Cost += ScalarCost;
6964 }
6965
6966 return Cost;
6967}
6968
6969InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
6970 VPRegisterUsage *RU) const {
6971 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
6972 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6973
6974 // Now compute and add the VPlan-based cost.
6975 Cost += Plan.cost(VF, Ctx&: CostCtx);
6976
6977 // Add the cost of spills due to excess register usage
6978 if (CM.shouldConsiderRegPressureForVF(VF))
6979 Cost += RU->spillCost(Ctx&: CostCtx, OverrideMaxNumRegs: ForceTargetNumVectorRegs);
6980
6981#ifndef NDEBUG
6982 unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
6983 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6984 << " (Estimated cost per lane: ");
6985 if (Cost.isValid()) {
6986 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6987 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6988 } else /* No point dividing an invalid cost - it will still be invalid */
6989 LLVM_DEBUG(dbgs() << "Invalid");
6990 LLVM_DEBUG(dbgs() << ")\n");
6991#endif
6992 return Cost;
6993}
6994
6995#ifndef NDEBUG
6996/// Return true if the original loop \ TheLoop contains any instructions that do
6997/// not have corresponding recipes in \p Plan and are not marked to be ignored
6998/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6999/// cost-model did not account for.
7000static bool planContainsAdditionalSimplifications(VPlan &Plan,
7001 VPCostContext &CostCtx,
7002 Loop *TheLoop,
7003 ElementCount VF) {
7004 using namespace VPlanPatternMatch;
7005 // First collect all instructions for the recipes in Plan.
7006 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
7007 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7008 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7009 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7010 return &WidenMem->getIngredient();
7011 return nullptr;
7012 };
7013
7014 // Check if a select for a safe divisor was hoisted to the pre-header. If so,
7015 // the select doesn't need to be considered for the vector loop cost; go with
7016 // the more accurate VPlan-based cost model.
7017 for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
7018 auto *VPI = dyn_cast<VPInstruction>(&R);
7019 if (!VPI || VPI->getOpcode() != Instruction::Select)
7020 continue;
7021
7022 if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7023 switch (WR->getOpcode()) {
7024 case Instruction::UDiv:
7025 case Instruction::SDiv:
7026 case Instruction::URem:
7027 case Instruction::SRem:
7028 return true;
7029 default:
7030 break;
7031 }
7032 }
7033 }
7034
7035 DenseSet<Instruction *> SeenInstrs;
7036 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7037 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7038 for (VPRecipeBase &R : *VPBB) {
7039 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7040 auto *IG = IR->getInterleaveGroup();
7041 unsigned NumMembers = IG->getNumMembers();
7042 for (unsigned I = 0; I != NumMembers; ++I) {
7043 if (Instruction *M = IG->getMember(I))
7044 SeenInstrs.insert(M);
7045 }
7046 continue;
7047 }
7048 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7049 // cost model won't cost it whilst the legacy will.
7050 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7051 if (none_of(FOR->users(),
7052 match_fn(m_VPInstruction<
7053 VPInstruction::FirstOrderRecurrenceSplice>())))
7054 return true;
7055 }
7056 // The VPlan-based cost model is more accurate for partial reductions and
7057 // comparing against the legacy cost isn't desirable.
7058 if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7059 if (VPR->isPartialReduction())
7060 return true;
7061
7062 // The VPlan-based cost model can analyze if recipes are scalar
7063 // recursively, but the legacy cost model cannot.
7064 if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7065 auto *AddrI = dyn_cast<Instruction>(
7066 getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7067 if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7068 CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7069 return true;
7070
7071 if (WidenMemR->isReverse()) {
7072 // If the stored value of a reverse store is invariant, LICM will
7073 // hoist the reverse operation to the preheader. In this case, the
7074 // result of the VPlan-based cost model will diverge from that of
7075 // the legacy model.
7076 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
7077 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7078 return true;
7079
7080 if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
7081 if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7082 return true;
7083 }
7084 }
7085
7086 // The legacy cost model costs non-header phis with a scalar VF as a phi,
7087 // but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
7088 if (isa<VPBlendRecipe>(&R) &&
7089 vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
7090 return true;
7091
7092 // The legacy cost model won't calculate the cost of the LogicalAnd which
7093 // will be replaced with vp_merge.
7094 if (match(&R, m_Intrinsic<Intrinsic::vp_merge>()))
7095 return true;
7096
7097 /// If a VPlan transform folded a recipe to one producing a single-scalar,
7098 /// but the original instruction wasn't uniform-after-vectorization in the
7099 /// legacy cost model, the legacy cost overestimates the actual cost.
7100 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7101 if (RepR->isSingleScalar() &&
7102 !CostCtx.isLegacyUniformAfterVectorization(
7103 RepR->getUnderlyingInstr(), VF))
7104 return true;
7105 }
7106 if (Instruction *UI = GetInstructionForCost(&R)) {
7107 // If we adjusted the predicate of the recipe, the cost in the legacy
7108 // cost model may be different.
7109 CmpPredicate Pred;
7110 if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7111 cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7112 cast<CmpInst>(UI)->getPredicate())
7113 return true;
7114
7115 // Recipes with underlying instructions being moved out of the loop
7116 // region by LICM may cause discrepancies between the legacy cost model
7117 // and the VPlan-based cost model.
7118 if (!VPBB->getEnclosingLoopRegion())
7119 return true;
7120
7121 SeenInstrs.insert(UI);
7122 }
7123 }
7124 }
7125
7126 // If a reverse recipe has been sunk to the middle block (e.g., for a load
7127 // whose result is only used as a live-out), VPlan avoids the per-iteration
7128 // reverse shuffle cost that the legacy model accounts for.
7129 if (any_of(*Plan.getMiddleBlock(), [](const VPRecipeBase &R) {
7130 return match(&R, m_VPInstruction<VPInstruction::Reverse>());
7131 }))
7132 return true;
7133
7134 // Return true if the loop contains any instructions that are not also part of
7135 // the VPlan or are skipped for VPlan-based cost computations. This indicates
7136 // that the VPlan contains extra simplifications.
7137 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7138 TheLoop](BasicBlock *BB) {
7139 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7140 // Skip induction phis when checking for simplifications, as they may not
7141 // be lowered directly be lowered to a corresponding PHI recipe.
7142 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7143 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7144 return false;
7145 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7146 });
7147 });
7148}
7149#endif
7150
7151VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7152 if (VPlans.empty())
7153 return VectorizationFactor::Disabled();
7154 // If there is a single VPlan with a single VF, return it directly.
7155 VPlan &FirstPlan = *VPlans[0];
7156 if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1)
7157 return {*FirstPlan.vectorFactors().begin(), 0, 0};
7158
7159 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7160 << (CM.CostKind == TTI::TCK_RecipThroughput
7161 ? "Reciprocal Throughput\n"
7162 : CM.CostKind == TTI::TCK_Latency
7163 ? "Instruction Latency\n"
7164 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7165 : CM.CostKind == TTI::TCK_SizeAndLatency
7166 ? "Code Size and Latency\n"
7167 : "Unknown\n"));
7168
7169 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
7170 assert(hasPlanWithVF(ScalarVF) &&
7171 "More than a single plan/VF w/o any plan having scalar VF");
7172
7173 // TODO: Compute scalar cost using VPlan-based cost model.
7174 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7175 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7176 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7177 VectorizationFactor BestFactor = ScalarFactor;
7178
7179 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7180 if (ForceVectorization) {
7181 // Ignore scalar width, because the user explicitly wants vectorization.
7182 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7183 // evaluation.
7184 BestFactor.Cost = InstructionCost::getMax();
7185 }
7186
7187 for (auto &P : VPlans) {
7188 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7189 P->vectorFactors().end());
7190
7191 SmallVector<VPRegisterUsage, 8> RUs;
7192 bool ConsiderRegPressure = any_of(Range&: VFs, P: [this](ElementCount VF) {
7193 return CM.shouldConsiderRegPressureForVF(VF);
7194 });
7195 if (ConsiderRegPressure)
7196 RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7197
7198 for (unsigned I = 0; I < VFs.size(); I++) {
7199 ElementCount VF = VFs[I];
7200 if (VF.isScalar())
7201 continue;
7202 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7203 LLVM_DEBUG(
7204 dbgs()
7205 << "LV: Not considering vector loop of width " << VF
7206 << " because it will not generate any vector instructions.\n");
7207 continue;
7208 }
7209 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7210 LLVM_DEBUG(
7211 dbgs()
7212 << "LV: Not considering vector loop of width " << VF
7213 << " because it would cause replicated blocks to be generated,"
7214 << " which isn't allowed when optimizing for size.\n");
7215 continue;
7216 }
7217
7218 InstructionCost Cost =
7219 cost(Plan&: *P, VF, RU: ConsiderRegPressure ? &RUs[I] : nullptr);
7220 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7221
7222 if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P->hasScalarTail()))
7223 BestFactor = CurrentFactor;
7224
7225 // If profitable add it to ProfitableVF list.
7226 if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P->hasScalarTail()))
7227 ProfitableVFs.push_back(Elt: CurrentFactor);
7228 }
7229 }
7230
7231#ifndef NDEBUG
7232 // Select the optimal vectorization factor according to the legacy cost-model.
7233 // This is now only used to verify the decisions by the new VPlan-based
7234 // cost-model and will be retired once the VPlan-based cost-model is
7235 // stabilized.
7236 VectorizationFactor LegacyVF = selectVectorizationFactor();
7237 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7238
7239 // Pre-compute the cost and use it to check if BestPlan contains any
7240 // simplifications not accounted for in the legacy cost model. If that's the
7241 // case, don't trigger the assertion, as the extra simplifications may cause a
7242 // different VF to be picked by the VPlan-based cost model.
7243 VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, CM.PSE,
7244 OrigLoop);
7245 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7246 // Verify that the VPlan-based and legacy cost models agree, except for
7247 // * VPlans with early exits,
7248 // * VPlans with additional VPlan simplifications,
7249 // * EVL-based VPlans with gather/scatters (the VPlan-based cost model uses
7250 // vp_scatter/vp_gather).
7251 // The legacy cost model doesn't properly model costs for such loops.
7252 bool UsesEVLGatherScatter =
7253 any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
7254 BestPlan.getVectorLoopRegion()->getEntry())),
7255 [](VPBasicBlock *VPBB) {
7256 return any_of(*VPBB, [](VPRecipeBase &R) {
7257 return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7258 !cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7259 });
7260 });
7261 assert(
7262 (BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7263 !Legal->getLAI()->getSymbolicStrides().empty() || UsesEVLGatherScatter ||
7264 planContainsAdditionalSimplifications(
7265 getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) ||
7266 planContainsAdditionalSimplifications(
7267 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7268 " VPlan cost model and legacy cost model disagreed");
7269 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7270 "when vectorizing, the scalar cost must be computed.");
7271#endif
7272
7273 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7274 return BestFactor;
7275}
7276
7277DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7278 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7279 InnerLoopVectorizer &ILV, DominatorTree *DT,
7280 EpilogueVectorizationKind EpilogueVecKind) {
7281 assert(BestVPlan.hasVF(BestVF) &&
7282 "Trying to execute plan with unsupported VF");
7283 assert(BestVPlan.hasUF(BestUF) &&
7284 "Trying to execute plan with unsupported UF");
7285 if (BestVPlan.hasEarlyExit())
7286 ++LoopsEarlyExitVectorized;
7287 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7288 // cost model is complete for better cost estimates.
7289 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7290 RUN_VPLAN_PASS(VPlanTransforms::materializePacksAndUnpacks, BestVPlan);
7291 RUN_VPLAN_PASS(VPlanTransforms::materializeBroadcasts, BestVPlan);
7292 RUN_VPLAN_PASS(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7293 bool HasBranchWeights =
7294 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
7295 if (HasBranchWeights) {
7296 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7297 RUN_VPLAN_PASS(VPlanTransforms::addBranchWeightToMiddleTerminator,
7298 BestVPlan, BestVF, VScale);
7299 }
7300
7301 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7302 VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
7303
7304 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7305 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7306 if (EpilogueVecKind == EpilogueVectorizationKind::None)
7307 VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan);
7308 if (BestVPlan.getEntry()->getSingleSuccessor() ==
7309 BestVPlan.getScalarPreheader()) {
7310 // TODO: The vector loop would be dead, should not even try to vectorize.
7311 ORE->emit(RemarkBuilder: [&]() {
7312 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
7313 OrigLoop->getStartLoc(),
7314 OrigLoop->getHeader())
7315 << "Created vector loop never executes due to insufficient trip "
7316 "count.";
7317 });
7318 return DenseMap<const SCEV *, Value *>();
7319 }
7320
7321 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7322
7323 VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan);
7324 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
7325 VPlanTransforms::convertEVLExitCond(Plan&: BestVPlan);
7326 // Regions are dissolved after optimizing for VF and UF, which completely
7327 // removes unneeded loop regions first.
7328 VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7329 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
7330 // its successors.
7331 VPlanTransforms::expandBranchOnTwoConds(Plan&: BestVPlan);
7332 // Convert loops with variable-length stepping after regions are dissolved.
7333 VPlanTransforms::convertToVariableLengthStep(Plan&: BestVPlan);
7334 // Remove dead back-edges for single-iteration loops with BranchOnCond(true).
7335 // Only process loop latches to avoid removing edges from the middle block,
7336 // which may be needed for epilogue vectorization.
7337 VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan, /*OnlyLatches=*/true);
7338 VPlanTransforms::materializeBackedgeTakenCount(Plan&: BestVPlan, VectorPH);
7339 VPlanTransforms::materializeVectorTripCount(
7340 Plan&: BestVPlan, VectorPHVPBB: VectorPH, TailByMasking: CM.foldTailByMasking(),
7341 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: BestVF.isVector()), Step: &BestVPlan.getVFxUF());
7342 VPlanTransforms::materializeFactors(Plan&: BestVPlan, VectorPH, VF: BestVF);
7343 VPlanTransforms::cse(Plan&: BestVPlan);
7344 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7345 VPlanTransforms::simplifyKnownEVL(Plan&: BestVPlan, VF: BestVF, PSE);
7346
7347 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7348 // making any changes to the CFG.
7349 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
7350 VPlanTransforms::expandSCEVs(Plan&: BestVPlan, SE&: *PSE.getSE());
7351
7352 // Perform the actual loop transformation.
7353 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7354 OrigLoop->getParentLoop(),
7355 Legal->getWidestInductionType());
7356
7357#ifdef EXPENSIVE_CHECKS
7358 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7359#endif
7360
7361 // 1. Set up the skeleton for vectorization, including vector pre-header and
7362 // middle block. The vector loop is created during VPlan execution.
7363 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7364 replaceVPBBWithIRVPBB(VPBB: BestVPlan.getScalarPreheader(),
7365 IRBB: State.CFG.PrevBB->getSingleSuccessor(), Plan: &BestVPlan);
7366 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7367
7368 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
7369
7370 // After vectorization, the exit blocks of the original loop will have
7371 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
7372 // looked through single-entry phis.
7373 ScalarEvolution &SE = *PSE.getSE();
7374 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7375 if (!Exit->hasPredecessors())
7376 continue;
7377 for (VPRecipeBase &PhiR : Exit->phis())
7378 SE.forgetLcssaPhiWithNewPredecessor(L: OrigLoop,
7379 V: &cast<VPIRPhi>(Val&: PhiR).getIRPhi());
7380 }
7381 // Forget the original loop and block dispositions.
7382 SE.forgetLoop(L: OrigLoop);
7383 SE.forgetBlockAndLoopDispositions();
7384
7385 ILV.printDebugTracesAtStart();
7386
7387 //===------------------------------------------------===//
7388 //
7389 // Notice: any optimization or new instruction that go
7390 // into the code below should also be implemented in
7391 // the cost-model.
7392 //
7393 //===------------------------------------------------===//
7394
7395 // Retrieve loop information before executing the plan, which may remove the
7396 // original loop, if it becomes unreachable.
7397 MDNode *LID = OrigLoop->getLoopID();
7398 unsigned OrigLoopInvocationWeight = 0;
7399 std::optional<unsigned> OrigAverageTripCount =
7400 getLoopEstimatedTripCount(L: OrigLoop, EstimatedLoopInvocationWeight: &OrigLoopInvocationWeight);
7401
7402 BestVPlan.execute(State: &State);
7403
7404 // 2.6. Maintain Loop Hints
7405 // Keep all loop hints from the original loop on the vector loop (we'll
7406 // replace the vectorizer-specific hints below).
7407 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7408 // Add metadata to disable runtime unrolling a scalar loop when there
7409 // are no runtime checks about strides and memory. A scalar loop that is
7410 // rarely used is not worth unrolling.
7411 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7412 updateLoopMetadataAndProfileInfo(
7413 VectorLoop: HeaderVPBB ? LI->getLoopFor(BB: State.CFG.VPBB2IRBB.lookup(Val: HeaderVPBB))
7414 : nullptr,
7415 HeaderVPBB, Plan: BestVPlan,
7416 VectorizingEpilogue: EpilogueVecKind == EpilogueVectorizationKind::Epilogue, OrigLoopID: LID,
7417 OrigAverageTripCount, OrigLoopInvocationWeight,
7418 EstimatedVFxUF: estimateElementCount(VF: BestVF * BestUF, VScale: CM.getVScaleForTuning()),
7419 DisableRuntimeUnroll);
7420
7421 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7422 // predication, updating analyses.
7423 ILV.fixVectorizedLoop(State);
7424
7425 ILV.printDebugTracesAtEnd();
7426
7427 return ExpandedSCEVs;
7428}
7429
7430//===--------------------------------------------------------------------===//
7431// EpilogueVectorizerMainLoop
7432//===--------------------------------------------------------------------===//
7433
7434void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7435 LLVM_DEBUG({
7436 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7437 << "Main Loop VF:" << EPI.MainLoopVF
7438 << ", Main Loop UF:" << EPI.MainLoopUF
7439 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7440 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7441 });
7442}
7443
7444void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7445 DEBUG_WITH_TYPE(VerboseDebug, {
7446 dbgs() << "intermediate fn:\n"
7447 << *OrigLoop->getHeader()->getParent() << "\n";
7448 });
7449}
7450
7451//===--------------------------------------------------------------------===//
7452// EpilogueVectorizerEpilogueLoop
7453//===--------------------------------------------------------------------===//
7454
7455/// This function creates a new scalar preheader, using the previous one as
7456/// entry block to the epilogue VPlan. The minimum iteration check is being
7457/// represented in VPlan.
7458BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
7459 BasicBlock *NewScalarPH = createScalarPreheader(Prefix: "vec.epilog.");
7460 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7461 OriginalScalarPH->setName("vec.epilog.iter.check");
7462 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: OriginalScalarPH);
7463 VPBasicBlock *OldEntry = Plan.getEntry();
7464 for (auto &R : make_early_inc_range(Range&: *OldEntry)) {
7465 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7466 // defining.
7467 if (isa<VPIRInstruction>(Val: &R))
7468 continue;
7469 R.moveBefore(BB&: *NewEntry, I: NewEntry->end());
7470 }
7471
7472 VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7473 Plan.setEntry(NewEntry);
7474 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7475
7476 return OriginalScalarPH;
7477}
7478
7479void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7480 LLVM_DEBUG({
7481 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7482 << "Epilogue Loop VF:" << EPI.EpilogueVF
7483 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7484 });
7485}
7486
7487void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7488 DEBUG_WITH_TYPE(VerboseDebug, {
7489 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7490 });
7491}
7492
7493VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
7494 VFRange &Range) {
7495 assert((VPI->getOpcode() == Instruction::Load ||
7496 VPI->getOpcode() == Instruction::Store) &&
7497 "Must be called with either a load or store");
7498 Instruction *I = VPI->getUnderlyingInstr();
7499
7500 auto WillWiden = [&](ElementCount VF) -> bool {
7501 LoopVectorizationCostModel::InstWidening Decision =
7502 CM.getWideningDecision(I, VF);
7503 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7504 "CM decision should be taken at this point.");
7505 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7506 return true;
7507 if (CM.isScalarAfterVectorization(I, VF) ||
7508 CM.isProfitableToScalarize(I, VF))
7509 return false;
7510 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7511 };
7512
7513 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7514 return nullptr;
7515
7516 // If a mask is not required, drop it - use unmasked version for safe loads.
7517 // TODO: Determine if mask is needed in VPlan.
7518 VPValue *Mask = CM.isMaskRequired(I) ? VPI->getMask() : nullptr;
7519
7520 // Determine if the pointer operand of the access is either consecutive or
7521 // reverse consecutive.
7522 LoopVectorizationCostModel::InstWidening Decision =
7523 CM.getWideningDecision(I, VF: Range.Start);
7524 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7525 bool Consecutive =
7526 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7527
7528 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(N: 0)
7529 : VPI->getOperand(N: 1);
7530 if (Consecutive) {
7531 auto *GEP = dyn_cast<GetElementPtrInst>(
7532 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7533 VPSingleDefRecipe *VectorPtr;
7534 if (Reverse) {
7535 // When folding the tail, we may compute an address that we don't in the
7536 // original scalar loop: drop the GEP no-wrap flags in this case.
7537 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
7538 // emit negative indices.
7539 GEPNoWrapFlags Flags =
7540 CM.foldTailByMasking() || !GEP
7541 ? GEPNoWrapFlags::none()
7542 : GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7543 VectorPtr = new VPVectorEndPointerRecipe(
7544 Ptr, &Plan.getVF(), getLoadStoreType(I),
7545 /*Stride*/ -1, Flags, VPI->getDebugLoc());
7546 } else {
7547 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7548 GEP ? GEP->getNoWrapFlags()
7549 : GEPNoWrapFlags::none(),
7550 VPI->getDebugLoc());
7551 }
7552 Builder.insert(R: VectorPtr);
7553 Ptr = VectorPtr;
7554 }
7555
7556 if (VPI->getOpcode() == Instruction::Load) {
7557 auto *Load = cast<LoadInst>(Val: I);
7558 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7559 *VPI, Load->getDebugLoc());
7560 if (Reverse) {
7561 Builder.insert(R: LoadR);
7562 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
7563 LoadR->getDebugLoc());
7564 }
7565 return LoadR;
7566 }
7567
7568 StoreInst *Store = cast<StoreInst>(Val: I);
7569 VPValue *StoredVal = VPI->getOperand(N: 0);
7570 if (Reverse)
7571 StoredVal = Builder.createNaryOp(Opcode: VPInstruction::Reverse, Operands: StoredVal,
7572 DL: Store->getDebugLoc());
7573 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive,
7574 Reverse, *VPI, Store->getDebugLoc());
7575}
7576
7577VPWidenIntOrFpInductionRecipe *
7578VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7579 VFRange &Range) {
7580 auto *I = cast<TruncInst>(Val: VPI->getUnderlyingInstr());
7581 // Optimize the special case where the source is a constant integer
7582 // induction variable. Notice that we can only optimize the 'trunc' case
7583 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7584 // (c) other casts depend on pointer size.
7585
7586 // Determine whether \p K is a truncation based on an induction variable that
7587 // can be optimized.
7588 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
7589 Predicate: bind_front(Fn: &LoopVectorizationCostModel::isOptimizableIVTruncate, BindArgs&: CM,
7590 BindArgs&: I),
7591 Range))
7592 return nullptr;
7593
7594 auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
7595 Val: VPI->getOperand(N: 0)->getDefiningRecipe());
7596 PHINode *Phi = WidenIV->getPHINode();
7597 VPIRValue *Start = WidenIV->getStartValue();
7598 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7599
7600 // Wrap flags from the original induction do not apply to the truncated type,
7601 // so do not propagate them.
7602 VPIRFlags Flags = VPIRFlags::WrapFlagsTy(false, false);
7603 VPValue *Step =
7604 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
7605 return new VPWidenIntOrFpInductionRecipe(
7606 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7607}
7608
7609VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
7610 VFRange &Range) {
7611 CallInst *CI = cast<CallInst>(Val: VPI->getUnderlyingInstr());
7612 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7613 Predicate: [this, CI](ElementCount VF) {
7614 return CM.isScalarWithPredication(I: CI, VF);
7615 },
7616 Range);
7617
7618 if (IsPredicated)
7619 return nullptr;
7620
7621 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7622 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7623 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7624 ID == Intrinsic::pseudoprobe ||
7625 ID == Intrinsic::experimental_noalias_scope_decl))
7626 return nullptr;
7627
7628 SmallVector<VPValue *, 4> Ops(VPI->op_begin(),
7629 VPI->op_begin() + CI->arg_size());
7630
7631 // Is it beneficial to perform intrinsic call compared to lib call?
7632 bool ShouldUseVectorIntrinsic =
7633 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7634 Predicate: [&](ElementCount VF) -> bool {
7635 return CM.getCallWideningDecision(CI, VF).Kind ==
7636 LoopVectorizationCostModel::CM_IntrinsicCall;
7637 },
7638 Range);
7639 if (ShouldUseVectorIntrinsic)
7640 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI, *VPI,
7641 VPI->getDebugLoc());
7642
7643 Function *Variant = nullptr;
7644 std::optional<unsigned> MaskPos;
7645 // Is better to call a vectorized version of the function than to to scalarize
7646 // the call?
7647 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7648 Predicate: [&](ElementCount VF) -> bool {
7649 // The following case may be scalarized depending on the VF.
7650 // The flag shows whether we can use a usual Call for vectorized
7651 // version of the instruction.
7652
7653 // If we've found a variant at a previous VF, then stop looking. A
7654 // vectorized variant of a function expects input in a certain shape
7655 // -- basically the number of input registers, the number of lanes
7656 // per register, and whether there's a mask required.
7657 // We store a pointer to the variant in the VPWidenCallRecipe, so
7658 // once we have an appropriate variant it's only valid for that VF.
7659 // This will force a different vplan to be generated for each VF that
7660 // finds a valid variant.
7661 if (Variant)
7662 return false;
7663 LoopVectorizationCostModel::CallWideningDecision Decision =
7664 CM.getCallWideningDecision(CI, VF);
7665 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7666 Variant = Decision.Variant;
7667 MaskPos = Decision.MaskPos;
7668 return true;
7669 }
7670
7671 return false;
7672 },
7673 Range);
7674 if (ShouldUseVectorCall) {
7675 if (MaskPos.has_value()) {
7676 // We have 2 cases that would require a mask:
7677 // 1) The call needs to be predicated, either due to a conditional
7678 // in the scalar loop or use of an active lane mask with
7679 // tail-folding, and we use the appropriate mask for the block.
7680 // 2) No mask is required for the call instruction, but the only
7681 // available vector variant at this VF requires a mask, so we
7682 // synthesize an all-true mask.
7683 VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7684
7685 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7686 }
7687
7688 Ops.push_back(Elt: VPI->getOperand(N: VPI->getNumOperandsWithoutMask() - 1));
7689 return new VPWidenCallRecipe(CI, Variant, Ops, *VPI, *VPI,
7690 VPI->getDebugLoc());
7691 }
7692
7693 return nullptr;
7694}
7695
7696bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7697 assert((!isa<UncondBrInst, CondBrInst, PHINode, LoadInst, StoreInst>(I)) &&
7698 "Instruction should have been handled earlier");
7699 // Instruction should be widened, unless it is scalar after vectorization,
7700 // scalarization is profitable or it is predicated.
7701 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7702 return CM.isScalarAfterVectorization(I, VF) ||
7703 CM.isProfitableToScalarize(I, VF) ||
7704 CM.isScalarWithPredication(I, VF);
7705 };
7706 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7707 Range);
7708}
7709
7710VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
7711 auto *I = VPI->getUnderlyingInstr();
7712 switch (VPI->getOpcode()) {
7713 default:
7714 return nullptr;
7715 case Instruction::SDiv:
7716 case Instruction::UDiv:
7717 case Instruction::SRem:
7718 case Instruction::URem: {
7719 // If not provably safe, use a select to form a safe divisor before widening the
7720 // div/rem operation itself. Otherwise fall through to general handling below.
7721 if (CM.isPredicatedInst(I)) {
7722 SmallVector<VPValue *> Ops(VPI->operandsWithoutMask());
7723 VPValue *Mask = VPI->getMask();
7724 VPValue *One = Plan.getConstantInt(Ty: I->getType(), Val: 1u);
7725 auto *SafeRHS =
7726 Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: VPI->getDebugLoc());
7727 Ops[1] = SafeRHS;
7728 return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
7729 }
7730 [[fallthrough]];
7731 }
7732 case Instruction::Add:
7733 case Instruction::And:
7734 case Instruction::AShr:
7735 case Instruction::FAdd:
7736 case Instruction::FCmp:
7737 case Instruction::FDiv:
7738 case Instruction::FMul:
7739 case Instruction::FNeg:
7740 case Instruction::FRem:
7741 case Instruction::FSub:
7742 case Instruction::ICmp:
7743 case Instruction::LShr:
7744 case Instruction::Mul:
7745 case Instruction::Or:
7746 case Instruction::Select:
7747 case Instruction::Shl:
7748 case Instruction::Sub:
7749 case Instruction::Xor:
7750 case Instruction::Freeze:
7751 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
7752 VPI->getDebugLoc());
7753 case Instruction::ExtractValue: {
7754 SmallVector<VPValue *> NewOps(VPI->operandsWithoutMask());
7755 auto *EVI = cast<ExtractValueInst>(Val: I);
7756 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7757 unsigned Idx = EVI->getIndices()[0];
7758 NewOps.push_back(Elt: Plan.getConstantInt(BitWidth: 32, Val: Idx));
7759 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
7760 }
7761 };
7762}
7763
7764VPHistogramRecipe *VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7765 VPInstruction *VPI) {
7766 // FIXME: Support other operations.
7767 unsigned Opcode = HI->Update->getOpcode();
7768 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7769 "Histogram update operation must be an Add or Sub");
7770
7771 SmallVector<VPValue *, 3> HGramOps;
7772 // Bucket address.
7773 HGramOps.push_back(Elt: VPI->getOperand(N: 1));
7774 // Increment value.
7775 HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: 1)));
7776
7777 // In case of predicated execution (due to tail-folding, or conditional
7778 // execution, or both), pass the relevant mask.
7779 if (CM.isMaskRequired(I: HI->Store))
7780 HGramOps.push_back(Elt: VPI->getMask());
7781
7782 return new VPHistogramRecipe(Opcode, HGramOps, VPI->getDebugLoc());
7783}
7784
7785VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
7786 VFRange &Range) {
7787 auto *I = VPI->getUnderlyingInstr();
7788 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7789 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7790 Range);
7791
7792 bool IsPredicated = CM.isPredicatedInst(I);
7793
7794 // Even if the instruction is not marked as uniform, there are certain
7795 // intrinsic calls that can be effectively treated as such, so we check for
7796 // them here. Conservatively, we only do this for scalable vectors, since
7797 // for fixed-width VFs we can always fall back on full scalarization.
7798 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
7799 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
7800 case Intrinsic::assume:
7801 case Intrinsic::lifetime_start:
7802 case Intrinsic::lifetime_end:
7803 // For scalable vectors if one of the operands is variant then we still
7804 // want to mark as uniform, which will generate one instruction for just
7805 // the first lane of the vector. We can't scalarize the call in the same
7806 // way as for fixed-width vectors because we don't know how many lanes
7807 // there are.
7808 //
7809 // The reasons for doing it this way for scalable vectors are:
7810 // 1. For the assume intrinsic generating the instruction for the first
7811 // lane is still be better than not generating any at all. For
7812 // example, the input may be a splat across all lanes.
7813 // 2. For the lifetime start/end intrinsics the pointer operand only
7814 // does anything useful when the input comes from a stack object,
7815 // which suggests it should always be uniform. For non-stack objects
7816 // the effect is to poison the object, which still allows us to
7817 // remove the call.
7818 IsUniform = true;
7819 break;
7820 default:
7821 break;
7822 }
7823 }
7824 VPValue *BlockInMask = nullptr;
7825 if (!IsPredicated) {
7826 // Finalize the recipe for Instr, first if it is not predicated.
7827 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7828 } else {
7829 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7830 // Instructions marked for predication are replicated and a mask operand is
7831 // added initially. Masked replicate recipes will later be placed under an
7832 // if-then construct to prevent side-effects. Generate recipes to compute
7833 // the block mask for this region.
7834 BlockInMask = VPI->getMask();
7835 }
7836
7837 // Note that there is some custom logic to mark some intrinsics as uniform
7838 // manually above for scalable vectors, which this assert needs to account for
7839 // as well.
7840 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
7841 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7842 "Should not predicate a uniform recipe");
7843 auto *Recipe =
7844 new VPReplicateRecipe(I, VPI->operandsWithoutMask(), IsUniform,
7845 BlockInMask, *VPI, *VPI, VPI->getDebugLoc());
7846 return Recipe;
7847}
7848
7849VPRecipeBase *
7850VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
7851 VFRange &Range) {
7852 assert(!R->isPhi() && "phis must be handled earlier");
7853 // First, check for specific widening recipes that deal with optimizing
7854 // truncates, calls and memory operations.
7855
7856 VPRecipeBase *Recipe;
7857 auto *VPI = cast<VPInstruction>(Val: R);
7858 if (VPI->getOpcode() == Instruction::Trunc &&
7859 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
7860 return Recipe;
7861
7862 // All widen recipes below deal only with VF > 1.
7863 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7864 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
7865 return nullptr;
7866
7867 if (VPI->getOpcode() == Instruction::Call)
7868 return tryToWidenCall(VPI, Range);
7869
7870 Instruction *Instr = R->getUnderlyingInstr();
7871 if (VPI->getOpcode() == Instruction::Store)
7872 if (auto HistInfo = Legal->getHistogramInfo(I: cast<StoreInst>(Val: Instr)))
7873 return tryToWidenHistogram(HI: *HistInfo, VPI);
7874
7875 if (VPI->getOpcode() == Instruction::Load ||
7876 VPI->getOpcode() == Instruction::Store)
7877 return tryToWidenMemory(VPI, Range);
7878
7879 if (!shouldWiden(I: Instr, Range))
7880 return nullptr;
7881
7882 if (VPI->getOpcode() == Instruction::GetElementPtr)
7883 return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Val: Instr),
7884 VPI->operandsWithoutMask(), *VPI,
7885 VPI->getDebugLoc());
7886
7887 if (Instruction::isCast(Opcode: VPI->getOpcode())) {
7888 auto *CI = cast<CastInst>(Val: Instr);
7889 auto *CastR = cast<VPInstructionWithType>(Val: VPI);
7890 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(N: 0),
7891 CastR->getResultType(), CI, *VPI, *VPI,
7892 VPI->getDebugLoc());
7893 }
7894
7895 return tryToWiden(VPI);
7896}
7897
7898void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
7899 ElementCount MaxVF) {
7900 if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
7901 return;
7902
7903 assert(OrigLoop->isInnermost() && "Inner loop expected.");
7904
7905 const LoopAccessInfo *LAI = Legal->getLAI();
7906 LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
7907 OrigLoop, LI, DT, PSE.getSE());
7908 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
7909 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7910 // Only use noalias metadata when using memory checks guaranteeing no
7911 // overlap across all iterations.
7912 LVer.prepareNoAliasMetadata();
7913 }
7914
7915 // Create initial base VPlan0, to serve as common starting point for all
7916 // candidates built later for specific VF ranges.
7917 auto VPlan0 = VPlanTransforms::buildVPlan0(
7918 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
7919 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE, LVer: &LVer);
7920
7921 // Create recipes for header phis.
7922 VPlanTransforms::createHeaderPhiRecipes(
7923 Plan&: *VPlan0, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
7924 Reductions: Legal->getReductionVars(), FixedOrderRecurrences: Legal->getFixedOrderRecurrences(),
7925 InLoopReductions: CM.getInLoopReductions(), AllowReordering: Hints.allowReordering());
7926
7927 VPlanTransforms::simplifyRecipes(Plan&: *VPlan0);
7928 // If we're vectorizing a loop with an uncountable exit, make sure that the
7929 // recipes are safe to handle.
7930 // TODO: Remove this once we can properly check the VPlan itself for both
7931 // the presence of an uncountable exit and the presence of stores in
7932 // the loop inside handleEarlyExits itself.
7933 UncountableExitStyle EEStyle = UncountableExitStyle::NoUncountableExit;
7934 if (Legal->hasUncountableEarlyExit())
7935 EEStyle = Legal->hasUncountableExitWithSideEffects()
7936 ? UncountableExitStyle::MaskedHandleExitInScalarLoop
7937 : UncountableExitStyle::ReadOnly;
7938
7939 if (!VPlanTransforms::handleEarlyExits(Plan&: *VPlan0, Style: EEStyle, TheLoop: OrigLoop, PSE, DT&: *DT,
7940 AC: Legal->getAssumptionCache()))
7941 return;
7942 VPlanTransforms::addMiddleCheck(Plan&: *VPlan0, TailFolded: CM.foldTailByMasking());
7943 RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::createLoopRegions, *VPlan0);
7944 if (CM.foldTailByMasking())
7945 RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *VPlan0);
7946 RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize,
7947 *VPlan0);
7948
7949 auto MaxVFTimes2 = MaxVF * 2;
7950 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
7951 VFRange SubRange = {VF, MaxVFTimes2};
7952 if (auto Plan = tryToBuildVPlanWithVPRecipes(
7953 InitialPlan: std::unique_ptr<VPlan>(VPlan0->duplicate()), Range&: SubRange, LVer: &LVer)) {
7954 // Now optimize the initial VPlan.
7955 VPlanTransforms::hoistPredicatedLoads(Plan&: *Plan, PSE, L: OrigLoop);
7956 VPlanTransforms::sinkPredicatedStores(Plan&: *Plan, PSE, L: OrigLoop);
7957 RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
7958 CM.getMinimalBitwidths());
7959 RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
7960 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
7961 if (CM.foldTailWithEVL()) {
7962 RUN_VPLAN_PASS(VPlanTransforms::addExplicitVectorLength, *Plan,
7963 CM.getMaxSafeElements());
7964 RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
7965 }
7966
7967 if (auto P = VPlanTransforms::narrowInterleaveGroups(Plan&: *Plan, TTI))
7968 VPlans.push_back(Elt: std::move(P));
7969
7970 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
7971 VPlans.push_back(Elt: std::move(Plan));
7972 }
7973 VF = SubRange.End;
7974 }
7975}
7976
7977VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
7978 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
7979
7980 using namespace llvm::VPlanPatternMatch;
7981 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7982
7983 // ---------------------------------------------------------------------------
7984 // Build initial VPlan: Scan the body of the loop in a topological order to
7985 // visit each basic block after having visited its predecessor basic blocks.
7986 // ---------------------------------------------------------------------------
7987
7988 bool RequiresScalarEpilogueCheck =
7989 LoopVectorizationPlanner::getDecisionAndClampRange(
7990 Predicate: [this](ElementCount VF) {
7991 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
7992 },
7993 Range);
7994 // Update the branch in the middle block if a scalar epilogue is required.
7995 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
7996 if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == 2) {
7997 auto *BranchOnCond = cast<VPInstruction>(Val: MiddleVPBB->getTerminator());
7998 assert(MiddleVPBB->getSuccessors()[1] == Plan->getScalarPreheader() &&
7999 "second successor must be scalar preheader");
8000 BranchOnCond->setOperand(I: 0, New: Plan->getFalse());
8001 }
8002
8003 // Don't use getDecisionAndClampRange here, because we don't know the UF
8004 // so this function is better to be conservative, rather than to split
8005 // it up into different VPlans.
8006 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8007 bool IVUpdateMayOverflow = false;
8008 for (ElementCount VF : Range)
8009 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8010
8011 TailFoldingStyle Style = CM.getTailFoldingStyle();
8012 // Use NUW for the induction increment if we proved that it won't overflow in
8013 // the vector loop or when not folding the tail. In the later case, we know
8014 // that the canonical induction increment will not overflow as the vector trip
8015 // count is >= increment and a multiple of the increment.
8016 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8017 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8018 if (!HasNUW) {
8019 auto *IVInc =
8020 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(N: 0);
8021 assert(match(IVInc,
8022 m_VPInstruction<Instruction::Add>(
8023 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8024 "Did not find the canonical IV increment");
8025 cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8026 }
8027
8028 // ---------------------------------------------------------------------------
8029 // Pre-construction: record ingredients whose recipes we'll need to further
8030 // process after constructing the initial VPlan.
8031 // ---------------------------------------------------------------------------
8032
8033 // For each interleave group which is relevant for this (possibly trimmed)
8034 // Range, add it to the set of groups to be later applied to the VPlan and add
8035 // placeholders for its members' Recipes which we'll be replacing with a
8036 // single VPInterleaveRecipe.
8037 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8038 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8039 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8040 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8041 LoopVectorizationCostModel::CM_Interleave);
8042 // For scalable vectors, the interleave factors must be <= 8 since we
8043 // require the (de)interleaveN intrinsics instead of shufflevectors.
8044 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8045 "Unsupported interleave factor for scalable vectors");
8046 return Result;
8047 };
8048 if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8049 continue;
8050 InterleaveGroups.insert(Ptr: IG);
8051 }
8052
8053 // ---------------------------------------------------------------------------
8054 // Construct wide recipes and apply predication for original scalar
8055 // VPInstructions in the loop.
8056 // ---------------------------------------------------------------------------
8057 VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
8058
8059 // Scan the body of the loop in a topological order to visit each basic block
8060 // after having visited its predecessor basic blocks.
8061 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8062 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8063 HeaderVPBB);
8064
8065 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8066
8067 // Collect blocks that need predication for in-loop reduction recipes.
8068 DenseSet<BasicBlock *> BlocksNeedingPredication;
8069 for (BasicBlock *BB : OrigLoop->blocks())
8070 if (CM.blockNeedsPredicationForAnyReason(BB))
8071 BlocksNeedingPredication.insert(V: BB);
8072
8073 VPlanTransforms::createInLoopReductionRecipes(Plan&: *Plan, BlocksNeedingPredication,
8074 MinVF: Range.Start);
8075
8076 // Now process all other blocks and instructions.
8077 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8078 // Convert input VPInstructions to widened recipes.
8079 for (VPRecipeBase &R : make_early_inc_range(
8080 Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end()))) {
8081 // Skip recipes that do not need transforming.
8082 if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(Val: &R))
8083 continue;
8084 auto *VPI = cast<VPInstruction>(Val: &R);
8085 if (!VPI->getUnderlyingValue())
8086 continue;
8087
8088 // TODO: Gradually replace uses of underlying instruction by analyses on
8089 // VPlan. Migrate code relying on the underlying instruction from VPlan0
8090 // to construct recipes below to not use the underlying instruction.
8091 Instruction *Instr = cast<Instruction>(Val: VPI->getUnderlyingValue());
8092 Builder.setInsertPoint(VPI);
8093
8094 // The stores with invariant address inside the loop will be deleted, and
8095 // in the exit block, a uniform store recipe will be created for the final
8096 // invariant store of the reduction.
8097 StoreInst *SI;
8098 if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8099 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8100 // Only create recipe for the final invariant store of the reduction.
8101 if (Legal->isInvariantStoreOfReduction(SI)) {
8102 auto *Recipe = new VPReplicateRecipe(
8103 SI, VPI->operandsWithoutMask(), true /* IsUniform */,
8104 nullptr /*Mask*/, *VPI, *VPI, VPI->getDebugLoc());
8105 Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8106 }
8107 R.eraseFromParent();
8108 continue;
8109 }
8110
8111 VPRecipeBase *Recipe =
8112 RecipeBuilder.tryToCreateWidenNonPhiRecipe(R: VPI, Range);
8113 if (!Recipe)
8114 Recipe =
8115 RecipeBuilder.handleReplication(VPI: cast<VPInstruction>(Val: VPI), Range);
8116
8117 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8118 if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8119 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8120 // moved to the phi section in the header.
8121 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8122 } else {
8123 Builder.insert(R: Recipe);
8124 }
8125 if (Recipe->getNumDefinedValues() == 1) {
8126 VPI->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8127 } else {
8128 assert(Recipe->getNumDefinedValues() == 0 &&
8129 "Unexpected multidef recipe");
8130 }
8131 R.eraseFromParent();
8132 }
8133 }
8134
8135 assert(isa<VPRegionBlock>(LoopRegion) &&
8136 !LoopRegion->getEntryBasicBlock()->empty() &&
8137 "entry block must be set to a VPRegionBlock having a non-empty entry "
8138 "VPBasicBlock");
8139
8140 // TODO: We can't call runPass on these transforms yet, due to verifier
8141 // failures.
8142 VPlanTransforms::addExitUsersForFirstOrderRecurrences(Plan&: *Plan, Range);
8143
8144 // ---------------------------------------------------------------------------
8145 // Transform initial VPlan: Apply previously taken decisions, in order, to
8146 // bring the VPlan to its final state.
8147 // ---------------------------------------------------------------------------
8148
8149 addReductionResultComputation(Plan, RecipeBuilder, MinVF: Range.Start);
8150
8151 // Optimize FindIV reductions to use sentinel-based approach when possible.
8152 RUN_VPLAN_PASS(VPlanTransforms::optimizeFindIVReductions, *Plan, PSE,
8153 *OrigLoop);
8154 VPlanTransforms::optimizeInductionLiveOutUsers(Plan&: *Plan, PSE,
8155 FoldTail: CM.foldTailByMasking());
8156
8157 // Apply mandatory transformation to handle reductions with multiple in-loop
8158 // uses if possible, bail out otherwise.
8159 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMultiUseReductions, *Plan, ORE,
8160 OrigLoop))
8161 return nullptr;
8162 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8163 // NaNs if possible, bail out otherwise.
8164 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMaxMinNumReductions, *Plan))
8165 return nullptr;
8166
8167 // Create whole-vector selects for find-last recurrences.
8168 if (!RUN_VPLAN_PASS(VPlanTransforms::handleFindLastReductions, *Plan))
8169 return nullptr;
8170
8171 // Create partial reduction recipes for scaled reductions and transform
8172 // recipes to abstract recipes if it is legal and beneficial and clamp the
8173 // range for better cost estimation.
8174 // TODO: Enable following transform when the EVL-version of extended-reduction
8175 // and mulacc-reduction are implemented.
8176 if (!CM.foldTailWithEVL()) {
8177 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind, CM.PSE,
8178 OrigLoop);
8179 RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
8180 Range);
8181 RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
8182 Range);
8183 }
8184
8185 for (ElementCount VF : Range)
8186 Plan->addVF(VF);
8187 Plan->setName("Initial VPlan");
8188
8189 // Interleave memory: for each Interleave Group we marked earlier as relevant
8190 // for this VPlan, replace the Recipes widening its memory instructions with a
8191 // single VPInterleaveRecipe at its insertion point.
8192 RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
8193 InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
8194
8195 // Replace VPValues for known constant strides.
8196 RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
8197 Legal->getLAI()->getSymbolicStrides());
8198
8199 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8200 return Legal->blockNeedsPredication(BB);
8201 };
8202 RUN_VPLAN_PASS(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
8203 BlockNeedsPredication);
8204
8205 // Sink users of fixed-order recurrence past the recipe defining the previous
8206 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8207 if (!RUN_VPLAN_PASS(VPlanTransforms::adjustFixedOrderRecurrences, *Plan,
8208 Builder))
8209 return nullptr;
8210
8211 if (useActiveLaneMask(Style)) {
8212 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8213 // TailFoldingStyle is visible there.
8214 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8215 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow);
8216 }
8217
8218 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8219 return Plan;
8220}
8221
8222VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8223 // Outer loop handling: They may require CFG and instruction level
8224 // transformations before even evaluating whether vectorization is profitable.
8225 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8226 // the vectorization pipeline.
8227 assert(!OrigLoop->isInnermost());
8228 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8229
8230 auto Plan = VPlanTransforms::buildVPlan0(
8231 TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8232 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE);
8233
8234 VPlanTransforms::createHeaderPhiRecipes(
8235 Plan&: *Plan, PSE, OrigLoop&: *OrigLoop, Inductions: Legal->getInductionVars(),
8236 Reductions: MapVector<PHINode *, RecurrenceDescriptor>(),
8237 FixedOrderRecurrences: SmallPtrSet<const PHINode *, 1>(), InLoopReductions: SmallPtrSet<PHINode *, 1>(),
8238 /*AllowReordering=*/false);
8239 [[maybe_unused]] bool CanHandleExits = VPlanTransforms::handleEarlyExits(
8240 Plan&: *Plan, Style: UncountableExitStyle::NoUncountableExit, TheLoop: OrigLoop, PSE, DT&: *DT,
8241 AC: Legal->getAssumptionCache());
8242 assert(CanHandleExits &&
8243 "early-exits are not supported in VPlan-native path");
8244 VPlanTransforms::addMiddleCheck(Plan&: *Plan, /*TailFolded*/ false);
8245
8246 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8247
8248 for (ElementCount VF : Range)
8249 Plan->addVF(VF);
8250
8251 if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(Plan&: *Plan, TLI: *TLI))
8252 return nullptr;
8253
8254 // Optimize induction live-out users to use precomputed end values.
8255 VPlanTransforms::optimizeInductionLiveOutUsers(Plan&: *Plan, PSE,
8256 /*FoldTail=*/false);
8257
8258 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8259 return Plan;
8260}
8261
8262void LoopVectorizationPlanner::addReductionResultComputation(
8263 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8264 using namespace VPlanPatternMatch;
8265 VPTypeAnalysis TypeInfo(*Plan);
8266 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8267 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8268 SmallVector<VPRecipeBase *> ToDelete;
8269 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8270 Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
8271 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8272 for (VPRecipeBase &R :
8273 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8274 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8275 // TODO: Remove check for constant incoming value once removeDeadRecipes is
8276 // used on VPlan0.
8277 if (!PhiR || isa<VPIRValue>(Val: PhiR->getOperand(N: 1)))
8278 continue;
8279
8280 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8281 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8282 PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
8283 Type *PhiTy = TypeInfo.inferScalarType(V: PhiR);
8284 // If tail is folded by masking, introduce selects between the phi
8285 // and the users outside the vector region of each reduction, at the
8286 // beginning of the dedicated latch block.
8287 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8288 auto *NewExitingVPV = PhiR->getBackedgeValue();
8289 // Don't output selects for partial reductions because they have an output
8290 // with fewer lanes than the VF. So the operands of the select would have
8291 // different numbers of lanes. Partial reductions mask the input instead.
8292 auto *RR = dyn_cast<VPReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe());
8293 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8294 (!RR || !RR->isPartialReduction())) {
8295 VPValue *Cond = vputils::findHeaderMask(Plan&: *Plan);
8296 NewExitingVPV =
8297 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", Flags: *PhiR);
8298 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
8299 using namespace VPlanPatternMatch;
8300 return match(
8301 U: &U, P: m_CombineOr(
8302 L: m_VPInstruction<VPInstruction::ComputeAnyOfResult>(),
8303 R: m_VPInstruction<VPInstruction::ComputeReductionResult>()));
8304 });
8305
8306 if (CM.usePredicatedReductionSelect(RecurrenceKind))
8307 PhiR->setOperand(I: 1, New: NewExitingVPV);
8308 }
8309
8310 // We want code in the middle block to appear to execute on the location of
8311 // the scalar loop's latch terminator because: (a) it is all compiler
8312 // generated, (b) these instructions are always executed after evaluating
8313 // the latch conditional branch, and (c) other passes may add new
8314 // predecessors which terminate on this line. This is the easiest way to
8315 // ensure we don't accidentally cause an extra step back into the loop while
8316 // debugging.
8317 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8318
8319 // TODO: At the moment ComputeReductionResult also drives creation of the
8320 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8321 // even for in-loop reductions, until the reduction resume value handling is
8322 // also modeled in VPlan.
8323 VPInstruction *FinalReductionResult;
8324 VPBuilder::InsertPointGuard Guard(Builder);
8325 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
8326 // For AnyOf reductions, find the select among PhiR's users. This is used
8327 // both to find NewVal for ComputeAnyOfResult and to adjust the reduction.
8328 VPRecipeBase *AnyOfSelect = nullptr;
8329 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8330 AnyOfSelect = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
8331 return match(U, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()));
8332 }));
8333 }
8334 if (AnyOfSelect) {
8335 VPValue *Start = PhiR->getStartValue();
8336 // NewVal is the non-phi operand of the select.
8337 VPValue *NewVal = AnyOfSelect->getOperand(N: 1) == PhiR
8338 ? AnyOfSelect->getOperand(N: 2)
8339 : AnyOfSelect->getOperand(N: 1);
8340 FinalReductionResult =
8341 Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
8342 Operands: {Start, NewVal, NewExitingVPV}, DL: ExitDL);
8343 } else {
8344 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
8345 PhiR->getFastMathFlags());
8346 FinalReductionResult =
8347 Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
8348 Operands: {NewExitingVPV}, Flags, DL: ExitDL);
8349 }
8350 // If the vector reduction can be performed in a smaller type, we truncate
8351 // then extend the loop exit value to enable InstCombine to evaluate the
8352 // entire expression in the smaller type.
8353 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8354 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8355 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8356 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
8357 "Unexpected truncated min-max recurrence!");
8358 Type *RdxTy = RdxDesc.getRecurrenceType();
8359 VPWidenCastRecipe *Trunc;
8360 Instruction::CastOps ExtendOpc =
8361 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8362 VPWidenCastRecipe *Extnd;
8363 {
8364 VPBuilder::InsertPointGuard Guard(Builder);
8365 Builder.setInsertPoint(
8366 TheBB: NewExitingVPV->getDefiningRecipe()->getParent(),
8367 IP: std::next(x: NewExitingVPV->getDefiningRecipe()->getIterator()));
8368 Trunc =
8369 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: NewExitingVPV, ResultTy: RdxTy);
8370 Extnd = Builder.createWidenCast(Opcode: ExtendOpc, Op: Trunc, ResultTy: PhiTy);
8371 }
8372 if (PhiR->getOperand(N: 1) == NewExitingVPV)
8373 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
8374
8375 // Update ComputeReductionResult with the truncated exiting value and
8376 // extend its result. Operand 0 provides the values to be reduced.
8377 FinalReductionResult->setOperand(I: 0, New: Trunc);
8378 FinalReductionResult =
8379 Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
8380 }
8381
8382 // Update all users outside the vector region. Also replace redundant
8383 // extracts.
8384 for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
8385 auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
8386 if (FinalReductionResult == U || Parent->getParent())
8387 continue;
8388 // Skip FindIV reduction chain recipes (ComputeReductionResult, icmp).
8389 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind) &&
8390 match(U, P: m_CombineOr(
8391 L: m_VPInstruction<VPInstruction::ComputeReductionResult>(),
8392 R: m_VPInstruction<Instruction::ICmp>())))
8393 continue;
8394 U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
8395
8396 // Look through ExtractLastPart.
8397 if (match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
8398 U = cast<VPInstruction>(Val: U)->getSingleUser();
8399
8400 if (match(U, P: m_CombineOr(L: m_ExtractLane(Op0: m_VPValue(), Op1: m_VPValue()),
8401 R: m_ExtractLastLane(Op0: m_VPValue()))))
8402 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
8403 }
8404
8405 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8406 // with a boolean reduction phi node to check if the condition is true in
8407 // any iteration. The final value is selected by the final
8408 // ComputeReductionResult.
8409 if (AnyOfSelect) {
8410 VPValue *Cmp = AnyOfSelect->getOperand(N: 0);
8411 // If the compare is checking the reduction PHI node, adjust it to check
8412 // the start value.
8413 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8414 CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
8415 Builder.setInsertPoint(AnyOfSelect);
8416
8417 // If the true value of the select is the reduction phi, the new value is
8418 // selected if the negated condition is true in any iteration.
8419 if (AnyOfSelect->getOperand(N: 1) == PhiR)
8420 Cmp = Builder.createNot(Operand: Cmp);
8421 VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
8422 AnyOfSelect->getVPSingleValue()->replaceAllUsesWith(New: Or);
8423 // Delete AnyOfSelect now that it has invalid types.
8424 ToDelete.push_back(Elt: AnyOfSelect);
8425
8426 // Convert the reduction phi to operate on bools.
8427 PhiR->setOperand(I: 0, New: Plan->getFalse());
8428 continue;
8429 }
8430
8431 RecurKind RK = PhiR->getRecurrenceKind();
8432 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
8433 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
8434 !RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) &&
8435 !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RK))) {
8436 VPBuilder PHBuilder(Plan->getVectorPreheader());
8437 VPValue *Iden = Plan->getOrAddLiveIn(
8438 V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: PhiR->getFastMathFlags()));
8439 auto *ScaleFactorVPV = Plan->getConstantInt(BitWidth: 32, Val: 1);
8440 VPValue *StartV = PHBuilder.createNaryOp(
8441 Opcode: VPInstruction::ReductionStartVector,
8442 Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV}, Flags: *PhiR);
8443 PhiR->setOperand(I: 0, New: StartV);
8444 }
8445 }
8446 for (VPRecipeBase *R : ToDelete)
8447 R->eraseFromParent();
8448
8449 RUN_VPLAN_PASS(VPlanTransforms::clearReductionWrapFlags, *Plan);
8450}
8451
8452void LoopVectorizationPlanner::attachRuntimeChecks(
8453 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8454 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8455 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(N: 0)) {
8456 assert((!CM.OptForSize ||
8457 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8458 "Cannot SCEV check stride or overflow when optimizing for size");
8459 VPlanTransforms::attachCheckBlock(Plan, Cond: SCEVCheckCond, CheckBlock: SCEVCheckBlock,
8460 AddBranchWeights: HasBranchWeights);
8461 }
8462 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8463 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(N: 0)) {
8464 // VPlan-native path does not do any analysis for runtime checks
8465 // currently.
8466 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
8467 "Runtime checks are not supported for outer loops yet");
8468
8469 if (CM.OptForSize) {
8470 assert(
8471 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8472 "Cannot emit memory checks when optimizing for size, unless forced "
8473 "to vectorize.");
8474 ORE->emit(RemarkBuilder: [&]() {
8475 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
8476 OrigLoop->getStartLoc(),
8477 OrigLoop->getHeader())
8478 << "Code-size may be reduced by not forcing "
8479 "vectorization, or by source-code modifications "
8480 "eliminating the need for runtime checks "
8481 "(e.g., adding 'restrict').";
8482 });
8483 }
8484 VPlanTransforms::attachCheckBlock(Plan, Cond: MemCheckCond, CheckBlock: MemCheckBlock,
8485 AddBranchWeights: HasBranchWeights);
8486 }
8487}
8488
8489void LoopVectorizationPlanner::addMinimumIterationCheck(
8490 VPlan &Plan, ElementCount VF, unsigned UF,
8491 ElementCount MinProfitableTripCount) const {
8492 const uint32_t *BranchWeights =
8493 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())
8494 ? &MinItersBypassWeights[0]
8495 : nullptr;
8496 VPlanTransforms::addMinimumIterationCheck(
8497 Plan, VF, UF, MinProfitableTripCount,
8498 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()), TailFolded: CM.foldTailByMasking(),
8499 OrigLoop, MinItersBypassWeights: BranchWeights,
8500 DL: OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
8501}
8502
8503// Determine how to lower the scalar epilogue, which depends on 1) optimising
8504// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8505// predication, and 4) a TTI hook that analyses whether the loop is suitable
8506// for predication.
8507static ScalarEpilogueLowering getScalarEpilogueLowering(
8508 Function *F, Loop *L, LoopVectorizeHints &Hints, bool OptForSize,
8509 TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
8510 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
8511 // 1) OptSize takes precedence over all other options, i.e. if this is set,
8512 // don't look at hints or options, and don't request a scalar epilogue.
8513 if (F->hasOptSize() ||
8514 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8515 return CM_ScalarEpilogueNotAllowedOptSize;
8516
8517 // 2) If set, obey the directives
8518 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8519 switch (PreferPredicateOverEpilogue) {
8520 case PreferPredicateTy::ScalarEpilogue:
8521 return CM_ScalarEpilogueAllowed;
8522 case PreferPredicateTy::PredicateElseScalarEpilogue:
8523 return CM_ScalarEpilogueNotNeededUsePredicate;
8524 case PreferPredicateTy::PredicateOrDontVectorize:
8525 return CM_ScalarEpilogueNotAllowedUsePredicate;
8526 };
8527 }
8528
8529 // 3) If set, obey the hints
8530 switch (Hints.getPredicate()) {
8531 case LoopVectorizeHints::FK_Enabled:
8532 return CM_ScalarEpilogueNotNeededUsePredicate;
8533 case LoopVectorizeHints::FK_Disabled:
8534 return CM_ScalarEpilogueAllowed;
8535 };
8536
8537 // 4) if the TTI hook indicates this is profitable, request predication.
8538 TailFoldingInfo TFI(TLI, &LVL, IAI);
8539 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
8540 return CM_ScalarEpilogueNotNeededUsePredicate;
8541
8542 return CM_ScalarEpilogueAllowed;
8543}
8544
8545// Process the loop in the VPlan-native vectorization path. This path builds
8546// VPlan upfront in the vectorization pipeline, which allows to apply
8547// VPlan-to-VPlan transformations from the very beginning without modifying the
8548// input LLVM IR.
8549static bool processLoopInVPlanNativePath(
8550 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
8551 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
8552 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
8553 OptimizationRemarkEmitter *ORE,
8554 std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
8555 LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
8556
8557 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
8558 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8559 return false;
8560 }
8561 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8562 Function *F = L->getHeader()->getParent();
8563 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8564
8565 ScalarEpilogueLowering SEL =
8566 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL&: *LVL, IAI: &IAI);
8567
8568 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
8569 GetBFI, F, &Hints, IAI, OptForSize);
8570 // Use the planner for outer loop vectorization.
8571 // TODO: CM is not used at this point inside the planner. Turn CM into an
8572 // optional argument if we don't need it in the future.
8573 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
8574 ORE);
8575
8576 // Get user vectorization factor.
8577 ElementCount UserVF = Hints.getWidth();
8578
8579 CM.collectElementTypesForWidening();
8580
8581 // Plan how to best vectorize, return the best VF and its cost.
8582 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8583
8584 // If we are stress testing VPlan builds, do not attempt to generate vector
8585 // code. Masked vector code generation support will follow soon.
8586 // Also, do not attempt to vectorize if no vector code will be produced.
8587 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
8588 return false;
8589
8590 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
8591
8592 {
8593 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
8594 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /*UF=*/1, &CM,
8595 Checks, BestPlan);
8596 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8597 << L->getHeader()->getParent()->getName() << "\"\n");
8598 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, /*UF=*/1,
8599 MinProfitableTripCount: VF.MinProfitableTripCount);
8600 bool HasBranchWeights =
8601 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
8602 LVP.attachRuntimeChecks(Plan&: BestPlan, RTChecks&: Checks, HasBranchWeights);
8603
8604 LVP.executePlan(BestVF: VF.Width, /*UF=*/BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT);
8605 }
8606
8607 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
8608
8609 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8610 return true;
8611}
8612
8613// Emit a remark if there are stores to floats that required a floating point
8614// extension. If the vectorized loop was generated with floating point there
8615// will be a performance penalty from the conversion overhead and the change in
8616// the vector width.
8617static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
8618 SmallVector<Instruction *, 4> Worklist;
8619 for (BasicBlock *BB : L->getBlocks()) {
8620 for (Instruction &Inst : *BB) {
8621 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
8622 if (S->getValueOperand()->getType()->isFloatTy())
8623 Worklist.push_back(Elt: S);
8624 }
8625 }
8626 }
8627
8628 // Traverse the floating point stores upwards searching, for floating point
8629 // conversions.
8630 SmallPtrSet<const Instruction *, 4> Visited;
8631 SmallPtrSet<const Instruction *, 4> EmittedRemark;
8632 while (!Worklist.empty()) {
8633 auto *I = Worklist.pop_back_val();
8634 if (!L->contains(Inst: I))
8635 continue;
8636 if (!Visited.insert(Ptr: I).second)
8637 continue;
8638
8639 // Emit a remark if the floating point store required a floating
8640 // point conversion.
8641 // TODO: More work could be done to identify the root cause such as a
8642 // constant or a function return type and point the user to it.
8643 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
8644 ORE->emit(RemarkBuilder: [&]() {
8645 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
8646 I->getDebugLoc(), L->getHeader())
8647 << "floating point conversion changes vector width. "
8648 << "Mixed floating point precision requires an up/down "
8649 << "cast that will negatively impact performance.";
8650 });
8651
8652 for (Use &Op : I->operands())
8653 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
8654 Worklist.push_back(Elt: OpI);
8655 }
8656}
8657
8658/// For loops with uncountable early exits, find the cost of doing work when
8659/// exiting the loop early, such as calculating the final exit values of
8660/// variables used outside the loop.
8661/// TODO: This is currently overly pessimistic because the loop may not take
8662/// the early exit, but better to keep this conservative for now. In future,
8663/// it might be possible to relax this by using branch probabilities.
8664static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
8665 VPlan &Plan, ElementCount VF) {
8666 InstructionCost Cost = 0;
8667 for (auto *ExitVPBB : Plan.getExitBlocks()) {
8668 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
8669 // If the predecessor is not the middle.block, then it must be the
8670 // vector.early.exit block, which may contain work to calculate the exit
8671 // values of variables used outside the loop.
8672 if (PredVPBB != Plan.getMiddleBlock()) {
8673 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
8674 << PredVPBB->getName() << ":\n");
8675 Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
8676 }
8677 }
8678 }
8679 return Cost;
8680}
8681
8682/// This function determines whether or not it's still profitable to vectorize
8683/// the loop given the extra work we have to do outside of the loop:
8684/// 1. Perform the runtime checks before entering the loop to ensure it's safe
8685/// to vectorize.
8686/// 2. In the case of loops with uncountable early exits, we may have to do
8687/// extra work when exiting the loop early, such as calculating the final
8688/// exit values of variables used outside the loop.
8689/// 3. The middle block.
8690static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
8691 VectorizationFactor &VF, Loop *L,
8692 PredicatedScalarEvolution &PSE,
8693 VPCostContext &CostCtx, VPlan &Plan,
8694 ScalarEpilogueLowering SEL,
8695 std::optional<unsigned> VScale) {
8696 InstructionCost RtC = Checks.getCost();
8697 if (!RtC.isValid())
8698 return false;
8699
8700 // When interleaving only scalar and vector cost will be equal, which in turn
8701 // would lead to a divide by 0. Fall back to hard threshold.
8702 if (VF.Width.isScalar()) {
8703 // TODO: Should we rename VectorizeMemoryCheckThreshold?
8704 if (RtC > VectorizeMemoryCheckThreshold) {
8705 LLVM_DEBUG(
8706 dbgs()
8707 << "LV: Interleaving only is not profitable due to runtime checks\n");
8708 return false;
8709 }
8710 return true;
8711 }
8712
8713 // The scalar cost should only be 0 when vectorizing with a user specified
8714 // VF/IC. In those cases, runtime checks should always be generated.
8715 uint64_t ScalarC = VF.ScalarCost.getValue();
8716 if (ScalarC == 0)
8717 return true;
8718
8719 InstructionCost TotalCost = RtC;
8720 // Add on the cost of any work required in the vector early exit block, if
8721 // one exists.
8722 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
8723 TotalCost += Plan.getMiddleBlock()->cost(VF: VF.Width, Ctx&: CostCtx);
8724
8725 // First, compute the minimum iteration count required so that the vector
8726 // loop outperforms the scalar loop.
8727 // The total cost of the scalar loop is
8728 // ScalarC * TC
8729 // where
8730 // * TC is the actual trip count of the loop.
8731 // * ScalarC is the cost of a single scalar iteration.
8732 //
8733 // The total cost of the vector loop is
8734 // TotalCost + VecC * (TC / VF) + EpiC
8735 // where
8736 // * TotalCost is the sum of the costs cost of
8737 // - the generated runtime checks, i.e. RtC
8738 // - performing any additional work in the vector.early.exit block for
8739 // loops with uncountable early exits.
8740 // - the middle block, if ExpectedTC <= VF.Width.
8741 // * VecC is the cost of a single vector iteration.
8742 // * TC is the actual trip count of the loop
8743 // * VF is the vectorization factor
8744 // * EpiCost is the cost of the generated epilogue, including the cost
8745 // of the remaining scalar operations.
8746 //
8747 // Vectorization is profitable once the total vector cost is less than the
8748 // total scalar cost:
8749 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
8750 //
8751 // Now we can compute the minimum required trip count TC as
8752 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
8753 //
8754 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
8755 // the computations are performed on doubles, not integers and the result
8756 // is rounded up, hence we get an upper estimate of the TC.
8757 unsigned IntVF = estimateElementCount(VF: VF.Width, VScale);
8758 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
8759 uint64_t MinTC1 =
8760 Div == 0 ? 0 : divideCeil(Numerator: TotalCost.getValue() * IntVF, Denominator: Div);
8761
8762 // Second, compute a minimum iteration count so that the cost of the
8763 // runtime checks is only a fraction of the total scalar loop cost. This
8764 // adds a loop-dependent bound on the overhead incurred if the runtime
8765 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
8766 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
8767 // cost, compute
8768 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
8769 uint64_t MinTC2 = divideCeil(Numerator: RtC.getValue() * 10, Denominator: ScalarC);
8770
8771 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
8772 // epilogue is allowed, choose the next closest multiple of VF. This should
8773 // partly compensate for ignoring the epilogue cost.
8774 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
8775 if (SEL == CM_ScalarEpilogueAllowed)
8776 MinTC = alignTo(Value: MinTC, Align: IntVF);
8777 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
8778
8779 LLVM_DEBUG(
8780 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
8781 << VF.MinProfitableTripCount << "\n");
8782
8783 // Skip vectorization if the expected trip count is less than the minimum
8784 // required trip count.
8785 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
8786 if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
8787 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
8788 "trip count < minimum profitable VF ("
8789 << *ExpectedTC << " < " << VF.MinProfitableTripCount
8790 << ")\n");
8791
8792 return false;
8793 }
8794 }
8795 return true;
8796}
8797
8798LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8799 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
8800 !EnableLoopInterleaving),
8801 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
8802 !EnableLoopVectorization) {}
8803
8804/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
8805/// vectorization.
8806static SmallVector<VPInstruction *>
8807preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
8808 using namespace VPlanPatternMatch;
8809 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
8810 // introduce multiple uses of undef/poison. If the reduction start value may
8811 // be undef or poison it needs to be frozen and the frozen start has to be
8812 // used when computing the reduction result. We also need to use the frozen
8813 // value in the resume phi generated by the main vector loop, as this is also
8814 // used to compute the reduction result after the epilogue vector loop.
8815 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
8816 bool UpdateResumePhis) {
8817 VPBuilder Builder(Plan.getEntry());
8818 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
8819 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
8820 if (!VPI)
8821 continue;
8822 VPValue *OrigStart;
8823 if (!matchFindIVResult(VPI, ReducedIV: m_VPValue(), Start: m_VPValue(V&: OrigStart)))
8824 continue;
8825 if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
8826 continue;
8827 VPInstruction *Freeze =
8828 Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, DL: {}, Name: "fr");
8829 VPI->setOperand(I: 2, New: Freeze);
8830 if (UpdateResumePhis)
8831 OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
8832 return Freeze != &U && isa<VPPhi>(Val: &U);
8833 });
8834 }
8835 };
8836 AddFreezeForFindLastIVReductions(MainPlan, true);
8837 AddFreezeForFindLastIVReductions(EpiPlan, false);
8838
8839 VPValue *VectorTC = nullptr;
8840 auto *Term =
8841 MainPlan.getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8842 [[maybe_unused]] bool MatchedTC =
8843 match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue(V&: VectorTC)));
8844 assert(MatchedTC && "must match vector trip count");
8845
8846 // If there is a suitable resume value for the canonical induction in the
8847 // scalar (which will become vector) epilogue loop, use it and move it to the
8848 // beginning of the scalar preheader. Otherwise create it below.
8849 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
8850 auto ResumePhiIter =
8851 find_if(Range: MainScalarPH->phis(), P: [VectorTC](VPRecipeBase &R) {
8852 return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Ops: m_Specific(VPV: VectorTC),
8853 Ops: m_ZeroInt()));
8854 });
8855 VPPhi *ResumePhi = nullptr;
8856 if (ResumePhiIter == MainScalarPH->phis().end()) {
8857 using namespace llvm::VPlanPatternMatch;
8858 assert(
8859 match(MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue(),
8860 m_ZeroInt()) &&
8861 "canonical IV must start at 0");
8862 Type *Ty = VPTypeAnalysis(MainPlan).inferScalarType(V: VectorTC);
8863 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
8864 ResumePhi = ScalarPHBuilder.createScalarPhi(
8865 IncomingValues: {VectorTC, MainPlan.getZero(Ty)}, DL: {}, Name: "vec.epilog.resume.val");
8866 } else {
8867 ResumePhi = cast<VPPhi>(Val: &*ResumePhiIter);
8868 ResumePhi->setName("vec.epilog.resume.val");
8869 if (&MainScalarPH->front() != ResumePhi)
8870 ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->begin());
8871 }
8872
8873 // Create a ResumeForEpilogue for the canonical IV resume as the
8874 // first non-phi, to keep it alive for the epilogue.
8875 VPBuilder ResumeBuilder(MainScalarPH);
8876 ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue, Operands: ResumePhi);
8877
8878 // Create ResumeForEpilogue instructions for the resume phis of the
8879 // VPIRPhis in the scalar header of the main plan and return them so they can
8880 // be used as resume values when vectorizing the epilogue.
8881 return to_vector(
8882 Range: map_range(C: MainPlan.getScalarHeader()->phis(), F: [&](VPRecipeBase &R) {
8883 assert(isa<VPIRPhi>(R) &&
8884 "only VPIRPhis expected in the scalar header");
8885 return ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue,
8886 Operands: R.getOperand(N: 0));
8887 }));
8888}
8889
8890/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
8891/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
8892/// reductions require creating new instructions to compute the resume values.
8893/// They are collected in a vector and returned. They must be moved to the
8894/// preheader of the vector epilogue loop, after created by the execution of \p
8895/// Plan.
8896static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
8897 VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
8898 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
8899 ScalarEvolution &SE) {
8900 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
8901 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
8902 Header->setName("vec.epilog.vector.body");
8903
8904 VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
8905 // When vectorizing the epilogue loop, the canonical induction needs to start
8906 // at the resume value from the main vector loop. Find the resume value
8907 // created during execution of the main VPlan. It must be the first phi in the
8908 // loop preheader. Add this resume value as an offset to the canonical IV of
8909 // the epilogue loop.
8910 using namespace llvm::PatternMatch;
8911 PHINode *EPResumeVal = &*L->getLoopPreheader()->phis().begin();
8912 for (Value *Inc : EPResumeVal->incoming_values()) {
8913 if (match(V: Inc, P: m_SpecificInt(V: 0)))
8914 continue;
8915 assert(!EPI.VectorTripCount &&
8916 "Must only have a single non-zero incoming value");
8917 EPI.VectorTripCount = Inc;
8918 }
8919 // If we didn't find a non-zero vector trip count, all incoming values
8920 // must be zero, which also means the vector trip count is zero. Pick the
8921 // first zero as vector trip count.
8922 // TODO: We should not choose VF * UF so the main vector loop is known to
8923 // be dead.
8924 if (!EPI.VectorTripCount) {
8925 assert(EPResumeVal->getNumIncomingValues() > 0 &&
8926 all_of(EPResumeVal->incoming_values(),
8927 [](Value *Inc) { return match(Inc, m_SpecificInt(0)); }) &&
8928 "all incoming values must be 0");
8929 EPI.VectorTripCount = EPResumeVal->getOperand(i_nocapture: 0);
8930 }
8931 VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
8932 assert(all_of(IV->users(),
8933 [](const VPUser *U) {
8934 return isa<VPScalarIVStepsRecipe>(U) ||
8935 isa<VPDerivedIVRecipe>(U) ||
8936 cast<VPRecipeBase>(U)->isScalarCast() ||
8937 cast<VPInstruction>(U)->getOpcode() ==
8938 Instruction::Add;
8939 }) &&
8940 "the canonical IV should only be used by its increment or "
8941 "ScalarIVSteps when resetting the start value");
8942 VPBuilder Builder(Header, Header->getFirstNonPhi());
8943 VPInstruction *Add = Builder.createAdd(LHS: IV, RHS: VPV);
8944 // Replace all users of the canonical IV and its increment with the offset
8945 // version, except for the Add itself and the canonical IV increment.
8946 auto *Increment = cast<VPInstruction>(Val: IV->getBackedgeValue());
8947 IV->replaceUsesWithIf(New: Add, ShouldReplace: [Add, Increment](VPUser &U, unsigned) {
8948 return &U != Add && &U != Increment;
8949 });
8950 VPInstruction *OffsetIVInc =
8951 VPBuilder::getToInsertAfter(R: Increment).createAdd(LHS: Increment, RHS: VPV);
8952 Increment->replaceUsesWithIf(New: OffsetIVInc,
8953 ShouldReplace: [IV](VPUser &U, unsigned) { return &U != IV; });
8954 OffsetIVInc->setOperand(I: 0, New: Increment);
8955
8956 DenseMap<Value *, Value *> ToFrozen;
8957 SmallVector<Instruction *> InstsToMove;
8958 // Ensure that the start values for all header phi recipes are updated before
8959 // vectorizing the epilogue loop. Skip the canonical IV, which has been
8960 // handled above.
8961 for (VPRecipeBase &R : drop_begin(RangeOrContainer: Header->phis())) {
8962 Value *ResumeV = nullptr;
8963 // TODO: Move setting of resume values to prepareToExecute.
8964 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
8965 // Find the reduction result by searching users of the phi or its backedge
8966 // value.
8967 auto IsReductionResult = [](VPRecipeBase *R) {
8968 auto *VPI = dyn_cast<VPInstruction>(Val: R);
8969 if (!VPI)
8970 return false;
8971 return VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
8972 VPI->getOpcode() == VPInstruction::ComputeReductionResult;
8973 };
8974 auto *RdxResult = cast<VPInstruction>(
8975 Val: vputils::findRecipe(Start: ReductionPhi->getBackedgeValue(), Pred: IsReductionResult));
8976 assert(RdxResult && "expected to find reduction result");
8977
8978 ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
8979 ->getIncomingValueForBlock(BB: L->getLoopPreheader());
8980
8981 // Check for FindIV pattern by looking for icmp user of RdxResult.
8982 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
8983 using namespace VPlanPatternMatch;
8984 VPValue *SentinelVPV = nullptr;
8985 bool IsFindIV = any_of(Range: RdxResult->users(), P: [&](VPUser *U) {
8986 return match(U, P: VPlanPatternMatch::m_SpecificICmp(
8987 MatchPred: ICmpInst::ICMP_NE, Op0: m_Specific(VPV: RdxResult),
8988 Op1: m_VPValue(V&: SentinelVPV)));
8989 });
8990
8991 if (RdxResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
8992 Value *StartV = RdxResult->getOperand(N: 0)->getLiveInIRValue();
8993 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
8994 // start value; compare the final value from the main vector loop
8995 // to the start value.
8996 BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
8997 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
8998 ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
8999 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9000 InstsToMove.push_back(Elt: I);
9001 } else if (IsFindIV) {
9002 assert(SentinelVPV && "expected to find icmp using RdxResult");
9003
9004 // Get the frozen start value from the main loop.
9005 Value *FrozenStartV = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9006 BB: EPI.MainLoopIterationCountCheck);
9007 if (auto *FreezeI = dyn_cast<FreezeInst>(Val: FrozenStartV))
9008 ToFrozen[FreezeI->getOperand(i_nocapture: 0)] = FrozenStartV;
9009
9010 // Adjust resume: select(icmp eq ResumeV, FrozenStartV), Sentinel,
9011 // ResumeV
9012 BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9013 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9014 Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: FrozenStartV);
9015 if (auto *I = dyn_cast<Instruction>(Val: Cmp))
9016 InstsToMove.push_back(Elt: I);
9017 ResumeV =
9018 Builder.CreateSelect(C: Cmp, True: SentinelVPV->getLiveInIRValue(), False: ResumeV);
9019 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9020 InstsToMove.push_back(Elt: I);
9021 } else {
9022 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9023 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9024 if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9025 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9026 "unexpected start value");
9027 // Partial sub-reductions always start at 0 and account for the
9028 // reduction start value in a final subtraction. Update it to use the
9029 // resume value from the main vector loop.
9030 if (PhiR->getVFScaleFactor() > 1 &&
9031 PhiR->getRecurrenceKind() == RecurKind::Sub) {
9032 auto *Sub = cast<VPInstruction>(Val: RdxResult->getSingleUser());
9033 assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode");
9034 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
9035 "Expected operand to match the original start value of the "
9036 "reduction");
9037 assert(VPlanPatternMatch::match(VPI->getOperand(0),
9038 VPlanPatternMatch::m_ZeroInt()) &&
9039 "Expected start value for partial sub-reduction to start at "
9040 "zero");
9041 Sub->setOperand(I: 0, New: StartVal);
9042 } else
9043 VPI->setOperand(I: 0, New: StartVal);
9044 continue;
9045 }
9046 }
9047 } else {
9048 // Retrieve the induction resume values for wide inductions from
9049 // their original phi nodes in the scalar loop.
9050 PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9051 // Hook up to the PHINode generated by a ResumePhi recipe of main
9052 // loop VPlan, which feeds the scalar loop.
9053 ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9054 }
9055 assert(ResumeV && "Must have a resume value");
9056 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9057 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9058 }
9059
9060 // For some VPValues in the epilogue plan we must re-use the generated IR
9061 // values from the main plan. Replace them with live-in VPValues.
9062 // TODO: This is a workaround needed for epilogue vectorization and it
9063 // should be removed once induction resume value creation is done
9064 // directly in VPlan.
9065 for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9066 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9067 // epilogue plan. This ensures all users use the same frozen value.
9068 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9069 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9070 VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9071 V: ToFrozen.lookup(Val: VPI->getOperand(N: 0)->getLiveInIRValue())));
9072 continue;
9073 }
9074
9075 // Re-use the trip count and steps expanded for the main loop, as
9076 // skeleton creation needs it as a value that dominates both the scalar
9077 // and vector epilogue loops
9078 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9079 if (!ExpandR)
9080 continue;
9081 VPValue *ExpandedVal =
9082 Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9083 ExpandR->replaceAllUsesWith(New: ExpandedVal);
9084 if (Plan.getTripCount() == ExpandR)
9085 Plan.resetTripCount(NewTripCount: ExpandedVal);
9086 ExpandR->eraseFromParent();
9087 }
9088
9089 auto VScale = CM.getVScaleForTuning();
9090 unsigned MainLoopStep =
9091 estimateElementCount(VF: EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9092 unsigned EpilogueLoopStep =
9093 estimateElementCount(VF: EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9094 VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
9095 Plan, VectorTripCount: EPI.VectorTripCount,
9096 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()), EpilogueVF: EPI.EpilogueVF,
9097 EpilogueUF: EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9098
9099 return InstsToMove;
9100}
9101
9102static void
9103fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
9104 VPlan &BestEpiPlan,
9105 ArrayRef<VPInstruction *> ResumeValues) {
9106 // Fix resume values from the additional bypass block.
9107 BasicBlock *PH = L->getLoopPreheader();
9108 for (auto *Pred : predecessors(BB: PH)) {
9109 for (PHINode &Phi : PH->phis()) {
9110 if (Phi.getBasicBlockIndex(BB: Pred) != -1)
9111 continue;
9112 Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
9113 }
9114 }
9115 auto *ScalarPH = cast<VPIRBasicBlock>(Val: BestEpiPlan.getScalarPreheader());
9116 if (ScalarPH->hasPredecessors()) {
9117 // Fix resume values for inductions and reductions from the additional
9118 // bypass block using the incoming values from the main loop's resume phis.
9119 // ResumeValues correspond 1:1 with the scalar loop header phis.
9120 for (auto [ResumeV, HeaderPhi] :
9121 zip(t&: ResumeValues, u: BestEpiPlan.getScalarHeader()->phis())) {
9122 auto *HeaderPhiR = cast<VPIRPhi>(Val: &HeaderPhi);
9123 auto *EpiResumePhi =
9124 cast<PHINode>(Val: HeaderPhiR->getIRPhi().getIncomingValueForBlock(BB: PH));
9125 if (EpiResumePhi->getBasicBlockIndex(BB: BypassBlock) == -1)
9126 continue;
9127 auto *MainResumePhi = cast<PHINode>(Val: ResumeV->getUnderlyingValue());
9128 EpiResumePhi->setIncomingValueForBlock(
9129 BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
9130 }
9131 }
9132}
9133
9134/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9135/// loop, after both plans have executed, updating branches from the iteration
9136/// and runtime checks of the main loop, as well as updating various phis. \p
9137/// InstsToMove contains instructions that need to be moved to the preheader of
9138/// the epilogue vector loop.
9139static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
9140 EpilogueLoopVectorizationInfo &EPI,
9141 DominatorTree *DT,
9142 GeneratedRTChecks &Checks,
9143 ArrayRef<Instruction *> InstsToMove,
9144 ArrayRef<VPInstruction *> ResumeValues) {
9145 BasicBlock *VecEpilogueIterationCountCheck =
9146 cast<VPIRBasicBlock>(Val: EpiPlan.getEntry())->getIRBasicBlock();
9147
9148 BasicBlock *VecEpiloguePreHeader =
9149 cast<CondBrInst>(Val: VecEpilogueIterationCountCheck->getTerminator())
9150 ->getSuccessor(i: 1);
9151 // Adjust the control flow taking the state info from the main loop
9152 // vectorization into account.
9153 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
9154 "expected this to be saved from the previous pass.");
9155 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9156 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
9157 From: VecEpilogueIterationCountCheck, To: VecEpiloguePreHeader);
9158
9159 DTU.applyUpdates(Updates: {{DominatorTree::Delete, EPI.MainLoopIterationCountCheck,
9160 VecEpilogueIterationCountCheck},
9161 {DominatorTree::Insert, EPI.MainLoopIterationCountCheck,
9162 VecEpiloguePreHeader}});
9163
9164 BasicBlock *ScalarPH =
9165 cast<VPIRBasicBlock>(Val: EpiPlan.getScalarPreheader())->getIRBasicBlock();
9166 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
9167 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9168 DTU.applyUpdates(
9169 Updates: {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck,
9170 VecEpilogueIterationCountCheck},
9171 {DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}});
9172
9173 // Adjust the terminators of runtime check blocks and phis using them.
9174 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9175 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9176 if (SCEVCheckBlock) {
9177 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9178 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9179 DTU.applyUpdates(Updates: {{DominatorTree::Delete, SCEVCheckBlock,
9180 VecEpilogueIterationCountCheck},
9181 {DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9182 }
9183 if (MemCheckBlock) {
9184 MemCheckBlock->getTerminator()->replaceUsesOfWith(
9185 From: VecEpilogueIterationCountCheck, To: ScalarPH);
9186 DTU.applyUpdates(
9187 Updates: {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9188 {DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9189 }
9190
9191 // The vec.epilog.iter.check block may contain Phi nodes from inductions
9192 // or reductions which merge control-flow from the latch block and the
9193 // middle block. Update the incoming values here and move the Phi into the
9194 // preheader.
9195 SmallVector<PHINode *, 4> PhisInBlock(
9196 llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
9197
9198 for (PHINode *Phi : PhisInBlock) {
9199 Phi->moveBefore(InsertPos: VecEpiloguePreHeader->getFirstNonPHIIt());
9200 Phi->replaceIncomingBlockWith(
9201 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
9202 New: VecEpilogueIterationCountCheck);
9203
9204 // If the phi doesn't have an incoming value from the
9205 // EpilogueIterationCountCheck, we are done. Otherwise remove the
9206 // incoming value and also those from other check blocks. This is needed
9207 // for reduction phis only.
9208 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
9209 return EPI.EpilogueIterationCountCheck == IncB;
9210 }))
9211 continue;
9212 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
9213 if (SCEVCheckBlock)
9214 Phi->removeIncomingValue(BB: SCEVCheckBlock);
9215 if (MemCheckBlock)
9216 Phi->removeIncomingValue(BB: MemCheckBlock);
9217 }
9218
9219 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9220 for (auto *I : InstsToMove)
9221 I->moveBefore(InsertPos: IP);
9222
9223 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9224 // after executing the main loop. We need to update the resume values of
9225 // inductions and reductions during epilogue vectorization.
9226 fixScalarResumeValuesFromBypass(BypassBlock: VecEpilogueIterationCountCheck, L, BestEpiPlan&: EpiPlan,
9227 ResumeValues);
9228
9229 // Remove dead phis that were moved to the epilogue preheader but are unused
9230 // (e.g., resume phis for inductions not widened in the epilogue vector loop).
9231 for (PHINode &Phi : make_early_inc_range(Range: VecEpiloguePreHeader->phis()))
9232 if (Phi.use_empty())
9233 Phi.eraseFromParent();
9234}
9235
9236bool LoopVectorizePass::processLoop(Loop *L) {
9237 assert((EnableVPlanNativePath || L->isInnermost()) &&
9238 "VPlan-native path is not enabled. Only process inner loops.");
9239
9240 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9241 << L->getHeader()->getParent()->getName() << "' from "
9242 << L->getLocStr() << "\n");
9243
9244 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9245
9246 LLVM_DEBUG(
9247 dbgs() << "LV: Loop hints:"
9248 << " force="
9249 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9250 ? "disabled"
9251 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9252 ? "enabled"
9253 : "?"))
9254 << " width=" << Hints.getWidth()
9255 << " interleave=" << Hints.getInterleave() << "\n");
9256
9257 // Function containing loop
9258 Function *F = L->getHeader()->getParent();
9259
9260 // Looking at the diagnostic output is the only way to determine if a loop
9261 // was vectorized (other than looking at the IR or machine code), so it
9262 // is important to generate an optimization remark for each loop. Most of
9263 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9264 // generated as OptimizationRemark and OptimizationRemarkMissed are
9265 // less verbose reporting vectorized loops and unvectorized loops that may
9266 // benefit from vectorization, respectively.
9267
9268 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9269 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9270 return false;
9271 }
9272
9273 PredicatedScalarEvolution PSE(*SE, *L);
9274
9275 // Query this against the original loop and save it here because the profile
9276 // of the original loop header may change as the transformation happens.
9277 bool OptForSize = llvm::shouldOptimizeForSize(
9278 BB: L->getHeader(), PSI,
9279 BFI: PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
9280 QueryType: PGSOQueryType::IRPass);
9281
9282 // Check if it is legal to vectorize the loop.
9283 LoopVectorizationRequirements Requirements;
9284 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9285 &Requirements, &Hints, DB, AC,
9286 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
9287 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9288 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9289 Hints.emitRemarkWithHints();
9290 return false;
9291 }
9292
9293 if (LVL.hasUncountableEarlyExit()) {
9294 if (!EnableEarlyExitVectorization) {
9295 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
9296 "early exit is not enabled",
9297 ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
9298 return false;
9299 }
9300 }
9301
9302 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9303 // here. They may require CFG and instruction level transformations before
9304 // even evaluating whether vectorization is profitable. Since we cannot modify
9305 // the incoming IR, we need to build VPlan upfront in the vectorization
9306 // pipeline.
9307 if (!L->isInnermost())
9308 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9309 ORE, GetBFI, OptForSize, Hints,
9310 Requirements);
9311
9312 assert(L->isInnermost() && "Inner loop expected.");
9313
9314 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9315 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9316
9317 // If an override option has been passed in for interleaved accesses, use it.
9318 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9319 UseInterleaved = EnableInterleavedMemAccesses;
9320
9321 // Analyze interleaved memory accesses.
9322 if (UseInterleaved)
9323 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9324
9325 if (LVL.hasUncountableEarlyExit()) {
9326 BasicBlock *LoopLatch = L->getLoopLatch();
9327 if (IAI.requiresScalarEpilogue() ||
9328 any_of(Range: LVL.getCountableExitingBlocks(),
9329 P: [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9330 reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
9331 "requiring a scalar epilogue is unsupported",
9332 ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
9333 return false;
9334 }
9335 }
9336
9337 // Check the function attributes and profiles to find out if this function
9338 // should be optimized for size.
9339 ScalarEpilogueLowering SEL =
9340 getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, IAI: &IAI);
9341
9342 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9343 // count by optimizing for size, to minimize overheads.
9344 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9345 if (ExpectedTC && ExpectedTC->isFixed() &&
9346 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
9347 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9348 << "This loop is worth vectorizing only if no scalar "
9349 << "iteration overheads are incurred.");
9350 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9351 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9352 else {
9353 LLVM_DEBUG(dbgs() << "\n");
9354 // Predicate tail-folded loops are efficient even when the loop
9355 // iteration count is low. However, setting the epilogue policy to
9356 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9357 // with runtime checks. It's more effective to let
9358 // `isOutsideLoopWorkProfitable` determine if vectorization is
9359 // beneficial for the loop.
9360 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9361 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9362 }
9363 }
9364
9365 // Check the function attributes to see if implicit floats or vectors are
9366 // allowed.
9367 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
9368 reportVectorizationFailure(
9369 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9370 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9371 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9372 Hints.emitRemarkWithHints();
9373 return false;
9374 }
9375
9376 // Check if the target supports potentially unsafe FP vectorization.
9377 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9378 // for the target we're vectorizing for, to make sure none of the
9379 // additional fp-math flags can help.
9380 if (Hints.isPotentiallyUnsafe() &&
9381 TTI->isFPVectorizationPotentiallyUnsafe()) {
9382 reportVectorizationFailure(
9383 DebugMsg: "Potentially unsafe FP op prevents vectorization",
9384 OREMsg: "loop not vectorized due to unsafe FP support.",
9385 ORETag: "UnsafeFP", ORE, TheLoop: L);
9386 Hints.emitRemarkWithHints();
9387 return false;
9388 }
9389
9390 bool AllowOrderedReductions;
9391 // If the flag is set, use that instead and override the TTI behaviour.
9392 if (ForceOrderedReductions.getNumOccurrences() > 0)
9393 AllowOrderedReductions = ForceOrderedReductions;
9394 else
9395 AllowOrderedReductions = TTI->enableOrderedReductions();
9396 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9397 ORE->emit(RemarkBuilder: [&]() {
9398 auto *ExactFPMathInst = Requirements.getExactFPInst();
9399 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9400 ExactFPMathInst->getDebugLoc(),
9401 ExactFPMathInst->getParent())
9402 << "loop not vectorized: cannot prove it is safe to reorder "
9403 "floating-point operations";
9404 });
9405 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9406 "reorder floating-point operations\n");
9407 Hints.emitRemarkWithHints();
9408 return false;
9409 }
9410
9411 // Use the cost model.
9412 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9413 GetBFI, F, &Hints, IAI, OptForSize);
9414 // Use the planner for vectorization.
9415 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9416 ORE);
9417
9418 // Get user vectorization factor and interleave count.
9419 ElementCount UserVF = Hints.getWidth();
9420 unsigned UserIC = Hints.getInterleave();
9421 if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
9422 UserIC = 1;
9423
9424 // Plan how to best vectorize.
9425 LVP.plan(UserVF, UserIC);
9426 VectorizationFactor VF = LVP.computeBestVF();
9427 unsigned IC = 1;
9428
9429 if (ORE->allowExtraAnalysis(LV_NAME))
9430 LVP.emitInvalidCostRemarks(ORE);
9431
9432 GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
9433 if (LVP.hasPlanWithVF(VF: VF.Width)) {
9434 // Select the interleave count.
9435 IC = LVP.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
9436
9437 unsigned SelectedIC = std::max(a: IC, b: UserIC);
9438 // Optimistically generate runtime checks if they are needed. Drop them if
9439 // they turn out to not be profitable.
9440 if (VF.Width.isVector() || SelectedIC > 1) {
9441 Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC,
9442 ORE&: *ORE);
9443
9444 // Bail out early if either the SCEV or memory runtime checks are known to
9445 // fail. In that case, the vector loop would never execute.
9446 using namespace llvm::PatternMatch;
9447 if (Checks.getSCEVChecks().first &&
9448 match(V: Checks.getSCEVChecks().first, P: m_One()))
9449 return false;
9450 if (Checks.getMemRuntimeChecks().first &&
9451 match(V: Checks.getMemRuntimeChecks().first, P: m_One()))
9452 return false;
9453 }
9454
9455 // Check if it is profitable to vectorize with runtime checks.
9456 bool ForceVectorization =
9457 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9458 VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF: VF.Width), CM,
9459 CM.CostKind, CM.PSE, L);
9460 if (!ForceVectorization &&
9461 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9462 Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
9463 VScale: CM.getVScaleForTuning())) {
9464 ORE->emit(RemarkBuilder: [&]() {
9465 return OptimizationRemarkAnalysisAliasing(
9466 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9467 L->getHeader())
9468 << "loop not vectorized: cannot prove it is safe to reorder "
9469 "memory operations";
9470 });
9471 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9472 Hints.emitRemarkWithHints();
9473 return false;
9474 }
9475 }
9476
9477 // Identify the diagnostic messages that should be produced.
9478 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9479 bool VectorizeLoop = true, InterleaveLoop = true;
9480 if (VF.Width.isScalar()) {
9481 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9482 VecDiagMsg = {
9483 "VectorizationNotBeneficial",
9484 "the cost-model indicates that vectorization is not beneficial"};
9485 VectorizeLoop = false;
9486 }
9487
9488 if (UserIC == 1 && Hints.getInterleave() > 1) {
9489 assert(!LVL.isSafeForAnyVectorWidth() &&
9490 "UserIC should only be ignored due to unsafe dependencies");
9491 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9492 IntDiagMsg = {"InterleavingUnsafe",
9493 "Ignoring user-specified interleave count due to possibly "
9494 "unsafe dependencies in the loop."};
9495 InterleaveLoop = false;
9496 } else if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > 1) {
9497 // Tell the user interleaving was avoided up-front, despite being explicitly
9498 // requested.
9499 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9500 "interleaving should be avoided up front\n");
9501 IntDiagMsg = {"InterleavingAvoided",
9502 "Ignoring UserIC, because interleaving was avoided up front"};
9503 InterleaveLoop = false;
9504 } else if (IC == 1 && UserIC <= 1) {
9505 // Tell the user interleaving is not beneficial.
9506 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9507 IntDiagMsg = {
9508 "InterleavingNotBeneficial",
9509 "the cost-model indicates that interleaving is not beneficial"};
9510 InterleaveLoop = false;
9511 if (UserIC == 1) {
9512 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9513 IntDiagMsg.second +=
9514 " and is explicitly disabled or interleave count is set to 1";
9515 }
9516 } else if (IC > 1 && UserIC == 1) {
9517 // Tell the user interleaving is beneficial, but it explicitly disabled.
9518 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
9519 "disabled.\n");
9520 IntDiagMsg = {"InterleavingBeneficialButDisabled",
9521 "the cost-model indicates that interleaving is beneficial "
9522 "but is explicitly disabled or interleave count is set to 1"};
9523 InterleaveLoop = false;
9524 }
9525
9526 // If there is a histogram in the loop, do not just interleave without
9527 // vectorizing. The order of operations will be incorrect without the
9528 // histogram intrinsics, which are only used for recipes with VF > 1.
9529 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
9530 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9531 << "to histogram operations.\n");
9532 IntDiagMsg = {
9533 "HistogramPreventsScalarInterleaving",
9534 "Unable to interleave without vectorization due to constraints on "
9535 "the order of histogram operations"};
9536 InterleaveLoop = false;
9537 }
9538
9539 // Override IC if user provided an interleave count.
9540 IC = UserIC > 0 ? UserIC : IC;
9541
9542 // Emit diagnostic messages, if any.
9543 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9544 if (!VectorizeLoop && !InterleaveLoop) {
9545 // Do not vectorize or interleaving the loop.
9546 ORE->emit(RemarkBuilder: [&]() {
9547 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9548 L->getStartLoc(), L->getHeader())
9549 << VecDiagMsg.second;
9550 });
9551 ORE->emit(RemarkBuilder: [&]() {
9552 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9553 L->getStartLoc(), L->getHeader())
9554 << IntDiagMsg.second;
9555 });
9556 return false;
9557 }
9558
9559 if (!VectorizeLoop && InterleaveLoop) {
9560 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9561 ORE->emit(RemarkBuilder: [&]() {
9562 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9563 L->getStartLoc(), L->getHeader())
9564 << VecDiagMsg.second;
9565 });
9566 } else if (VectorizeLoop && !InterleaveLoop) {
9567 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9568 << ") in " << L->getLocStr() << '\n');
9569 ORE->emit(RemarkBuilder: [&]() {
9570 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9571 L->getStartLoc(), L->getHeader())
9572 << IntDiagMsg.second;
9573 });
9574 } else if (VectorizeLoop && InterleaveLoop) {
9575 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9576 << ") in " << L->getLocStr() << '\n');
9577 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9578 }
9579
9580 // Report the vectorization decision.
9581 if (VF.Width.isScalar()) {
9582 using namespace ore;
9583 assert(IC > 1);
9584 ORE->emit(RemarkBuilder: [&]() {
9585 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9586 L->getHeader())
9587 << "interleaved loop (interleaved count: "
9588 << NV("InterleaveCount", IC) << ")";
9589 });
9590 } else {
9591 // Report the vectorization decision.
9592 reportVectorization(ORE, TheLoop: L, VF, IC);
9593 }
9594 if (ORE->allowExtraAnalysis(LV_NAME))
9595 checkMixedPrecision(L, ORE);
9596
9597 // If we decided that it is *legal* to interleave or vectorize the loop, then
9598 // do it.
9599
9600 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9601 // Consider vectorizing the epilogue too if it's profitable.
9602 VectorizationFactor EpilogueVF =
9603 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
9604 bool HasBranchWeights =
9605 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9606 if (EpilogueVF.Width.isVector()) {
9607 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
9608
9609 // The first pass vectorizes the main loop and creates a scalar epilogue
9610 // to be vectorized by executing the plan (potentially with a different
9611 // factor) again shortly afterwards.
9612 VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
9613 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
9614 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
9615 SmallVector<VPInstruction *> ResumeValues =
9616 preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
9617 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
9618 BestEpiPlan);
9619
9620 // Add minimum iteration check for the epilogue plan, followed by runtime
9621 // checks for the main plan.
9622 LVP.addMinimumIterationCheck(Plan&: *BestMainPlan, VF: EPI.EpilogueVF, UF: EPI.EpilogueUF,
9623 MinProfitableTripCount: ElementCount::getFixed(MinVal: 0));
9624 LVP.attachRuntimeChecks(Plan&: *BestMainPlan, RTChecks&: Checks, HasBranchWeights);
9625 VPlanTransforms::addIterationCountCheckBlock(
9626 Plan&: *BestMainPlan, VF: EPI.MainLoopVF, UF: EPI.MainLoopUF,
9627 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: EPI.MainLoopVF.isVector()), OrigLoop: L,
9628 MinItersBypassWeights: HasBranchWeights ? MinItersBypassWeights : nullptr,
9629 DL: L->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
9630
9631 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9632 Checks, *BestMainPlan);
9633 auto ExpandedSCEVs = LVP.executePlan(
9634 BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF, BestVPlan&: *BestMainPlan, ILV&: MainILV, DT,
9635 EpilogueVecKind: LoopVectorizationPlanner::EpilogueVectorizationKind::MainLoop);
9636 ++LoopsVectorized;
9637
9638 // Derive EPI fields from VPlan-generated IR.
9639 BasicBlock *EntryBB =
9640 cast<VPIRBasicBlock>(Val: BestMainPlan->getEntry())->getIRBasicBlock();
9641 EntryBB->setName("iter.check");
9642 EPI.EpilogueIterationCountCheck = EntryBB;
9643 // The check chain is: Entry -> [SCEV] -> [Mem] -> MainCheck -> VecPH.
9644 // MainCheck is the non-bypass successor of the last runtime check block
9645 // (or Entry if there are no runtime checks).
9646 BasicBlock *LastCheck = EntryBB;
9647 if (BasicBlock *MemBB = Checks.getMemRuntimeChecks().second)
9648 LastCheck = MemBB;
9649 else if (BasicBlock *SCEVBB = Checks.getSCEVChecks().second)
9650 LastCheck = SCEVBB;
9651 BasicBlock *ScalarPH = L->getLoopPreheader();
9652 auto *BI = cast<CondBrInst>(Val: LastCheck->getTerminator());
9653 EPI.MainLoopIterationCountCheck =
9654 BI->getSuccessor(i: BI->getSuccessor(i: 0) == ScalarPH);
9655
9656 // Second pass vectorizes the epilogue and adjusts the control flow
9657 // edges from the first pass.
9658 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9659 Checks, BestEpiPlan);
9660 SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
9661 Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI, CM, SE&: *PSE.getSE());
9662 LVP.attachRuntimeChecks(Plan&: BestEpiPlan, RTChecks&: Checks, HasBranchWeights);
9663 LVP.executePlan(
9664 BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, DT,
9665 EpilogueVecKind: LoopVectorizationPlanner::EpilogueVectorizationKind::Epilogue);
9666 connectEpilogueVectorLoop(EpiPlan&: BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
9667 ResumeValues);
9668 ++LoopsEpilogueVectorized;
9669 } else {
9670 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
9671 BestPlan);
9672 // TODO: Move to general VPlan pipeline once epilogue loops are also
9673 // supported.
9674 RUN_VPLAN_PASS(VPlanTransforms::materializeConstantVectorTripCount,
9675 BestPlan, VF.Width, IC, PSE);
9676 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, UF: IC,
9677 MinProfitableTripCount: VF.MinProfitableTripCount);
9678 LVP.attachRuntimeChecks(Plan&: BestPlan, RTChecks&: Checks, HasBranchWeights);
9679
9680 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT);
9681 ++LoopsVectorized;
9682 }
9683
9684 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
9685 "DT not preserved correctly");
9686 assert(!verifyFunction(*F, &dbgs()));
9687
9688 return true;
9689}
9690
9691LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
9692
9693 // Don't attempt if
9694 // 1. the target claims to have no vector registers, and
9695 // 2. interleaving won't help ILP.
9696 //
9697 // The second condition is necessary because, even if the target has no
9698 // vector registers, loop vectorization may still enable scalar
9699 // interleaving.
9700 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
9701 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
9702 return LoopVectorizeResult(false, false);
9703
9704 bool Changed = false, CFGChanged = false;
9705
9706 // The vectorizer requires loops to be in simplified form.
9707 // Since simplification may add new inner loops, it has to run before the
9708 // legality and profitability checks. This means running the loop vectorizer
9709 // will simplify all loops, regardless of whether anything end up being
9710 // vectorized.
9711 for (const auto &L : *LI)
9712 Changed |= CFGChanged |=
9713 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
9714
9715 // Build up a worklist of inner-loops to vectorize. This is necessary as
9716 // the act of vectorizing or partially unrolling a loop creates new loops
9717 // and can invalidate iterators across the loops.
9718 SmallVector<Loop *, 8> Worklist;
9719
9720 for (Loop *L : *LI)
9721 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
9722
9723 LoopsAnalyzed += Worklist.size();
9724
9725 // Now walk the identified inner loops.
9726 while (!Worklist.empty()) {
9727 Loop *L = Worklist.pop_back_val();
9728
9729 // For the inner loops we actually process, form LCSSA to simplify the
9730 // transform.
9731 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
9732
9733 Changed |= CFGChanged |= processLoop(L);
9734
9735 if (Changed) {
9736 LAIs->clear();
9737
9738#ifndef NDEBUG
9739 if (VerifySCEV)
9740 SE->verify();
9741#endif
9742 }
9743 }
9744
9745 // Process each loop nest in the function.
9746 return LoopVectorizeResult(Changed, CFGChanged);
9747}
9748
9749PreservedAnalyses LoopVectorizePass::run(Function &F,
9750 FunctionAnalysisManager &AM) {
9751 LI = &AM.getResult<LoopAnalysis>(IR&: F);
9752 // There are no loops in the function. Return before computing other
9753 // expensive analyses.
9754 if (LI->empty())
9755 return PreservedAnalyses::all();
9756 SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
9757 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
9758 DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
9759 TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
9760 AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
9761 DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
9762 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
9763 LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
9764 AA = &AM.getResult<AAManager>(IR&: F);
9765
9766 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
9767 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
9768 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
9769 return AM.getResult<BlockFrequencyAnalysis>(IR&: F);
9770 };
9771 LoopVectorizeResult Result = runImpl(F);
9772 if (!Result.MadeAnyChange)
9773 return PreservedAnalyses::all();
9774 PreservedAnalyses PA;
9775
9776 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
9777 for (auto &BB : F)
9778 RemoveRedundantDbgInstrs(BB: &BB);
9779 }
9780
9781 PA.preserve<LoopAnalysis>();
9782 PA.preserve<DominatorTreeAnalysis>();
9783 PA.preserve<ScalarEvolutionAnalysis>();
9784 PA.preserve<LoopAccessAnalysis>();
9785
9786 if (Result.MadeCFGChange) {
9787 // Making CFG changes likely means a loop got vectorized. Indicate that
9788 // extra simplification passes should be run.
9789 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
9790 // be run if runtime checks have been added.
9791 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
9792 PA.preserve<ShouldRunExtraVectorPasses>();
9793 } else {
9794 PA.preserveSet<CFGAnalyses>();
9795 }
9796 return PA;
9797}
9798
9799void LoopVectorizePass::printPipeline(
9800 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
9801 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
9802 OS, MapClassName2PassName);
9803
9804 OS << '<';
9805 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
9806 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
9807 OS << '>';
9808}
9809