1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanHCFGBuilder.h"
62#include "VPlanPatternMatch.h"
63#include "VPlanTransforms.h"
64#include "VPlanVerifier.h"
65#include "llvm/ADT/APInt.h"
66#include "llvm/ADT/ArrayRef.h"
67#include "llvm/ADT/DenseMap.h"
68#include "llvm/ADT/DenseMapInfo.h"
69#include "llvm/ADT/Hashing.h"
70#include "llvm/ADT/MapVector.h"
71#include "llvm/ADT/STLExtras.h"
72#include "llvm/ADT/SmallPtrSet.h"
73#include "llvm/ADT/SmallSet.h"
74#include "llvm/ADT/SmallVector.h"
75#include "llvm/ADT/Statistic.h"
76#include "llvm/ADT/StringRef.h"
77#include "llvm/ADT/Twine.h"
78#include "llvm/ADT/iterator_range.h"
79#include "llvm/Analysis/AssumptionCache.h"
80#include "llvm/Analysis/BasicAliasAnalysis.h"
81#include "llvm/Analysis/BlockFrequencyInfo.h"
82#include "llvm/Analysis/CFG.h"
83#include "llvm/Analysis/CodeMetrics.h"
84#include "llvm/Analysis/DemandedBits.h"
85#include "llvm/Analysis/GlobalsModRef.h"
86#include "llvm/Analysis/LoopAccessAnalysis.h"
87#include "llvm/Analysis/LoopAnalysisManager.h"
88#include "llvm/Analysis/LoopInfo.h"
89#include "llvm/Analysis/LoopIterator.h"
90#include "llvm/Analysis/OptimizationRemarkEmitter.h"
91#include "llvm/Analysis/ProfileSummaryInfo.h"
92#include "llvm/Analysis/ScalarEvolution.h"
93#include "llvm/Analysis/ScalarEvolutionExpressions.h"
94#include "llvm/Analysis/TargetLibraryInfo.h"
95#include "llvm/Analysis/TargetTransformInfo.h"
96#include "llvm/Analysis/ValueTracking.h"
97#include "llvm/Analysis/VectorUtils.h"
98#include "llvm/IR/Attributes.h"
99#include "llvm/IR/BasicBlock.h"
100#include "llvm/IR/CFG.h"
101#include "llvm/IR/Constant.h"
102#include "llvm/IR/Constants.h"
103#include "llvm/IR/DataLayout.h"
104#include "llvm/IR/DebugInfo.h"
105#include "llvm/IR/DebugInfoMetadata.h"
106#include "llvm/IR/DebugLoc.h"
107#include "llvm/IR/DerivedTypes.h"
108#include "llvm/IR/DiagnosticInfo.h"
109#include "llvm/IR/Dominators.h"
110#include "llvm/IR/Function.h"
111#include "llvm/IR/IRBuilder.h"
112#include "llvm/IR/InstrTypes.h"
113#include "llvm/IR/Instruction.h"
114#include "llvm/IR/Instructions.h"
115#include "llvm/IR/IntrinsicInst.h"
116#include "llvm/IR/Intrinsics.h"
117#include "llvm/IR/MDBuilder.h"
118#include "llvm/IR/Metadata.h"
119#include "llvm/IR/Module.h"
120#include "llvm/IR/Operator.h"
121#include "llvm/IR/PatternMatch.h"
122#include "llvm/IR/ProfDataUtils.h"
123#include "llvm/IR/Type.h"
124#include "llvm/IR/Use.h"
125#include "llvm/IR/User.h"
126#include "llvm/IR/Value.h"
127#include "llvm/IR/ValueHandle.h"
128#include "llvm/IR/VectorBuilder.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
131#include "llvm/Support/CommandLine.h"
132#include "llvm/Support/Compiler.h"
133#include "llvm/Support/Debug.h"
134#include "llvm/Support/ErrorHandling.h"
135#include "llvm/Support/InstructionCost.h"
136#include "llvm/Support/MathExtras.h"
137#include "llvm/Support/raw_ostream.h"
138#include "llvm/Transforms/Utils/BasicBlockUtils.h"
139#include "llvm/Transforms/Utils/InjectTLIMappings.h"
140#include "llvm/Transforms/Utils/LoopSimplify.h"
141#include "llvm/Transforms/Utils/LoopUtils.h"
142#include "llvm/Transforms/Utils/LoopVersioning.h"
143#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144#include "llvm/Transforms/Utils/SizeOpts.h"
145#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146#include <algorithm>
147#include <cassert>
148#include <cmath>
149#include <cstdint>
150#include <functional>
151#include <iterator>
152#include <limits>
153#include <map>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160
161#define LV_NAME "loop-vectorize"
162#define DEBUG_TYPE LV_NAME
163
164#ifndef NDEBUG
165const char VerboseDebug[] = DEBUG_TYPE "-verbose";
166#endif
167
168/// @{
169/// Metadata attribute names
170const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
171const char LLVMLoopVectorizeFollowupVectorized[] =
172 "llvm.loop.vectorize.followup_vectorized";
173const char LLVMLoopVectorizeFollowupEpilogue[] =
174 "llvm.loop.vectorize.followup_epilogue";
175/// @}
176
177STATISTIC(LoopsVectorized, "Number of loops vectorized");
178STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
179STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
180
181static cl::opt<bool> EnableEpilogueVectorization(
182 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
183 cl::desc("Enable vectorization of epilogue loops."));
184
185static cl::opt<unsigned> EpilogueVectorizationForceVF(
186 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
187 cl::desc("When epilogue vectorization is enabled, and a value greater than "
188 "1 is specified, forces the given VF for all applicable epilogue "
189 "loops."));
190
191static cl::opt<unsigned> EpilogueVectorizationMinVF(
192 "epilogue-vectorization-minimum-VF", cl::init(Val: 16), cl::Hidden,
193 cl::desc("Only loops with vectorization factor equal to or larger than "
194 "the specified value are considered for epilogue vectorization."));
195
196/// Loops with a known constant trip count below this number are vectorized only
197/// if no scalar iteration overheads are incurred.
198static cl::opt<unsigned> TinyTripCountVectorThreshold(
199 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
200 cl::desc("Loops with a constant trip count that is smaller than this "
201 "value are vectorized only if no scalar iteration overheads "
202 "are incurred."));
203
204static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
205 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
206 cl::desc("The maximum allowed number of runtime memory checks"));
207
208static cl::opt<bool> UseLegacyCostModel(
209 "vectorize-use-legacy-cost-model", cl::init(Val: true), cl::Hidden,
210 cl::desc("Use the legacy cost model instead of the VPlan-based cost model. "
211 "This option will be removed in the future."));
212
213// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
214// that predication is preferred, and this lists all options. I.e., the
215// vectorizer will try to fold the tail-loop (epilogue) into the vector body
216// and predicate the instructions accordingly. If tail-folding fails, there are
217// different fallback strategies depending on these values:
218namespace PreferPredicateTy {
219 enum Option {
220 ScalarEpilogue = 0,
221 PredicateElseScalarEpilogue,
222 PredicateOrDontVectorize
223 };
224} // namespace PreferPredicateTy
225
226static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
227 "prefer-predicate-over-epilogue",
228 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
229 cl::Hidden,
230 cl::desc("Tail-folding and predication preferences over creating a scalar "
231 "epilogue loop."),
232 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
233 "scalar-epilogue",
234 "Don't tail-predicate loops, create scalar epilogue"),
235 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
236 "predicate-else-scalar-epilogue",
237 "prefer tail-folding, create scalar epilogue if tail "
238 "folding fails."),
239 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
240 "predicate-dont-vectorize",
241 "prefers tail-folding, don't attempt vectorization if "
242 "tail-folding fails.")));
243
244static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
245 "force-tail-folding-style", cl::desc("Force the tail folding style"),
246 cl::init(Val: TailFoldingStyle::None),
247 cl::values(
248 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
249 clEnumValN(
250 TailFoldingStyle::Data, "data",
251 "Create lane mask for data only, using active.lane.mask intrinsic"),
252 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
253 "data-without-lane-mask",
254 "Create lane mask with compare/stepvector"),
255 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
256 "Create lane mask using active.lane.mask intrinsic, and use "
257 "it for both data and control flow"),
258 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
259 "data-and-control-without-rt-check",
260 "Similar to data-and-control, but remove the runtime check"),
261 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
262 "Use predicated EVL instructions for tail folding. If EVL "
263 "is unsupported, fallback to data-without-lane-mask.")));
264
265static cl::opt<bool> MaximizeBandwidth(
266 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
267 cl::desc("Maximize bandwidth when selecting vectorization factor which "
268 "will be determined by the smallest type in loop."));
269
270static cl::opt<bool> EnableInterleavedMemAccesses(
271 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
272 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
273
274/// An interleave-group may need masking if it resides in a block that needs
275/// predication, or in order to mask away gaps.
276static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
277 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
278 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
279
280static cl::opt<unsigned> ForceTargetNumScalarRegs(
281 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
282 cl::desc("A flag that overrides the target's number of scalar registers."));
283
284static cl::opt<unsigned> ForceTargetNumVectorRegs(
285 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
286 cl::desc("A flag that overrides the target's number of vector registers."));
287
288static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
289 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
290 cl::desc("A flag that overrides the target's max interleave factor for "
291 "scalar loops."));
292
293static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
294 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
295 cl::desc("A flag that overrides the target's max interleave factor for "
296 "vectorized loops."));
297
298cl::opt<unsigned> ForceTargetInstructionCost(
299 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
300 cl::desc("A flag that overrides the target's expected cost for "
301 "an instruction to a single constant value. Mostly "
302 "useful for getting consistent testing."));
303
304static cl::opt<bool> ForceTargetSupportsScalableVectors(
305 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
306 cl::desc(
307 "Pretend that scalable vectors are supported, even if the target does "
308 "not support them. This flag should only be used for testing."));
309
310static cl::opt<unsigned> SmallLoopCost(
311 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
312 cl::desc(
313 "The cost of a loop that is considered 'small' by the interleaver."));
314
315static cl::opt<bool> LoopVectorizeWithBlockFrequency(
316 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
317 cl::desc("Enable the use of the block frequency analysis to access PGO "
318 "heuristics minimizing code growth in cold regions and being more "
319 "aggressive in hot regions."));
320
321// Runtime interleave loops for load/store throughput.
322static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
323 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
324 cl::desc(
325 "Enable runtime interleaving until load/store ports are saturated"));
326
327/// The number of stores in a loop that are allowed to need predication.
328static cl::opt<unsigned> NumberOfStoresToPredicate(
329 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
330 cl::desc("Max number of stores to be predicated behind an if."));
331
332static cl::opt<bool> EnableIndVarRegisterHeur(
333 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
334 cl::desc("Count the induction variable only once when interleaving"));
335
336static cl::opt<bool> EnableCondStoresVectorization(
337 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
338 cl::desc("Enable if predication of stores during vectorization."));
339
340static cl::opt<unsigned> MaxNestedScalarReductionIC(
341 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
342 cl::desc("The maximum interleave count to use when interleaving a scalar "
343 "reduction in a nested loop."));
344
345static cl::opt<bool>
346 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
347 cl::Hidden,
348 cl::desc("Prefer in-loop vector reductions, "
349 "overriding the targets preference."));
350
351static cl::opt<bool> ForceOrderedReductions(
352 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
353 cl::desc("Enable the vectorisation of loops with in-order (strict) "
354 "FP reductions"));
355
356static cl::opt<bool> PreferPredicatedReductionSelect(
357 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
358 cl::desc(
359 "Prefer predicating a reduction operation over an after loop select."));
360
361namespace llvm {
362cl::opt<bool> EnableVPlanNativePath(
363 "enable-vplan-native-path", cl::Hidden,
364 cl::desc("Enable VPlan-native vectorization path with "
365 "support for outer loop vectorization."));
366}
367
368// This flag enables the stress testing of the VPlan H-CFG construction in the
369// VPlan-native vectorization path. It must be used in conjuction with
370// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
371// verification of the H-CFGs built.
372static cl::opt<bool> VPlanBuildStressTest(
373 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
374 cl::desc(
375 "Build VPlan for every supported loop nest in the function and bail "
376 "out right after the build (stress test the VPlan H-CFG construction "
377 "in the VPlan-native vectorization path)."));
378
379cl::opt<bool> llvm::EnableLoopInterleaving(
380 "interleave-loops", cl::init(Val: true), cl::Hidden,
381 cl::desc("Enable loop interleaving in Loop vectorization passes"));
382cl::opt<bool> llvm::EnableLoopVectorization(
383 "vectorize-loops", cl::init(Val: true), cl::Hidden,
384 cl::desc("Run the Loop vectorization passes"));
385
386static cl::opt<bool> PrintVPlansInDotFormat(
387 "vplan-print-in-dot-format", cl::Hidden,
388 cl::desc("Use dot format instead of plain text when dumping VPlans"));
389
390static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
391 "force-widen-divrem-via-safe-divisor", cl::Hidden,
392 cl::desc(
393 "Override cost based safe divisor widening for div/rem instructions"));
394
395static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
396 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
397 cl::Hidden,
398 cl::desc("Try wider VFs if they enable the use of vector variants"));
399
400// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
401// variables not overflowing do not hold. See `emitSCEVChecks`.
402static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
403// Likelyhood of bypassing the vectorized loop because pointers overlap. See
404// `emitMemRuntimeChecks`.
405static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
406// Likelyhood of bypassing the vectorized loop because there are zero trips left
407// after prolog. See `emitIterationCountCheck`.
408static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
409
410/// A helper function that returns true if the given type is irregular. The
411/// type is irregular if its allocated size doesn't equal the store size of an
412/// element of the corresponding vector type.
413static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
414 // Determine if an array of N elements of type Ty is "bitcast compatible"
415 // with a <N x Ty> vector.
416 // This is only true if there is no padding between the array elements.
417 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
418}
419
420/// Returns "best known" trip count for the specified loop \p L as defined by
421/// the following procedure:
422/// 1) Returns exact trip count if it is known.
423/// 2) Returns expected trip count according to profile data if any.
424/// 3) Returns upper bound estimate if it is known.
425/// 4) Returns std::nullopt if all of the above failed.
426static std::optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE,
427 Loop *L) {
428 // Check if exact trip count is known.
429 if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
430 return ExpectedTC;
431
432 // Check if there is an expected trip count available from profile data.
433 if (LoopVectorizeWithBlockFrequency)
434 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
435 return *EstimatedTC;
436
437 // Check if upper bound estimate is known.
438 if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
439 return ExpectedTC;
440
441 return std::nullopt;
442}
443
444namespace {
445// Forward declare GeneratedRTChecks.
446class GeneratedRTChecks;
447
448using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
449} // namespace
450
451namespace llvm {
452
453AnalysisKey ShouldRunExtraVectorPasses::Key;
454
455/// InnerLoopVectorizer vectorizes loops which contain only one basic
456/// block to a specified vectorization factor (VF).
457/// This class performs the widening of scalars into vectors, or multiple
458/// scalars. This class also implements the following features:
459/// * It inserts an epilogue loop for handling loops that don't have iteration
460/// counts that are known to be a multiple of the vectorization factor.
461/// * It handles the code generation for reduction variables.
462/// * Scalarization (implementation using scalars) of un-vectorizable
463/// instructions.
464/// InnerLoopVectorizer does not perform any vectorization-legality
465/// checks, and relies on the caller to check for the different legality
466/// aspects. The InnerLoopVectorizer relies on the
467/// LoopVectorizationLegality class to provide information about the induction
468/// and reduction variables that were found to a given vectorization factor.
469class InnerLoopVectorizer {
470public:
471 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
472 LoopInfo *LI, DominatorTree *DT,
473 const TargetLibraryInfo *TLI,
474 const TargetTransformInfo *TTI, AssumptionCache *AC,
475 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
476 ElementCount MinProfitableTripCount,
477 unsigned UnrollFactor, LoopVectorizationLegality *LVL,
478 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
479 ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
480 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
481 AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
482 Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
483 PSI(PSI), RTChecks(RTChecks) {
484 // Query this against the original loop and save it here because the profile
485 // of the original loop header may change as the transformation happens.
486 OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
487 BB: OrigLoop->getHeader(), PSI, BFI, QueryType: PGSOQueryType::IRPass);
488
489 if (MinProfitableTripCount.isZero())
490 this->MinProfitableTripCount = VecWidth;
491 else
492 this->MinProfitableTripCount = MinProfitableTripCount;
493 }
494
495 virtual ~InnerLoopVectorizer() = default;
496
497 /// Create a new empty loop that will contain vectorized instructions later
498 /// on, while the old loop will be used as the scalar remainder. Control flow
499 /// is generated around the vectorized (and scalar epilogue) loops consisting
500 /// of various checks and bypasses. Return the pre-header block of the new
501 /// loop and the start value for the canonical induction, if it is != 0. The
502 /// latter is the case when vectorizing the epilogue loop. In the case of
503 /// epilogue vectorization, this function is overriden to handle the more
504 /// complex control flow around the loops. \p ExpandedSCEVs is used to
505 /// look up SCEV expansions for expressions needed during skeleton creation.
506 virtual std::pair<BasicBlock *, Value *>
507 createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
508
509 /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
510 void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
511
512 // Return true if any runtime check is added.
513 bool areSafetyChecksAdded() { return AddedSafetyChecks; }
514
515 /// A helper function to scalarize a single Instruction in the innermost loop.
516 /// Generates a sequence of scalar instances for each lane between \p MinLane
517 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
518 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
519 /// Instr's operands.
520 void scalarizeInstruction(const Instruction *Instr,
521 VPReplicateRecipe *RepRecipe,
522 const VPIteration &Instance,
523 VPTransformState &State);
524
525 /// Fix the non-induction PHIs in \p Plan.
526 void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
527
528 /// Create a new phi node for the induction variable \p OrigPhi to resume
529 /// iteration count in the scalar epilogue, from where the vectorized loop
530 /// left off. \p Step is the SCEV-expanded induction step to use. In cases
531 /// where the loop skeleton is more complicated (i.e., epilogue vectorization)
532 /// and the resume values can come from an additional bypass block, the \p
533 /// AdditionalBypass pair provides information about the bypass block and the
534 /// end value on the edge from bypass to this loop.
535 PHINode *createInductionResumeValue(
536 PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
537 ArrayRef<BasicBlock *> BypassBlocks,
538 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
539
540 /// Returns the original loop trip count.
541 Value *getTripCount() const { return TripCount; }
542
543 /// Used to set the trip count after ILV's construction and after the
544 /// preheader block has been executed. Note that this always holds the trip
545 /// count of the original loop for both main loop and epilogue vectorization.
546 void setTripCount(Value *TC) { TripCount = TC; }
547
548protected:
549 friend class LoopVectorizationPlanner;
550
551 /// A small list of PHINodes.
552 using PhiVector = SmallVector<PHINode *, 4>;
553
554 /// A type for scalarized values in the new loop. Each value from the
555 /// original loop, when scalarized, is represented by UF x VF scalar values
556 /// in the new unrolled loop, where UF is the unroll factor and VF is the
557 /// vectorization factor.
558 using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
559
560 /// Set up the values of the IVs correctly when exiting the vector loop.
561 void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
562 Value *VectorTripCount, Value *EndValue,
563 BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
564 VPlan &Plan, VPTransformState &State);
565
566 /// Iteratively sink the scalarized operands of a predicated instruction into
567 /// the block that was created for it.
568 void sinkScalarOperands(Instruction *PredInst);
569
570 /// Returns (and creates if needed) the trip count of the widened loop.
571 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
572
573 /// Emit a bypass check to see if the vector trip count is zero, including if
574 /// it overflows.
575 void emitIterationCountCheck(BasicBlock *Bypass);
576
577 /// Emit a bypass check to see if all of the SCEV assumptions we've
578 /// had to make are correct. Returns the block containing the checks or
579 /// nullptr if no checks have been added.
580 BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
581
582 /// Emit bypass checks to check any memory assumptions we may have made.
583 /// Returns the block containing the checks or nullptr if no checks have been
584 /// added.
585 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
586
587 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
588 /// vector loop preheader, middle block and scalar preheader.
589 void createVectorLoopSkeleton(StringRef Prefix);
590
591 /// Create new phi nodes for the induction variables to resume iteration count
592 /// in the scalar epilogue, from where the vectorized loop left off.
593 /// In cases where the loop skeleton is more complicated (eg. epilogue
594 /// vectorization) and the resume values can come from an additional bypass
595 /// block, the \p AdditionalBypass pair provides information about the bypass
596 /// block and the end value on the edge from bypass to this loop.
597 void createInductionResumeValues(
598 const SCEV2ValueTy &ExpandedSCEVs,
599 std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
600
601 /// Complete the loop skeleton by adding debug MDs, creating appropriate
602 /// conditional branches in the middle block, preparing the builder and
603 /// running the verifier. Return the preheader of the completed vector loop.
604 BasicBlock *completeLoopSkeleton();
605
606 /// Allow subclasses to override and print debug traces before/after vplan
607 /// execution, when trace information is requested.
608 virtual void printDebugTracesAtStart(){};
609 virtual void printDebugTracesAtEnd(){};
610
611 /// The original loop.
612 Loop *OrigLoop;
613
614 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
615 /// dynamic knowledge to simplify SCEV expressions and converts them to a
616 /// more usable form.
617 PredicatedScalarEvolution &PSE;
618
619 /// Loop Info.
620 LoopInfo *LI;
621
622 /// Dominator Tree.
623 DominatorTree *DT;
624
625 /// Target Library Info.
626 const TargetLibraryInfo *TLI;
627
628 /// Target Transform Info.
629 const TargetTransformInfo *TTI;
630
631 /// Assumption Cache.
632 AssumptionCache *AC;
633
634 /// Interface to emit optimization remarks.
635 OptimizationRemarkEmitter *ORE;
636
637 /// The vectorization SIMD factor to use. Each vector will have this many
638 /// vector elements.
639 ElementCount VF;
640
641 ElementCount MinProfitableTripCount;
642
643 /// The vectorization unroll factor to use. Each scalar is vectorized to this
644 /// many different vector instructions.
645 unsigned UF;
646
647 /// The builder that we use
648 IRBuilder<> Builder;
649
650 // --- Vectorization state ---
651
652 /// The vector-loop preheader.
653 BasicBlock *LoopVectorPreHeader;
654
655 /// The scalar-loop preheader.
656 BasicBlock *LoopScalarPreHeader;
657
658 /// Middle Block between the vector and the scalar.
659 BasicBlock *LoopMiddleBlock;
660
661 /// The unique ExitBlock of the scalar loop if one exists. Note that
662 /// there can be multiple exiting edges reaching this block.
663 BasicBlock *LoopExitBlock;
664
665 /// The scalar loop body.
666 BasicBlock *LoopScalarBody;
667
668 /// A list of all bypass blocks. The first block is the entry of the loop.
669 SmallVector<BasicBlock *, 4> LoopBypassBlocks;
670
671 /// Store instructions that were predicated.
672 SmallVector<Instruction *, 4> PredicatedInstructions;
673
674 /// Trip count of the original loop.
675 Value *TripCount = nullptr;
676
677 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
678 Value *VectorTripCount = nullptr;
679
680 /// The legality analysis.
681 LoopVectorizationLegality *Legal;
682
683 /// The profitablity analysis.
684 LoopVectorizationCostModel *Cost;
685
686 // Record whether runtime checks are added.
687 bool AddedSafetyChecks = false;
688
689 // Holds the end values for each induction variable. We save the end values
690 // so we can later fix-up the external users of the induction variables.
691 DenseMap<PHINode *, Value *> IVEndValues;
692
693 /// BFI and PSI are used to check for profile guided size optimizations.
694 BlockFrequencyInfo *BFI;
695 ProfileSummaryInfo *PSI;
696
697 // Whether this loop should be optimized for size based on profile guided size
698 // optimizatios.
699 bool OptForSizeBasedOnProfile;
700
701 /// Structure to hold information about generated runtime checks, responsible
702 /// for cleaning the checks, if vectorization turns out unprofitable.
703 GeneratedRTChecks &RTChecks;
704
705 // Holds the resume values for reductions in the loops, used to set the
706 // correct start value of reduction PHIs when vectorizing the epilogue.
707 SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
708 ReductionResumeValues;
709};
710
711class InnerLoopUnroller : public InnerLoopVectorizer {
712public:
713 InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
714 LoopInfo *LI, DominatorTree *DT,
715 const TargetLibraryInfo *TLI,
716 const TargetTransformInfo *TTI, AssumptionCache *AC,
717 OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
718 LoopVectorizationLegality *LVL,
719 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
720 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
721 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
722 ElementCount::getFixed(MinVal: 1),
723 ElementCount::getFixed(MinVal: 1), UnrollFactor, LVL, CM,
724 BFI, PSI, Check) {}
725};
726
727/// Encapsulate information regarding vectorization of a loop and its epilogue.
728/// This information is meant to be updated and used across two stages of
729/// epilogue vectorization.
730struct EpilogueLoopVectorizationInfo {
731 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
732 unsigned MainLoopUF = 0;
733 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
734 unsigned EpilogueUF = 0;
735 BasicBlock *MainLoopIterationCountCheck = nullptr;
736 BasicBlock *EpilogueIterationCountCheck = nullptr;
737 BasicBlock *SCEVSafetyCheck = nullptr;
738 BasicBlock *MemSafetyCheck = nullptr;
739 Value *TripCount = nullptr;
740 Value *VectorTripCount = nullptr;
741
742 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
743 ElementCount EVF, unsigned EUF)
744 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
745 assert(EUF == 1 &&
746 "A high UF for the epilogue loop is likely not beneficial.");
747 }
748};
749
750/// An extension of the inner loop vectorizer that creates a skeleton for a
751/// vectorized loop that has its epilogue (residual) also vectorized.
752/// The idea is to run the vplan on a given loop twice, firstly to setup the
753/// skeleton and vectorize the main loop, and secondly to complete the skeleton
754/// from the first step and vectorize the epilogue. This is achieved by
755/// deriving two concrete strategy classes from this base class and invoking
756/// them in succession from the loop vectorizer planner.
757class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
758public:
759 InnerLoopAndEpilogueVectorizer(
760 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
761 DominatorTree *DT, const TargetLibraryInfo *TLI,
762 const TargetTransformInfo *TTI, AssumptionCache *AC,
763 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
764 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
765 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
766 GeneratedRTChecks &Checks)
767 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
768 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
769 CM, BFI, PSI, Checks),
770 EPI(EPI) {}
771
772 // Override this function to handle the more complex control flow around the
773 // three loops.
774 std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton(
775 const SCEV2ValueTy &ExpandedSCEVs) final {
776 return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
777 }
778
779 /// The interface for creating a vectorized skeleton using one of two
780 /// different strategies, each corresponding to one execution of the vplan
781 /// as described above.
782 virtual std::pair<BasicBlock *, Value *>
783 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
784
785 /// Holds and updates state information required to vectorize the main loop
786 /// and its epilogue in two separate passes. This setup helps us avoid
787 /// regenerating and recomputing runtime safety checks. It also helps us to
788 /// shorten the iteration-count-check path length for the cases where the
789 /// iteration count of the loop is so small that the main vector loop is
790 /// completely skipped.
791 EpilogueLoopVectorizationInfo &EPI;
792};
793
794/// A specialized derived class of inner loop vectorizer that performs
795/// vectorization of *main* loops in the process of vectorizing loops and their
796/// epilogues.
797class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
798public:
799 EpilogueVectorizerMainLoop(
800 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
801 DominatorTree *DT, const TargetLibraryInfo *TLI,
802 const TargetTransformInfo *TTI, AssumptionCache *AC,
803 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
804 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
805 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
806 GeneratedRTChecks &Check)
807 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
808 EPI, LVL, CM, BFI, PSI, Check) {}
809 /// Implements the interface for creating a vectorized skeleton using the
810 /// *main loop* strategy (ie the first pass of vplan execution).
811 std::pair<BasicBlock *, Value *>
812 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
813
814protected:
815 /// Emits an iteration count bypass check once for the main loop (when \p
816 /// ForEpilogue is false) and once for the epilogue loop (when \p
817 /// ForEpilogue is true).
818 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
819 void printDebugTracesAtStart() override;
820 void printDebugTracesAtEnd() override;
821};
822
823// A specialized derived class of inner loop vectorizer that performs
824// vectorization of *epilogue* loops in the process of vectorizing loops and
825// their epilogues.
826class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
827public:
828 EpilogueVectorizerEpilogueLoop(
829 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
830 DominatorTree *DT, const TargetLibraryInfo *TLI,
831 const TargetTransformInfo *TTI, AssumptionCache *AC,
832 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
833 LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
834 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
835 GeneratedRTChecks &Checks)
836 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
837 EPI, LVL, CM, BFI, PSI, Checks) {
838 TripCount = EPI.TripCount;
839 }
840 /// Implements the interface for creating a vectorized skeleton using the
841 /// *epilogue loop* strategy (ie the second pass of vplan execution).
842 std::pair<BasicBlock *, Value *>
843 createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
844
845protected:
846 /// Emits an iteration count bypass check after the main vector loop has
847 /// finished to see if there are any iterations left to execute by either
848 /// the vector epilogue or the scalar epilogue.
849 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
850 BasicBlock *Bypass,
851 BasicBlock *Insert);
852 void printDebugTracesAtStart() override;
853 void printDebugTracesAtEnd() override;
854};
855} // end namespace llvm
856
857/// Look for a meaningful debug location on the instruction or it's
858/// operands.
859static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
860 if (!I)
861 return DebugLoc();
862
863 DebugLoc Empty;
864 if (I->getDebugLoc() != Empty)
865 return I->getDebugLoc();
866
867 for (Use &Op : I->operands()) {
868 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
869 if (OpInst->getDebugLoc() != Empty)
870 return OpInst->getDebugLoc();
871 }
872
873 return I->getDebugLoc();
874}
875
876/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
877/// is passed, the message relates to that particular instruction.
878#ifndef NDEBUG
879static void debugVectorizationMessage(const StringRef Prefix,
880 const StringRef DebugMsg,
881 Instruction *I) {
882 dbgs() << "LV: " << Prefix << DebugMsg;
883 if (I != nullptr)
884 dbgs() << " " << *I;
885 else
886 dbgs() << '.';
887 dbgs() << '\n';
888}
889#endif
890
891/// Create an analysis remark that explains why vectorization failed
892///
893/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
894/// RemarkName is the identifier for the remark. If \p I is passed it is an
895/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
896/// the location of the remark. \return the remark object that can be
897/// streamed to.
898static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
899 StringRef RemarkName, Loop *TheLoop, Instruction *I) {
900 Value *CodeRegion = TheLoop->getHeader();
901 DebugLoc DL = TheLoop->getStartLoc();
902
903 if (I) {
904 CodeRegion = I->getParent();
905 // If there is no debug location attached to the instruction, revert back to
906 // using the loop's.
907 if (I->getDebugLoc())
908 DL = I->getDebugLoc();
909 }
910
911 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
912}
913
914namespace llvm {
915
916/// Return a value for Step multiplied by VF.
917Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
918 int64_t Step) {
919 assert(Ty->isIntegerTy() && "Expected an integer step");
920 return B.CreateElementCount(DstType: Ty, EC: VF.multiplyCoefficientBy(RHS: Step));
921}
922
923/// Return the runtime value for VF.
924Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
925 return B.CreateElementCount(DstType: Ty, EC: VF);
926}
927
928const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE,
929 Loop *OrigLoop) {
930 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
931 assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && "Invalid loop count");
932
933 ScalarEvolution &SE = *PSE.getSE();
934 return SE.getTripCountFromExitCount(ExitCount: BackedgeTakenCount, EvalTy: IdxTy, L: OrigLoop);
935}
936
937void reportVectorizationFailure(const StringRef DebugMsg,
938 const StringRef OREMsg, const StringRef ORETag,
939 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
940 Instruction *I) {
941 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
942 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
943 ORE->emit(
944 OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
945 << "loop not vectorized: " << OREMsg);
946}
947
948/// Reports an informative message: print \p Msg for debugging purposes as well
949/// as an optimization remark. Uses either \p I as location of the remark, or
950/// otherwise \p TheLoop.
951static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
952 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
953 Instruction *I = nullptr) {
954 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
955 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
956 ORE->emit(
957 OptDiag&: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
958 << Msg);
959}
960
961/// Report successful vectorization of the loop. In case an outer loop is
962/// vectorized, prepend "outer" to the vectorization remark.
963static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
964 VectorizationFactor VF, unsigned IC) {
965 LLVM_DEBUG(debugVectorizationMessage(
966 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
967 nullptr));
968 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
969 ORE->emit(RemarkBuilder: [&]() {
970 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
971 TheLoop->getHeader())
972 << "vectorized " << LoopType << "loop (vectorization width: "
973 << ore::NV("VectorizationFactor", VF.Width)
974 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
975 });
976}
977
978} // end namespace llvm
979
980namespace llvm {
981
982// Loop vectorization cost-model hints how the scalar epilogue loop should be
983// lowered.
984enum ScalarEpilogueLowering {
985
986 // The default: allowing scalar epilogues.
987 CM_ScalarEpilogueAllowed,
988
989 // Vectorization with OptForSize: don't allow epilogues.
990 CM_ScalarEpilogueNotAllowedOptSize,
991
992 // A special case of vectorisation with OptForSize: loops with a very small
993 // trip count are considered for vectorization under OptForSize, thereby
994 // making sure the cost of their loop body is dominant, free of runtime
995 // guards and scalar iteration overheads.
996 CM_ScalarEpilogueNotAllowedLowTripLoop,
997
998 // Loop hint predicate indicating an epilogue is undesired.
999 CM_ScalarEpilogueNotNeededUsePredicate,
1000
1001 // Directive indicating we must either tail fold or not vectorize
1002 CM_ScalarEpilogueNotAllowedUsePredicate
1003};
1004
1005using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1006
1007/// LoopVectorizationCostModel - estimates the expected speedups due to
1008/// vectorization.
1009/// In many cases vectorization is not profitable. This can happen because of
1010/// a number of reasons. In this class we mainly attempt to predict the
1011/// expected speedup/slowdowns due to the supported instruction set. We use the
1012/// TargetTransformInfo to query the different backends for the cost of
1013/// different operations.
1014class LoopVectorizationCostModel {
1015public:
1016 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1017 PredicatedScalarEvolution &PSE, LoopInfo *LI,
1018 LoopVectorizationLegality *Legal,
1019 const TargetTransformInfo &TTI,
1020 const TargetLibraryInfo *TLI, DemandedBits *DB,
1021 AssumptionCache *AC,
1022 OptimizationRemarkEmitter *ORE, const Function *F,
1023 const LoopVectorizeHints *Hints,
1024 InterleavedAccessInfo &IAI)
1025 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1026 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1027 Hints(Hints), InterleaveInfo(IAI) {}
1028
1029 /// \return An upper bound for the vectorization factors (both fixed and
1030 /// scalable). If the factors are 0, vectorization and interleaving should be
1031 /// avoided up front.
1032 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1033
1034 /// \return True if runtime checks are required for vectorization, and false
1035 /// otherwise.
1036 bool runtimeChecksRequired();
1037
1038 /// Setup cost-based decisions for user vectorization factor.
1039 /// \return true if the UserVF is a feasible VF to be chosen.
1040 bool selectUserVectorizationFactor(ElementCount UserVF) {
1041 collectUniformsAndScalars(VF: UserVF);
1042 collectInstsToScalarize(VF: UserVF);
1043 return expectedCost(VF: UserVF).isValid();
1044 }
1045
1046 /// \return The size (in bits) of the smallest and widest types in the code
1047 /// that needs to be vectorized. We ignore values that remain scalar such as
1048 /// 64 bit loop indices.
1049 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1050
1051 /// \return The desired interleave count.
1052 /// If interleave count has been specified by metadata it will be returned.
1053 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1054 /// are the selected vectorization factor and the cost of the selected VF.
1055 unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
1056
1057 /// Memory access instruction may be vectorized in more than one way.
1058 /// Form of instruction after vectorization depends on cost.
1059 /// This function takes cost-based decisions for Load/Store instructions
1060 /// and collects them in a map. This decisions map is used for building
1061 /// the lists of loop-uniform and loop-scalar instructions.
1062 /// The calculated cost is saved with widening decision in order to
1063 /// avoid redundant calculations.
1064 void setCostBasedWideningDecision(ElementCount VF);
1065
1066 /// A call may be vectorized in different ways depending on whether we have
1067 /// vectorized variants available and whether the target supports masking.
1068 /// This function analyzes all calls in the function at the supplied VF,
1069 /// makes a decision based on the costs of available options, and stores that
1070 /// decision in a map for use in planning and plan execution.
1071 void setVectorizedCallDecision(ElementCount VF);
1072
1073 /// A struct that represents some properties of the register usage
1074 /// of a loop.
1075 struct RegisterUsage {
1076 /// Holds the number of loop invariant values that are used in the loop.
1077 /// The key is ClassID of target-provided register class.
1078 SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1079 /// Holds the maximum number of concurrent live intervals in the loop.
1080 /// The key is ClassID of target-provided register class.
1081 SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1082 };
1083
1084 /// \return Returns information about the register usages of the loop for the
1085 /// given vectorization factors.
1086 SmallVector<RegisterUsage, 8>
1087 calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1088
1089 /// Collect values we want to ignore in the cost model.
1090 void collectValuesToIgnore();
1091
1092 /// Collect all element types in the loop for which widening is needed.
1093 void collectElementTypesForWidening();
1094
1095 /// Split reductions into those that happen in the loop, and those that happen
1096 /// outside. In loop reductions are collected into InLoopReductions.
1097 void collectInLoopReductions();
1098
1099 /// Returns true if we should use strict in-order reductions for the given
1100 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1101 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1102 /// of FP operations.
1103 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1104 return !Hints->allowReordering() && RdxDesc.isOrdered();
1105 }
1106
1107 /// \returns The smallest bitwidth each instruction can be represented with.
1108 /// The vector equivalents of these instructions should be truncated to this
1109 /// type.
1110 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1111 return MinBWs;
1112 }
1113
1114 /// \returns True if it is more profitable to scalarize instruction \p I for
1115 /// vectorization factor \p VF.
1116 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1117 assert(VF.isVector() &&
1118 "Profitable to scalarize relevant only for VF > 1.");
1119 assert(
1120 TheLoop->isInnermost() &&
1121 "cost-model should not be used for outer loops (in VPlan-native path)");
1122
1123 auto Scalars = InstsToScalarize.find(Val: VF);
1124 assert(Scalars != InstsToScalarize.end() &&
1125 "VF not yet analyzed for scalarization profitability");
1126 return Scalars->second.contains(Val: I);
1127 }
1128
1129 /// Returns true if \p I is known to be uniform after vectorization.
1130 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1131 assert(
1132 TheLoop->isInnermost() &&
1133 "cost-model should not be used for outer loops (in VPlan-native path)");
1134 // Pseudo probe needs to be duplicated for each unrolled iteration and
1135 // vector lane so that profiled loop trip count can be accurately
1136 // accumulated instead of being under counted.
1137 if (isa<PseudoProbeInst>(Val: I))
1138 return false;
1139
1140 if (VF.isScalar())
1141 return true;
1142
1143 auto UniformsPerVF = Uniforms.find(Val: VF);
1144 assert(UniformsPerVF != Uniforms.end() &&
1145 "VF not yet analyzed for uniformity");
1146 return UniformsPerVF->second.count(Ptr: I);
1147 }
1148
1149 /// Returns true if \p I is known to be scalar after vectorization.
1150 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1151 assert(
1152 TheLoop->isInnermost() &&
1153 "cost-model should not be used for outer loops (in VPlan-native path)");
1154 if (VF.isScalar())
1155 return true;
1156
1157 auto ScalarsPerVF = Scalars.find(Val: VF);
1158 assert(ScalarsPerVF != Scalars.end() &&
1159 "Scalar values are not calculated for VF");
1160 return ScalarsPerVF->second.count(Ptr: I);
1161 }
1162
1163 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1164 /// for vectorization factor \p VF.
1165 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1166 return VF.isVector() && MinBWs.contains(Key: I) &&
1167 !isProfitableToScalarize(I, VF) &&
1168 !isScalarAfterVectorization(I, VF);
1169 }
1170
1171 /// Decision that was taken during cost calculation for memory instruction.
1172 enum InstWidening {
1173 CM_Unknown,
1174 CM_Widen, // For consecutive accesses with stride +1.
1175 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1176 CM_Interleave,
1177 CM_GatherScatter,
1178 CM_Scalarize,
1179 CM_VectorCall,
1180 CM_IntrinsicCall
1181 };
1182
1183 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1184 /// instruction \p I and vector width \p VF.
1185 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1186 InstructionCost Cost) {
1187 assert(VF.isVector() && "Expected VF >=2");
1188 WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost);
1189 }
1190
1191 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1192 /// interleaving group \p Grp and vector width \p VF.
1193 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1194 ElementCount VF, InstWidening W,
1195 InstructionCost Cost) {
1196 assert(VF.isVector() && "Expected VF >=2");
1197 /// Broadcast this decicion to all instructions inside the group.
1198 /// But the cost will be assigned to one instruction only.
1199 for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1200 if (auto *I = Grp->getMember(Index: i)) {
1201 if (Grp->getInsertPos() == I)
1202 WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y&: Cost);
1203 else
1204 WideningDecisions[std::make_pair(x&: I, y&: VF)] = std::make_pair(x&: W, y: 0);
1205 }
1206 }
1207 }
1208
1209 /// Return the cost model decision for the given instruction \p I and vector
1210 /// width \p VF. Return CM_Unknown if this instruction did not pass
1211 /// through the cost modeling.
1212 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1213 assert(VF.isVector() && "Expected VF to be a vector VF");
1214 assert(
1215 TheLoop->isInnermost() &&
1216 "cost-model should not be used for outer loops (in VPlan-native path)");
1217
1218 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF);
1219 auto Itr = WideningDecisions.find(Val: InstOnVF);
1220 if (Itr == WideningDecisions.end())
1221 return CM_Unknown;
1222 return Itr->second.first;
1223 }
1224
1225 /// Return the vectorization cost for the given instruction \p I and vector
1226 /// width \p VF.
1227 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1228 assert(VF.isVector() && "Expected VF >=2");
1229 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(x&: I, y&: VF);
1230 assert(WideningDecisions.contains(InstOnVF) &&
1231 "The cost is not calculated");
1232 return WideningDecisions[InstOnVF].second;
1233 }
1234
1235 struct CallWideningDecision {
1236 InstWidening Kind;
1237 Function *Variant;
1238 Intrinsic::ID IID;
1239 std::optional<unsigned> MaskPos;
1240 InstructionCost Cost;
1241 };
1242
1243 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1244 Function *Variant, Intrinsic::ID IID,
1245 std::optional<unsigned> MaskPos,
1246 InstructionCost Cost) {
1247 assert(!VF.isScalar() && "Expected vector VF");
1248 CallWideningDecisions[std::make_pair(x&: CI, y&: VF)] = {.Kind: Kind, .Variant: Variant, .IID: IID,
1249 .MaskPos: MaskPos, .Cost: Cost};
1250 }
1251
1252 CallWideningDecision getCallWideningDecision(CallInst *CI,
1253 ElementCount VF) const {
1254 assert(!VF.isScalar() && "Expected vector VF");
1255 return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF));
1256 }
1257
1258 /// Return True if instruction \p I is an optimizable truncate whose operand
1259 /// is an induction variable. Such a truncate will be removed by adding a new
1260 /// induction variable with the destination type.
1261 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1262 // If the instruction is not a truncate, return false.
1263 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1264 if (!Trunc)
1265 return false;
1266
1267 // Get the source and destination types of the truncate.
1268 Type *SrcTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getSrcTy(), EC: VF);
1269 Type *DestTy = ToVectorTy(Scalar: cast<CastInst>(Val: I)->getDestTy(), EC: VF);
1270
1271 // If the truncate is free for the given types, return false. Replacing a
1272 // free truncate with an induction variable would add an induction variable
1273 // update instruction to each iteration of the loop. We exclude from this
1274 // check the primary induction variable since it will need an update
1275 // instruction regardless.
1276 Value *Op = Trunc->getOperand(i_nocapture: 0);
1277 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1278 return false;
1279
1280 // If the truncated value is not an induction variable, return false.
1281 return Legal->isInductionPhi(V: Op);
1282 }
1283
1284 /// Collects the instructions to scalarize for each predicated instruction in
1285 /// the loop.
1286 void collectInstsToScalarize(ElementCount VF);
1287
1288 /// Collect Uniform and Scalar values for the given \p VF.
1289 /// The sets depend on CM decision for Load/Store instructions
1290 /// that may be vectorized as interleave, gather-scatter or scalarized.
1291 /// Also make a decision on what to do about call instructions in the loop
1292 /// at that VF -- scalarize, call a known vector routine, or call a
1293 /// vector intrinsic.
1294 void collectUniformsAndScalars(ElementCount VF) {
1295 // Do the analysis once.
1296 if (VF.isScalar() || Uniforms.contains(Val: VF))
1297 return;
1298 setCostBasedWideningDecision(VF);
1299 setVectorizedCallDecision(VF);
1300 collectLoopUniforms(VF);
1301 collectLoopScalars(VF);
1302 }
1303
1304 /// Returns true if the target machine supports masked store operation
1305 /// for the given \p DataType and kind of access to \p Ptr.
1306 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1307 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1308 TTI.isLegalMaskedStore(DataType, Alignment);
1309 }
1310
1311 /// Returns true if the target machine supports masked load operation
1312 /// for the given \p DataType and kind of access to \p Ptr.
1313 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1314 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1315 TTI.isLegalMaskedLoad(DataType, Alignment);
1316 }
1317
1318 /// Returns true if the target machine can represent \p V as a masked gather
1319 /// or scatter operation.
1320 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1321 bool LI = isa<LoadInst>(Val: V);
1322 bool SI = isa<StoreInst>(Val: V);
1323 if (!LI && !SI)
1324 return false;
1325 auto *Ty = getLoadStoreType(I: V);
1326 Align Align = getLoadStoreAlignment(I: V);
1327 if (VF.isVector())
1328 Ty = VectorType::get(ElementType: Ty, EC: VF);
1329 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1330 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1331 }
1332
1333 /// Returns true if the target machine supports all of the reduction
1334 /// variables found for the given VF.
1335 bool canVectorizeReductions(ElementCount VF) const {
1336 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1337 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1338 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1339 }));
1340 }
1341
1342 /// Given costs for both strategies, return true if the scalar predication
1343 /// lowering should be used for div/rem. This incorporates an override
1344 /// option so it is not simply a cost comparison.
1345 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1346 InstructionCost SafeDivisorCost) const {
1347 switch (ForceSafeDivisor) {
1348 case cl::BOU_UNSET:
1349 return ScalarCost < SafeDivisorCost;
1350 case cl::BOU_TRUE:
1351 return false;
1352 case cl::BOU_FALSE:
1353 return true;
1354 };
1355 llvm_unreachable("impossible case value");
1356 }
1357
1358 /// Returns true if \p I is an instruction which requires predication and
1359 /// for which our chosen predication strategy is scalarization (i.e. we
1360 /// don't have an alternate strategy such as masking available).
1361 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1362 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1363
1364 /// Returns true if \p I is an instruction that needs to be predicated
1365 /// at runtime. The result is independent of the predication mechanism.
1366 /// Superset of instructions that return true for isScalarWithPredication.
1367 bool isPredicatedInst(Instruction *I) const;
1368
1369 /// Return the costs for our two available strategies for lowering a
1370 /// div/rem operation which requires speculating at least one lane.
1371 /// First result is for scalarization (will be invalid for scalable
1372 /// vectors); second is for the safe-divisor strategy.
1373 std::pair<InstructionCost, InstructionCost>
1374 getDivRemSpeculationCost(Instruction *I,
1375 ElementCount VF) const;
1376
1377 /// Returns true if \p I is a memory instruction with consecutive memory
1378 /// access that can be widened.
1379 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1380
1381 /// Returns true if \p I is a memory instruction in an interleaved-group
1382 /// of memory accesses that can be vectorized with wide vector loads/stores
1383 /// and shuffles.
1384 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1385
1386 /// Check if \p Instr belongs to any interleaved access group.
1387 bool isAccessInterleaved(Instruction *Instr) const {
1388 return InterleaveInfo.isInterleaved(Instr);
1389 }
1390
1391 /// Get the interleaved access group that \p Instr belongs to.
1392 const InterleaveGroup<Instruction> *
1393 getInterleavedAccessGroup(Instruction *Instr) const {
1394 return InterleaveInfo.getInterleaveGroup(Instr);
1395 }
1396
1397 /// Returns true if we're required to use a scalar epilogue for at least
1398 /// the final iteration of the original loop.
1399 bool requiresScalarEpilogue(bool IsVectorizing) const {
1400 if (!isScalarEpilogueAllowed()) {
1401 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1402 return false;
1403 }
1404 // If we might exit from anywhere but the latch, must run the exiting
1405 // iteration in scalar form.
1406 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1407 LLVM_DEBUG(
1408 dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1409 return true;
1410 }
1411 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1412 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1413 "interleaved group requires scalar epilogue\n");
1414 return true;
1415 }
1416 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1417 return false;
1418 }
1419
1420 /// Returns true if we're required to use a scalar epilogue for at least
1421 /// the final iteration of the original loop for all VFs in \p Range.
1422 /// A scalar epilogue must either be required for all VFs in \p Range or for
1423 /// none.
1424 bool requiresScalarEpilogue(VFRange Range) const {
1425 auto RequiresScalarEpilogue = [this](ElementCount VF) {
1426 return requiresScalarEpilogue(IsVectorizing: VF.isVector());
1427 };
1428 bool IsRequired = all_of(Range, P: RequiresScalarEpilogue);
1429 assert(
1430 (IsRequired || none_of(Range, RequiresScalarEpilogue)) &&
1431 "all VFs in range must agree on whether a scalar epilogue is required");
1432 return IsRequired;
1433 }
1434
1435 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1436 /// loop hint annotation.
1437 bool isScalarEpilogueAllowed() const {
1438 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1439 }
1440
1441 /// Returns the TailFoldingStyle that is best for the current loop.
1442 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1443 if (!ChosenTailFoldingStyle)
1444 return TailFoldingStyle::None;
1445 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1446 : ChosenTailFoldingStyle->second;
1447 }
1448
1449 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1450 /// overflow or not.
1451 /// \param IsScalableVF true if scalable vector factors enabled.
1452 /// \param UserIC User specific interleave count.
1453 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1454 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1455 if (!Legal->canFoldTailByMasking()) {
1456 ChosenTailFoldingStyle =
1457 std::make_pair(x: TailFoldingStyle::None, y: TailFoldingStyle::None);
1458 return;
1459 }
1460
1461 if (!ForceTailFoldingStyle.getNumOccurrences()) {
1462 ChosenTailFoldingStyle = std::make_pair(
1463 x: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1464 y: TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));
1465 return;
1466 }
1467
1468 // Set styles when forced.
1469 ChosenTailFoldingStyle = std::make_pair(x&: ForceTailFoldingStyle.getValue(),
1470 y&: ForceTailFoldingStyle.getValue());
1471 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1472 return;
1473 // Override forced styles if needed.
1474 // FIXME: use actual opcode/data type for analysis here.
1475 // FIXME: Investigate opportunity for fixed vector factor.
1476 bool EVLIsLegal =
1477 IsScalableVF && UserIC <= 1 &&
1478 TTI.hasActiveVectorLength(Opcode: 0, DataType: nullptr, Alignment: Align()) &&
1479 !EnableVPlanNativePath &&
1480 // FIXME: implement support for max safe dependency distance.
1481 Legal->isSafeForAnyVectorWidth();
1482 if (!EVLIsLegal) {
1483 // If for some reason EVL mode is unsupported, fallback to
1484 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1485 // in a generic way.
1486 ChosenTailFoldingStyle =
1487 std::make_pair(x: TailFoldingStyle::DataWithoutLaneMask,
1488 y: TailFoldingStyle::DataWithoutLaneMask);
1489 LLVM_DEBUG(
1490 dbgs()
1491 << "LV: Preference for VP intrinsics indicated. Will "
1492 "not try to generate VP Intrinsics "
1493 << (UserIC > 1
1494 ? "since interleave count specified is greater than 1.\n"
1495 : "due to non-interleaving reasons.\n"));
1496 }
1497 }
1498
1499 /// Returns true if all loop blocks should be masked to fold tail loop.
1500 bool foldTailByMasking() const {
1501 // TODO: check if it is possible to check for None style independent of
1502 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1503 return getTailFoldingStyle() != TailFoldingStyle::None;
1504 }
1505
1506 /// Returns true if the instructions in this block requires predication
1507 /// for any reason, e.g. because tail folding now requires a predicate
1508 /// or because the block in the original loop was predicated.
1509 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1510 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1511 }
1512
1513 /// Returns true if VP intrinsics with explicit vector length support should
1514 /// be generated in the tail folded loop.
1515 bool foldTailWithEVL() const {
1516 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1517 }
1518
1519 /// Returns true if the Phi is part of an inloop reduction.
1520 bool isInLoopReduction(PHINode *Phi) const {
1521 return InLoopReductions.contains(Ptr: Phi);
1522 }
1523
1524 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1525 /// with factor VF. Return the cost of the instruction, including
1526 /// scalarization overhead if it's needed.
1527 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1528
1529 /// Estimate cost of a call instruction CI if it were vectorized with factor
1530 /// VF. Return the cost of the instruction, including scalarization overhead
1531 /// if it's needed.
1532 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1533
1534 /// Invalidates decisions already taken by the cost model.
1535 void invalidateCostModelingDecisions() {
1536 WideningDecisions.clear();
1537 CallWideningDecisions.clear();
1538 Uniforms.clear();
1539 Scalars.clear();
1540 }
1541
1542 /// Returns the expected execution cost. The unit of the cost does
1543 /// not matter because we use the 'cost' units to compare different
1544 /// vector widths. The cost that is returned is *not* normalized by
1545 /// the factor width. If \p Invalid is not nullptr, this function
1546 /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1547 /// each instruction that has an Invalid cost for the given VF.
1548 InstructionCost
1549 expectedCost(ElementCount VF,
1550 SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1551
1552 bool hasPredStores() const { return NumPredStores > 0; }
1553
1554 /// Returns true if epilogue vectorization is considered profitable, and
1555 /// false otherwise.
1556 /// \p VF is the vectorization factor chosen for the original loop.
1557 bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1558
1559 /// Returns the execution time cost of an instruction for a given vector
1560 /// width. Vector width of one means scalar.
1561 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1562
1563 /// Return the cost of instructions in an inloop reduction pattern, if I is
1564 /// part of that pattern.
1565 std::optional<InstructionCost>
1566 getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1567 TTI::TargetCostKind CostKind) const;
1568
1569private:
1570 unsigned NumPredStores = 0;
1571
1572 /// \return An upper bound for the vectorization factors for both
1573 /// fixed and scalable vectorization, where the minimum-known number of
1574 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1575 /// disabled or unsupported, then the scalable part will be equal to
1576 /// ElementCount::getScalable(0).
1577 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1578 ElementCount UserVF,
1579 bool FoldTailByMasking);
1580
1581 /// \return the maximized element count based on the targets vector
1582 /// registers and the loop trip-count, but limited to a maximum safe VF.
1583 /// This is a helper function of computeFeasibleMaxVF.
1584 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1585 unsigned SmallestType,
1586 unsigned WidestType,
1587 ElementCount MaxSafeVF,
1588 bool FoldTailByMasking);
1589
1590 /// Checks if scalable vectorization is supported and enabled. Caches the
1591 /// result to avoid repeated debug dumps for repeated queries.
1592 bool isScalableVectorizationAllowed();
1593
1594 /// \return the maximum legal scalable VF, based on the safe max number
1595 /// of elements.
1596 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1597
1598 /// Calculate vectorization cost of memory instruction \p I.
1599 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1600
1601 /// The cost computation for scalarized memory instruction.
1602 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1603
1604 /// The cost computation for interleaving group of memory instructions.
1605 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1606
1607 /// The cost computation for Gather/Scatter instruction.
1608 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1609
1610 /// The cost computation for widening instruction \p I with consecutive
1611 /// memory access.
1612 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1613
1614 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1615 /// Load: scalar load + broadcast.
1616 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1617 /// element)
1618 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1619
1620 /// Estimate the overhead of scalarizing an instruction. This is a
1621 /// convenience wrapper for the type-based getScalarizationOverhead API.
1622 InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF,
1623 TTI::TargetCostKind CostKind) const;
1624
1625 /// Returns true if an artificially high cost for emulated masked memrefs
1626 /// should be used.
1627 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1628
1629 /// Map of scalar integer values to the smallest bitwidth they can be legally
1630 /// represented as. The vector equivalents of these values should be truncated
1631 /// to this type.
1632 MapVector<Instruction *, uint64_t> MinBWs;
1633
1634 /// A type representing the costs for instructions if they were to be
1635 /// scalarized rather than vectorized. The entries are Instruction-Cost
1636 /// pairs.
1637 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1638
1639 /// A set containing all BasicBlocks that are known to present after
1640 /// vectorization as a predicated block.
1641 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1642 PredicatedBBsAfterVectorization;
1643
1644 /// Records whether it is allowed to have the original scalar loop execute at
1645 /// least once. This may be needed as a fallback loop in case runtime
1646 /// aliasing/dependence checks fail, or to handle the tail/remainder
1647 /// iterations when the trip count is unknown or doesn't divide by the VF,
1648 /// or as a peel-loop to handle gaps in interleave-groups.
1649 /// Under optsize and when the trip count is very small we don't allow any
1650 /// iterations to execute in the scalar loop.
1651 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1652
1653 /// Control finally chosen tail folding style. The first element is used if
1654 /// the IV update may overflow, the second element - if it does not.
1655 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1656 ChosenTailFoldingStyle;
1657
1658 /// true if scalable vectorization is supported and enabled.
1659 std::optional<bool> IsScalableVectorizationAllowed;
1660
1661 /// A map holding scalar costs for different vectorization factors. The
1662 /// presence of a cost for an instruction in the mapping indicates that the
1663 /// instruction will be scalarized when vectorizing with the associated
1664 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1665 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1666
1667 /// Holds the instructions known to be uniform after vectorization.
1668 /// The data is collected per VF.
1669 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1670
1671 /// Holds the instructions known to be scalar after vectorization.
1672 /// The data is collected per VF.
1673 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1674
1675 /// Holds the instructions (address computations) that are forced to be
1676 /// scalarized.
1677 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1678
1679 /// PHINodes of the reductions that should be expanded in-loop.
1680 SmallPtrSet<PHINode *, 4> InLoopReductions;
1681
1682 /// A Map of inloop reduction operations and their immediate chain operand.
1683 /// FIXME: This can be removed once reductions can be costed correctly in
1684 /// VPlan. This was added to allow quick lookup of the inloop operations.
1685 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1686
1687 /// Returns the expected difference in cost from scalarizing the expression
1688 /// feeding a predicated instruction \p PredInst. The instructions to
1689 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1690 /// non-negative return value implies the expression will be scalarized.
1691 /// Currently, only single-use chains are considered for scalarization.
1692 InstructionCost computePredInstDiscount(Instruction *PredInst,
1693 ScalarCostsTy &ScalarCosts,
1694 ElementCount VF);
1695
1696 /// Collect the instructions that are uniform after vectorization. An
1697 /// instruction is uniform if we represent it with a single scalar value in
1698 /// the vectorized loop corresponding to each vector iteration. Examples of
1699 /// uniform instructions include pointer operands of consecutive or
1700 /// interleaved memory accesses. Note that although uniformity implies an
1701 /// instruction will be scalar, the reverse is not true. In general, a
1702 /// scalarized instruction will be represented by VF scalar values in the
1703 /// vectorized loop, each corresponding to an iteration of the original
1704 /// scalar loop.
1705 void collectLoopUniforms(ElementCount VF);
1706
1707 /// Collect the instructions that are scalar after vectorization. An
1708 /// instruction is scalar if it is known to be uniform or will be scalarized
1709 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1710 /// to the list if they are used by a load/store instruction that is marked as
1711 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1712 /// VF values in the vectorized loop, each corresponding to an iteration of
1713 /// the original scalar loop.
1714 void collectLoopScalars(ElementCount VF);
1715
1716 /// Keeps cost model vectorization decision and cost for instructions.
1717 /// Right now it is used for memory instructions only.
1718 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1719 std::pair<InstWidening, InstructionCost>>;
1720
1721 DecisionList WideningDecisions;
1722
1723 using CallDecisionList =
1724 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1725
1726 CallDecisionList CallWideningDecisions;
1727
1728 /// Returns true if \p V is expected to be vectorized and it needs to be
1729 /// extracted.
1730 bool needsExtract(Value *V, ElementCount VF) const {
1731 Instruction *I = dyn_cast<Instruction>(Val: V);
1732 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1733 TheLoop->isLoopInvariant(V: I))
1734 return false;
1735
1736 // Assume we can vectorize V (and hence we need extraction) if the
1737 // scalars are not computed yet. This can happen, because it is called
1738 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1739 // the scalars are collected. That should be a safe assumption in most
1740 // cases, because we check if the operands have vectorizable types
1741 // beforehand in LoopVectorizationLegality.
1742 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1743 };
1744
1745 /// Returns a range containing only operands needing to be extracted.
1746 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1747 ElementCount VF) const {
1748 return SmallVector<Value *, 4>(make_filter_range(
1749 Range&: Ops, Pred: [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1750 }
1751
1752public:
1753 /// The loop that we evaluate.
1754 Loop *TheLoop;
1755
1756 /// Predicated scalar evolution analysis.
1757 PredicatedScalarEvolution &PSE;
1758
1759 /// Loop Info analysis.
1760 LoopInfo *LI;
1761
1762 /// Vectorization legality.
1763 LoopVectorizationLegality *Legal;
1764
1765 /// Vector target information.
1766 const TargetTransformInfo &TTI;
1767
1768 /// Target Library Info.
1769 const TargetLibraryInfo *TLI;
1770
1771 /// Demanded bits analysis.
1772 DemandedBits *DB;
1773
1774 /// Assumption cache.
1775 AssumptionCache *AC;
1776
1777 /// Interface to emit optimization remarks.
1778 OptimizationRemarkEmitter *ORE;
1779
1780 const Function *TheFunction;
1781
1782 /// Loop Vectorize Hint.
1783 const LoopVectorizeHints *Hints;
1784
1785 /// The interleave access information contains groups of interleaved accesses
1786 /// with the same stride and close to each other.
1787 InterleavedAccessInfo &InterleaveInfo;
1788
1789 /// Values to ignore in the cost model.
1790 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1791
1792 /// Values to ignore in the cost model when VF > 1.
1793 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1794
1795 /// All element types found in the loop.
1796 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1797};
1798} // end namespace llvm
1799
1800namespace {
1801/// Helper struct to manage generating runtime checks for vectorization.
1802///
1803/// The runtime checks are created up-front in temporary blocks to allow better
1804/// estimating the cost and un-linked from the existing IR. After deciding to
1805/// vectorize, the checks are moved back. If deciding not to vectorize, the
1806/// temporary blocks are completely removed.
1807class GeneratedRTChecks {
1808 /// Basic block which contains the generated SCEV checks, if any.
1809 BasicBlock *SCEVCheckBlock = nullptr;
1810
1811 /// The value representing the result of the generated SCEV checks. If it is
1812 /// nullptr, either no SCEV checks have been generated or they have been used.
1813 Value *SCEVCheckCond = nullptr;
1814
1815 /// Basic block which contains the generated memory runtime checks, if any.
1816 BasicBlock *MemCheckBlock = nullptr;
1817
1818 /// The value representing the result of the generated memory runtime checks.
1819 /// If it is nullptr, either no memory runtime checks have been generated or
1820 /// they have been used.
1821 Value *MemRuntimeCheckCond = nullptr;
1822
1823 DominatorTree *DT;
1824 LoopInfo *LI;
1825 TargetTransformInfo *TTI;
1826
1827 SCEVExpander SCEVExp;
1828 SCEVExpander MemCheckExp;
1829
1830 bool CostTooHigh = false;
1831 const bool AddBranchWeights;
1832
1833 Loop *OuterLoop = nullptr;
1834
1835public:
1836 GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1837 TargetTransformInfo *TTI, const DataLayout &DL,
1838 bool AddBranchWeights)
1839 : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1840 MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
1841
1842 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1843 /// accurately estimate the cost of the runtime checks. The blocks are
1844 /// un-linked from the IR and is added back during vector code generation. If
1845 /// there is no vector code generation, the check blocks are removed
1846 /// completely.
1847 void Create(Loop *L, const LoopAccessInfo &LAI,
1848 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1849
1850 // Hard cutoff to limit compile-time increase in case a very large number of
1851 // runtime checks needs to be generated.
1852 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1853 // profile info.
1854 CostTooHigh =
1855 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1856 if (CostTooHigh)
1857 return;
1858
1859 BasicBlock *LoopHeader = L->getHeader();
1860 BasicBlock *Preheader = L->getLoopPreheader();
1861
1862 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1863 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1864 // may be used by SCEVExpander. The blocks will be un-linked from their
1865 // predecessors and removed from LI & DT at the end of the function.
1866 if (!UnionPred.isAlwaysTrue()) {
1867 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1868 MSSAU: nullptr, BBName: "vector.scevcheck");
1869
1870 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1871 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1872 }
1873
1874 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1875 if (RtPtrChecking.Need) {
1876 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1877 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1878 BBName: "vector.memcheck");
1879
1880 auto DiffChecks = RtPtrChecking.getDiffChecks();
1881 if (DiffChecks) {
1882 Value *RuntimeVF = nullptr;
1883 MemRuntimeCheckCond = addDiffRuntimeChecks(
1884 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1885 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1886 if (!RuntimeVF)
1887 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1888 return RuntimeVF;
1889 },
1890 IC);
1891 } else {
1892 MemRuntimeCheckCond = addRuntimeChecks(
1893 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1894 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1895 }
1896 assert(MemRuntimeCheckCond &&
1897 "no RT checks generated although RtPtrChecking "
1898 "claimed checks are required");
1899 }
1900
1901 if (!MemCheckBlock && !SCEVCheckBlock)
1902 return;
1903
1904 // Unhook the temporary block with the checks, update various places
1905 // accordingly.
1906 if (SCEVCheckBlock)
1907 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1908 if (MemCheckBlock)
1909 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1910
1911 if (SCEVCheckBlock) {
1912 SCEVCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator());
1913 new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1914 Preheader->getTerminator()->eraseFromParent();
1915 }
1916 if (MemCheckBlock) {
1917 MemCheckBlock->getTerminator()->moveBefore(MovePos: Preheader->getTerminator());
1918 new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1919 Preheader->getTerminator()->eraseFromParent();
1920 }
1921
1922 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1923 if (MemCheckBlock) {
1924 DT->eraseNode(BB: MemCheckBlock);
1925 LI->removeBlock(BB: MemCheckBlock);
1926 }
1927 if (SCEVCheckBlock) {
1928 DT->eraseNode(BB: SCEVCheckBlock);
1929 LI->removeBlock(BB: SCEVCheckBlock);
1930 }
1931
1932 // Outer loop is used as part of the later cost calculations.
1933 OuterLoop = L->getParentLoop();
1934 }
1935
1936 InstructionCost getCost() {
1937 if (SCEVCheckBlock || MemCheckBlock)
1938 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1939
1940 if (CostTooHigh) {
1941 InstructionCost Cost;
1942 Cost.setInvalid();
1943 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1944 return Cost;
1945 }
1946
1947 InstructionCost RTCheckCost = 0;
1948 if (SCEVCheckBlock)
1949 for (Instruction &I : *SCEVCheckBlock) {
1950 if (SCEVCheckBlock->getTerminator() == &I)
1951 continue;
1952 InstructionCost C =
1953 TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput);
1954 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1955 RTCheckCost += C;
1956 }
1957 if (MemCheckBlock) {
1958 InstructionCost MemCheckCost = 0;
1959 for (Instruction &I : *MemCheckBlock) {
1960 if (MemCheckBlock->getTerminator() == &I)
1961 continue;
1962 InstructionCost C =
1963 TTI->getInstructionCost(U: &I, CostKind: TTI::TCK_RecipThroughput);
1964 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1965 MemCheckCost += C;
1966 }
1967
1968 // If the runtime memory checks are being created inside an outer loop
1969 // we should find out if these checks are outer loop invariant. If so,
1970 // the checks will likely be hoisted out and so the effective cost will
1971 // reduce according to the outer loop trip count.
1972 if (OuterLoop) {
1973 ScalarEvolution *SE = MemCheckExp.getSE();
1974 // TODO: If profitable, we could refine this further by analysing every
1975 // individual memory check, since there could be a mixture of loop
1976 // variant and invariant checks that mean the final condition is
1977 // variant.
1978 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1979 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1980 // It seems reasonable to assume that we can reduce the effective
1981 // cost of the checks even when we know nothing about the trip
1982 // count. Assume that the outer loop executes at least twice.
1983 unsigned BestTripCount = 2;
1984
1985 // If exact trip count is known use that.
1986 if (unsigned SmallTC = SE->getSmallConstantTripCount(L: OuterLoop))
1987 BestTripCount = SmallTC;
1988 else if (LoopVectorizeWithBlockFrequency) {
1989 // Else use profile data if available.
1990 if (auto EstimatedTC = getLoopEstimatedTripCount(L: OuterLoop))
1991 BestTripCount = *EstimatedTC;
1992 }
1993
1994 BestTripCount = std::max(a: BestTripCount, b: 1U);
1995 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1996
1997 // Let's ensure the cost is always at least 1.
1998 NewMemCheckCost = std::max(a: *NewMemCheckCost.getValue(),
1999 b: (InstructionCost::CostType)1);
2000
2001 if (BestTripCount > 1)
2002 LLVM_DEBUG(dbgs()
2003 << "We expect runtime memory checks to be hoisted "
2004 << "out of the outer loop. Cost reduced from "
2005 << MemCheckCost << " to " << NewMemCheckCost << '\n');
2006
2007 MemCheckCost = NewMemCheckCost;
2008 }
2009 }
2010
2011 RTCheckCost += MemCheckCost;
2012 }
2013
2014 if (SCEVCheckBlock || MemCheckBlock)
2015 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2016 << "\n");
2017
2018 return RTCheckCost;
2019 }
2020
2021 /// Remove the created SCEV & memory runtime check blocks & instructions, if
2022 /// unused.
2023 ~GeneratedRTChecks() {
2024 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2025 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2026 if (!SCEVCheckCond)
2027 SCEVCleaner.markResultUsed();
2028
2029 if (!MemRuntimeCheckCond)
2030 MemCheckCleaner.markResultUsed();
2031
2032 if (MemRuntimeCheckCond) {
2033 auto &SE = *MemCheckExp.getSE();
2034 // Memory runtime check generation creates compares that use expanded
2035 // values. Remove them before running the SCEVExpanderCleaners.
2036 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2037 if (MemCheckExp.isInsertedInstruction(I: &I))
2038 continue;
2039 SE.forgetValue(V: &I);
2040 I.eraseFromParent();
2041 }
2042 }
2043 MemCheckCleaner.cleanup();
2044 SCEVCleaner.cleanup();
2045
2046 if (SCEVCheckCond)
2047 SCEVCheckBlock->eraseFromParent();
2048 if (MemRuntimeCheckCond)
2049 MemCheckBlock->eraseFromParent();
2050 }
2051
2052 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2053 /// adjusts the branches to branch to the vector preheader or \p Bypass,
2054 /// depending on the generated condition.
2055 BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2056 BasicBlock *LoopVectorPreHeader,
2057 BasicBlock *LoopExitBlock) {
2058 if (!SCEVCheckCond)
2059 return nullptr;
2060
2061 Value *Cond = SCEVCheckCond;
2062 // Mark the check as used, to prevent it from being removed during cleanup.
2063 SCEVCheckCond = nullptr;
2064 if (auto *C = dyn_cast<ConstantInt>(Val: Cond))
2065 if (C->isZero())
2066 return nullptr;
2067
2068 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2069
2070 BranchInst::Create(IfTrue: LoopVectorPreHeader, InsertBefore: SCEVCheckBlock);
2071 // Create new preheader for vector loop.
2072 if (OuterLoop)
2073 OuterLoop->addBasicBlockToLoop(NewBB: SCEVCheckBlock, LI&: *LI);
2074
2075 SCEVCheckBlock->getTerminator()->eraseFromParent();
2076 SCEVCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2077 Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2078 NewBB: SCEVCheckBlock);
2079
2080 DT->addNewBlock(BB: SCEVCheckBlock, DomBB: Pred);
2081 DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: SCEVCheckBlock);
2082
2083 BranchInst &BI = *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond);
2084 if (AddBranchWeights)
2085 setBranchWeights(I&: BI, Weights: SCEVCheckBypassWeights, /*IsExpected=*/false);
2086 ReplaceInstWithInst(From: SCEVCheckBlock->getTerminator(), To: &BI);
2087 return SCEVCheckBlock;
2088 }
2089
2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2091 /// the branches to branch to the vector preheader or \p Bypass, depending on
2092 /// the generated condition.
2093 BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2094 BasicBlock *LoopVectorPreHeader) {
2095 // Check if we generated code that checks in runtime if arrays overlap.
2096 if (!MemRuntimeCheckCond)
2097 return nullptr;
2098
2099 auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2100 Pred->getTerminator()->replaceSuccessorWith(OldBB: LoopVectorPreHeader,
2101 NewBB: MemCheckBlock);
2102
2103 DT->addNewBlock(BB: MemCheckBlock, DomBB: Pred);
2104 DT->changeImmediateDominator(BB: LoopVectorPreHeader, NewBB: MemCheckBlock);
2105 MemCheckBlock->moveBefore(MovePos: LoopVectorPreHeader);
2106
2107 if (OuterLoop)
2108 OuterLoop->addBasicBlockToLoop(NewBB: MemCheckBlock, LI&: *LI);
2109
2110 BranchInst &BI =
2111 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: MemRuntimeCheckCond);
2112 if (AddBranchWeights) {
2113 setBranchWeights(I&: BI, Weights: MemCheckBypassWeights, /*IsExpected=*/false);
2114 }
2115 ReplaceInstWithInst(From: MemCheckBlock->getTerminator(), To: &BI);
2116 MemCheckBlock->getTerminator()->setDebugLoc(
2117 Pred->getTerminator()->getDebugLoc());
2118
2119 // Mark the check as used, to prevent it from being removed during cleanup.
2120 MemRuntimeCheckCond = nullptr;
2121 return MemCheckBlock;
2122 }
2123};
2124} // namespace
2125
2126static bool useActiveLaneMask(TailFoldingStyle Style) {
2127 return Style == TailFoldingStyle::Data ||
2128 Style == TailFoldingStyle::DataAndControlFlow ||
2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2130}
2131
2132static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2133 return Style == TailFoldingStyle::DataAndControlFlow ||
2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2135}
2136
2137// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2138// vectorization. The loop needs to be annotated with #pragma omp simd
2139// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2140// vector length information is not provided, vectorization is not considered
2141// explicit. Interleave hints are not allowed either. These limitations will be
2142// relaxed in the future.
2143// Please, note that we are currently forced to abuse the pragma 'clang
2144// vectorize' semantics. This pragma provides *auto-vectorization hints*
2145// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2146// provides *explicit vectorization hints* (LV can bypass legal checks and
2147// assume that vectorization is legal). However, both hints are implemented
2148// using the same metadata (llvm.loop.vectorize, processed by
2149// LoopVectorizeHints). This will be fixed in the future when the native IR
2150// representation for pragma 'omp simd' is introduced.
2151static bool isExplicitVecOuterLoop(Loop *OuterLp,
2152 OptimizationRemarkEmitter *ORE) {
2153 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2154 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2155
2156 // Only outer loops with an explicit vectorization hint are supported.
2157 // Unannotated outer loops are ignored.
2158 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2159 return false;
2160
2161 Function *Fn = OuterLp->getHeader()->getParent();
2162 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2163 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2164 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2165 return false;
2166 }
2167
2168 if (Hints.getInterleave() > 1) {
2169 // TODO: Interleave support is future work.
2170 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2171 "outer loops.\n");
2172 Hints.emitRemarkWithHints();
2173 return false;
2174 }
2175
2176 return true;
2177}
2178
2179static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2180 OptimizationRemarkEmitter *ORE,
2181 SmallVectorImpl<Loop *> &V) {
2182 // Collect inner loops and outer loops without irreducible control flow. For
2183 // now, only collect outer loops that have explicit vectorization hints. If we
2184 // are stress testing the VPlan H-CFG construction, we collect the outermost
2185 // loop of every loop nest.
2186 if (L.isInnermost() || VPlanBuildStressTest ||
2187 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2188 LoopBlocksRPO RPOT(&L);
2189 RPOT.perform(LI);
2190 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2191 V.push_back(Elt: &L);
2192 // TODO: Collect inner loops inside marked outer loops in case
2193 // vectorization fails for the outer loop. Do not invoke
2194 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2195 // already known to be reducible. We can use an inherited attribute for
2196 // that.
2197 return;
2198 }
2199 }
2200 for (Loop *InnerL : L)
2201 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2202}
2203
2204//===----------------------------------------------------------------------===//
2205// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2206// LoopVectorizationCostModel and LoopVectorizationPlanner.
2207//===----------------------------------------------------------------------===//
2208
2209/// Compute the transformed value of Index at offset StartValue using step
2210/// StepValue.
2211/// For integer induction, returns StartValue + Index * StepValue.
2212/// For pointer induction, returns StartValue[Index * StepValue].
2213/// FIXME: The newly created binary instructions should contain nsw/nuw
2214/// flags, which can be found from the original scalar operations.
2215static Value *
2216emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2217 Value *Step,
2218 InductionDescriptor::InductionKind InductionKind,
2219 const BinaryOperator *InductionBinOp) {
2220 Type *StepTy = Step->getType();
2221 Value *CastedIndex = StepTy->isIntegerTy()
2222 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2223 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2224 if (CastedIndex != Index) {
2225 CastedIndex->setName(CastedIndex->getName() + ".cast");
2226 Index = CastedIndex;
2227 }
2228
2229 // Note: the IR at this point is broken. We cannot use SE to create any new
2230 // SCEV and then expand it, hoping that SCEV's simplification will give us
2231 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2232 // lead to various SCEV crashes. So all we can do is to use builder and rely
2233 // on InstCombine for future simplifications. Here we handle some trivial
2234 // cases only.
2235 auto CreateAdd = [&B](Value *X, Value *Y) {
2236 assert(X->getType() == Y->getType() && "Types don't match!");
2237 if (auto *CX = dyn_cast<ConstantInt>(Val: X))
2238 if (CX->isZero())
2239 return Y;
2240 if (auto *CY = dyn_cast<ConstantInt>(Val: Y))
2241 if (CY->isZero())
2242 return X;
2243 return B.CreateAdd(LHS: X, RHS: Y);
2244 };
2245
2246 // We allow X to be a vector type, in which case Y will potentially be
2247 // splatted into a vector with the same element count.
2248 auto CreateMul = [&B](Value *X, Value *Y) {
2249 assert(X->getType()->getScalarType() == Y->getType() &&
2250 "Types don't match!");
2251 if (auto *CX = dyn_cast<ConstantInt>(Val: X))
2252 if (CX->isOne())
2253 return Y;
2254 if (auto *CY = dyn_cast<ConstantInt>(Val: Y))
2255 if (CY->isOne())
2256 return X;
2257 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2258 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2259 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2260 return B.CreateMul(LHS: X, RHS: Y);
2261 };
2262
2263 switch (InductionKind) {
2264 case InductionDescriptor::IK_IntInduction: {
2265 assert(!isa<VectorType>(Index->getType()) &&
2266 "Vector indices not supported for integer inductions yet");
2267 assert(Index->getType() == StartValue->getType() &&
2268 "Index type does not match StartValue type");
2269 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2270 return B.CreateSub(LHS: StartValue, RHS: Index);
2271 auto *Offset = CreateMul(Index, Step);
2272 return CreateAdd(StartValue, Offset);
2273 }
2274 case InductionDescriptor::IK_PtrInduction:
2275 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2276 case InductionDescriptor::IK_FpInduction: {
2277 assert(!isa<VectorType>(Index->getType()) &&
2278 "Vector indices not supported for FP inductions yet");
2279 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2280 assert(InductionBinOp &&
2281 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2282 InductionBinOp->getOpcode() == Instruction::FSub) &&
2283 "Original bin op should be defined for FP induction");
2284
2285 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2286 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2287 Name: "induction");
2288 }
2289 case InductionDescriptor::IK_NoInduction:
2290 return nullptr;
2291 }
2292 llvm_unreachable("invalid enum");
2293}
2294
2295std::optional<unsigned> getMaxVScale(const Function &F,
2296 const TargetTransformInfo &TTI) {
2297 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2298 return MaxVScale;
2299
2300 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2301 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2302
2303 return std::nullopt;
2304}
2305
2306/// For the given VF and UF and maximum trip count computed for the loop, return
2307/// whether the induction variable might overflow in the vectorized loop. If not,
2308/// then we know a runtime overflow check always evaluates to false and can be
2309/// removed.
2310static bool isIndvarOverflowCheckKnownFalse(
2311 const LoopVectorizationCostModel *Cost,
2312 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313 // Always be conservative if we don't know the exact unroll factor.
2314 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2315
2316 Type *IdxTy = Cost->Legal->getWidestInductionType();
2317 APInt MaxUIntTripCount = cast<IntegerType>(Val: IdxTy)->getMask();
2318
2319 // We know the runtime overflow check is known false iff the (max) trip-count
2320 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2321 // the vector loop induction variable.
2322 if (unsigned TC =
2323 Cost->PSE.getSE()->getSmallConstantMaxTripCount(L: Cost->TheLoop)) {
2324 uint64_t MaxVF = VF.getKnownMinValue();
2325 if (VF.isScalable()) {
2326 std::optional<unsigned> MaxVScale =
2327 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2328 if (!MaxVScale)
2329 return false;
2330 MaxVF *= *MaxVScale;
2331 }
2332
2333 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2334 }
2335
2336 return false;
2337}
2338
2339// Return whether we allow using masked interleave-groups (for dealing with
2340// strided loads/stores that reside in predicated blocks, or for dealing
2341// with gaps).
2342static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2343 // If an override option has been passed in for interleaved accesses, use it.
2344 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2345 return EnableMaskedInterleavedMemAccesses;
2346
2347 return TTI.enableMaskedInterleavedAccessVectorization();
2348}
2349
2350void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
2351 VPReplicateRecipe *RepRecipe,
2352 const VPIteration &Instance,
2353 VPTransformState &State) {
2354 assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2355
2356 // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2357 // the first lane and part.
2358 if (isa<NoAliasScopeDeclInst>(Val: Instr))
2359 if (!Instance.isFirstIteration())
2360 return;
2361
2362 // Does this instruction return a value ?
2363 bool IsVoidRetTy = Instr->getType()->isVoidTy();
2364
2365 Instruction *Cloned = Instr->clone();
2366 if (!IsVoidRetTy) {
2367 Cloned->setName(Instr->getName() + ".cloned");
2368#if !defined(NDEBUG)
2369 // Verify that VPlan type inference results agree with the type of the
2370 // generated values.
2371 assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
2372 "inferred type and type from generated instructions do not match");
2373#endif
2374 }
2375
2376 RepRecipe->setFlags(Cloned);
2377
2378 if (auto DL = Instr->getDebugLoc())
2379 State.setDebugLocFrom(DL);
2380
2381 // Replace the operands of the cloned instructions with their scalar
2382 // equivalents in the new loop.
2383 for (const auto &I : enumerate(First: RepRecipe->operands())) {
2384 auto InputInstance = Instance;
2385 VPValue *Operand = I.value();
2386 if (vputils::isUniformAfterVectorization(VPV: Operand))
2387 InputInstance.Lane = VPLane::getFirstLane();
2388 Cloned->setOperand(i: I.index(), Val: State.get(Def: Operand, Instance: InputInstance));
2389 }
2390 State.addNewMetadata(To: Cloned, Orig: Instr);
2391
2392 // Place the cloned scalar in the new loop.
2393 State.Builder.Insert(I: Cloned);
2394
2395 State.set(Def: RepRecipe, V: Cloned, Instance);
2396
2397 // If we just cloned a new assumption, add it the assumption cache.
2398 if (auto *II = dyn_cast<AssumeInst>(Val: Cloned))
2399 AC->registerAssumption(CI: II);
2400
2401 // End if-block.
2402 bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
2403 if (IfPredicateInstr)
2404 PredicatedInstructions.push_back(Elt: Cloned);
2405}
2406
2407Value *
2408InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2409 if (VectorTripCount)
2410 return VectorTripCount;
2411
2412 Value *TC = getTripCount();
2413 IRBuilder<> Builder(InsertBlock->getTerminator());
2414
2415 Type *Ty = TC->getType();
2416 // This is where we can make the step a runtime constant.
2417 Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF);
2418
2419 // If the tail is to be folded by masking, round the number of iterations N
2420 // up to a multiple of Step instead of rounding down. This is done by first
2421 // adding Step-1 and then rounding down. Note that it's ok if this addition
2422 // overflows: the vector induction variable will eventually wrap to zero given
2423 // that it starts at zero and its Step is a power of two; the loop will then
2424 // exit, with the last early-exit vector comparison also producing all-true.
2425 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2426 // is accounted for in emitIterationCountCheck that adds an overflow check.
2427 if (Cost->foldTailByMasking()) {
2428 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2429 "VF*UF must be a power of 2 when folding tail by masking");
2430 TC = Builder.CreateAdd(LHS: TC, RHS: Builder.CreateSub(LHS: Step, RHS: ConstantInt::get(Ty, V: 1)),
2431 Name: "n.rnd.up");
2432 }
2433
2434 // Now we need to generate the expression for the part of the loop that the
2435 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2436 // iterations are not required for correctness, or N - Step, otherwise. Step
2437 // is equal to the vectorization factor (number of SIMD elements) times the
2438 // unroll factor (number of SIMD instructions).
2439 Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf");
2440
2441 // There are cases where we *must* run at least one iteration in the remainder
2442 // loop. See the cost model for when this can happen. If the step evenly
2443 // divides the trip count, we set the remainder to be equal to the step. If
2444 // the step does not evenly divide the trip count, no adjustment is necessary
2445 // since there will already be scalar iterations. Note that the minimum
2446 // iterations check ensures that N >= Step.
2447 if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2448 auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: 0));
2449 R = Builder.CreateSelect(C: IsZero, True: Step, False: R);
2450 }
2451
2452 VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec");
2453
2454 return VectorTripCount;
2455}
2456
2457void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2458 Value *Count = getTripCount();
2459 // Reuse existing vector loop preheader for TC checks.
2460 // Note that new preheader block is generated for vector loop.
2461 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2462 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2463
2464 // Generate code to check if the loop's trip count is less than VF * UF, or
2465 // equal to it in case a scalar epilogue is required; this implies that the
2466 // vector trip count is zero. This check also covers the case where adding one
2467 // to the backedge-taken count overflowed leading to an incorrect trip count
2468 // of zero. In this case we will also jump to the scalar loop.
2469 auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2470 : ICmpInst::ICMP_ULT;
2471
2472 // If tail is to be folded, vector loop takes care of all iterations.
2473 Type *CountTy = Count->getType();
2474 Value *CheckMinIters = Builder.getFalse();
2475 auto CreateStep = [&]() -> Value * {
2476 // Create step with max(MinProTripCount, UF * VF).
2477 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2478 return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2479
2480 Value *MinProfTC =
2481 createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: 1);
2482 if (!VF.isScalable())
2483 return MinProfTC;
2484 return Builder.CreateBinaryIntrinsic(
2485 ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2486 };
2487
2488 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2489 if (Style == TailFoldingStyle::None)
2490 CheckMinIters =
2491 Builder.CreateICmp(P, LHS: Count, RHS: CreateStep(), Name: "min.iters.check");
2492 else if (VF.isScalable() &&
2493 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2494 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2495 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2496 // an overflow to zero when updating induction variables and so an
2497 // additional overflow check is required before entering the vector loop.
2498
2499 // Get the maximum unsigned value for the type.
2500 Value *MaxUIntTripCount =
2501 ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2502 Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2503
2504 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2505 CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep());
2506 }
2507
2508 // Create new preheader for vector loop.
2509 LoopVectorPreHeader =
2510 SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(), DT, LI, MSSAU: nullptr,
2511 BBName: "vector.ph");
2512
2513 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2514 DT->getNode(Bypass)->getIDom()) &&
2515 "TC check is expected to dominate Bypass");
2516
2517 // Update dominator for Bypass & LoopExit (if needed).
2518 DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock);
2519 BranchInst &BI =
2520 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
2521 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
2522 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
2523 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
2524 LoopBypassBlocks.push_back(Elt: TCCheckBlock);
2525}
2526
2527BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2528 BasicBlock *const SCEVCheckBlock =
2529 RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2530 if (!SCEVCheckBlock)
2531 return nullptr;
2532
2533 assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2534 (OptForSizeBasedOnProfile &&
2535 Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2536 "Cannot SCEV check stride or overflow when optimizing for size");
2537
2538
2539 // Update dominator only if this is first RT check.
2540 if (LoopBypassBlocks.empty()) {
2541 DT->changeImmediateDominator(BB: Bypass, NewBB: SCEVCheckBlock);
2542 if (!Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()))
2543 // If there is an epilogue which must run, there's no edge from the
2544 // middle block to exit blocks and thus no need to update the immediate
2545 // dominator of the exit blocks.
2546 DT->changeImmediateDominator(BB: LoopExitBlock, NewBB: SCEVCheckBlock);
2547 }
2548
2549 LoopBypassBlocks.push_back(Elt: SCEVCheckBlock);
2550 AddedSafetyChecks = true;
2551 return SCEVCheckBlock;
2552}
2553
2554BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2555 // VPlan-native path does not do any analysis for runtime checks currently.
2556 if (EnableVPlanNativePath)
2557 return nullptr;
2558
2559 BasicBlock *const MemCheckBlock =
2560 RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2561
2562 // Check if we generated code that checks in runtime if arrays overlap. We put
2563 // the checks into a separate block to make the more common case of few
2564 // elements faster.
2565 if (!MemCheckBlock)
2566 return nullptr;
2567
2568 if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2569 assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2570 "Cannot emit memory checks when optimizing for size, unless forced "
2571 "to vectorize.");
2572 ORE->emit(RemarkBuilder: [&]() {
2573 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2574 OrigLoop->getStartLoc(),
2575 OrigLoop->getHeader())
2576 << "Code-size may be reduced by not forcing "
2577 "vectorization, or by source-code modifications "
2578 "eliminating the need for runtime checks "
2579 "(e.g., adding 'restrict').";
2580 });
2581 }
2582
2583 LoopBypassBlocks.push_back(Elt: MemCheckBlock);
2584
2585 AddedSafetyChecks = true;
2586
2587 return MemCheckBlock;
2588}
2589
2590void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2591 LoopScalarBody = OrigLoop->getHeader();
2592 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2593 assert(LoopVectorPreHeader && "Invalid loop structure");
2594 LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2595 assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2596 "multiple exit loop without required epilogue?");
2597
2598 LoopMiddleBlock =
2599 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
2600 LI, MSSAU: nullptr, BBName: Twine(Prefix) + "middle.block");
2601 LoopScalarPreHeader =
2602 SplitBlock(Old: LoopMiddleBlock, SplitPt: LoopMiddleBlock->getTerminator(), DT, LI,
2603 MSSAU: nullptr, BBName: Twine(Prefix) + "scalar.ph");
2604}
2605
2606PHINode *InnerLoopVectorizer::createInductionResumeValue(
2607 PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2608 ArrayRef<BasicBlock *> BypassBlocks,
2609 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2610 Value *VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
2611 assert(VectorTripCount && "Expected valid arguments");
2612
2613 Instruction *OldInduction = Legal->getPrimaryInduction();
2614 Value *&EndValue = IVEndValues[OrigPhi];
2615 Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
2616 if (OrigPhi == OldInduction) {
2617 // We know what the end value is.
2618 EndValue = VectorTripCount;
2619 } else {
2620 IRBuilder<> B(LoopVectorPreHeader->getTerminator());
2621
2622 // Fast-math-flags propagate from the original induction instruction.
2623 if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp()))
2624 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2625
2626 EndValue = emitTransformedIndex(B, Index: VectorTripCount, StartValue: II.getStartValue(),
2627 Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
2628 EndValue->setName("ind.end");
2629
2630 // Compute the end value for the additional bypass (if applicable).
2631 if (AdditionalBypass.first) {
2632 B.SetInsertPoint(TheBB: AdditionalBypass.first,
2633 IP: AdditionalBypass.first->getFirstInsertionPt());
2634 EndValueFromAdditionalBypass =
2635 emitTransformedIndex(B, Index: AdditionalBypass.second, StartValue: II.getStartValue(),
2636 Step, InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
2637 EndValueFromAdditionalBypass->setName("ind.end");
2638 }
2639 }
2640
2641 // Create phi nodes to merge from the backedge-taken check block.
2642 PHINode *BCResumeVal = PHINode::Create(Ty: OrigPhi->getType(), NumReservedValues: 3, NameStr: "bc.resume.val",
2643 InsertBefore: LoopScalarPreHeader->getFirstNonPHI());
2644 // Copy original phi DL over to the new one.
2645 BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
2646
2647 // The new PHI merges the original incoming value, in case of a bypass,
2648 // or the value at the end of the vectorized loop.
2649 BCResumeVal->addIncoming(V: EndValue, BB: LoopMiddleBlock);
2650
2651 // Fix the scalar body counter (PHI node).
2652 // The old induction's phi node in the scalar body needs the truncated
2653 // value.
2654 for (BasicBlock *BB : BypassBlocks)
2655 BCResumeVal->addIncoming(V: II.getStartValue(), BB);
2656
2657 if (AdditionalBypass.first)
2658 BCResumeVal->setIncomingValueForBlock(BB: AdditionalBypass.first,
2659 V: EndValueFromAdditionalBypass);
2660 return BCResumeVal;
2661}
2662
2663/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2664/// expansion results.
2665static Value *getExpandedStep(const InductionDescriptor &ID,
2666 const SCEV2ValueTy &ExpandedSCEVs) {
2667 const SCEV *Step = ID.getStep();
2668 if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
2669 return C->getValue();
2670 if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
2671 return U->getValue();
2672 auto I = ExpandedSCEVs.find(Val: Step);
2673 assert(I != ExpandedSCEVs.end() && "SCEV must be expanded at this point");
2674 return I->second;
2675}
2676
2677void InnerLoopVectorizer::createInductionResumeValues(
2678 const SCEV2ValueTy &ExpandedSCEVs,
2679 std::pair<BasicBlock *, Value *> AdditionalBypass) {
2680 assert(((AdditionalBypass.first && AdditionalBypass.second) ||
2681 (!AdditionalBypass.first && !AdditionalBypass.second)) &&
2682 "Inconsistent information about additional bypass.");
2683 // We are going to resume the execution of the scalar loop.
2684 // Go over all of the induction variables that we found and fix the
2685 // PHIs that are left in the scalar version of the loop.
2686 // The starting values of PHI nodes depend on the counter of the last
2687 // iteration in the vectorized loop.
2688 // If we come from a bypass edge then we need to start from the original
2689 // start value.
2690 for (const auto &InductionEntry : Legal->getInductionVars()) {
2691 PHINode *OrigPhi = InductionEntry.first;
2692 const InductionDescriptor &II = InductionEntry.second;
2693 PHINode *BCResumeVal = createInductionResumeValue(
2694 OrigPhi, II, Step: getExpandedStep(ID: II, ExpandedSCEVs), BypassBlocks: LoopBypassBlocks,
2695 AdditionalBypass);
2696 OrigPhi->setIncomingValueForBlock(BB: LoopScalarPreHeader, V: BCResumeVal);
2697 }
2698}
2699
2700std::pair<BasicBlock *, Value *>
2701InnerLoopVectorizer::createVectorizedLoopSkeleton(
2702 const SCEV2ValueTy &ExpandedSCEVs) {
2703 /*
2704 In this function we generate a new loop. The new loop will contain
2705 the vectorized instructions while the old loop will continue to run the
2706 scalar remainder.
2707
2708 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2709 / | preheader are expanded here. Eventually all required SCEV
2710 / | expansion should happen here.
2711 / v
2712 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2713 | / |
2714 | / v
2715 || [ ] <-- vector pre header.
2716 |/ |
2717 | v
2718 | [ ] \
2719 | [ ]_| <-- vector loop (created during VPlan execution).
2720 | |
2721 | v
2722 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2723 | | successors created during VPlan execution)
2724 \/ |
2725 /\ v
2726 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2727 | |
2728 (opt) v <-- edge from middle to exit iff epilogue is not required.
2729 | [ ] \
2730 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
2731 \ |
2732 \ v
2733 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2734 ...
2735 */
2736
2737 // Create an empty vector loop, and prepare basic blocks for the runtime
2738 // checks.
2739 createVectorLoopSkeleton(Prefix: "");
2740
2741 // Now, compare the new count to zero. If it is zero skip the vector loop and
2742 // jump to the scalar loop. This check also covers the case where the
2743 // backedge-taken count is uint##_max: adding one to it will overflow leading
2744 // to an incorrect trip count of zero. In this (rare) case we will also jump
2745 // to the scalar loop.
2746 emitIterationCountCheck(Bypass: LoopScalarPreHeader);
2747
2748 // Generate the code to check any assumptions that we've made for SCEV
2749 // expressions.
2750 emitSCEVChecks(Bypass: LoopScalarPreHeader);
2751
2752 // Generate the code that checks in runtime if arrays overlap. We put the
2753 // checks into a separate block to make the more common case of few elements
2754 // faster.
2755 emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
2756
2757 // Emit phis for the new starting index of the scalar loop.
2758 createInductionResumeValues(ExpandedSCEVs);
2759
2760 return {LoopVectorPreHeader, nullptr};
2761}
2762
2763// Fix up external users of the induction variable. At this point, we are
2764// in LCSSA form, with all external PHIs that use the IV having one input value,
2765// coming from the remainder loop. We need those PHIs to also have a correct
2766// value for the IV when arriving directly from the middle block.
2767void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
2768 const InductionDescriptor &II,
2769 Value *VectorTripCount, Value *EndValue,
2770 BasicBlock *MiddleBlock,
2771 BasicBlock *VectorHeader, VPlan &Plan,
2772 VPTransformState &State) {
2773 // There are two kinds of external IV usages - those that use the value
2774 // computed in the last iteration (the PHI) and those that use the penultimate
2775 // value (the value that feeds into the phi from the loop latch).
2776 // We allow both, but they, obviously, have different values.
2777
2778 assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
2779
2780 DenseMap<Value *, Value *> MissingVals;
2781
2782 // An external user of the last iteration's value should see the value that
2783 // the remainder loop uses to initialize its own IV.
2784 Value *PostInc = OrigPhi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
2785 for (User *U : PostInc->users()) {
2786 Instruction *UI = cast<Instruction>(Val: U);
2787 if (!OrigLoop->contains(Inst: UI)) {
2788 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2789 MissingVals[UI] = EndValue;
2790 }
2791 }
2792
2793 // An external user of the penultimate value need to see EndValue - Step.
2794 // The simplest way to get this is to recompute it from the constituent SCEVs,
2795 // that is Start + (Step * (CRD - 1)).
2796 for (User *U : OrigPhi->users()) {
2797 auto *UI = cast<Instruction>(Val: U);
2798 if (!OrigLoop->contains(Inst: UI)) {
2799 assert(isa<PHINode>(UI) && "Expected LCSSA form");
2800 IRBuilder<> B(MiddleBlock->getTerminator());
2801
2802 // Fast-math-flags propagate from the original induction instruction.
2803 if (II.getInductionBinOp() && isa<FPMathOperator>(Val: II.getInductionBinOp()))
2804 B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
2805
2806 Value *CountMinusOne = B.CreateSub(
2807 LHS: VectorTripCount, RHS: ConstantInt::get(Ty: VectorTripCount->getType(), V: 1));
2808 CountMinusOne->setName("cmo");
2809
2810 VPValue *StepVPV = Plan.getSCEVExpansion(S: II.getStep());
2811 assert(StepVPV && "step must have been expanded during VPlan execution");
2812 Value *Step = StepVPV->isLiveIn() ? StepVPV->getLiveInIRValue()
2813 : State.get(Def: StepVPV, Instance: {0, 0});
2814 Value *Escape =
2815 emitTransformedIndex(B, Index: CountMinusOne, StartValue: II.getStartValue(), Step,
2816 InductionKind: II.getKind(), InductionBinOp: II.getInductionBinOp());
2817 Escape->setName("ind.escape");
2818 MissingVals[UI] = Escape;
2819 }
2820 }
2821
2822 for (auto &I : MissingVals) {
2823 PHINode *PHI = cast<PHINode>(Val: I.first);
2824 // One corner case we have to handle is two IVs "chasing" each-other,
2825 // that is %IV2 = phi [...], [ %IV1, %latch ]
2826 // In this case, if IV1 has an external use, we need to avoid adding both
2827 // "last value of IV1" and "penultimate value of IV2". So, verify that we
2828 // don't already have an incoming value for the middle block.
2829 if (PHI->getBasicBlockIndex(BB: MiddleBlock) == -1) {
2830 PHI->addIncoming(V: I.second, BB: MiddleBlock);
2831 Plan.removeLiveOut(PN: PHI);
2832 }
2833 }
2834}
2835
2836namespace {
2837
2838struct CSEDenseMapInfo {
2839 static bool canHandle(const Instruction *I) {
2840 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2841 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2842 }
2843
2844 static inline Instruction *getEmptyKey() {
2845 return DenseMapInfo<Instruction *>::getEmptyKey();
2846 }
2847
2848 static inline Instruction *getTombstoneKey() {
2849 return DenseMapInfo<Instruction *>::getTombstoneKey();
2850 }
2851
2852 static unsigned getHashValue(const Instruction *I) {
2853 assert(canHandle(I) && "Unknown instruction!");
2854 return hash_combine(args: I->getOpcode(), args: hash_combine_range(first: I->value_op_begin(),
2855 last: I->value_op_end()));
2856 }
2857
2858 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2859 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2860 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2861 return LHS == RHS;
2862 return LHS->isIdenticalTo(I: RHS);
2863 }
2864};
2865
2866} // end anonymous namespace
2867
2868///Perform cse of induction variable instructions.
2869static void cse(BasicBlock *BB) {
2870 // Perform simple cse.
2871 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2872 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2873 if (!CSEDenseMapInfo::canHandle(I: &In))
2874 continue;
2875
2876 // Check if we can replace this instruction with any of the
2877 // visited instructions.
2878 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2879 In.replaceAllUsesWith(V);
2880 In.eraseFromParent();
2881 continue;
2882 }
2883
2884 CSEMap[&In] = &In;
2885 }
2886}
2887
2888InstructionCost
2889LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2890 ElementCount VF) const {
2891 // We only need to calculate a cost if the VF is scalar; for actual vectors
2892 // we should already have a pre-calculated cost at each VF.
2893 if (!VF.isScalar())
2894 return CallWideningDecisions.at(Val: std::make_pair(x&: CI, y&: VF)).Cost;
2895
2896 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2897 Type *RetTy = CI->getType();
2898 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2899 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy, CostKind))
2900 return *RedCost;
2901
2902 SmallVector<Type *, 4> Tys;
2903 for (auto &ArgOp : CI->args())
2904 Tys.push_back(Elt: ArgOp->getType());
2905
2906 InstructionCost ScalarCallCost =
2907 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2908
2909 // If this is an intrinsic we may have a lower cost for it.
2910 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2911 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2912 return std::min(a: ScalarCallCost, b: IntrinsicCost);
2913 }
2914 return ScalarCallCost;
2915}
2916
2917static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
2918 if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
2919 return Elt;
2920 return VectorType::get(ElementType: Elt, EC: VF);
2921}
2922
2923InstructionCost
2924LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2925 ElementCount VF) const {
2926 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2927 assert(ID && "Expected intrinsic call!");
2928 Type *RetTy = MaybeVectorizeType(Elt: CI->getType(), VF);
2929 FastMathFlags FMF;
2930 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2931 FMF = FPMO->getFastMathFlags();
2932
2933 SmallVector<const Value *> Arguments(CI->args());
2934 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2935 SmallVector<Type *> ParamTys;
2936 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2937 result: std::back_inserter(x&: ParamTys),
2938 unary_op: [&](Type *Ty) { return MaybeVectorizeType(Elt: Ty, VF); });
2939
2940 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2941 dyn_cast<IntrinsicInst>(Val: CI));
2942 return TTI.getIntrinsicInstrCost(ICA: CostAttrs,
2943 CostKind: TargetTransformInfo::TCK_RecipThroughput);
2944}
2945
2946void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
2947 VPlan &Plan) {
2948 // Fix widened non-induction PHIs by setting up the PHI operands.
2949 if (EnableVPlanNativePath)
2950 fixNonInductionPHIs(Plan, State);
2951
2952 // Forget the original basic block.
2953 PSE.getSE()->forgetLoop(L: OrigLoop);
2954 PSE.getSE()->forgetBlockAndLoopDispositions();
2955
2956 // After vectorization, the exit blocks of the original loop will have
2957 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2958 // looked through single-entry phis.
2959 SmallVector<BasicBlock *> ExitBlocks;
2960 OrigLoop->getExitBlocks(ExitBlocks);
2961 for (BasicBlock *Exit : ExitBlocks)
2962 for (PHINode &PN : Exit->phis())
2963 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN);
2964
2965 VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
2966 VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
2967 Loop *VectorLoop = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[LatchVPBB]);
2968 if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2969 // No edge from the middle block to the unique exit block has been inserted
2970 // and there is nothing to fix from vector loop; phis should have incoming
2971 // from scalar loop only.
2972 } else {
2973 // TODO: Check VPLiveOuts to see if IV users need fixing instead of checking
2974 // the cost model.
2975
2976 // If we inserted an edge from the middle block to the unique exit block,
2977 // update uses outside the loop (phis) to account for the newly inserted
2978 // edge.
2979
2980 // Fix-up external users of the induction variables.
2981 for (const auto &Entry : Legal->getInductionVars())
2982 fixupIVUsers(OrigPhi: Entry.first, II: Entry.second,
2983 VectorTripCount: getOrCreateVectorTripCount(InsertBlock: VectorLoop->getLoopPreheader()),
2984 EndValue: IVEndValues[Entry.first], MiddleBlock: LoopMiddleBlock,
2985 VectorHeader: VectorLoop->getHeader(), Plan, State);
2986 }
2987
2988 // Fix live-out phis not already fixed earlier.
2989 for (const auto &KV : Plan.getLiveOuts())
2990 KV.second->fixPhi(Plan, State);
2991
2992 for (Instruction *PI : PredicatedInstructions)
2993 sinkScalarOperands(PredInst: &*PI);
2994
2995 // Remove redundant induction instructions.
2996 cse(BB: VectorLoop->getHeader());
2997
2998 // Set/update profile weights for the vector and remainder loops as original
2999 // loop iterations are now distributed among them. Note that original loop
3000 // represented by LoopScalarBody becomes remainder loop after vectorization.
3001 //
3002 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3003 // end up getting slightly roughened result but that should be OK since
3004 // profile is not inherently precise anyway. Note also possible bypass of
3005 // vector code caused by legality checks is ignored, assigning all the weight
3006 // to the vector loop, optimistically.
3007 //
3008 // For scalable vectorization we can't know at compile time how many iterations
3009 // of the loop are handled in one vector iteration, so instead assume a pessimistic
3010 // vscale of '1'.
3011 setProfileInfoAfterUnrolling(OrigLoop: LI->getLoopFor(BB: LoopScalarBody), UnrolledLoop: VectorLoop,
3012 RemainderLoop: LI->getLoopFor(BB: LoopScalarBody),
3013 UF: VF.getKnownMinValue() * UF);
3014}
3015
3016void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3017 // The basic block and loop containing the predicated instruction.
3018 auto *PredBB = PredInst->getParent();
3019 auto *VectorLoop = LI->getLoopFor(BB: PredBB);
3020
3021 // Initialize a worklist with the operands of the predicated instruction.
3022 SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3023
3024 // Holds instructions that we need to analyze again. An instruction may be
3025 // reanalyzed if we don't yet know if we can sink it or not.
3026 SmallVector<Instruction *, 8> InstsToReanalyze;
3027
3028 // Returns true if a given use occurs in the predicated block. Phi nodes use
3029 // their operands in their corresponding predecessor blocks.
3030 auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3031 auto *I = cast<Instruction>(Val: U.getUser());
3032 BasicBlock *BB = I->getParent();
3033 if (auto *Phi = dyn_cast<PHINode>(Val: I))
3034 BB = Phi->getIncomingBlock(
3035 i: PHINode::getIncomingValueNumForOperand(i: U.getOperandNo()));
3036 return BB == PredBB;
3037 };
3038
3039 // Iteratively sink the scalarized operands of the predicated instruction
3040 // into the block we created for it. When an instruction is sunk, it's
3041 // operands are then added to the worklist. The algorithm ends after one pass
3042 // through the worklist doesn't sink a single instruction.
3043 bool Changed;
3044 do {
3045 // Add the instructions that need to be reanalyzed to the worklist, and
3046 // reset the changed indicator.
3047 Worklist.insert(Start: InstsToReanalyze.begin(), End: InstsToReanalyze.end());
3048 InstsToReanalyze.clear();
3049 Changed = false;
3050
3051 while (!Worklist.empty()) {
3052 auto *I = dyn_cast<Instruction>(Val: Worklist.pop_back_val());
3053
3054 // We can't sink an instruction if it is a phi node, is not in the loop,
3055 // may have side effects or may read from memory.
3056 // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
3057 if (!I || isa<PHINode>(Val: I) || !VectorLoop->contains(Inst: I) ||
3058 I->mayHaveSideEffects() || I->mayReadFromMemory())
3059 continue;
3060
3061 // If the instruction is already in PredBB, check if we can sink its
3062 // operands. In that case, VPlan's sinkScalarOperands() succeeded in
3063 // sinking the scalar instruction I, hence it appears in PredBB; but it
3064 // may have failed to sink I's operands (recursively), which we try
3065 // (again) here.
3066 if (I->getParent() == PredBB) {
3067 Worklist.insert(Start: I->op_begin(), End: I->op_end());
3068 continue;
3069 }
3070
3071 // It's legal to sink the instruction if all its uses occur in the
3072 // predicated block. Otherwise, there's nothing to do yet, and we may
3073 // need to reanalyze the instruction.
3074 if (!llvm::all_of(Range: I->uses(), P: isBlockOfUsePredicated)) {
3075 InstsToReanalyze.push_back(Elt: I);
3076 continue;
3077 }
3078
3079 // Move the instruction to the beginning of the predicated block, and add
3080 // it's operands to the worklist.
3081 I->moveBefore(MovePos: &*PredBB->getFirstInsertionPt());
3082 Worklist.insert(Start: I->op_begin(), End: I->op_end());
3083
3084 // The sinking may have enabled other instructions to be sunk, so we will
3085 // need to iterate.
3086 Changed = true;
3087 }
3088 } while (Changed);
3089}
3090
3091void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
3092 VPTransformState &State) {
3093 auto Iter = vp_depth_first_deep(G: Plan.getEntry());
3094 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3095 for (VPRecipeBase &P : VPBB->phis()) {
3096 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
3097 if (!VPPhi)
3098 continue;
3099 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi, Part: 0));
3100 // Make sure the builder has a valid insert point.
3101 Builder.SetInsertPoint(NewPhi);
3102 for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
3103 VPValue *Inc = VPPhi->getIncomingValue(I: i);
3104 VPBasicBlock *VPBB = VPPhi->getIncomingBlock(I: i);
3105 NewPhi->addIncoming(V: State.get(Def: Inc, Part: 0), BB: State.CFG.VPBB2IRBB[VPBB]);
3106 }
3107 }
3108 }
3109}
3110
3111void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
3112 // We should not collect Scalars more than once per VF. Right now, this
3113 // function is called from collectUniformsAndScalars(), which already does
3114 // this check. Collecting Scalars for VF=1 does not make any sense.
3115 assert(VF.isVector() && !Scalars.contains(VF) &&
3116 "This function should not be visited twice for the same VF");
3117
3118 // This avoids any chances of creating a REPLICATE recipe during planning
3119 // since that would result in generation of scalarized code during execution,
3120 // which is not supported for scalable vectors.
3121 if (VF.isScalable()) {
3122 Scalars[VF].insert(I: Uniforms[VF].begin(), E: Uniforms[VF].end());
3123 return;
3124 }
3125
3126 SmallSetVector<Instruction *, 8> Worklist;
3127
3128 // These sets are used to seed the analysis with pointers used by memory
3129 // accesses that will remain scalar.
3130 SmallSetVector<Instruction *, 8> ScalarPtrs;
3131 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
3132 auto *Latch = TheLoop->getLoopLatch();
3133
3134 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
3135 // The pointer operands of loads and stores will be scalar as long as the
3136 // memory access is not a gather or scatter operation. The value operand of a
3137 // store will remain scalar if the store is scalarized.
3138 auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
3139 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
3140 assert(WideningDecision != CM_Unknown &&
3141 "Widening decision should be ready at this moment");
3142 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
3143 if (Ptr == Store->getValueOperand())
3144 return WideningDecision == CM_Scalarize;
3145 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
3146 "Ptr is neither a value or pointer operand");
3147 return WideningDecision != CM_GatherScatter;
3148 };
3149
3150 // A helper that returns true if the given value is a bitcast or
3151 // getelementptr instruction contained in the loop.
3152 auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
3153 return ((isa<BitCastInst>(Val: V) && V->getType()->isPointerTy()) ||
3154 isa<GetElementPtrInst>(Val: V)) &&
3155 !TheLoop->isLoopInvariant(V);
3156 };
3157
3158 // A helper that evaluates a memory access's use of a pointer. If the use will
3159 // be a scalar use and the pointer is only used by memory accesses, we place
3160 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
3161 // PossibleNonScalarPtrs.
3162 auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
3163 // We only care about bitcast and getelementptr instructions contained in
3164 // the loop.
3165 if (!isLoopVaryingBitCastOrGEP(Ptr))
3166 return;
3167
3168 // If the pointer has already been identified as scalar (e.g., if it was
3169 // also identified as uniform), there's nothing to do.
3170 auto *I = cast<Instruction>(Val: Ptr);
3171 if (Worklist.count(key: I))
3172 return;
3173
3174 // If the use of the pointer will be a scalar use, and all users of the
3175 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
3176 // place the pointer in PossibleNonScalarPtrs.
3177 if (isScalarUse(MemAccess, Ptr) && llvm::all_of(Range: I->users(), P: [&](User *U) {
3178 return isa<LoadInst>(Val: U) || isa<StoreInst>(Val: U);
3179 }))
3180 ScalarPtrs.insert(X: I);
3181 else
3182 PossibleNonScalarPtrs.insert(Ptr: I);
3183 };
3184
3185 // We seed the scalars analysis with three classes of instructions: (1)
3186 // instructions marked uniform-after-vectorization and (2) bitcast,
3187 // getelementptr and (pointer) phi instructions used by memory accesses
3188 // requiring a scalar use.
3189 //
3190 // (1) Add to the worklist all instructions that have been identified as
3191 // uniform-after-vectorization.
3192 Worklist.insert(Start: Uniforms[VF].begin(), End: Uniforms[VF].end());
3193
3194 // (2) Add to the worklist all bitcast and getelementptr instructions used by
3195 // memory accesses requiring a scalar use. The pointer operands of loads and
3196 // stores will be scalar as long as the memory accesses is not a gather or
3197 // scatter operation. The value operand of a store will remain scalar if the
3198 // store is scalarized.
3199 for (auto *BB : TheLoop->blocks())
3200 for (auto &I : *BB) {
3201 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
3202 evaluatePtrUse(Load, Load->getPointerOperand());
3203 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
3204 evaluatePtrUse(Store, Store->getPointerOperand());
3205 evaluatePtrUse(Store, Store->getValueOperand());
3206 }
3207 }
3208 for (auto *I : ScalarPtrs)
3209 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
3210 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
3211 Worklist.insert(X: I);
3212 }
3213
3214 // Insert the forced scalars.
3215 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
3216 // induction variable when the PHI user is scalarized.
3217 auto ForcedScalar = ForcedScalars.find(Val: VF);
3218 if (ForcedScalar != ForcedScalars.end())
3219 for (auto *I : ForcedScalar->second) {
3220 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
3221 Worklist.insert(X: I);
3222 }
3223
3224 // Expand the worklist by looking through any bitcasts and getelementptr
3225 // instructions we've already identified as scalar. This is similar to the
3226 // expansion step in collectLoopUniforms(); however, here we're only
3227 // expanding to include additional bitcasts and getelementptr instructions.
3228 unsigned Idx = 0;
3229 while (Idx != Worklist.size()) {
3230 Instruction *Dst = Worklist[Idx++];
3231 if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(i: 0)))
3232 continue;
3233 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
3234 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
3235 auto *J = cast<Instruction>(Val: U);
3236 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
3237 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
3238 isScalarUse(J, Src));
3239 })) {
3240 Worklist.insert(X: Src);
3241 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
3242 }
3243 }
3244
3245 // An induction variable will remain scalar if all users of the induction
3246 // variable and induction variable update remain scalar.
3247 for (const auto &Induction : Legal->getInductionVars()) {
3248 auto *Ind = Induction.first;
3249 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3250
3251 // If tail-folding is applied, the primary induction variable will be used
3252 // to feed a vector compare.
3253 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
3254 continue;
3255
3256 // Returns true if \p Indvar is a pointer induction that is used directly by
3257 // load/store instruction \p I.
3258 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
3259 Instruction *I) {
3260 return Induction.second.getKind() ==
3261 InductionDescriptor::IK_PtrInduction &&
3262 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
3263 Indvar == getLoadStorePointerOperand(V: I) && isScalarUse(I, Indvar);
3264 };
3265
3266 // Determine if all users of the induction variable are scalar after
3267 // vectorization.
3268 auto ScalarInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3269 auto *I = cast<Instruction>(Val: U);
3270 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3271 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3272 });
3273 if (!ScalarInd)
3274 continue;
3275
3276 // If the induction variable update is a fixed-order recurrence, neither the
3277 // induction variable or its update should be marked scalar after
3278 // vectorization.
3279 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
3280 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
3281 continue;
3282
3283 // Determine if all users of the induction variable update instruction are
3284 // scalar after vectorization.
3285 auto ScalarIndUpdate =
3286 llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3287 auto *I = cast<Instruction>(Val: U);
3288 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3289 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3290 });
3291 if (!ScalarIndUpdate)
3292 continue;
3293
3294 // The induction variable and its update instruction will remain scalar.
3295 Worklist.insert(X: Ind);
3296 Worklist.insert(X: IndUpdate);
3297 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
3298 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
3299 << "\n");
3300 }
3301
3302 Scalars[VF].insert(I: Worklist.begin(), E: Worklist.end());
3303}
3304
3305bool LoopVectorizationCostModel::isScalarWithPredication(
3306 Instruction *I, ElementCount VF) const {
3307 if (!isPredicatedInst(I))
3308 return false;
3309
3310 // Do we have a non-scalar lowering for this predicated
3311 // instruction? No - it is scalar with predication.
3312 switch(I->getOpcode()) {
3313 default:
3314 return true;
3315 case Instruction::Call:
3316 if (VF.isScalar())
3317 return true;
3318 return CallWideningDecisions.at(Val: std::make_pair(x: cast<CallInst>(Val: I), y&: VF))
3319 .Kind == CM_Scalarize;
3320 case Instruction::Load:
3321 case Instruction::Store: {
3322 auto *Ptr = getLoadStorePointerOperand(V: I);
3323 auto *Ty = getLoadStoreType(I);
3324 Type *VTy = Ty;
3325 if (VF.isVector())
3326 VTy = VectorType::get(ElementType: Ty, EC: VF);
3327 const Align Alignment = getLoadStoreAlignment(I);
3328 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment) ||
3329 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
3330 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment) ||
3331 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
3332 }
3333 case Instruction::UDiv:
3334 case Instruction::SDiv:
3335 case Instruction::SRem:
3336 case Instruction::URem: {
3337 // We have the option to use the safe-divisor idiom to avoid predication.
3338 // The cost based decision here will always select safe-divisor for
3339 // scalable vectors as scalarization isn't legal.
3340 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
3341 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
3342 }
3343 }
3344}
3345
3346bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
3347 if (!blockNeedsPredicationForAnyReason(BB: I->getParent()))
3348 return false;
3349
3350 // Can we prove this instruction is safe to unconditionally execute?
3351 // If not, we must use some form of predication.
3352 switch(I->getOpcode()) {
3353 default:
3354 return false;
3355 case Instruction::Load:
3356 case Instruction::Store: {
3357 if (!Legal->isMaskRequired(I))
3358 return false;
3359 // When we know the load's address is loop invariant and the instruction
3360 // in the original scalar loop was unconditionally executed then we
3361 // don't need to mark it as a predicated instruction. Tail folding may
3362 // introduce additional predication, but we're guaranteed to always have
3363 // at least one active lane. We call Legal->blockNeedsPredication here
3364 // because it doesn't query tail-folding. For stores, we need to prove
3365 // both speculation safety (which follows from the same argument as loads),
3366 // but also must prove the value being stored is correct. The easiest
3367 // form of the later is to require that all values stored are the same.
3368 if (Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
3369 (isa<LoadInst>(Val: I) ||
3370 (isa<StoreInst>(Val: I) &&
3371 TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()))) &&
3372 !Legal->blockNeedsPredication(BB: I->getParent()))
3373 return false;
3374 return true;
3375 }
3376 case Instruction::UDiv:
3377 case Instruction::SDiv:
3378 case Instruction::SRem:
3379 case Instruction::URem:
3380 // TODO: We can use the loop-preheader as context point here and get
3381 // context sensitive reasoning
3382 return !isSafeToSpeculativelyExecute(I);
3383 case Instruction::Call:
3384 return Legal->isMaskRequired(I);
3385 }
3386}
3387
3388std::pair<InstructionCost, InstructionCost>
3389LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3390 ElementCount VF) const {
3391 assert(I->getOpcode() == Instruction::UDiv ||
3392 I->getOpcode() == Instruction::SDiv ||
3393 I->getOpcode() == Instruction::SRem ||
3394 I->getOpcode() == Instruction::URem);
3395 assert(!isSafeToSpeculativelyExecute(I));
3396
3397 const TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
3398
3399 // Scalarization isn't legal for scalable vector types
3400 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3401 if (!VF.isScalable()) {
3402 // Get the scalarization cost and scale this amount by the probability of
3403 // executing the predicated block. If the instruction is not predicated,
3404 // we fall through to the next case.
3405 ScalarizationCost = 0;
3406
3407 // These instructions have a non-void type, so account for the phi nodes
3408 // that we will create. This cost is likely to be zero. The phi node
3409 // cost, if any, should be scaled by the block probability because it
3410 // models a copy at the end of each predicated block.
3411 ScalarizationCost += VF.getKnownMinValue() *
3412 TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
3413
3414 // The cost of the non-predicated instruction.
3415 ScalarizationCost += VF.getKnownMinValue() *
3416 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
3417
3418 // The cost of insertelement and extractelement instructions needed for
3419 // scalarization.
3420 ScalarizationCost += getScalarizationOverhead(I, VF, CostKind);
3421
3422 // Scale the cost by the probability of executing the predicated blocks.
3423 // This assumes the predicated block for each vector lane is equally
3424 // likely.
3425 ScalarizationCost = ScalarizationCost / getReciprocalPredBlockProb();
3426 }
3427 InstructionCost SafeDivisorCost = 0;
3428
3429 auto *VecTy = ToVectorTy(Scalar: I->getType(), EC: VF);
3430
3431 // The cost of the select guard to ensure all lanes are well defined
3432 // after we speculate above any internal control flow.
3433 SafeDivisorCost += TTI.getCmpSelInstrCost(
3434 Opcode: Instruction::Select, ValTy: VecTy,
3435 CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
3436 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
3437
3438 // Certain instructions can be cheaper to vectorize if they have a constant
3439 // second vector operand. One example of this are shifts on x86.
3440 Value *Op2 = I->getOperand(i: 1);
3441 auto Op2Info = TTI.getOperandInfo(V: Op2);
3442 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3443 Legal->isInvariant(V: Op2))
3444 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3445
3446 SmallVector<const Value *, 4> Operands(I->operand_values());
3447 SafeDivisorCost += TTI.getArithmeticInstrCost(
3448 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
3449 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
3450 Opd2Info: Op2Info, Args: Operands, CxtI: I);
3451 return {ScalarizationCost, SafeDivisorCost};
3452}
3453
3454bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3455 Instruction *I, ElementCount VF) const {
3456 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3457 assert(getWideningDecision(I, VF) == CM_Unknown &&
3458 "Decision should not be set yet.");
3459 auto *Group = getInterleavedAccessGroup(Instr: I);
3460 assert(Group && "Must have a group.");
3461
3462 // If the instruction's allocated size doesn't equal it's type size, it
3463 // requires padding and will be scalarized.
3464 auto &DL = I->getDataLayout();
3465 auto *ScalarTy = getLoadStoreType(I);
3466 if (hasIrregularType(Ty: ScalarTy, DL))
3467 return false;
3468
3469 // If the group involves a non-integral pointer, we may not be able to
3470 // losslessly cast all values to a common type.
3471 unsigned InterleaveFactor = Group->getFactor();
3472 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
3473 for (unsigned i = 0; i < InterleaveFactor; i++) {
3474 Instruction *Member = Group->getMember(Index: i);
3475 if (!Member)
3476 continue;
3477 auto *MemberTy = getLoadStoreType(I: Member);
3478 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3479 // Don't coerce non-integral pointers to integers or vice versa.
3480 if (MemberNI != ScalarNI) {
3481 // TODO: Consider adding special nullptr value case here
3482 return false;
3483 } else if (MemberNI && ScalarNI &&
3484 ScalarTy->getPointerAddressSpace() !=
3485 MemberTy->getPointerAddressSpace()) {
3486 return false;
3487 }
3488 }
3489
3490 // Check if masking is required.
3491 // A Group may need masking for one of two reasons: it resides in a block that
3492 // needs predication, or it was decided to use masking to deal with gaps
3493 // (either a gap at the end of a load-access that may result in a speculative
3494 // load, or any gaps in a store-access).
3495 bool PredicatedAccessRequiresMasking =
3496 blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3497 Legal->isMaskRequired(I);
3498 bool LoadAccessWithGapsRequiresEpilogMasking =
3499 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3500 !isScalarEpilogueAllowed();
3501 bool StoreAccessWithGapsRequiresMasking =
3502 isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor());
3503 if (!PredicatedAccessRequiresMasking &&
3504 !LoadAccessWithGapsRequiresEpilogMasking &&
3505 !StoreAccessWithGapsRequiresMasking)
3506 return true;
3507
3508 // If masked interleaving is required, we expect that the user/target had
3509 // enabled it, because otherwise it either wouldn't have been created or
3510 // it should have been invalidated by the CostModel.
3511 assert(useMaskedInterleavedAccesses(TTI) &&
3512 "Masked interleave-groups for predicated accesses are not enabled.");
3513
3514 if (Group->isReverse())
3515 return false;
3516
3517 auto *Ty = getLoadStoreType(I);
3518 const Align Alignment = getLoadStoreAlignment(I);
3519 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment)
3520 : TTI.isLegalMaskedStore(DataType: Ty, Alignment);
3521}
3522
3523bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3524 Instruction *I, ElementCount VF) {
3525 // Get and ensure we have a valid memory instruction.
3526 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3527
3528 auto *Ptr = getLoadStorePointerOperand(V: I);
3529 auto *ScalarTy = getLoadStoreType(I);
3530
3531 // In order to be widened, the pointer should be consecutive, first of all.
3532 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3533 return false;
3534
3535 // If the instruction is a store located in a predicated block, it will be
3536 // scalarized.
3537 if (isScalarWithPredication(I, VF))
3538 return false;
3539
3540 // If the instruction's allocated size doesn't equal it's type size, it
3541 // requires padding and will be scalarized.
3542 auto &DL = I->getDataLayout();
3543 if (hasIrregularType(Ty: ScalarTy, DL))
3544 return false;
3545
3546 return true;
3547}
3548
3549void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3550 // We should not collect Uniforms more than once per VF. Right now,
3551 // this function is called from collectUniformsAndScalars(), which
3552 // already does this check. Collecting Uniforms for VF=1 does not make any
3553 // sense.
3554
3555 assert(VF.isVector() && !Uniforms.contains(VF) &&
3556 "This function should not be visited twice for the same VF");
3557
3558 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
3559 // not analyze again. Uniforms.count(VF) will return 1.
3560 Uniforms[VF].clear();
3561
3562 // We now know that the loop is vectorizable!
3563 // Collect instructions inside the loop that will remain uniform after
3564 // vectorization.
3565
3566 // Global values, params and instructions outside of current loop are out of
3567 // scope.
3568 auto isOutOfScope = [&](Value *V) -> bool {
3569 Instruction *I = dyn_cast<Instruction>(Val: V);
3570 return (!I || !TheLoop->contains(Inst: I));
3571 };
3572
3573 // Worklist containing uniform instructions demanding lane 0.
3574 SetVector<Instruction *> Worklist;
3575
3576 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3577 // that require predication must not be considered uniform after
3578 // vectorization, because that would create an erroneous replicating region
3579 // where only a single instance out of VF should be formed.
3580 auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
3581 if (isOutOfScope(I)) {
3582 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3583 << *I << "\n");
3584 return;
3585 }
3586 if (isPredicatedInst(I)) {
3587 LLVM_DEBUG(
3588 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3589 << "\n");
3590 return;
3591 }
3592 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3593 Worklist.insert(X: I);
3594 };
3595
3596 // Start with the conditional branches exiting the loop. If the branch
3597 // condition is an instruction contained in the loop that is only used by the
3598 // branch, it is uniform.
3599 SmallVector<BasicBlock *> Exiting;
3600 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3601 for (BasicBlock *E : Exiting) {
3602 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
3603 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3604 addToWorklistIfAllowed(Cmp);
3605 }
3606
3607 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
3608 // Return true if all lanes perform the same memory operation, and we can
3609 // thus chose to execute only one.
3610 auto isUniformMemOpUse = [&](Instruction *I) {
3611 // If the value was already known to not be uniform for the previous
3612 // (smaller VF), it cannot be uniform for the larger VF.
3613 if (PrevVF.isVector()) {
3614 auto Iter = Uniforms.find(Val: PrevVF);
3615 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
3616 return false;
3617 }
3618 if (!Legal->isUniformMemOp(I&: *I, VF))
3619 return false;
3620 if (isa<LoadInst>(Val: I))
3621 // Loading the same address always produces the same result - at least
3622 // assuming aliasing and ordering which have already been checked.
3623 return true;
3624 // Storing the same value on every iteration.
3625 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3626 };
3627
3628 auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
3629 InstWidening WideningDecision = getWideningDecision(I, VF);
3630 assert(WideningDecision != CM_Unknown &&
3631 "Widening decision should be ready at this moment");
3632
3633 if (isUniformMemOpUse(I))
3634 return true;
3635
3636 return (WideningDecision == CM_Widen ||
3637 WideningDecision == CM_Widen_Reverse ||
3638 WideningDecision == CM_Interleave);
3639 };
3640
3641 // Returns true if Ptr is the pointer operand of a memory access instruction
3642 // I, I is known to not require scalarization, and the pointer is not also
3643 // stored.
3644 auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3645 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
3646 return false;
3647 return getLoadStorePointerOperand(V: I) == Ptr &&
3648 (isUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
3649 };
3650
3651 // Holds a list of values which are known to have at least one uniform use.
3652 // Note that there may be other uses which aren't uniform. A "uniform use"
3653 // here is something which only demands lane 0 of the unrolled iterations;
3654 // it does not imply that all lanes produce the same value (e.g. this is not
3655 // the usual meaning of uniform)
3656 SetVector<Value *> HasUniformUse;
3657
3658 // Scan the loop for instructions which are either a) known to have only
3659 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3660 for (auto *BB : TheLoop->blocks())
3661 for (auto &I : *BB) {
3662 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3663 switch (II->getIntrinsicID()) {
3664 case Intrinsic::sideeffect:
3665 case Intrinsic::experimental_noalias_scope_decl:
3666 case Intrinsic::assume:
3667 case Intrinsic::lifetime_start:
3668 case Intrinsic::lifetime_end:
3669 if (TheLoop->hasLoopInvariantOperands(I: &I))
3670 addToWorklistIfAllowed(&I);
3671 break;
3672 default:
3673 break;
3674 }
3675 }
3676
3677 // ExtractValue instructions must be uniform, because the operands are
3678 // known to be loop-invariant.
3679 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3680 assert(isOutOfScope(EVI->getAggregateOperand()) &&
3681 "Expected aggregate value to be loop invariant");
3682 addToWorklistIfAllowed(EVI);
3683 continue;
3684 }
3685
3686 // If there's no pointer operand, there's nothing to do.
3687 auto *Ptr = getLoadStorePointerOperand(V: &I);
3688 if (!Ptr)
3689 continue;
3690
3691 if (isUniformMemOpUse(&I))
3692 addToWorklistIfAllowed(&I);
3693
3694 if (isVectorizedMemAccessUse(&I, Ptr))
3695 HasUniformUse.insert(X: Ptr);
3696 }
3697
3698 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3699 // demanding) users. Since loops are assumed to be in LCSSA form, this
3700 // disallows uses outside the loop as well.
3701 for (auto *V : HasUniformUse) {
3702 if (isOutOfScope(V))
3703 continue;
3704 auto *I = cast<Instruction>(Val: V);
3705 auto UsersAreMemAccesses =
3706 llvm::all_of(Range: I->users(), P: [&](User *U) -> bool {
3707 return isVectorizedMemAccessUse(cast<Instruction>(Val: U), V);
3708 });
3709 if (UsersAreMemAccesses)
3710 addToWorklistIfAllowed(I);
3711 }
3712
3713 // Expand Worklist in topological order: whenever a new instruction
3714 // is added , its users should be already inside Worklist. It ensures
3715 // a uniform instruction will only be used by uniform instructions.
3716 unsigned idx = 0;
3717 while (idx != Worklist.size()) {
3718 Instruction *I = Worklist[idx++];
3719
3720 for (auto *OV : I->operand_values()) {
3721 // isOutOfScope operands cannot be uniform instructions.
3722 if (isOutOfScope(OV))
3723 continue;
3724 // First order recurrence Phi's should typically be considered
3725 // non-uniform.
3726 auto *OP = dyn_cast<PHINode>(Val: OV);
3727 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3728 continue;
3729 // If all the users of the operand are uniform, then add the
3730 // operand into the uniform worklist.
3731 auto *OI = cast<Instruction>(Val: OV);
3732 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
3733 auto *J = cast<Instruction>(Val: U);
3734 return Worklist.count(key: J) || isVectorizedMemAccessUse(J, OI);
3735 }))
3736 addToWorklistIfAllowed(OI);
3737 }
3738 }
3739
3740 // For an instruction to be added into Worklist above, all its users inside
3741 // the loop should also be in Worklist. However, this condition cannot be
3742 // true for phi nodes that form a cyclic dependence. We must process phi
3743 // nodes separately. An induction variable will remain uniform if all users
3744 // of the induction variable and induction variable update remain uniform.
3745 // The code below handles both pointer and non-pointer induction variables.
3746 BasicBlock *Latch = TheLoop->getLoopLatch();
3747 for (const auto &Induction : Legal->getInductionVars()) {
3748 auto *Ind = Induction.first;
3749 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3750
3751 // Determine if all users of the induction variable are uniform after
3752 // vectorization.
3753 auto UniformInd = llvm::all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3754 auto *I = cast<Instruction>(Val: U);
3755 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3756 isVectorizedMemAccessUse(I, Ind);
3757 });
3758 if (!UniformInd)
3759 continue;
3760
3761 // Determine if all users of the induction variable update instruction are
3762 // uniform after vectorization.
3763 auto UniformIndUpdate =
3764 llvm::all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3765 auto *I = cast<Instruction>(Val: U);
3766 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3767 isVectorizedMemAccessUse(I, IndUpdate);
3768 });
3769 if (!UniformIndUpdate)
3770 continue;
3771
3772 // The induction variable and its update instruction will remain uniform.
3773 addToWorklistIfAllowed(Ind);
3774 addToWorklistIfAllowed(IndUpdate);
3775 }
3776
3777 Uniforms[VF].insert(I: Worklist.begin(), E: Worklist.end());
3778}
3779
3780bool LoopVectorizationCostModel::runtimeChecksRequired() {
3781 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3782
3783 if (Legal->getRuntimePointerChecking()->Need) {
3784 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3785 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3786 "loop with '#pragma clang loop vectorize(enable)' when "
3787 "compiling with -Os/-Oz",
3788 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3789 return true;
3790 }
3791
3792 if (!PSE.getPredicate().isAlwaysTrue()) {
3793 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3794 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3795 "loop with '#pragma clang loop vectorize(enable)' when "
3796 "compiling with -Os/-Oz",
3797 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3798 return true;
3799 }
3800
3801 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3802 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3803 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3804 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3805 "this loop without such check by compiling with -Os/-Oz",
3806 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3807 return true;
3808 }
3809
3810 return false;
3811}
3812
3813bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3814 if (IsScalableVectorizationAllowed)
3815 return *IsScalableVectorizationAllowed;
3816
3817 IsScalableVectorizationAllowed = false;
3818 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3819 return false;
3820
3821 if (Hints->isScalableVectorizationDisabled()) {
3822 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3823 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3824 return false;
3825 }
3826
3827 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3828
3829 auto MaxScalableVF = ElementCount::getScalable(
3830 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3831
3832 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3833 // FIXME: While for scalable vectors this is currently sufficient, this should
3834 // be replaced by a more detailed mechanism that filters out specific VFs,
3835 // instead of invalidating vectorization for a whole set of VFs based on the
3836 // MaxVF.
3837
3838 // Disable scalable vectorization if the loop contains unsupported reductions.
3839 if (!canVectorizeReductions(VF: MaxScalableVF)) {
3840 reportVectorizationInfo(
3841 Msg: "Scalable vectorization not supported for the reduction "
3842 "operations found in this loop.",
3843 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3844 return false;
3845 }
3846
3847 // Disable scalable vectorization if the loop contains any instructions
3848 // with element types not supported for scalable vectors.
3849 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3850 return !Ty->isVoidTy() &&
3851 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3852 })) {
3853 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3854 "for all element types found in this loop.",
3855 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3856 return false;
3857 }
3858
3859 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3860 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3861 "for safe distance analysis.",
3862 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3863 return false;
3864 }
3865
3866 IsScalableVectorizationAllowed = true;
3867 return true;
3868}
3869
3870ElementCount
3871LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3872 if (!isScalableVectorizationAllowed())
3873 return ElementCount::getScalable(MinVal: 0);
3874
3875 auto MaxScalableVF = ElementCount::getScalable(
3876 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3877 if (Legal->isSafeForAnyVectorWidth())
3878 return MaxScalableVF;
3879
3880 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3881 // Limit MaxScalableVF by the maximum safe dependence distance.
3882 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3883
3884 if (!MaxScalableVF)
3885 reportVectorizationInfo(
3886 Msg: "Max legal vector width too small, scalable vectorization "
3887 "unfeasible.",
3888 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3889
3890 return MaxScalableVF;
3891}
3892
3893FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3894 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3895 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3896 unsigned SmallestType, WidestType;
3897 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3898
3899 // Get the maximum safe dependence distance in bits computed by LAA.
3900 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3901 // the memory accesses that is most restrictive (involved in the smallest
3902 // dependence distance).
3903 unsigned MaxSafeElements =
3904 llvm::bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3905
3906 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElements);
3907 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3908
3909 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3910 << ".\n");
3911 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3912 << ".\n");
3913
3914 // First analyze the UserVF, fall back if the UserVF should be ignored.
3915 if (UserVF) {
3916 auto MaxSafeUserVF =
3917 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3918
3919 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3920 // If `VF=vscale x N` is safe, then so is `VF=N`
3921 if (UserVF.isScalable())
3922 return FixedScalableVFPair(
3923 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3924 else
3925 return UserVF;
3926 }
3927
3928 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3929
3930 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3931 // is better to ignore the hint and let the compiler choose a suitable VF.
3932 if (!UserVF.isScalable()) {
3933 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3934 << " is unsafe, clamping to max safe VF="
3935 << MaxSafeFixedVF << ".\n");
3936 ORE->emit(RemarkBuilder: [&]() {
3937 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3938 TheLoop->getStartLoc(),
3939 TheLoop->getHeader())
3940 << "User-specified vectorization factor "
3941 << ore::NV("UserVectorizationFactor", UserVF)
3942 << " is unsafe, clamping to maximum safe vectorization factor "
3943 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3944 });
3945 return MaxSafeFixedVF;
3946 }
3947
3948 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3949 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3950 << " is ignored because scalable vectors are not "
3951 "available.\n");
3952 ORE->emit(RemarkBuilder: [&]() {
3953 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3954 TheLoop->getStartLoc(),
3955 TheLoop->getHeader())
3956 << "User-specified vectorization factor "
3957 << ore::NV("UserVectorizationFactor", UserVF)
3958 << " is ignored because the target does not support scalable "
3959 "vectors. The compiler will pick a more suitable value.";
3960 });
3961 } else {
3962 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3963 << " is unsafe. Ignoring scalable UserVF.\n");
3964 ORE->emit(RemarkBuilder: [&]() {
3965 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3966 TheLoop->getStartLoc(),
3967 TheLoop->getHeader())
3968 << "User-specified vectorization factor "
3969 << ore::NV("UserVectorizationFactor", UserVF)
3970 << " is unsafe. Ignoring the hint to let the compiler pick a "
3971 "more suitable value.";
3972 });
3973 }
3974 }
3975
3976 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3977 << " / " << WidestType << " bits.\n");
3978
3979 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
3980 ElementCount::getScalable(MinVal: 0));
3981 if (auto MaxVF =
3982 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3983 MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking))
3984 Result.FixedVF = MaxVF;
3985
3986 if (auto MaxVF =
3987 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3988 MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking))
3989 if (MaxVF.isScalable()) {
3990 Result.ScalableVF = MaxVF;
3991 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3992 << "\n");
3993 }
3994
3995 return Result;
3996}
3997
3998FixedScalableVFPair
3999LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4000 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4001 // TODO: It may by useful to do since it's still likely to be dynamically
4002 // uniform if the target can skip.
4003 reportVectorizationFailure(
4004 DebugMsg: "Not inserting runtime ptr check for divergent target",
4005 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
4006 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4007 return FixedScalableVFPair::getNone();
4008 }
4009
4010 unsigned TC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop);
4011 unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(L: TheLoop);
4012 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4013 if (TC == 1) {
4014 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
4015 OREMsg: "loop trip count is one, irrelevant for vectorization",
4016 ORETag: "SingleIterationLoop", ORE, TheLoop);
4017 return FixedScalableVFPair::getNone();
4018 }
4019
4020 switch (ScalarEpilogueStatus) {
4021 case CM_ScalarEpilogueAllowed:
4022 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
4023 case CM_ScalarEpilogueNotAllowedUsePredicate:
4024 [[fallthrough]];
4025 case CM_ScalarEpilogueNotNeededUsePredicate:
4026 LLVM_DEBUG(
4027 dbgs() << "LV: vector predicate hint/switch found.\n"
4028 << "LV: Not allowing scalar epilogue, creating predicated "
4029 << "vector loop.\n");
4030 break;
4031 case CM_ScalarEpilogueNotAllowedLowTripLoop:
4032 // fallthrough as a special case of OptForSize
4033 case CM_ScalarEpilogueNotAllowedOptSize:
4034 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4035 LLVM_DEBUG(
4036 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4037 else
4038 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4039 << "count.\n");
4040
4041 // Bail if runtime checks are required, which are not good when optimising
4042 // for size.
4043 if (runtimeChecksRequired())
4044 return FixedScalableVFPair::getNone();
4045
4046 break;
4047 }
4048
4049 // The only loops we can vectorize without a scalar epilogue, are loops with
4050 // a bottom-test and a single exiting block. We'd have to handle the fact
4051 // that not every instruction executes on the last iteration. This will
4052 // require a lane mask which varies through the vector loop body. (TODO)
4053 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4054 // If there was a tail-folding hint/switch, but we can't fold the tail by
4055 // masking, fallback to a vectorization with a scalar epilogue.
4056 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4057 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4058 "scalar epilogue instead.\n");
4059 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4060 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
4061 }
4062 return FixedScalableVFPair::getNone();
4063 }
4064
4065 // Now try the tail folding
4066
4067 // Invalidate interleave groups that require an epilogue if we can't mask
4068 // the interleave-group.
4069 if (!useMaskedInterleavedAccesses(TTI)) {
4070 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
4071 "No decisions should have been taken at this point");
4072 // Note: There is no need to invalidate any cost modeling decisions here, as
4073 // non where taken so far.
4074 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4075 }
4076
4077 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true);
4078
4079 // Avoid tail folding if the trip count is known to be a multiple of any VF
4080 // we choose.
4081 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4082 MaxFactors.FixedVF.getFixedValue();
4083 if (MaxFactors.ScalableVF) {
4084 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
4085 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
4086 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4087 a: *MaxPowerOf2RuntimeVF,
4088 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
4089 } else
4090 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
4091 }
4092
4093 if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4094 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4095 "MaxFixedVF must be a power of 2");
4096 unsigned MaxVFtimesIC =
4097 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4098 ScalarEvolution *SE = PSE.getSE();
4099 const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
4100 const SCEV *ExitCount = SE->getAddExpr(
4101 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
4102 const SCEV *Rem = SE->getURemExpr(
4103 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
4104 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
4105 if (Rem->isZero()) {
4106 // Accept MaxFixedVF if we do not have a tail.
4107 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4108 return MaxFactors;
4109 }
4110 }
4111
4112 // If we don't know the precise trip count, or if the trip count that we
4113 // found modulo the vectorization factor is not zero, try to fold the tail
4114 // by masking.
4115 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4116 setTailFoldingStyles(IsScalableVF: MaxFactors.ScalableVF.isScalable(), UserIC);
4117 if (foldTailByMasking()) {
4118 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
4119 LLVM_DEBUG(
4120 dbgs()
4121 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
4122 "try to generate VP Intrinsics with scalable vector "
4123 "factors only.\n");
4124 // Tail folded loop using VP intrinsics restricts the VF to be scalable
4125 // for now.
4126 // TODO: extend it for fixed vectors, if required.
4127 assert(MaxFactors.ScalableVF.isScalable() &&
4128 "Expected scalable vector factor.");
4129
4130 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
4131 }
4132 return MaxFactors;
4133 }
4134
4135 // If there was a tail-folding hint/switch, but we can't fold the tail by
4136 // masking, fallback to a vectorization with a scalar epilogue.
4137 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4138 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4139 "scalar epilogue instead.\n");
4140 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4141 return MaxFactors;
4142 }
4143
4144 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
4145 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
4146 return FixedScalableVFPair::getNone();
4147 }
4148
4149 if (TC == 0) {
4150 reportVectorizationFailure(
4151 DebugMsg: "Unable to calculate the loop count due to complex control flow",
4152 OREMsg: "unable to calculate the loop count due to complex control flow",
4153 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
4154 return FixedScalableVFPair::getNone();
4155 }
4156
4157 reportVectorizationFailure(
4158 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
4159 OREMsg: "cannot optimize for size and vectorize at the same time. "
4160 "Enable vectorization of this loop with '#pragma clang loop "
4161 "vectorize(enable)' when compiling with -Os/-Oz",
4162 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
4163 return FixedScalableVFPair::getNone();
4164}
4165
4166ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4167 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
4168 ElementCount MaxSafeVF, bool FoldTailByMasking) {
4169 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
4170 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
4171 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4172 : TargetTransformInfo::RGK_FixedWidthVector);
4173
4174 // Convenience function to return the minimum of two ElementCounts.
4175 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
4176 assert((LHS.isScalable() == RHS.isScalable()) &&
4177 "Scalable flags must match");
4178 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
4179 };
4180
4181 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
4182 // Note that both WidestRegister and WidestType may not be a powers of 2.
4183 auto MaxVectorElementCount = ElementCount::get(
4184 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
4185 Scalable: ComputeScalableMaxVF);
4186 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4187 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
4188 << (MaxVectorElementCount * WidestType) << " bits.\n");
4189
4190 if (!MaxVectorElementCount) {
4191 LLVM_DEBUG(dbgs() << "LV: The target has no "
4192 << (ComputeScalableMaxVF ? "scalable" : "fixed")
4193 << " vector registers.\n");
4194 return ElementCount::getFixed(MinVal: 1);
4195 }
4196
4197 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4198 if (MaxVectorElementCount.isScalable() &&
4199 TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
4200 auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
4201 auto Min = Attr.getVScaleRangeMin();
4202 WidestRegisterMinEC *= Min;
4203 }
4204
4205 // When a scalar epilogue is required, at least one iteration of the scalar
4206 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4207 // max VF that results in a dead vector loop.
4208 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
4209 MaxTripCount -= 1;
4210
4211 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4212 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
4213 // If upper bound loop trip count (TC) is known at compile time there is no
4214 // point in choosing VF greater than TC (as done in the loop below). Select
4215 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
4216 // scalable, we only fall back on a fixed VF when the TC is less than or
4217 // equal to the known number of lanes.
4218 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount);
4219 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
4220 "exceeding the constant trip count: "
4221 << ClampedUpperTripCount << "\n");
4222 return ElementCount::get(
4223 MinVal: ClampedUpperTripCount,
4224 Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
4225 }
4226
4227 TargetTransformInfo::RegisterKind RegKind =
4228 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
4229 : TargetTransformInfo::RGK_FixedWidthVector;
4230 ElementCount MaxVF = MaxVectorElementCount;
4231 if (MaximizeBandwidth ||
4232 (MaximizeBandwidth.getNumOccurrences() == 0 &&
4233 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
4234 (UseWiderVFIfCallVariantsPresent && Legal->hasVectorCallVariants())))) {
4235 auto MaxVectorElementCountMaxBW = ElementCount::get(
4236 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
4237 Scalable: ComputeScalableMaxVF);
4238 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4239
4240 // Collect all viable vectorization factors larger than the default MaxVF
4241 // (i.e. MaxVectorElementCount).
4242 SmallVector<ElementCount, 8> VFs;
4243 for (ElementCount VS = MaxVectorElementCount * 2;
4244 ElementCount::isKnownLE(LHS: VS, RHS: MaxVectorElementCountMaxBW); VS *= 2)
4245 VFs.push_back(Elt: VS);
4246
4247 // For each VF calculate its register usage.
4248 auto RUs = calculateRegisterUsage(VFs);
4249
4250 // Select the largest VF which doesn't require more registers than existing
4251 // ones.
4252 for (int I = RUs.size() - 1; I >= 0; --I) {
4253 const auto &MLU = RUs[I].MaxLocalUsers;
4254 if (all_of(Range: MLU, P: [&](decltype(MLU.front()) &LU) {
4255 return LU.second <= TTI.getNumberOfRegisters(ClassID: LU.first);
4256 })) {
4257 MaxVF = VFs[I];
4258 break;
4259 }
4260 }
4261 if (ElementCount MinVF =
4262 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
4263 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
4264 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
4265 << ") with target's minimum: " << MinVF << '\n');
4266 MaxVF = MinVF;
4267 }
4268 }
4269
4270 // Invalidate any widening decisions we might have made, in case the loop
4271 // requires prediction (decided later), but we have already made some
4272 // load/store widening decisions.
4273 invalidateCostModelingDecisions();
4274 }
4275 return MaxVF;
4276}
4277
4278/// Convenience function that returns the value of vscale_range iff
4279/// vscale_range.min == vscale_range.max or otherwise returns the value
4280/// returned by the corresponding TTI method.
4281static std::optional<unsigned>
4282getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4283 const Function *Fn = L->getHeader()->getParent();
4284 if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
4285 auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
4286 auto Min = Attr.getVScaleRangeMin();
4287 auto Max = Attr.getVScaleRangeMax();
4288 if (Max && Min == Max)
4289 return Max;
4290 }
4291
4292 return TTI.getVScaleForTuning();
4293}
4294
4295bool LoopVectorizationPlanner::isMoreProfitable(
4296 const VectorizationFactor &A, const VectorizationFactor &B) const {
4297 InstructionCost CostA = A.Cost;
4298 InstructionCost CostB = B.Cost;
4299
4300 unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(L: OrigLoop);
4301
4302 // Improve estimate for the vector width if it is scalable.
4303 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
4304 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4305 if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI)) {
4306 if (A.Width.isScalable())
4307 EstimatedWidthA *= *VScale;
4308 if (B.Width.isScalable())
4309 EstimatedWidthB *= *VScale;
4310 }
4311
4312 // Assume vscale may be larger than 1 (or the value being tuned for),
4313 // so that scalable vectorization is slightly favorable over fixed-width
4314 // vectorization.
4315 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
4316 A.Width.isScalable() && !B.Width.isScalable();
4317
4318 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
4319 const InstructionCost &RHS) {
4320 return PreferScalable ? LHS <= RHS : LHS < RHS;
4321 };
4322
4323 // To avoid the need for FP division:
4324 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
4325 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
4326 if (!MaxTripCount)
4327 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4328
4329 auto GetCostForTC = [MaxTripCount, this](unsigned VF,
4330 InstructionCost VectorCost,
4331 InstructionCost ScalarCost) {
4332 // If the trip count is a known (possibly small) constant, the trip count
4333 // will be rounded up to an integer number of iterations under
4334 // FoldTailByMasking. The total cost in that case will be
4335 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
4336 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
4337 // some extra overheads, but for the purpose of comparing the costs of
4338 // different VFs we can use this to compare the total loop-body cost
4339 // expected after vectorization.
4340 if (CM.foldTailByMasking())
4341 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
4342 return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4343 };
4344
4345 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
4346 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
4347 return CmpFn(RTCostA, RTCostB);
4348}
4349
4350static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4351 OptimizationRemarkEmitter *ORE,
4352 Loop *TheLoop) {
4353 if (InvalidCosts.empty())
4354 return;
4355
4356 // Emit a report of VFs with invalid costs in the loop.
4357
4358 // Group the remarks per instruction, keeping the instruction order from
4359 // InvalidCosts.
4360 std::map<Instruction *, unsigned> Numbering;
4361 unsigned I = 0;
4362 for (auto &Pair : InvalidCosts)
4363 if (!Numbering.count(x: Pair.first))
4364 Numbering[Pair.first] = I++;
4365
4366 // Sort the list, first on instruction(number) then on VF.
4367 sort(C&: InvalidCosts, Comp: [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
4368 if (Numbering[A.first] != Numbering[B.first])
4369 return Numbering[A.first] < Numbering[B.first];
4370 const auto &LHS = A.second;
4371 const auto &RHS = B.second;
4372 return std::make_tuple(args: LHS.isScalable(), args: LHS.getKnownMinValue()) <
4373 std::make_tuple(args: RHS.isScalable(), args: RHS.getKnownMinValue());
4374 });
4375
4376 // For a list of ordered instruction-vf pairs:
4377 // [(load, vf1), (load, vf2), (store, vf1)]
4378 // Group the instructions together to emit separate remarks for:
4379 // load (vf1, vf2)
4380 // store (vf1)
4381 auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
4382 auto Subset = ArrayRef<InstructionVFPair>();
4383 do {
4384 if (Subset.empty())
4385 Subset = Tail.take_front(N: 1);
4386
4387 Instruction *I = Subset.front().first;
4388
4389 // If the next instruction is different, or if there are no other pairs,
4390 // emit a remark for the collated subset. e.g.
4391 // [(load, vf1), (load, vf2))]
4392 // to emit:
4393 // remark: invalid costs for 'load' at VF=(vf, vf2)
4394 if (Subset == Tail || Tail[Subset.size()].first != I) {
4395 std::string OutString;
4396 raw_string_ostream OS(OutString);
4397 assert(!Subset.empty() && "Unexpected empty range");
4398 OS << "Instruction with invalid costs prevented vectorization at VF=(";
4399 for (const auto &Pair : Subset)
4400 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4401 OS << "):";
4402 if (auto *CI = dyn_cast<CallInst>(Val: I))
4403 OS << " call to " << CI->getCalledFunction()->getName();
4404 else
4405 OS << " " << I->getOpcodeName();
4406 OS.flush();
4407 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop, I);
4408 Tail = Tail.drop_front(N: Subset.size());
4409 Subset = {};
4410 } else
4411 // Grow the subset by one element
4412 Subset = Tail.take_front(N: Subset.size() + 1);
4413 } while (!Tail.empty());
4414}
4415
4416/// Check if any recipe of \p Plan will generate a vector value, which will be
4417/// assigned a vector register.
4418static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4419 const TargetTransformInfo &TTI) {
4420 assert(VF.isVector() && "Checking a scalar VF?");
4421 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4422 Plan.getCanonicalIV()->getScalarType()->getContext());
4423 DenseSet<VPRecipeBase *> EphemeralRecipes;
4424 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4425 // Set of already visited types.
4426 DenseSet<Type *> Visited;
4427 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4428 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4429 for (VPRecipeBase &R : *VPBB) {
4430 if (EphemeralRecipes.contains(V: &R))
4431 continue;
4432 // Continue early if the recipe is considered to not produce a vector
4433 // result. Note that this includes VPInstruction where some opcodes may
4434 // produce a vector, to preserve existing behavior as VPInstructions model
4435 // aspects not directly mapped to existing IR instructions.
4436 switch (R.getVPDefID()) {
4437 case VPDef::VPDerivedIVSC:
4438 case VPDef::VPScalarIVStepsSC:
4439 case VPDef::VPScalarCastSC:
4440 case VPDef::VPReplicateSC:
4441 case VPDef::VPInstructionSC:
4442 case VPDef::VPCanonicalIVPHISC:
4443 case VPDef::VPVectorPointerSC:
4444 case VPDef::VPExpandSCEVSC:
4445 case VPDef::VPEVLBasedIVPHISC:
4446 case VPDef::VPPredInstPHISC:
4447 case VPDef::VPBranchOnMaskSC:
4448 continue;
4449 case VPDef::VPReductionSC:
4450 case VPDef::VPActiveLaneMaskPHISC:
4451 case VPDef::VPWidenCallSC:
4452 case VPDef::VPWidenCanonicalIVSC:
4453 case VPDef::VPWidenCastSC:
4454 case VPDef::VPWidenGEPSC:
4455 case VPDef::VPWidenSC:
4456 case VPDef::VPWidenSelectSC:
4457 case VPDef::VPBlendSC:
4458 case VPDef::VPFirstOrderRecurrencePHISC:
4459 case VPDef::VPWidenPHISC:
4460 case VPDef::VPWidenIntOrFpInductionSC:
4461 case VPDef::VPWidenPointerInductionSC:
4462 case VPDef::VPReductionPHISC:
4463 case VPDef::VPInterleaveSC:
4464 case VPDef::VPWidenLoadEVLSC:
4465 case VPDef::VPWidenLoadSC:
4466 case VPDef::VPWidenStoreEVLSC:
4467 case VPDef::VPWidenStoreSC:
4468 break;
4469 default:
4470 llvm_unreachable("unhandled recipe");
4471 }
4472
4473 auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4474 Type *VectorTy = ToVectorTy(Scalar: ScalarTy, EC: VF);
4475 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4476 if (!NumLegalParts)
4477 return false;
4478 if (VF.isScalable()) {
4479 // <vscale x 1 x iN> is assumed to be profitable over iN because
4480 // scalable registers are a distinct register class from scalar
4481 // ones. If we ever find a target which wants to lower scalable
4482 // vectors back to scalars, we'll need to update this code to
4483 // explicitly ask TTI about the register class uses for each part.
4484 return NumLegalParts <= VF.getKnownMinValue();
4485 }
4486 // Two or more parts that share a register - are vectorized.
4487 return NumLegalParts < VF.getKnownMinValue();
4488 };
4489
4490 // If no def nor is a store, e.g., branches, continue - no value to check.
4491 if (R.getNumDefinedValues() == 0 &&
4492 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4493 Val: &R))
4494 continue;
4495 // For multi-def recipes, currently only interleaved loads, suffice to
4496 // check first def only.
4497 // For stores check their stored value; for interleaved stores suffice
4498 // the check first stored value only. In all cases this is the second
4499 // operand.
4500 VPValue *ToCheck =
4501 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
4502 Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4503 if (!Visited.insert(V: {ScalarTy}).second)
4504 continue;
4505 if (WillWiden(ScalarTy))
4506 return true;
4507 }
4508 }
4509
4510 return false;
4511}
4512
4513VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4514 InstructionCost ExpectedCost = CM.expectedCost(VF: ElementCount::getFixed(MinVal: 1));
4515 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4516 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4517 assert(any_of(VPlans,
4518 [](std::unique_ptr<VPlan> &P) {
4519 return P->hasVF(ElementCount::getFixed(1));
4520 }) &&
4521 "Expected Scalar VF to be a candidate");
4522
4523 const VectorizationFactor ScalarCost(ElementCount::getFixed(MinVal: 1), ExpectedCost,
4524 ExpectedCost);
4525 VectorizationFactor ChosenFactor = ScalarCost;
4526
4527 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4528 if (ForceVectorization &&
4529 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4530 // Ignore scalar width, because the user explicitly wants vectorization.
4531 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4532 // evaluation.
4533 ChosenFactor.Cost = InstructionCost::getMax();
4534 }
4535
4536 SmallVector<InstructionVFPair> InvalidCosts;
4537 for (auto &P : VPlans) {
4538 for (ElementCount VF : P->vectorFactors()) {
4539 // The cost for scalar VF=1 is already calculated, so ignore it.
4540 if (VF.isScalar())
4541 continue;
4542
4543 InstructionCost C = CM.expectedCost(VF, Invalid: &InvalidCosts);
4544 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4545
4546#ifndef NDEBUG
4547 unsigned AssumedMinimumVscale =
4548 getVScaleForTuning(OrigLoop, TTI).value_or(1);
4549 unsigned Width =
4550 Candidate.Width.isScalable()
4551 ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4552 : Candidate.Width.getFixedValue();
4553 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4554 << " costs: " << (Candidate.Cost / Width));
4555 if (VF.isScalable())
4556 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4557 << AssumedMinimumVscale << ")");
4558 LLVM_DEBUG(dbgs() << ".\n");
4559#endif
4560
4561 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
4562 LLVM_DEBUG(
4563 dbgs()
4564 << "LV: Not considering vector loop of width " << VF
4565 << " because it will not generate any vector instructions.\n");
4566 continue;
4567 }
4568
4569 // If profitable add it to ProfitableVF list.
4570 if (isMoreProfitable(A: Candidate, B: ScalarCost))
4571 ProfitableVFs.push_back(Elt: Candidate);
4572
4573 if (isMoreProfitable(A: Candidate, B: ChosenFactor))
4574 ChosenFactor = Candidate;
4575 }
4576 }
4577
4578 emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop: OrigLoop);
4579
4580 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4581 reportVectorizationFailure(
4582 DebugMsg: "There are conditional stores.",
4583 OREMsg: "store that is conditionally executed prevents vectorization",
4584 ORETag: "ConditionalStore", ORE, TheLoop: OrigLoop);
4585 ChosenFactor = ScalarCost;
4586 }
4587
4588 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4589 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
4590 << "LV: Vectorization seems to be not beneficial, "
4591 << "but was forced by a user.\n");
4592 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
4593 return ChosenFactor;
4594}
4595
4596bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4597 ElementCount VF) const {
4598 // Cross iteration phis such as reductions need special handling and are
4599 // currently unsupported.
4600 if (any_of(Range: OrigLoop->getHeader()->phis(),
4601 P: [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(Phi: &Phi); }))
4602 return false;
4603
4604 // Phis with uses outside of the loop require special handling and are
4605 // currently unsupported.
4606 for (const auto &Entry : Legal->getInductionVars()) {
4607 // Look for uses of the value of the induction at the last iteration.
4608 Value *PostInc =
4609 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4610 for (User *U : PostInc->users())
4611 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4612 return false;
4613 // Look for uses of penultimate value of the induction.
4614 for (User *U : Entry.first->users())
4615 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4616 return false;
4617 }
4618
4619 // Epilogue vectorization code has not been auditted to ensure it handles
4620 // non-latch exits properly. It may be fine, but it needs auditted and
4621 // tested.
4622 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4623 return false;
4624
4625 return true;
4626}
4627
4628bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4629 const ElementCount VF) const {
4630 // FIXME: We need a much better cost-model to take different parameters such
4631 // as register pressure, code size increase and cost of extra branches into
4632 // account. For now we apply a very crude heuristic and only consider loops
4633 // with vectorization factors larger than a certain value.
4634
4635 // Allow the target to opt out entirely.
4636 if (!TTI.preferEpilogueVectorization())
4637 return false;
4638
4639 // We also consider epilogue vectorization unprofitable for targets that don't
4640 // consider interleaving beneficial (eg. MVE).
4641 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4642 return false;
4643
4644 unsigned Multiplier = 1;
4645 if (VF.isScalable())
4646 Multiplier = getVScaleForTuning(L: TheLoop, TTI).value_or(u: 1);
4647 if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4648 return true;
4649 return false;
4650}
4651
4652VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4653 const ElementCount MainLoopVF, unsigned IC) {
4654 VectorizationFactor Result = VectorizationFactor::Disabled();
4655 if (!EnableEpilogueVectorization) {
4656 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4657 return Result;
4658 }
4659
4660 if (!CM.isScalarEpilogueAllowed()) {
4661 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4662 "epilogue is allowed.\n");
4663 return Result;
4664 }
4665
4666 // Not really a cost consideration, but check for unsupported cases here to
4667 // simplify the logic.
4668 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4669 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4670 "is not a supported candidate.\n");
4671 return Result;
4672 }
4673
4674 if (EpilogueVectorizationForceVF > 1) {
4675 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4676 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4677 if (hasPlanWithVF(VF: ForcedEC))
4678 return {ForcedEC, 0, 0};
4679 else {
4680 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4681 "viable.\n");
4682 return Result;
4683 }
4684 }
4685
4686 if (OrigLoop->getHeader()->getParent()->hasOptSize() ||
4687 OrigLoop->getHeader()->getParent()->hasMinSize()) {
4688 LLVM_DEBUG(
4689 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4690 return Result;
4691 }
4692
4693 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF)) {
4694 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4695 "this loop\n");
4696 return Result;
4697 }
4698
4699 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4700 // the main loop handles 8 lanes per iteration. We could still benefit from
4701 // vectorizing the epilogue loop with VF=4.
4702 ElementCount EstimatedRuntimeVF = MainLoopVF;
4703 if (MainLoopVF.isScalable()) {
4704 EstimatedRuntimeVF = ElementCount::getFixed(MinVal: MainLoopVF.getKnownMinValue());
4705 if (std::optional<unsigned> VScale = getVScaleForTuning(L: OrigLoop, TTI))
4706 EstimatedRuntimeVF *= *VScale;
4707 }
4708
4709 ScalarEvolution &SE = *PSE.getSE();
4710 Type *TCType = Legal->getWidestInductionType();
4711 const SCEV *RemainingIterations = nullptr;
4712 for (auto &NextVF : ProfitableVFs) {
4713 // Skip candidate VFs without a corresponding VPlan.
4714 if (!hasPlanWithVF(VF: NextVF.Width))
4715 continue;
4716
4717 // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4718 // vectors) or the VF of the main loop (fixed vectors).
4719 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4720 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) ||
4721 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF))
4722 continue;
4723
4724 // If NextVF is greater than the number of remaining iterations, the
4725 // epilogue loop would be dead. Skip such factors.
4726 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4727 // TODO: extend to support scalable VFs.
4728 if (!RemainingIterations) {
4729 const SCEV *TC = createTripCountSCEV(IdxTy: TCType, PSE, OrigLoop);
4730 RemainingIterations = SE.getURemExpr(
4731 LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getKnownMinValue() * IC));
4732 }
4733 if (SE.isKnownPredicate(
4734 Pred: CmpInst::ICMP_UGT,
4735 LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getKnownMinValue()),
4736 RHS: RemainingIterations))
4737 continue;
4738 }
4739
4740 if (Result.Width.isScalar() || isMoreProfitable(A: NextVF, B: Result))
4741 Result = NextVF;
4742 }
4743
4744 if (Result != VectorizationFactor::Disabled())
4745 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4746 << Result.Width << "\n");
4747 return Result;
4748}
4749
4750std::pair<unsigned, unsigned>
4751LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4752 unsigned MinWidth = -1U;
4753 unsigned MaxWidth = 8;
4754 const DataLayout &DL = TheFunction->getDataLayout();
4755 // For in-loop reductions, no element types are added to ElementTypesInLoop
4756 // if there are no loads/stores in the loop. In this case, check through the
4757 // reduction variables to determine the maximum width.
4758 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4759 // Reset MaxWidth so that we can find the smallest type used by recurrences
4760 // in the loop.
4761 MaxWidth = -1U;
4762 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4763 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4764 // When finding the min width used by the recurrence we need to account
4765 // for casts on the input operands of the recurrence.
4766 MaxWidth = std::min<unsigned>(
4767 a: MaxWidth, b: std::min<unsigned>(
4768 a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4769 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4770 }
4771 } else {
4772 for (Type *T : ElementTypesInLoop) {
4773 MinWidth = std::min<unsigned>(
4774 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4775 MaxWidth = std::max<unsigned>(
4776 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4777 }
4778 }
4779 return {MinWidth, MaxWidth};
4780}
4781
4782void LoopVectorizationCostModel::collectElementTypesForWidening() {
4783 ElementTypesInLoop.clear();
4784 // For each block.
4785 for (BasicBlock *BB : TheLoop->blocks()) {
4786 // For each instruction in the loop.
4787 for (Instruction &I : BB->instructionsWithoutDebug()) {
4788 Type *T = I.getType();
4789
4790 // Skip ignored values.
4791 if (ValuesToIgnore.count(Ptr: &I))
4792 continue;
4793
4794 // Only examine Loads, Stores and PHINodes.
4795 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4796 continue;
4797
4798 // Examine PHI nodes that are reduction variables. Update the type to
4799 // account for the recurrence type.
4800 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4801 if (!Legal->isReductionVariable(PN))
4802 continue;
4803 const RecurrenceDescriptor &RdxDesc =
4804 Legal->getReductionVars().find(Key: PN)->second;
4805 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4806 TTI.preferInLoopReduction(Opcode: RdxDesc.getOpcode(),
4807 Ty: RdxDesc.getRecurrenceType(),
4808 Flags: TargetTransformInfo::ReductionFlags()))
4809 continue;
4810 T = RdxDesc.getRecurrenceType();
4811 }
4812
4813 // Examine the stored values.
4814 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4815 T = ST->getValueOperand()->getType();
4816
4817 assert(T->isSized() &&
4818 "Expected the load/store/recurrence type to be sized");
4819
4820 ElementTypesInLoop.insert(Ptr: T);
4821 }
4822 }
4823}
4824
4825unsigned
4826LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4827 InstructionCost LoopCost) {
4828 // -- The interleave heuristics --
4829 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4830 // There are many micro-architectural considerations that we can't predict
4831 // at this level. For example, frontend pressure (on decode or fetch) due to
4832 // code size, or the number and capabilities of the execution ports.
4833 //
4834 // We use the following heuristics to select the interleave count:
4835 // 1. If the code has reductions, then we interleave to break the cross
4836 // iteration dependency.
4837 // 2. If the loop is really small, then we interleave to reduce the loop
4838 // overhead.
4839 // 3. We don't interleave if we think that we will spill registers to memory
4840 // due to the increased register pressure.
4841
4842 if (!isScalarEpilogueAllowed())
4843 return 1;
4844
4845 // Do not interleave if EVL is preferred and no User IC is specified.
4846 if (foldTailWithEVL()) {
4847 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4848 "Unroll factor forced to be 1.\n");
4849 return 1;
4850 }
4851
4852 // We used the distance for the interleave count.
4853 if (!Legal->isSafeForAnyVectorWidth())
4854 return 1;
4855
4856 auto BestKnownTC = getSmallBestKnownTC(SE&: *PSE.getSE(), L: TheLoop);
4857 const bool HasReductions = !Legal->getReductionVars().empty();
4858
4859 // If we did not calculate the cost for VF (because the user selected the VF)
4860 // then we calculate the cost of VF here.
4861 if (LoopCost == 0) {
4862 LoopCost = expectedCost(VF);
4863 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4864
4865 // Loop body is free and there is no need for interleaving.
4866 if (LoopCost == 0)
4867 return 1;
4868 }
4869
4870 RegisterUsage R = calculateRegisterUsage(VFs: {VF})[0];
4871 // We divide by these constants so assume that we have at least one
4872 // instruction that uses at least one register.
4873 for (auto& pair : R.MaxLocalUsers) {
4874 pair.second = std::max(a: pair.second, b: 1U);
4875 }
4876
4877 // We calculate the interleave count using the following formula.
4878 // Subtract the number of loop invariants from the number of available
4879 // registers. These registers are used by all of the interleaved instances.
4880 // Next, divide the remaining registers by the number of registers that is
4881 // required by the loop, in order to estimate how many parallel instances
4882 // fit without causing spills. All of this is rounded down if necessary to be
4883 // a power of two. We want power of two interleave count to simplify any
4884 // addressing operations or alignment considerations.
4885 // We also want power of two interleave counts to ensure that the induction
4886 // variable of the vector loop wraps to zero, when tail is folded by masking;
4887 // this currently happens when OptForSize, in which case IC is set to 1 above.
4888 unsigned IC = UINT_MAX;
4889
4890 for (auto& pair : R.MaxLocalUsers) {
4891 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: pair.first);
4892 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4893 << " registers of "
4894 << TTI.getRegisterClassName(pair.first) << " register class\n");
4895 if (VF.isScalar()) {
4896 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4897 TargetNumRegisters = ForceTargetNumScalarRegs;
4898 } else {
4899 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4900 TargetNumRegisters = ForceTargetNumVectorRegs;
4901 }
4902 unsigned MaxLocalUsers = pair.second;
4903 unsigned LoopInvariantRegs = 0;
4904 if (R.LoopInvariantRegs.find(Key: pair.first) != R.LoopInvariantRegs.end())
4905 LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
4906
4907 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4908 MaxLocalUsers);
4909 // Don't count the induction variable as interleaved.
4910 if (EnableIndVarRegisterHeur) {
4911 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
4912 std::max(a: 1U, b: (MaxLocalUsers - 1)));
4913 }
4914
4915 IC = std::min(a: IC, b: TmpIC);
4916 }
4917
4918 // Clamp the interleave ranges to reasonable counts.
4919 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4920
4921 // Check if the user has overridden the max.
4922 if (VF.isScalar()) {
4923 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4924 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4925 } else {
4926 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4927 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4928 }
4929
4930 unsigned EstimatedVF = VF.getKnownMinValue();
4931 if (VF.isScalable()) {
4932 if (std::optional<unsigned> VScale = getVScaleForTuning(L: TheLoop, TTI))
4933 EstimatedVF *= *VScale;
4934 }
4935 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4936
4937 unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(L: TheLoop);
4938 if (KnownTC > 0) {
4939 // At least one iteration must be scalar when this constraint holds. So the
4940 // maximum available iterations for interleaving is one less.
4941 unsigned AvailableTC =
4942 requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? KnownTC - 1 : KnownTC;
4943
4944 // If trip count is known we select between two prospective ICs, where
4945 // 1) the aggressive IC is capped by the trip count divided by VF
4946 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4947 // The final IC is selected in a way that the epilogue loop trip count is
4948 // minimized while maximizing the IC itself, so that we either run the
4949 // vector loop at least once if it generates a small epilogue loop, or else
4950 // we run the vector loop at least twice.
4951
4952 unsigned InterleaveCountUB = bit_floor(
4953 Value: std::max(a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4954 unsigned InterleaveCountLB = bit_floor(Value: std::max(
4955 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4956 MaxInterleaveCount = InterleaveCountLB;
4957
4958 if (InterleaveCountUB != InterleaveCountLB) {
4959 unsigned TailTripCountUB =
4960 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4961 unsigned TailTripCountLB =
4962 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4963 // If both produce same scalar tail, maximize the IC to do the same work
4964 // in fewer vector loop iterations
4965 if (TailTripCountUB == TailTripCountLB)
4966 MaxInterleaveCount = InterleaveCountUB;
4967 }
4968 } else if (BestKnownTC && *BestKnownTC > 0) {
4969 // At least one iteration must be scalar when this constraint holds. So the
4970 // maximum available iterations for interleaving is one less.
4971 unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector())
4972 ? (*BestKnownTC) - 1
4973 : *BestKnownTC;
4974
4975 // If trip count is an estimated compile time constant, limit the
4976 // IC to be capped by the trip count divided by VF * 2, such that the vector
4977 // loop runs at least twice to make interleaving seem profitable when there
4978 // is an epilogue loop present. Since exact Trip count is not known we
4979 // choose to be conservative in our IC estimate.
4980 MaxInterleaveCount = bit_floor(Value: std::max(
4981 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4982 }
4983
4984 assert(MaxInterleaveCount > 0 &&
4985 "Maximum interleave count must be greater than 0");
4986
4987 // Clamp the calculated IC to be between the 1 and the max interleave count
4988 // that the target and trip count allows.
4989 if (IC > MaxInterleaveCount)
4990 IC = MaxInterleaveCount;
4991 else
4992 // Make sure IC is greater than 0.
4993 IC = std::max(a: 1u, b: IC);
4994
4995 assert(IC > 0 && "Interleave count must be greater than 0.");
4996
4997 // Interleave if we vectorized this loop and there is a reduction that could
4998 // benefit from interleaving.
4999 if (VF.isVector() && HasReductions) {
5000 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5001 return IC;
5002 }
5003
5004 // For any scalar loop that either requires runtime checks or predication we
5005 // are better off leaving this to the unroller. Note that if we've already
5006 // vectorized the loop we will have done the runtime check and so interleaving
5007 // won't require further checks.
5008 bool ScalarInterleavingRequiresPredication =
5009 (VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) {
5010 return Legal->blockNeedsPredication(BB);
5011 }));
5012 bool ScalarInterleavingRequiresRuntimePointerCheck =
5013 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5014
5015 // We want to interleave small loops in order to reduce the loop overhead and
5016 // potentially expose ILP opportunities.
5017 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5018 << "LV: IC is " << IC << '\n'
5019 << "LV: VF is " << VF << '\n');
5020 const bool AggressivelyInterleaveReductions =
5021 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
5022 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5023 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5024 // We assume that the cost overhead is 1 and we use the cost model
5025 // to estimate the cost of the loop and interleave until the cost of the
5026 // loop overhead is about 5% of the cost of the loop.
5027 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
5028 Value: SmallLoopCost / *LoopCost.getValue()));
5029
5030 // Interleave until store/load ports (estimated by max interleave count) are
5031 // saturated.
5032 unsigned NumStores = Legal->getNumStores();
5033 unsigned NumLoads = Legal->getNumLoads();
5034 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5035 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5036
5037 // There is little point in interleaving for reductions containing selects
5038 // and compares when VF=1 since it may just create more overhead than it's
5039 // worth for loops with small trip counts. This is because we still have to
5040 // do the final reduction after the loop.
5041 bool HasSelectCmpReductions =
5042 HasReductions &&
5043 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
5044 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5045 return RecurrenceDescriptor::isAnyOfRecurrenceKind(
5046 Kind: RdxDesc.getRecurrenceKind());
5047 });
5048 if (HasSelectCmpReductions) {
5049 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5050 return 1;
5051 }
5052
5053 // If we have a scalar reduction (vector reductions are already dealt with
5054 // by this point), we can increase the critical path length if the loop
5055 // we're interleaving is inside another loop. For tree-wise reductions
5056 // set the limit to 2, and for ordered reductions it's best to disable
5057 // interleaving entirely.
5058 if (HasReductions && TheLoop->getLoopDepth() > 1) {
5059 bool HasOrderedReductions =
5060 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
5061 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5062 return RdxDesc.isOrdered();
5063 });
5064 if (HasOrderedReductions) {
5065 LLVM_DEBUG(
5066 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5067 return 1;
5068 }
5069
5070 unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5071 SmallIC = std::min(a: SmallIC, b: F);
5072 StoresIC = std::min(a: StoresIC, b: F);
5073 LoadsIC = std::min(a: LoadsIC, b: F);
5074 }
5075
5076 if (EnableLoadStoreRuntimeInterleave &&
5077 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
5078 LLVM_DEBUG(
5079 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5080 return std::max(a: StoresIC, b: LoadsIC);
5081 }
5082
5083 // If there are scalar reductions and TTI has enabled aggressive
5084 // interleaving for reductions, we will interleave to expose ILP.
5085 if (VF.isScalar() && AggressivelyInterleaveReductions) {
5086 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5087 // Interleave no less than SmallIC but not as aggressive as the normal IC
5088 // to satisfy the rare situation when resources are too limited.
5089 return std::max(a: IC / 2, b: SmallIC);
5090 } else {
5091 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5092 return SmallIC;
5093 }
5094 }
5095
5096 // Interleave if this is a large loop (small loops are already dealt with by
5097 // this point) that could benefit from interleaving.
5098 if (AggressivelyInterleaveReductions) {
5099 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5100 return IC;
5101 }
5102
5103 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5104 return 1;
5105}
5106
5107SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5108LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5109 // This function calculates the register usage by measuring the highest number
5110 // of values that are alive at a single location. Obviously, this is a very
5111 // rough estimation. We scan the loop in a topological order in order and
5112 // assign a number to each instruction. We use RPO to ensure that defs are
5113 // met before their users. We assume that each instruction that has in-loop
5114 // users starts an interval. We record every time that an in-loop value is
5115 // used, so we have a list of the first and last occurrences of each
5116 // instruction. Next, we transpose this data structure into a multi map that
5117 // holds the list of intervals that *end* at a specific location. This multi
5118 // map allows us to perform a linear search. We scan the instructions linearly
5119 // and record each time that a new interval starts, by placing it in a set.
5120 // If we find this value in the multi-map then we remove it from the set.
5121 // The max register usage is the maximum size of the set.
5122 // We also search for instructions that are defined outside the loop, but are
5123 // used inside the loop. We need this number separately from the max-interval
5124 // usage number because when we unroll, loop-invariant values do not take
5125 // more register.
5126 LoopBlocksDFS DFS(TheLoop);
5127 DFS.perform(LI);
5128
5129 RegisterUsage RU;
5130
5131 // Each 'key' in the map opens a new interval. The values
5132 // of the map are the index of the 'last seen' usage of the
5133 // instruction that is the key.
5134 using IntervalMap = DenseMap<Instruction *, unsigned>;
5135
5136 // Maps instruction to its index.
5137 SmallVector<Instruction *, 64> IdxToInstr;
5138 // Marks the end of each interval.
5139 IntervalMap EndPoint;
5140 // Saves the list of instruction indices that are used in the loop.
5141 SmallPtrSet<Instruction *, 8> Ends;
5142 // Saves the list of values that are used in the loop but are defined outside
5143 // the loop (not including non-instruction values such as arguments and
5144 // constants).
5145 SmallSetVector<Instruction *, 8> LoopInvariants;
5146
5147 for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) {
5148 for (Instruction &I : BB->instructionsWithoutDebug()) {
5149 IdxToInstr.push_back(Elt: &I);
5150
5151 // Save the end location of each USE.
5152 for (Value *U : I.operands()) {
5153 auto *Instr = dyn_cast<Instruction>(Val: U);
5154
5155 // Ignore non-instruction values such as arguments, constants, etc.
5156 // FIXME: Might need some motivation why these values are ignored. If
5157 // for example an argument is used inside the loop it will increase the
5158 // register pressure (so shouldn't we add it to LoopInvariants).
5159 if (!Instr)
5160 continue;
5161
5162 // If this instruction is outside the loop then record it and continue.
5163 if (!TheLoop->contains(Inst: Instr)) {
5164 LoopInvariants.insert(X: Instr);
5165 continue;
5166 }
5167
5168 // Overwrite previous end points.
5169 EndPoint[Instr] = IdxToInstr.size();
5170 Ends.insert(Ptr: Instr);
5171 }
5172 }
5173 }
5174
5175 // Saves the list of intervals that end with the index in 'key'.
5176 using InstrList = SmallVector<Instruction *, 2>;
5177 DenseMap<unsigned, InstrList> TransposeEnds;
5178
5179 // Transpose the EndPoints to a list of values that end at each index.
5180 for (auto &Interval : EndPoint)
5181 TransposeEnds[Interval.second].push_back(Elt: Interval.first);
5182
5183 SmallPtrSet<Instruction *, 8> OpenIntervals;
5184 SmallVector<RegisterUsage, 8> RUs(VFs.size());
5185 SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5186
5187 LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5188
5189 const auto &TTICapture = TTI;
5190 auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5191 if (Ty->isTokenTy() || !VectorType::isValidElementType(ElemTy: Ty))
5192 return 0;
5193 return TTICapture.getRegUsageForType(Ty: VectorType::get(ElementType: Ty, EC: VF));
5194 };
5195
5196 for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5197 Instruction *I = IdxToInstr[i];
5198
5199 // Remove all of the instructions that end at this location.
5200 InstrList &List = TransposeEnds[i];
5201 for (Instruction *ToRemove : List)
5202 OpenIntervals.erase(Ptr: ToRemove);
5203
5204 // Ignore instructions that are never used within the loop.
5205 if (!Ends.count(Ptr: I))
5206 continue;
5207
5208 // Skip ignored values.
5209 if (ValuesToIgnore.count(Ptr: I))
5210 continue;
5211
5212 collectInLoopReductions();
5213
5214 // For each VF find the maximum usage of registers.
5215 for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5216 // Count the number of registers used, per register class, given all open
5217 // intervals.
5218 // Note that elements in this SmallMapVector will be default constructed
5219 // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5220 // there is no previous entry for ClassID.
5221 SmallMapVector<unsigned, unsigned, 4> RegUsage;
5222
5223 if (VFs[j].isScalar()) {
5224 for (auto *Inst : OpenIntervals) {
5225 unsigned ClassID =
5226 TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType());
5227 // FIXME: The target might use more than one register for the type
5228 // even in the scalar case.
5229 RegUsage[ClassID] += 1;
5230 }
5231 } else {
5232 collectUniformsAndScalars(VF: VFs[j]);
5233 for (auto *Inst : OpenIntervals) {
5234 // Skip ignored values for VF > 1.
5235 if (VecValuesToIgnore.count(Ptr: Inst))
5236 continue;
5237 if (isScalarAfterVectorization(I: Inst, VF: VFs[j])) {
5238 unsigned ClassID =
5239 TTI.getRegisterClassForType(Vector: false, Ty: Inst->getType());
5240 // FIXME: The target might use more than one register for the type
5241 // even in the scalar case.
5242 RegUsage[ClassID] += 1;
5243 } else {
5244 unsigned ClassID =
5245 TTI.getRegisterClassForType(Vector: true, Ty: Inst->getType());
5246 RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5247 }
5248 }
5249 }
5250
5251 for (auto& pair : RegUsage) {
5252 auto &Entry = MaxUsages[j][pair.first];
5253 Entry = std::max(a: Entry, b: pair.second);
5254 }
5255 }
5256
5257 LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5258 << OpenIntervals.size() << '\n');
5259
5260 // Add the current instruction to the list of open intervals.
5261 OpenIntervals.insert(Ptr: I);
5262 }
5263
5264 for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5265 // Note that elements in this SmallMapVector will be default constructed
5266 // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5267 // there is no previous entry for ClassID.
5268 SmallMapVector<unsigned, unsigned, 4> Invariant;
5269
5270 for (auto *Inst : LoopInvariants) {
5271 // FIXME: The target might use more than one register for the type
5272 // even in the scalar case.
5273 bool IsScalar = all_of(Range: Inst->users(), P: [&](User *U) {
5274 auto *I = cast<Instruction>(Val: U);
5275 return TheLoop != LI->getLoopFor(BB: I->getParent()) ||
5276 isScalarAfterVectorization(I, VF: VFs[i]);
5277 });
5278
5279 ElementCount VF = IsScalar ? ElementCount::getFixed(MinVal: 1) : VFs[i];
5280 unsigned ClassID =
5281 TTI.getRegisterClassForType(Vector: VF.isVector(), Ty: Inst->getType());
5282 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5283 }
5284
5285 LLVM_DEBUG({
5286 dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5287 dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5288 << " item\n";
5289 for (const auto &pair : MaxUsages[i]) {
5290 dbgs() << "LV(REG): RegisterClass: "
5291 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5292 << " registers\n";
5293 }
5294 dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5295 << " item\n";
5296 for (const auto &pair : Invariant) {
5297 dbgs() << "LV(REG): RegisterClass: "
5298 << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5299 << " registers\n";
5300 }
5301 });
5302
5303 RU.LoopInvariantRegs = Invariant;
5304 RU.MaxLocalUsers = MaxUsages[i];
5305 RUs[i] = RU;
5306 }
5307
5308 return RUs;
5309}
5310
5311bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
5312 ElementCount VF) {
5313 // TODO: Cost model for emulated masked load/store is completely
5314 // broken. This hack guides the cost model to use an artificially
5315 // high enough value to practically disable vectorization with such
5316 // operations, except where previously deployed legality hack allowed
5317 // using very low cost values. This is to avoid regressions coming simply
5318 // from moving "masked load/store" check from legality to cost model.
5319 // Masked Load/Gather emulation was previously never allowed.
5320 // Limited number of Masked Store/Scatter emulation was allowed.
5321 assert((isPredicatedInst(I)) &&
5322 "Expecting a scalar emulated instruction");
5323 return isa<LoadInst>(Val: I) ||
5324 (isa<StoreInst>(Val: I) &&
5325 NumPredStores > NumberOfStoresToPredicate);
5326}
5327
5328void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
5329 // If we aren't vectorizing the loop, or if we've already collected the
5330 // instructions to scalarize, there's nothing to do. Collection may already
5331 // have occurred if we have a user-selected VF and are now computing the
5332 // expected cost for interleaving.
5333 if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(Val: VF))
5334 return;
5335
5336 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5337 // not profitable to scalarize any instructions, the presence of VF in the
5338 // map will indicate that we've analyzed it already.
5339 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5340
5341 PredicatedBBsAfterVectorization[VF].clear();
5342
5343 // Find all the instructions that are scalar with predication in the loop and
5344 // determine if it would be better to not if-convert the blocks they are in.
5345 // If so, we also record the instructions to scalarize.
5346 for (BasicBlock *BB : TheLoop->blocks()) {
5347 if (!blockNeedsPredicationForAnyReason(BB))
5348 continue;
5349 for (Instruction &I : *BB)
5350 if (isScalarWithPredication(I: &I, VF)) {
5351 ScalarCostsTy ScalarCosts;
5352 // Do not apply discount logic for:
5353 // 1. Scalars after vectorization, as there will only be a single copy
5354 // of the instruction.
5355 // 2. Scalable VF, as that would lead to invalid scalarization costs.
5356 // 3. Emulated masked memrefs, if a hacked cost is needed.
5357 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5358 !useEmulatedMaskMemRefHack(I: &I, VF) &&
5359 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0)
5360 ScalarCostsVF.insert(I: ScalarCosts.begin(), E: ScalarCosts.end());
5361 // Remember that BB will remain after vectorization.
5362 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
5363 for (auto *Pred : predecessors(BB)) {
5364 if (Pred->getSingleSuccessor() == BB)
5365 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
5366 }
5367 }
5368 }
5369}
5370
5371InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5372 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5373 assert(!isUniformAfterVectorization(PredInst, VF) &&
5374 "Instruction marked uniform-after-vectorization will be predicated");
5375
5376 // Initialize the discount to zero, meaning that the scalar version and the
5377 // vector version cost the same.
5378 InstructionCost Discount = 0;
5379
5380 // Holds instructions to analyze. The instructions we visit are mapped in
5381 // ScalarCosts. Those instructions are the ones that would be scalarized if
5382 // we find that the scalar version costs less.
5383 SmallVector<Instruction *, 8> Worklist;
5384
5385 // Returns true if the given instruction can be scalarized.
5386 auto canBeScalarized = [&](Instruction *I) -> bool {
5387 // We only attempt to scalarize instructions forming a single-use chain
5388 // from the original predicated block that would otherwise be vectorized.
5389 // Although not strictly necessary, we give up on instructions we know will
5390 // already be scalar to avoid traversing chains that are unlikely to be
5391 // beneficial.
5392 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5393 isScalarAfterVectorization(I, VF))
5394 return false;
5395
5396 // If the instruction is scalar with predication, it will be analyzed
5397 // separately. We ignore it within the context of PredInst.
5398 if (isScalarWithPredication(I, VF))
5399 return false;
5400
5401 // If any of the instruction's operands are uniform after vectorization,
5402 // the instruction cannot be scalarized. This prevents, for example, a
5403 // masked load from being scalarized.
5404 //
5405 // We assume we will only emit a value for lane zero of an instruction
5406 // marked uniform after vectorization, rather than VF identical values.
5407 // Thus, if we scalarize an instruction that uses a uniform, we would
5408 // create uses of values corresponding to the lanes we aren't emitting code
5409 // for. This behavior can be changed by allowing getScalarValue to clone
5410 // the lane zero values for uniforms rather than asserting.
5411 for (Use &U : I->operands())
5412 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5413 if (isUniformAfterVectorization(I: J, VF))
5414 return false;
5415
5416 // Otherwise, we can scalarize the instruction.
5417 return true;
5418 };
5419
5420 // Compute the expected cost discount from scalarizing the entire expression
5421 // feeding the predicated instruction. We currently only consider expressions
5422 // that are single-use instruction chains.
5423 Worklist.push_back(Elt: PredInst);
5424 while (!Worklist.empty()) {
5425 Instruction *I = Worklist.pop_back_val();
5426
5427 // If we've already analyzed the instruction, there's nothing to do.
5428 if (ScalarCosts.contains(Val: I))
5429 continue;
5430
5431 // Compute the cost of the vector instruction. Note that this cost already
5432 // includes the scalarization overhead of the predicated instruction.
5433 InstructionCost VectorCost = getInstructionCost(I, VF);
5434
5435 // Compute the cost of the scalarized instruction. This cost is the cost of
5436 // the instruction as if it wasn't if-converted and instead remained in the
5437 // predicated block. We will scale this cost by block probability after
5438 // computing the scalarization overhead.
5439 InstructionCost ScalarCost =
5440 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
5441
5442 // Compute the scalarization overhead of needed insertelement instructions
5443 // and phi nodes.
5444 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5445 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5446 ScalarCost += TTI.getScalarizationOverhead(
5447 Ty: cast<VectorType>(Val: ToVectorTy(Scalar: I->getType(), EC: VF)),
5448 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ true,
5449 /*Extract*/ false, CostKind);
5450 ScalarCost +=
5451 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5452 }
5453
5454 // Compute the scalarization overhead of needed extractelement
5455 // instructions. For each of the instruction's operands, if the operand can
5456 // be scalarized, add it to the worklist; otherwise, account for the
5457 // overhead.
5458 for (Use &U : I->operands())
5459 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5460 assert(VectorType::isValidElementType(J->getType()) &&
5461 "Instruction has non-scalar type");
5462 if (canBeScalarized(J))
5463 Worklist.push_back(Elt: J);
5464 else if (needsExtract(V: J, VF)) {
5465 ScalarCost += TTI.getScalarizationOverhead(
5466 Ty: cast<VectorType>(Val: ToVectorTy(Scalar: J->getType(), EC: VF)),
5467 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5468 /*Extract*/ true, CostKind);
5469 }
5470 }
5471
5472 // Scale the total scalar cost by block probability.
5473 ScalarCost /= getReciprocalPredBlockProb();
5474
5475 // Compute the discount. A non-negative discount means the vector version
5476 // of the instruction costs more, and scalarizing would be beneficial.
5477 Discount += VectorCost - ScalarCost;
5478 ScalarCosts[I] = ScalarCost;
5479 }
5480
5481 return Discount;
5482}
5483
5484InstructionCost LoopVectorizationCostModel::expectedCost(
5485 ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5486 InstructionCost Cost;
5487
5488 // For each block.
5489 for (BasicBlock *BB : TheLoop->blocks()) {
5490 InstructionCost BlockCost;
5491
5492 // For each instruction in the old loop.
5493 for (Instruction &I : BB->instructionsWithoutDebug()) {
5494 // Skip ignored values.
5495 if (ValuesToIgnore.count(Ptr: &I) ||
5496 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5497 continue;
5498
5499 InstructionCost C = getInstructionCost(I: &I, VF);
5500
5501 // Check if we should override the cost.
5502 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5503 C = InstructionCost(ForceTargetInstructionCost);
5504
5505 // Keep a list of instructions with invalid costs.
5506 if (Invalid && !C.isValid())
5507 Invalid->emplace_back(Args: &I, Args&: VF);
5508
5509 BlockCost += C;
5510 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5511 << VF << " For instruction: " << I << '\n');
5512 }
5513
5514 // If we are vectorizing a predicated block, it will have been
5515 // if-converted. This means that the block's instructions (aside from
5516 // stores and instructions that may divide by zero) will now be
5517 // unconditionally executed. For the scalar case, we may not always execute
5518 // the predicated block, if it is an if-else block. Thus, scale the block's
5519 // cost by the probability of executing it. blockNeedsPredication from
5520 // Legal is used so as to not include all blocks in tail folded loops.
5521 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5522 BlockCost /= getReciprocalPredBlockProb();
5523
5524 Cost += BlockCost;
5525 }
5526
5527 return Cost;
5528}
5529
5530/// Gets Address Access SCEV after verifying that the access pattern
5531/// is loop invariant except the induction variable dependence.
5532///
5533/// This SCEV can be sent to the Target in order to estimate the address
5534/// calculation cost.
5535static const SCEV *getAddressAccessSCEV(
5536 Value *Ptr,
5537 LoopVectorizationLegality *Legal,
5538 PredicatedScalarEvolution &PSE,
5539 const Loop *TheLoop) {
5540
5541 auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr);
5542 if (!Gep)
5543 return nullptr;
5544
5545 // We are looking for a gep with all loop invariant indices except for one
5546 // which should be an induction variable.
5547 auto SE = PSE.getSE();
5548 unsigned NumOperands = Gep->getNumOperands();
5549 for (unsigned i = 1; i < NumOperands; ++i) {
5550 Value *Opd = Gep->getOperand(i_nocapture: i);
5551 if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) &&
5552 !Legal->isInductionVariable(V: Opd))
5553 return nullptr;
5554 }
5555
5556 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5557 return PSE.getSCEV(V: Ptr);
5558}
5559
5560InstructionCost
5561LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5562 ElementCount VF) {
5563 assert(VF.isVector() &&
5564 "Scalarization cost of instruction implies vectorization.");
5565 if (VF.isScalable())
5566 return InstructionCost::getInvalid();
5567
5568 Type *ValTy = getLoadStoreType(I);
5569 auto SE = PSE.getSE();
5570
5571 unsigned AS = getLoadStoreAddressSpace(I);
5572 Value *Ptr = getLoadStorePointerOperand(V: I);
5573 Type *PtrTy = ToVectorTy(Scalar: Ptr->getType(), EC: VF);
5574 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5575 // that it is being called from this specific place.
5576
5577 // Figure out whether the access is strided and get the stride value
5578 // if it's known in compile time
5579 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5580
5581 // Get the cost of the scalar memory instruction and address computation.
5582 InstructionCost Cost =
5583 VF.getKnownMinValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV);
5584
5585 // Don't pass *I here, since it is scalar but will actually be part of a
5586 // vectorized loop where the user of it is a vectorized instruction.
5587 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5588 const Align Alignment = getLoadStoreAlignment(I);
5589 Cost += VF.getKnownMinValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(),
5590 Src: ValTy->getScalarType(),
5591 Alignment, AddressSpace: AS, CostKind);
5592
5593 // Get the overhead of the extractelement and insertelement instructions
5594 // we might create due to scalarization.
5595 Cost += getScalarizationOverhead(I, VF, CostKind);
5596
5597 // If we have a predicated load/store, it will need extra i1 extracts and
5598 // conditional branches, but may not be executed for each vector lane. Scale
5599 // the cost by the probability of executing the predicated block.
5600 if (isPredicatedInst(I)) {
5601 Cost /= getReciprocalPredBlockProb();
5602
5603 // Add the cost of an i1 extract and a branch
5604 auto *Vec_i1Ty =
5605 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5606 Cost += TTI.getScalarizationOverhead(
5607 Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()),
5608 /*Insert=*/false, /*Extract=*/true, CostKind);
5609 Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
5610
5611 if (useEmulatedMaskMemRefHack(I, VF))
5612 // Artificially setting to a high enough value to practically disable
5613 // vectorization with such operations.
5614 Cost = 3000000;
5615 }
5616
5617 return Cost;
5618}
5619
5620InstructionCost
5621LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5622 ElementCount VF) {
5623 Type *ValTy = getLoadStoreType(I);
5624 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
5625 Value *Ptr = getLoadStorePointerOperand(V: I);
5626 unsigned AS = getLoadStoreAddressSpace(I);
5627 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5628 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5629
5630 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5631 "Stride should be 1 or -1 for consecutive memory access");
5632 const Align Alignment = getLoadStoreAlignment(I);
5633 InstructionCost Cost = 0;
5634 if (Legal->isMaskRequired(I)) {
5635 Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5636 CostKind);
5637 } else {
5638 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5639 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5640 CostKind, OpdInfo: OpInfo, I);
5641 }
5642
5643 bool Reverse = ConsecutiveStride < 0;
5644 if (Reverse)
5645 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy,
5646 Mask: std::nullopt, CostKind, Index: 0);
5647 return Cost;
5648}
5649
5650InstructionCost
5651LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5652 ElementCount VF) {
5653 assert(Legal->isUniformMemOp(*I, VF));
5654
5655 Type *ValTy = getLoadStoreType(I);
5656 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
5657 const Align Alignment = getLoadStoreAlignment(I);
5658 unsigned AS = getLoadStoreAddressSpace(I);
5659 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5660 if (isa<LoadInst>(Val: I)) {
5661 return TTI.getAddressComputationCost(Ty: ValTy) +
5662 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5663 CostKind) +
5664 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, Tp: VectorTy);
5665 }
5666 StoreInst *SI = cast<StoreInst>(Val: I);
5667
5668 bool isLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5669 return TTI.getAddressComputationCost(Ty: ValTy) +
5670 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
5671 CostKind) +
5672 (isLoopInvariantStoreValue
5673 ? 0
5674 : TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy,
5675 CostKind, Index: VF.getKnownMinValue() - 1));
5676}
5677
5678InstructionCost
5679LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5680 ElementCount VF) {
5681 Type *ValTy = getLoadStoreType(I);
5682 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
5683 const Align Alignment = getLoadStoreAlignment(I);
5684 const Value *Ptr = getLoadStorePointerOperand(V: I);
5685
5686 return TTI.getAddressComputationCost(Ty: VectorTy) +
5687 TTI.getGatherScatterOpCost(
5688 Opcode: I->getOpcode(), DataTy: VectorTy, Ptr, VariableMask: Legal->isMaskRequired(I), Alignment,
5689 CostKind: TargetTransformInfo::TCK_RecipThroughput, I);
5690}
5691
5692InstructionCost
5693LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5694 ElementCount VF) {
5695 Type *ValTy = getLoadStoreType(I);
5696 auto *VectorTy = cast<VectorType>(Val: ToVectorTy(Scalar: ValTy, EC: VF));
5697 unsigned AS = getLoadStoreAddressSpace(I);
5698 enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
5699
5700 auto Group = getInterleavedAccessGroup(Instr: I);
5701 assert(Group && "Fail to get an interleaved access group.");
5702
5703 unsigned InterleaveFactor = Group->getFactor();
5704 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
5705
5706 // Holds the indices of existing members in the interleaved group.
5707 SmallVector<unsigned, 4> Indices;
5708 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5709 if (Group->getMember(Index: IF))
5710 Indices.push_back(Elt: IF);
5711
5712 // Calculate the cost of the whole interleaved group.
5713 bool UseMaskForGaps =
5714 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5715 (isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()));
5716 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5717 Opcode: I->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices, Alignment: Group->getAlign(),
5718 AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I), UseMaskForGaps);
5719
5720 if (Group->isReverse()) {
5721 // TODO: Add support for reversed masked interleaved access.
5722 assert(!Legal->isMaskRequired(I) &&
5723 "Reverse masked interleaved access not supported.");
5724 Cost += Group->getNumMembers() *
5725 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, Tp: VectorTy,
5726 Mask: std::nullopt, CostKind, Index: 0);
5727 }
5728 return Cost;
5729}
5730
5731std::optional<InstructionCost>
5732LoopVectorizationCostModel::getReductionPatternCost(
5733 Instruction *I, ElementCount VF, Type *Ty,
5734 TTI::TargetCostKind CostKind) const {
5735 using namespace llvm::PatternMatch;
5736 // Early exit for no inloop reductions
5737 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
5738 return std::nullopt;
5739 auto *VectorTy = cast<VectorType>(Val: Ty);
5740
5741 // We are looking for a pattern of, and finding the minimal acceptable cost:
5742 // reduce(mul(ext(A), ext(B))) or
5743 // reduce(mul(A, B)) or
5744 // reduce(ext(A)) or
5745 // reduce(A).
5746 // The basic idea is that we walk down the tree to do that, finding the root
5747 // reduction instruction in InLoopReductionImmediateChains. From there we find
5748 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5749 // of the components. If the reduction cost is lower then we return it for the
5750 // reduction instruction and 0 for the other instructions in the pattern. If
5751 // it is not we return an invalid cost specifying the orignal cost method
5752 // should be used.
5753 Instruction *RetI = I;
5754 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5755 if (!RetI->hasOneUser())
5756 return std::nullopt;
5757 RetI = RetI->user_back();
5758 }
5759
5760 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5761 RetI->user_back()->getOpcode() == Instruction::Add) {
5762 RetI = RetI->user_back();
5763 }
5764
5765 // Test if the found instruction is a reduction, and if not return an invalid
5766 // cost specifying the parent to use the original cost modelling.
5767 if (!InLoopReductionImmediateChains.count(Val: RetI))
5768 return std::nullopt;
5769
5770 // Find the reduction this chain is a part of and calculate the basic cost of
5771 // the reduction on its own.
5772 Instruction *LastChain = InLoopReductionImmediateChains.at(Val: RetI);
5773 Instruction *ReductionPhi = LastChain;
5774 while (!isa<PHINode>(Val: ReductionPhi))
5775 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5776
5777 const RecurrenceDescriptor &RdxDesc =
5778 Legal->getReductionVars().find(Key: cast<PHINode>(Val: ReductionPhi))->second;
5779
5780 InstructionCost BaseCost;
5781 RecurKind RK = RdxDesc.getRecurrenceKind();
5782 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5783 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5784 BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5785 FMF: RdxDesc.getFastMathFlags(), CostKind);
5786 } else {
5787 BaseCost = TTI.getArithmeticReductionCost(
5788 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5789 }
5790
5791 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5792 // normal fmul instruction to the cost of the fadd reduction.
5793 if (RK == RecurKind::FMulAdd)
5794 BaseCost +=
5795 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5796
5797 // If we're using ordered reductions then we can just return the base cost
5798 // here, since getArithmeticReductionCost calculates the full ordered
5799 // reduction cost when FP reassociation is not allowed.
5800 if (useOrderedReductions(RdxDesc))
5801 return BaseCost;
5802
5803 // Get the operand that was not the reduction chain and match it to one of the
5804 // patterns, returning the better cost if it is found.
5805 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
5806 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
5807 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
5808
5809 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
5810
5811 Instruction *Op0, *Op1;
5812 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5813 match(V: RedOp,
5814 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5815 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5816 Op0->getOpcode() == Op1->getOpcode() &&
5817 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
5818 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5819 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5820
5821 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5822 // Note that the extend opcodes need to all match, or if A==B they will have
5823 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5824 // which is equally fine.
5825 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5826 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
5827 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5828
5829 InstructionCost ExtCost =
5830 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5831 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5832 InstructionCost MulCost =
5833 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5834 InstructionCost Ext2Cost =
5835 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5836 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5837
5838 InstructionCost RedCost = TTI.getMulAccReductionCost(
5839 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5840
5841 if (RedCost.isValid() &&
5842 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5843 return I == RetI ? RedCost : 0;
5844 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5845 !TheLoop->isLoopInvariant(V: RedOp)) {
5846 // Matched reduce(ext(A))
5847 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5848 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
5849 InstructionCost RedCost = TTI.getExtendedReductionCost(
5850 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5851 FMF: RdxDesc.getFastMathFlags(), CostKind);
5852
5853 InstructionCost ExtCost =
5854 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5855 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5856 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5857 return I == RetI ? RedCost : 0;
5858 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5859 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5860 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5861 Op0->getOpcode() == Op1->getOpcode() &&
5862 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5863 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5864 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
5865 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
5866 Type *LargestOpTy =
5867 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5868 : Op0Ty;
5869 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5870
5871 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5872 // different sizes. We take the largest type as the ext to reduce, and add
5873 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5874 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5875 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5876 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5877 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5878 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5879 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5880 InstructionCost MulCost =
5881 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5882
5883 InstructionCost RedCost = TTI.getMulAccReductionCost(
5884 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5885 InstructionCost ExtraExtCost = 0;
5886 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5887 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5888 ExtraExtCost = TTI.getCastInstrCost(
5889 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5890 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
5891 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5892 }
5893
5894 if (RedCost.isValid() &&
5895 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5896 return I == RetI ? RedCost : 0;
5897 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5898 // Matched reduce.add(mul())
5899 InstructionCost MulCost =
5900 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5901
5902 InstructionCost RedCost = TTI.getMulAccReductionCost(
5903 IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind);
5904
5905 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5906 return I == RetI ? RedCost : 0;
5907 }
5908 }
5909
5910 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5911}
5912
5913InstructionCost
5914LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5915 ElementCount VF) {
5916 // Calculate scalar cost only. Vectorization cost should be ready at this
5917 // moment.
5918 if (VF.isScalar()) {
5919 Type *ValTy = getLoadStoreType(I);
5920 const Align Alignment = getLoadStoreAlignment(I);
5921 unsigned AS = getLoadStoreAddressSpace(I);
5922
5923 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5924 return TTI.getAddressComputationCost(Ty: ValTy) +
5925 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS,
5926 CostKind: TTI::TCK_RecipThroughput, OpdInfo: OpInfo, I);
5927 }
5928 return getWideningCost(I, VF);
5929}
5930
5931InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
5932 Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
5933
5934 // There is no mechanism yet to create a scalable scalarization loop,
5935 // so this is currently Invalid.
5936 if (VF.isScalable())
5937 return InstructionCost::getInvalid();
5938
5939 if (VF.isScalar())
5940 return 0;
5941
5942 InstructionCost Cost = 0;
5943 Type *RetTy = ToVectorTy(Scalar: I->getType(), EC: VF);
5944 if (!RetTy->isVoidTy() &&
5945 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore()))
5946 Cost += TTI.getScalarizationOverhead(
5947 Ty: cast<VectorType>(Val: RetTy), DemandedElts: APInt::getAllOnes(numBits: VF.getKnownMinValue()),
5948 /*Insert*/ true,
5949 /*Extract*/ false, CostKind);
5950
5951 // Some targets keep addresses scalar.
5952 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5953 return Cost;
5954
5955 // Some targets support efficient element stores.
5956 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5957 return Cost;
5958
5959 // Collect operands to consider.
5960 CallInst *CI = dyn_cast<CallInst>(Val: I);
5961 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5962
5963 // Skip operands that do not require extraction/scalarization and do not incur
5964 // any overhead.
5965 SmallVector<Type *> Tys;
5966 for (auto *V : filterExtractingOperands(Ops, VF))
5967 Tys.push_back(Elt: MaybeVectorizeType(Elt: V->getType(), VF));
5968 return Cost + TTI.getOperandsScalarizationOverhead(
5969 Args: filterExtractingOperands(Ops, VF), Tys, CostKind);
5970}
5971
5972void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5973 if (VF.isScalar())
5974 return;
5975 NumPredStores = 0;
5976 for (BasicBlock *BB : TheLoop->blocks()) {
5977 // For each instruction in the old loop.
5978 for (Instruction &I : *BB) {
5979 Value *Ptr = getLoadStorePointerOperand(V: &I);
5980 if (!Ptr)
5981 continue;
5982
5983 // TODO: We should generate better code and update the cost model for
5984 // predicated uniform stores. Today they are treated as any other
5985 // predicated store (see added test cases in
5986 // invariant-store-vectorization.ll).
5987 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5988 NumPredStores++;
5989
5990 if (Legal->isUniformMemOp(I, VF)) {
5991 auto isLegalToScalarize = [&]() {
5992 if (!VF.isScalable())
5993 // Scalarization of fixed length vectors "just works".
5994 return true;
5995
5996 // We have dedicated lowering for unpredicated uniform loads and
5997 // stores. Note that even with tail folding we know that at least
5998 // one lane is active (i.e. generalized predication is not possible
5999 // here), and the logic below depends on this fact.
6000 if (!foldTailByMasking())
6001 return true;
6002
6003 // For scalable vectors, a uniform memop load is always
6004 // uniform-by-parts and we know how to scalarize that.
6005 if (isa<LoadInst>(Val: I))
6006 return true;
6007
6008 // A uniform store isn't neccessarily uniform-by-part
6009 // and we can't assume scalarization.
6010 auto &SI = cast<StoreInst>(Val&: I);
6011 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
6012 };
6013
6014 const InstructionCost GatherScatterCost =
6015 isLegalGatherOrScatter(V: &I, VF) ?
6016 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
6017
6018 // Load: Scalar load + broadcast
6019 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6020 // FIXME: This cost is a significant under-estimate for tail folded
6021 // memory ops.
6022 const InstructionCost ScalarizationCost = isLegalToScalarize() ?
6023 getUniformMemOpCost(I: &I, VF) : InstructionCost::getInvalid();
6024
6025 // Choose better solution for the current VF, Note that Invalid
6026 // costs compare as maximumal large. If both are invalid, we get
6027 // scalable invalid which signals a failure and a vectorization abort.
6028 if (GatherScatterCost < ScalarizationCost)
6029 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
6030 else
6031 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
6032 continue;
6033 }
6034
6035 // We assume that widening is the best solution when possible.
6036 if (memoryInstructionCanBeWidened(I: &I, VF)) {
6037 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
6038 int ConsecutiveStride = Legal->isConsecutivePtr(
6039 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
6040 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6041 "Expected consecutive stride.");
6042 InstWidening Decision =
6043 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6044 setWideningDecision(I: &I, VF, W: Decision, Cost);
6045 continue;
6046 }
6047
6048 // Choose between Interleaving, Gather/Scatter or Scalarization.
6049 InstructionCost InterleaveCost = InstructionCost::getInvalid();
6050 unsigned NumAccesses = 1;
6051 if (isAccessInterleaved(Instr: &I)) {
6052 auto Group = getInterleavedAccessGroup(Instr: &I);
6053 assert(Group && "Fail to get an interleaved access group.");
6054
6055 // Make one decision for the whole group.
6056 if (getWideningDecision(I: &I, VF) != CM_Unknown)
6057 continue;
6058
6059 NumAccesses = Group->getNumMembers();
6060 if (interleavedAccessCanBeWidened(I: &I, VF))
6061 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
6062 }
6063
6064 InstructionCost GatherScatterCost =
6065 isLegalGatherOrScatter(V: &I, VF)
6066 ? getGatherScatterCost(I: &I, VF) * NumAccesses
6067 : InstructionCost::getInvalid();
6068
6069 InstructionCost ScalarizationCost =
6070 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
6071
6072 // Choose better solution for the current VF,
6073 // write down this decision and use it during vectorization.
6074 InstructionCost Cost;
6075 InstWidening Decision;
6076 if (InterleaveCost <= GatherScatterCost &&
6077 InterleaveCost < ScalarizationCost) {
6078 Decision = CM_Interleave;
6079 Cost = InterleaveCost;
6080 } else if (GatherScatterCost < ScalarizationCost) {
6081 Decision = CM_GatherScatter;
6082 Cost = GatherScatterCost;
6083 } else {
6084 Decision = CM_Scalarize;
6085 Cost = ScalarizationCost;
6086 }
6087 // If the instructions belongs to an interleave group, the whole group
6088 // receives the same decision. The whole group receives the cost, but
6089 // the cost will actually be assigned to one instruction.
6090 if (auto Group = getInterleavedAccessGroup(Instr: &I))
6091 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
6092 else
6093 setWideningDecision(I: &I, VF, W: Decision, Cost);
6094 }
6095 }
6096
6097 // Make sure that any load of address and any other address computation
6098 // remains scalar unless there is gather/scatter support. This avoids
6099 // inevitable extracts into address registers, and also has the benefit of
6100 // activating LSR more, since that pass can't optimize vectorized
6101 // addresses.
6102 if (TTI.prefersVectorizedAddressing())
6103 return;
6104
6105 // Start with all scalar pointer uses.
6106 SmallPtrSet<Instruction *, 8> AddrDefs;
6107 for (BasicBlock *BB : TheLoop->blocks())
6108 for (Instruction &I : *BB) {
6109 Instruction *PtrDef =
6110 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
6111 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
6112 getWideningDecision(I: &I, VF) != CM_GatherScatter)
6113 AddrDefs.insert(Ptr: PtrDef);
6114 }
6115
6116 // Add all instructions used to generate the addresses.
6117 SmallVector<Instruction *, 4> Worklist;
6118 append_range(C&: Worklist, R&: AddrDefs);
6119 while (!Worklist.empty()) {
6120 Instruction *I = Worklist.pop_back_val();
6121 for (auto &Op : I->operands())
6122 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
6123 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) &&
6124 AddrDefs.insert(Ptr: InstOp).second)
6125 Worklist.push_back(Elt: InstOp);
6126 }
6127
6128 for (auto *I : AddrDefs) {
6129 if (isa<LoadInst>(Val: I)) {
6130 // Setting the desired widening decision should ideally be handled in
6131 // by cost functions, but since this involves the task of finding out
6132 // if the loaded register is involved in an address computation, it is
6133 // instead changed here when we know this is the case.
6134 InstWidening Decision = getWideningDecision(I, VF);
6135 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6136 // Scalarize a widened load of address.
6137 setWideningDecision(
6138 I, VF, W: CM_Scalarize,
6139 Cost: (VF.getKnownMinValue() *
6140 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
6141 else if (auto Group = getInterleavedAccessGroup(Instr: I)) {
6142 // Scalarize an interleave group of address loads.
6143 for (unsigned I = 0; I < Group->getFactor(); ++I) {
6144 if (Instruction *Member = Group->getMember(Index: I))
6145 setWideningDecision(
6146 I: Member, VF, W: CM_Scalarize,
6147 Cost: (VF.getKnownMinValue() *
6148 getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: 1))));
6149 }
6150 }
6151 } else
6152 // Make sure I gets scalarized and a cost estimate without
6153 // scalarization overhead.
6154 ForcedScalars[VF].insert(Ptr: I);
6155 }
6156}
6157
6158void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6159 assert(!VF.isScalar() &&
6160 "Trying to set a vectorization decision for a scalar VF");
6161
6162 for (BasicBlock *BB : TheLoop->blocks()) {
6163 // For each instruction in the old loop.
6164 for (Instruction &I : *BB) {
6165 CallInst *CI = dyn_cast<CallInst>(Val: &I);
6166
6167 if (!CI)
6168 continue;
6169
6170 InstructionCost ScalarCost = InstructionCost::getInvalid();
6171 InstructionCost VectorCost = InstructionCost::getInvalid();
6172 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
6173 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6174
6175 Function *ScalarFunc = CI->getCalledFunction();
6176 Type *ScalarRetTy = CI->getType();
6177 SmallVector<Type *, 4> Tys, ScalarTys;
6178 bool MaskRequired = Legal->isMaskRequired(I: CI);
6179 for (auto &ArgOp : CI->args())
6180 ScalarTys.push_back(Elt: ArgOp->getType());
6181
6182 // Compute corresponding vector type for return value and arguments.
6183 Type *RetTy = ToVectorTy(Scalar: ScalarRetTy, EC: VF);
6184 for (Type *ScalarTy : ScalarTys)
6185 Tys.push_back(Elt: ToVectorTy(Scalar: ScalarTy, EC: VF));
6186
6187 // An in-loop reduction using an fmuladd intrinsic is a special case;
6188 // we don't want the normal cost for that intrinsic.
6189 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
6190 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy, CostKind)) {
6191 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
6192 IID: getVectorIntrinsicIDForCall(CI, TLI),
6193 MaskPos: std::nullopt, Cost: *RedCost);
6194 continue;
6195 }
6196
6197 // Estimate cost of scalarized vector call. The source operands are
6198 // assumed to be vectors, so we need to extract individual elements from
6199 // there, execute VF scalar calls, and then gather the result into the
6200 // vector return value.
6201 InstructionCost ScalarCallCost =
6202 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
6203
6204 // Compute costs of unpacking argument values for the scalar calls and
6205 // packing the return values to a vector.
6206 InstructionCost ScalarizationCost =
6207 getScalarizationOverhead(I: CI, VF, CostKind);
6208
6209 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
6210
6211 // Find the cost of vectorizing the call, if we can find a suitable
6212 // vector variant of the function.
6213 bool UsesMask = false;
6214 VFInfo FuncInfo;
6215 Function *VecFunc = nullptr;
6216 // Search through any available variants for one we can use at this VF.
6217 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
6218 // Must match requested VF.
6219 if (Info.Shape.VF != VF)
6220 continue;
6221
6222 // Must take a mask argument if one is required
6223 if (MaskRequired && !Info.isMasked())
6224 continue;
6225
6226 // Check that all parameter kinds are supported
6227 bool ParamsOk = true;
6228 for (VFParameter Param : Info.Shape.Parameters) {
6229 switch (Param.ParamKind) {
6230 case VFParamKind::Vector:
6231 break;
6232 case VFParamKind::OMP_Uniform: {
6233 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6234 // Make sure the scalar parameter in the loop is invariant.
6235 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
6236 L: TheLoop))
6237 ParamsOk = false;
6238 break;
6239 }
6240 case VFParamKind::OMP_Linear: {
6241 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6242 // Find the stride for the scalar parameter in this loop and see if
6243 // it matches the stride for the variant.
6244 // TODO: do we need to figure out the cost of an extract to get the
6245 // first lane? Or do we hope that it will be folded away?
6246 ScalarEvolution *SE = PSE.getSE();
6247 const auto *SAR =
6248 dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam));
6249
6250 if (!SAR || SAR->getLoop() != TheLoop) {
6251 ParamsOk = false;
6252 break;
6253 }
6254
6255 const SCEVConstant *Step =
6256 dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE));
6257
6258 if (!Step ||
6259 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
6260 ParamsOk = false;
6261
6262 break;
6263 }
6264 case VFParamKind::GlobalPredicate:
6265 UsesMask = true;
6266 break;
6267 default:
6268 ParamsOk = false;
6269 break;
6270 }
6271 }
6272
6273 if (!ParamsOk)
6274 continue;
6275
6276 // Found a suitable candidate, stop here.
6277 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
6278 FuncInfo = Info;
6279 break;
6280 }
6281
6282 // Add in the cost of synthesizing a mask if one wasn't required.
6283 InstructionCost MaskCost = 0;
6284 if (VecFunc && UsesMask && !MaskRequired)
6285 MaskCost = TTI.getShuffleCost(
6286 Kind: TargetTransformInfo::SK_Broadcast,
6287 Tp: VectorType::get(ElementType: IntegerType::getInt1Ty(
6288 C&: VecFunc->getFunctionType()->getContext()),
6289 EC: VF));
6290
6291 if (TLI && VecFunc && !CI->isNoBuiltin())
6292 VectorCost =
6293 TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind) + MaskCost;
6294
6295 // Find the cost of an intrinsic; some targets may have instructions that
6296 // perform the operation without needing an actual call.
6297 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6298 if (IID != Intrinsic::not_intrinsic)
6299 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6300
6301 InstructionCost Cost = ScalarCost;
6302 InstWidening Decision = CM_Scalarize;
6303
6304 if (VectorCost <= Cost) {
6305 Cost = VectorCost;
6306 Decision = CM_VectorCall;
6307 }
6308
6309 if (IntrinsicCost <= Cost) {
6310 Cost = IntrinsicCost;
6311 Decision = CM_IntrinsicCall;
6312 }
6313
6314 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6315 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6316 }
6317 }
6318}
6319
6320InstructionCost
6321LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6322 ElementCount VF) {
6323 // If we know that this instruction will remain uniform, check the cost of
6324 // the scalar version.
6325 if (isUniformAfterVectorization(I, VF))
6326 VF = ElementCount::getFixed(MinVal: 1);
6327
6328 if (VF.isVector() && isProfitableToScalarize(I, VF))
6329 return InstsToScalarize[VF][I];
6330
6331 // Forced scalars do not have any scalarization overhead.
6332 auto ForcedScalar = ForcedScalars.find(Val: VF);
6333 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6334 auto InstSet = ForcedScalar->second;
6335 if (InstSet.count(Ptr: I))
6336 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
6337 VF.getKnownMinValue();
6338 }
6339
6340 Type *RetTy = I->getType();
6341 if (canTruncateToMinimalBitwidth(I, VF))
6342 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
6343 auto SE = PSE.getSE();
6344 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6345
6346 auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6347 ElementCount VF) -> bool {
6348 if (VF.isScalar())
6349 return true;
6350
6351 auto Scalarized = InstsToScalarize.find(Val: VF);
6352 assert(Scalarized != InstsToScalarize.end() &&
6353 "VF not yet analyzed for scalarization profitability");
6354 return !Scalarized->second.count(Val: I) &&
6355 llvm::all_of(Range: I->users(), P: [&](User *U) {
6356 auto *UI = cast<Instruction>(Val: U);
6357 return !Scalarized->second.count(Val: UI);
6358 });
6359 };
6360 (void) hasSingleCopyAfterVectorization;
6361
6362 Type *VectorTy;
6363 if (isScalarAfterVectorization(I, VF)) {
6364 // With the exception of GEPs and PHIs, after scalarization there should
6365 // only be one copy of the instruction generated in the loop. This is
6366 // because the VF is either 1, or any instructions that need scalarizing
6367 // have already been dealt with by the time we get here. As a result,
6368 // it means we don't have to multiply the instruction cost by VF.
6369 assert(I->getOpcode() == Instruction::GetElementPtr ||
6370 I->getOpcode() == Instruction::PHI ||
6371 (I->getOpcode() == Instruction::BitCast &&
6372 I->getType()->isPointerTy()) ||
6373 hasSingleCopyAfterVectorization(I, VF));
6374 VectorTy = RetTy;
6375 } else
6376 VectorTy = ToVectorTy(Scalar: RetTy, EC: VF);
6377
6378 if (VF.isVector() && VectorTy->isVectorTy() &&
6379 !TTI.getNumberOfParts(Tp: VectorTy))
6380 return InstructionCost::getInvalid();
6381
6382 // TODO: We need to estimate the cost of intrinsic calls.
6383 switch (I->getOpcode()) {
6384 case Instruction::GetElementPtr:
6385 // We mark this instruction as zero-cost because the cost of GEPs in
6386 // vectorized code depends on whether the corresponding memory instruction
6387 // is scalarized or not. Therefore, we handle GEPs with the memory
6388 // instruction cost.
6389 return 0;
6390 case Instruction::Br: {
6391 // In cases of scalarized and predicated instructions, there will be VF
6392 // predicated blocks in the vectorized loop. Each branch around these
6393 // blocks requires also an extract of its vector compare i1 element.
6394 // Note that the conditional branch from the loop latch will be replaced by
6395 // a single branch controlling the loop, so there is no extra overhead from
6396 // scalarization.
6397 bool ScalarPredicatedBB = false;
6398 BranchInst *BI = cast<BranchInst>(Val: I);
6399 if (VF.isVector() && BI->isConditional() &&
6400 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6401 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
6402 BI->getParent() != TheLoop->getLoopLatch())
6403 ScalarPredicatedBB = true;
6404
6405 if (ScalarPredicatedBB) {
6406 // Not possible to scalarize scalable vector with predicated instructions.
6407 if (VF.isScalable())
6408 return InstructionCost::getInvalid();
6409 // Return cost for branches around scalarized and predicated blocks.
6410 auto *Vec_i1Ty =
6411 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6412 return (
6413 TTI.getScalarizationOverhead(
6414 Ty: Vec_i1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6415 /*Insert*/ false, /*Extract*/ true, CostKind) +
6416 (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6417 } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6418 // The back-edge branch will remain, as will all scalar branches.
6419 return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6420 else
6421 // This branch will be eliminated by if-conversion.
6422 return 0;
6423 // Note: We currently assume zero cost for an unconditional branch inside
6424 // a predicated block since it will become a fall-through, although we
6425 // may decide in the future to call TTI for all branches.
6426 }
6427 case Instruction::PHI: {
6428 auto *Phi = cast<PHINode>(Val: I);
6429
6430 // First-order recurrences are replaced by vector shuffles inside the loop.
6431 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6432 // For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the
6433 // penultimate value of the recurrence.
6434 // TODO: Consider vscale_range info.
6435 if (VF.isScalable() && VF.getKnownMinValue() == 1)
6436 return InstructionCost::getInvalid();
6437 SmallVector<int> Mask(VF.getKnownMinValue());
6438 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6439 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6440 Tp: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6441 Index: VF.getKnownMinValue() - 1);
6442 }
6443
6444 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6445 // converted into select instructions. We require N - 1 selects per phi
6446 // node, where N is the number of incoming values.
6447 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6448 return (Phi->getNumIncomingValues() - 1) *
6449 TTI.getCmpSelInstrCost(
6450 Opcode: Instruction::Select, ValTy: ToVectorTy(Scalar: Phi->getType(), EC: VF),
6451 CondTy: ToVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6452 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6453
6454 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6455 }
6456 case Instruction::UDiv:
6457 case Instruction::SDiv:
6458 case Instruction::URem:
6459 case Instruction::SRem:
6460 if (VF.isVector() && isPredicatedInst(I)) {
6461 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6462 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6463 ScalarCost : SafeDivisorCost;
6464 }
6465 // We've proven all lanes safe to speculate, fall through.
6466 [[fallthrough]];
6467 case Instruction::Add:
6468 case Instruction::FAdd:
6469 case Instruction::Sub:
6470 case Instruction::FSub:
6471 case Instruction::Mul:
6472 case Instruction::FMul:
6473 case Instruction::FDiv:
6474 case Instruction::FRem:
6475 case Instruction::Shl:
6476 case Instruction::LShr:
6477 case Instruction::AShr:
6478 case Instruction::And:
6479 case Instruction::Or:
6480 case Instruction::Xor: {
6481 // If we're speculating on the stride being 1, the multiplication may
6482 // fold away. We can generalize this for all operations using the notion
6483 // of neutral elements. (TODO)
6484 if (I->getOpcode() == Instruction::Mul &&
6485 (PSE.getSCEV(V: I->getOperand(i: 0))->isOne() ||
6486 PSE.getSCEV(V: I->getOperand(i: 1))->isOne()))
6487 return 0;
6488
6489 // Detect reduction patterns
6490 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind))
6491 return *RedCost;
6492
6493 // Certain instructions can be cheaper to vectorize if they have a constant
6494 // second vector operand. One example of this are shifts on x86.
6495 Value *Op2 = I->getOperand(i: 1);
6496 auto Op2Info = TTI.getOperandInfo(V: Op2);
6497 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6498 Legal->isInvariant(V: Op2))
6499 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6500
6501 SmallVector<const Value *, 4> Operands(I->operand_values());
6502 return TTI.getArithmeticInstrCost(
6503 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6504 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6505 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6506 }
6507 case Instruction::FNeg: {
6508 return TTI.getArithmeticInstrCost(
6509 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6510 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6511 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6512 Args: I->getOperand(i: 0), CxtI: I);
6513 }
6514 case Instruction::Select: {
6515 SelectInst *SI = cast<SelectInst>(Val: I);
6516 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6517 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6518
6519 const Value *Op0, *Op1;
6520 using namespace llvm::PatternMatch;
6521 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6522 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6523 // select x, y, false --> x & y
6524 // select x, true, y --> x | y
6525 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6526 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6527 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6528 Op1->getType()->getScalarSizeInBits() == 1);
6529
6530 SmallVector<const Value *, 2> Operands{Op0, Op1};
6531 return TTI.getArithmeticInstrCost(
6532 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy,
6533 CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I);
6534 }
6535
6536 Type *CondTy = SI->getCondition()->getType();
6537 if (!ScalarCond)
6538 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6539
6540 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6541 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6542 Pred = Cmp->getPredicate();
6543 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6544 CostKind, I);
6545 }
6546 case Instruction::ICmp:
6547 case Instruction::FCmp: {
6548 Type *ValTy = I->getOperand(i: 0)->getType();
6549 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6550 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6551 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6552 VectorTy = ToVectorTy(Scalar: ValTy, EC: VF);
6553 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: nullptr,
6554 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6555 I);
6556 }
6557 case Instruction::Store:
6558 case Instruction::Load: {
6559 ElementCount Width = VF;
6560 if (Width.isVector()) {
6561 InstWidening Decision = getWideningDecision(I, VF: Width);
6562 assert(Decision != CM_Unknown &&
6563 "CM decision should be taken at this point");
6564 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6565 return InstructionCost::getInvalid();
6566 if (Decision == CM_Scalarize)
6567 Width = ElementCount::getFixed(MinVal: 1);
6568 }
6569 VectorTy = ToVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6570 return getMemoryInstructionCost(I, VF);
6571 }
6572 case Instruction::BitCast:
6573 if (I->getType()->isPointerTy())
6574 return 0;
6575 [[fallthrough]];
6576 case Instruction::ZExt:
6577 case Instruction::SExt:
6578 case Instruction::FPToUI:
6579 case Instruction::FPToSI:
6580 case Instruction::FPExt:
6581 case Instruction::PtrToInt:
6582 case Instruction::IntToPtr:
6583 case Instruction::SIToFP:
6584 case Instruction::UIToFP:
6585 case Instruction::Trunc:
6586 case Instruction::FPTrunc: {
6587 // Computes the CastContextHint from a Load/Store instruction.
6588 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6589 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6590 "Expected a load or a store!");
6591
6592 if (VF.isScalar() || !TheLoop->contains(Inst: I))
6593 return TTI::CastContextHint::Normal;
6594
6595 switch (getWideningDecision(I, VF)) {
6596 case LoopVectorizationCostModel::CM_GatherScatter:
6597 return TTI::CastContextHint::GatherScatter;
6598 case LoopVectorizationCostModel::CM_Interleave:
6599 return TTI::CastContextHint::Interleave;
6600 case LoopVectorizationCostModel::CM_Scalarize:
6601 case LoopVectorizationCostModel::CM_Widen:
6602 return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
6603 : TTI::CastContextHint::Normal;
6604 case LoopVectorizationCostModel::CM_Widen_Reverse:
6605 return TTI::CastContextHint::Reversed;
6606 case LoopVectorizationCostModel::CM_Unknown:
6607 llvm_unreachable("Instr did not go through cost modelling?");
6608 case LoopVectorizationCostModel::CM_VectorCall:
6609 case LoopVectorizationCostModel::CM_IntrinsicCall:
6610 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6611 }
6612
6613 llvm_unreachable("Unhandled case!");
6614 };
6615
6616 unsigned Opcode = I->getOpcode();
6617 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6618 // For Trunc, the context is the only user, which must be a StoreInst.
6619 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6620 if (I->hasOneUse())
6621 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
6622 CCH = ComputeCCH(Store);
6623 }
6624 // For Z/Sext, the context is the operand, which must be a LoadInst.
6625 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6626 Opcode == Instruction::FPExt) {
6627 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
6628 CCH = ComputeCCH(Load);
6629 }
6630
6631 // We optimize the truncation of induction variables having constant
6632 // integer steps. The cost of these truncations is the same as the scalar
6633 // operation.
6634 if (isOptimizableIVTruncate(I, VF)) {
6635 auto *Trunc = cast<TruncInst>(Val: I);
6636 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6637 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6638 }
6639
6640 // Detect reduction patterns
6641 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy, CostKind))
6642 return *RedCost;
6643
6644 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
6645 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6646 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6647 SrcScalarTy =
6648 IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6649 Type *SrcVecTy =
6650 VectorTy->isVectorTy() ? ToVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6651
6652 if (canTruncateToMinimalBitwidth(I, VF)) {
6653 // If the result type is <= the source type, there will be no extend
6654 // after truncating the users to the minimal required bitwidth.
6655 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6656 (I->getOpcode() == Instruction::ZExt ||
6657 I->getOpcode() == Instruction::SExt))
6658 return 0;
6659 }
6660
6661 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6662 }
6663 case Instruction::Call:
6664 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6665 case Instruction::ExtractValue:
6666 return TTI.getInstructionCost(U: I, CostKind: TTI::TCK_RecipThroughput);
6667 case Instruction::Alloca:
6668 // We cannot easily widen alloca to a scalable alloca, as
6669 // the result would need to be a vector of pointers.
6670 if (VF.isScalable())
6671 return InstructionCost::getInvalid();
6672 [[fallthrough]];
6673 default:
6674 // This opcode is unknown. Assume that it is the same as 'mul'.
6675 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6676 } // end of switch.
6677}
6678
6679void LoopVectorizationCostModel::collectValuesToIgnore() {
6680 // Ignore ephemeral values.
6681 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6682
6683 SmallVector<Value *, 4> DeadInterleavePointerOps;
6684 for (BasicBlock *BB : TheLoop->blocks())
6685 for (Instruction &I : *BB) {
6686 // Find all stores to invariant variables. Since they are going to sink
6687 // outside the loop we do not need calculate cost for them.
6688 StoreInst *SI;
6689 if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
6690 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand()))
6691 ValuesToIgnore.insert(Ptr: &I);
6692
6693 // For interleave groups, we only create a pointer for the start of the
6694 // interleave group. Queue up addresses of group members except the insert
6695 // position for further processing.
6696 if (isAccessInterleaved(Instr: &I)) {
6697 auto *Group = getInterleavedAccessGroup(Instr: &I);
6698 if (Group->getInsertPos() == &I)
6699 continue;
6700 Value *PointerOp = getLoadStorePointerOperand(V: &I);
6701 DeadInterleavePointerOps.push_back(Elt: PointerOp);
6702 }
6703 }
6704
6705 // Mark ops feeding interleave group members as free, if they are only used
6706 // by other dead computations.
6707 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6708 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
6709 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
6710 Instruction *UI = cast<Instruction>(Val: U);
6711 return !VecValuesToIgnore.contains(Ptr: U) &&
6712 (!isAccessInterleaved(Instr: UI) ||
6713 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6714 }))
6715 continue;
6716 VecValuesToIgnore.insert(Ptr: Op);
6717 DeadInterleavePointerOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6718 }
6719
6720 // Ignore type-promoting instructions we identified during reduction
6721 // detection.
6722 for (const auto &Reduction : Legal->getReductionVars()) {
6723 const RecurrenceDescriptor &RedDes = Reduction.second;
6724 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6725 VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end());
6726 }
6727 // Ignore type-casting instructions we identified during induction
6728 // detection.
6729 for (const auto &Induction : Legal->getInductionVars()) {
6730 const InductionDescriptor &IndDes = Induction.second;
6731 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6732 VecValuesToIgnore.insert(I: Casts.begin(), E: Casts.end());
6733 }
6734}
6735
6736void LoopVectorizationCostModel::collectInLoopReductions() {
6737 for (const auto &Reduction : Legal->getReductionVars()) {
6738 PHINode *Phi = Reduction.first;
6739 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6740
6741 // We don't collect reductions that are type promoted (yet).
6742 if (RdxDesc.getRecurrenceType() != Phi->getType())
6743 continue;
6744
6745 // If the target would prefer this reduction to happen "in-loop", then we
6746 // want to record it as such.
6747 unsigned Opcode = RdxDesc.getOpcode();
6748 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6749 !TTI.preferInLoopReduction(Opcode, Ty: Phi->getType(),
6750 Flags: TargetTransformInfo::ReductionFlags()))
6751 continue;
6752
6753 // Check that we can correctly put the reductions into the loop, by
6754 // finding the chain of operations that leads from the phi to the loop
6755 // exit value.
6756 SmallVector<Instruction *, 4> ReductionOperations =
6757 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6758 bool InLoop = !ReductionOperations.empty();
6759
6760 if (InLoop) {
6761 InLoopReductions.insert(Ptr: Phi);
6762 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6763 Instruction *LastChain = Phi;
6764 for (auto *I : ReductionOperations) {
6765 InLoopReductionImmediateChains[I] = LastChain;
6766 LastChain = I;
6767 }
6768 }
6769 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6770 << " reduction for phi: " << *Phi << "\n");
6771 }
6772}
6773
6774VPValue *VPBuilder::createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
6775 DebugLoc DL, const Twine &Name) {
6776 assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE &&
6777 Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate");
6778 return tryInsertInstruction(
6779 VPI: new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
6780}
6781
6782// This function will select a scalable VF if the target supports scalable
6783// vectors and a fixed one otherwise.
6784// TODO: we could return a pair of values that specify the max VF and
6785// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6786// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6787// doesn't have a cost model that can choose which plan to execute if
6788// more than one is generated.
6789static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6790 LoopVectorizationCostModel &CM) {
6791 unsigned WidestType;
6792 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6793
6794 TargetTransformInfo::RegisterKind RegKind =
6795 TTI.enableScalableVectorization()
6796 ? TargetTransformInfo::RGK_ScalableVector
6797 : TargetTransformInfo::RGK_FixedWidthVector;
6798
6799 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6800 unsigned N = RegSize.getKnownMinValue() / WidestType;
6801 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6802}
6803
6804VectorizationFactor
6805LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6806 ElementCount VF = UserVF;
6807 // Outer loop handling: They may require CFG and instruction level
6808 // transformations before even evaluating whether vectorization is profitable.
6809 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6810 // the vectorization pipeline.
6811 if (!OrigLoop->isInnermost()) {
6812 // If the user doesn't provide a vectorization factor, determine a
6813 // reasonable one.
6814 if (UserVF.isZero()) {
6815 VF = determineVPlanVF(TTI, CM);
6816 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6817
6818 // Make sure we have a VF > 1 for stress testing.
6819 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6820 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6821 << "overriding computed VF.\n");
6822 VF = ElementCount::getFixed(MinVal: 4);
6823 }
6824 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6825 !ForceTargetSupportsScalableVectors) {
6826 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6827 << "not supported by the target.\n");
6828 reportVectorizationFailure(
6829 DebugMsg: "Scalable vectorization requested but not supported by the target",
6830 OREMsg: "the scalable user-specified vectorization width for outer-loop "
6831 "vectorization cannot be used because the target does not support "
6832 "scalable vectors.",
6833 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6834 return VectorizationFactor::Disabled();
6835 }
6836 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6837 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6838 "VF needs to be a power of two");
6839 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6840 << "VF " << VF << " to build VPlans.\n");
6841 buildVPlans(MinVF: VF, MaxVF: VF);
6842
6843 // For VPlan build stress testing, we bail out after VPlan construction.
6844 if (VPlanBuildStressTest)
6845 return VectorizationFactor::Disabled();
6846
6847 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6848 }
6849
6850 LLVM_DEBUG(
6851 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6852 "VPlan-native path.\n");
6853 return VectorizationFactor::Disabled();
6854}
6855
6856std::optional<VectorizationFactor>
6857LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6858 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6859 CM.collectValuesToIgnore();
6860 CM.collectElementTypesForWidening();
6861
6862 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6863 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6864 return std::nullopt;
6865
6866 // Invalidate interleave groups if all blocks of loop will be predicated.
6867 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6868 !useMaskedInterleavedAccesses(TTI)) {
6869 LLVM_DEBUG(
6870 dbgs()
6871 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6872 "which requires masked-interleaved support.\n");
6873 if (CM.InterleaveInfo.invalidateGroups())
6874 // Invalidating interleave groups also requires invalidating all decisions
6875 // based on them, which includes widening decisions and uniform and scalar
6876 // values.
6877 CM.invalidateCostModelingDecisions();
6878 }
6879
6880 if (CM.foldTailByMasking())
6881 Legal->prepareToFoldTailByMasking();
6882
6883 ElementCount MaxUserVF =
6884 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6885 bool UserVFIsLegal = ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF);
6886 if (!UserVF.isZero() && UserVFIsLegal) {
6887 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6888 "VF needs to be a power of two");
6889 // Collect the instructions (and their associated costs) that will be more
6890 // profitable to scalarize.
6891 CM.collectInLoopReductions();
6892 if (CM.selectUserVectorizationFactor(UserVF)) {
6893 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6894 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6895 if (!hasPlanWithVF(VF: UserVF)) {
6896 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
6897 << ".\n");
6898 return std::nullopt;
6899 }
6900
6901 LLVM_DEBUG(printPlans(dbgs()));
6902 return {{UserVF, 0, 0}};
6903 } else
6904 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6905 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6906 }
6907
6908 // Collect the Vectorization Factor Candidates.
6909 SmallVector<ElementCount> VFCandidates;
6910 for (auto VF = ElementCount::getFixed(MinVal: 1);
6911 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
6912 VFCandidates.push_back(Elt: VF);
6913 for (auto VF = ElementCount::getScalable(MinVal: 1);
6914 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
6915 VFCandidates.push_back(Elt: VF);
6916
6917 CM.collectInLoopReductions();
6918 for (const auto &VF : VFCandidates) {
6919 // Collect Uniform and Scalar instructions after vectorization with VF.
6920 CM.collectUniformsAndScalars(VF);
6921
6922 // Collect the instructions (and their associated costs) that will be more
6923 // profitable to scalarize.
6924 if (VF.isVector())
6925 CM.collectInstsToScalarize(VF);
6926 }
6927
6928 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
6929 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
6930
6931 LLVM_DEBUG(printPlans(dbgs()));
6932 if (VPlans.empty())
6933 return std::nullopt;
6934 if (all_of(Range&: VPlans,
6935 P: [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }))
6936 return VectorizationFactor::Disabled();
6937
6938 // Select the optimal vectorization factor according to the legacy cost-model.
6939 // This is now only used to verify the decisions by the new VPlan-based
6940 // cost-model and will be retired once the VPlan-based cost-model is
6941 // stabilized.
6942 VectorizationFactor VF = selectVectorizationFactor();
6943 assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
6944 if (!hasPlanWithVF(VF: VF.Width)) {
6945 LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
6946 << ".\n");
6947 return std::nullopt;
6948 }
6949 return VF;
6950}
6951
6952InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6953 ElementCount VF) const {
6954 return CM.getInstructionCost(I: UI, VF);
6955}
6956
6957bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6958 return CM.ValuesToIgnore.contains(Ptr: UI) ||
6959 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
6960 SkipCostComputation.contains(Ptr: UI);
6961}
6962
6963InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6964 ElementCount VF) const {
6965 InstructionCost Cost = 0;
6966 LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
6967 VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
6968
6969 // Cost modeling for inductions is inaccurate in the legacy cost model
6970 // compared to the recipes that are generated. To match here initially during
6971 // VPlan cost model bring up directly use the induction costs from the legacy
6972 // cost model. Note that we do this as pre-processing; the VPlan may not have
6973 // any recipes associated with the original induction increment instruction
6974 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6975 // the cost of induction phis and increments (both that are represented by
6976 // recipes and those that are not), to avoid distinguishing between them here,
6977 // and skip all recipes that represent induction phis and increments (the
6978 // former case) later on, if they exist, to avoid counting them twice.
6979 // Similarly we pre-compute the cost of any optimized truncates.
6980 // TODO: Switch to more accurate costing based on VPlan.
6981 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6982 Instruction *IVInc = cast<Instruction>(
6983 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6984 SmallVector<Instruction *> IVInsts = {IV, IVInc};
6985 for (User *U : IV->users()) {
6986 auto *CI = cast<Instruction>(Val: U);
6987 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6988 continue;
6989 IVInsts.push_back(Elt: CI);
6990 }
6991 for (Instruction *IVInst : IVInsts) {
6992 if (!CostCtx.SkipCostComputation.insert(Ptr: IVInst).second)
6993 continue;
6994 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6995 LLVM_DEBUG({
6996 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6997 << ": induction instruction " << *IVInst << "\n";
6998 });
6999 Cost += InductionCost;
7000 }
7001 }
7002
7003 /// Compute the cost of all exiting conditions of the loop using the legacy
7004 /// cost model. This is to match the legacy behavior, which adds the cost of
7005 /// all exit conditions. Note that this over-estimates the cost, as there will
7006 /// be a single condition to control the vector loop.
7007 SmallVector<BasicBlock *> Exiting;
7008 CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
7009 SetVector<Instruction *> ExitInstrs;
7010 // Collect all exit conditions.
7011 for (BasicBlock *EB : Exiting) {
7012 auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator());
7013 if (!Term)
7014 continue;
7015 if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) {
7016 ExitInstrs.insert(X: CondI);
7017 }
7018 }
7019 // Compute the cost of all instructions only feeding the exit conditions.
7020 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
7021 Instruction *CondI = ExitInstrs[I];
7022 if (!OrigLoop->contains(Inst: CondI) ||
7023 !CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
7024 continue;
7025 Cost += CostCtx.getLegacyCost(UI: CondI, VF);
7026 for (Value *Op : CondI->operands()) {
7027 auto *OpI = dyn_cast<Instruction>(Val: Op);
7028 if (!OpI || any_of(Range: OpI->users(), P: [&ExitInstrs, this](User *U) {
7029 return OrigLoop->contains(BB: cast<Instruction>(Val: U)->getParent()) &&
7030 !ExitInstrs.contains(key: cast<Instruction>(Val: U));
7031 }))
7032 continue;
7033 ExitInstrs.insert(X: OpI);
7034 }
7035 }
7036
7037 // The legacy cost model has special logic to compute the cost of in-loop
7038 // reductions, which may be smaller than the sum of all instructions involved
7039 // in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7040 // which the legacy cost model uses to assign cost. Pre-compute their costs
7041 // for now.
7042 // TODO: Switch to costing based on VPlan once the logic has been ported.
7043 for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7044 if (!CM.isInLoopReduction(Phi: RedPhi) &&
7045 !RecurrenceDescriptor::isAnyOfRecurrenceKind(
7046 Kind: RdxDesc.getRecurrenceKind()))
7047 continue;
7048
7049 // AnyOf reduction codegen may remove the select. To match the legacy cost
7050 // model, pre-compute the cost for AnyOf reductions here.
7051 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7052 Kind: RdxDesc.getRecurrenceKind())) {
7053 auto *Select = cast<SelectInst>(Val: *find_if(
7054 Range: RedPhi->users(), P: [](User *U) { return isa<SelectInst>(Val: U); }));
7055 assert(!CostCtx.SkipCostComputation.contains(Select) &&
7056 "reduction op visited multiple times");
7057 CostCtx.SkipCostComputation.insert(Ptr: Select);
7058 auto ReductionCost = CostCtx.getLegacyCost(UI: Select, VF);
7059 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7060 << ":\n any-of reduction " << *Select << "\n");
7061 Cost += ReductionCost;
7062 continue;
7063 }
7064
7065 const auto &ChainOps = RdxDesc.getReductionOpChain(Phi: RedPhi, L: OrigLoop);
7066 SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7067 ChainOps.end());
7068 // Also include the operands of instructions in the chain, as the cost-model
7069 // may mark extends as free.
7070 for (auto *ChainOp : ChainOps) {
7071 for (Value *Op : ChainOp->operands()) {
7072 if (auto *I = dyn_cast<Instruction>(Val: Op))
7073 ChainOpsAndOperands.insert(X: I);
7074 }
7075 }
7076
7077 // Pre-compute the cost for I, if it has a reduction pattern cost.
7078 for (Instruction *I : ChainOpsAndOperands) {
7079 auto ReductionCost = CM.getReductionPatternCost(
7080 I, VF, Ty: ToVectorTy(Scalar: I->getType(), EC: VF), CostKind: TTI::TCK_RecipThroughput);
7081 if (!ReductionCost)
7082 continue;
7083
7084 assert(!CostCtx.SkipCostComputation.contains(I) &&
7085 "reduction op visited multiple times");
7086 CostCtx.SkipCostComputation.insert(Ptr: I);
7087 LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7088 << ":\n in-loop reduction " << *I << "\n");
7089 Cost += *ReductionCost;
7090 }
7091 }
7092
7093 // Pre-compute the costs for branches except for the backedge, as the number
7094 // of replicate regions in a VPlan may not directly match the number of
7095 // branches, which would lead to different decisions.
7096 // TODO: Compute cost of branches for each replicate region in the VPlan,
7097 // which is more accurate than the legacy cost model.
7098 for (BasicBlock *BB : OrigLoop->blocks()) {
7099 if (BB == OrigLoop->getLoopLatch())
7100 continue;
7101 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
7102 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
7103 Cost += BranchCost;
7104 }
7105 // Now compute and add the VPlan-based cost.
7106 Cost += Plan.cost(VF, Ctx&: CostCtx);
7107 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7108 return Cost;
7109}
7110
7111VPlan &LoopVectorizationPlanner::getBestPlan() const {
7112 // If there is a single VPlan with a single VF, return it directly.
7113 VPlan &FirstPlan = *VPlans[0];
7114 if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1)
7115 return FirstPlan;
7116
7117 VPlan *BestPlan = &FirstPlan;
7118 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
7119 assert(hasPlanWithVF(ScalarVF) &&
7120 "More than a single plan/VF w/o any plan having scalar VF");
7121
7122 // TODO: Compute scalar cost using VPlan-based cost model.
7123 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7124 VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7125
7126 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7127 if (ForceVectorization) {
7128 // Ignore scalar width, because the user explicitly wants vectorization.
7129 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7130 // evaluation.
7131 BestFactor.Cost = InstructionCost::getMax();
7132 }
7133
7134 for (auto &P : VPlans) {
7135 for (ElementCount VF : P->vectorFactors()) {
7136 if (VF.isScalar())
7137 continue;
7138 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7139 LLVM_DEBUG(
7140 dbgs()
7141 << "LV: Not considering vector loop of width " << VF
7142 << " because it will not generate any vector instructions.\n");
7143 continue;
7144 }
7145
7146 InstructionCost Cost = cost(Plan&: *P, VF);
7147 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7148 if (isMoreProfitable(A: CurrentFactor, B: BestFactor)) {
7149 BestFactor = CurrentFactor;
7150 BestPlan = &*P;
7151 }
7152 }
7153 }
7154 BestPlan->setVF(BestFactor.Width);
7155 return *BestPlan;
7156}
7157
7158VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7159 assert(count_if(VPlans,
7160 [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7161 1 &&
7162 "Best VF has not a single VPlan.");
7163
7164 for (const VPlanPtr &Plan : VPlans) {
7165 if (Plan->hasVF(VF))
7166 return *Plan.get();
7167 }
7168 llvm_unreachable("No plan found!");
7169}
7170
7171static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7172 SmallVector<Metadata *, 4> MDs;
7173 // Reserve first location for self reference to the LoopID metadata node.
7174 MDs.push_back(Elt: nullptr);
7175 bool IsUnrollMetadata = false;
7176 MDNode *LoopID = L->getLoopID();
7177 if (LoopID) {
7178 // First find existing loop unrolling disable metadata.
7179 for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7180 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I: i));
7181 if (MD) {
7182 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
7183 IsUnrollMetadata =
7184 S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
7185 }
7186 MDs.push_back(Elt: LoopID->getOperand(I: i));
7187 }
7188 }
7189
7190 if (!IsUnrollMetadata) {
7191 // Add runtime unroll disable metadata.
7192 LLVMContext &Context = L->getHeader()->getContext();
7193 SmallVector<Metadata *, 1> DisableOperands;
7194 DisableOperands.push_back(
7195 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
7196 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
7197 MDs.push_back(Elt: DisableNode);
7198 MDNode *NewLoopID = MDNode::get(Context, MDs);
7199 // Set operand 0 to refer to the loop id itself.
7200 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
7201 L->setLoopID(NewLoopID);
7202 }
7203}
7204
7205// Check if \p RedResult is a ComputeReductionResult instruction, and if it is
7206// create a merge phi node for it and add it to \p ReductionResumeValues.
7207static void createAndCollectMergePhiForReduction(
7208 VPInstruction *RedResult,
7209 DenseMap<const RecurrenceDescriptor *, Value *> &ReductionResumeValues,
7210 VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock,
7211 bool VectorizingEpilogue) {
7212 if (!RedResult ||
7213 RedResult->getOpcode() != VPInstruction::ComputeReductionResult)
7214 return;
7215
7216 auto *PhiR = cast<VPReductionPHIRecipe>(Val: RedResult->getOperand(N: 0));
7217 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
7218
7219 Value *FinalValue =
7220 State.get(Def: RedResult, Instance: VPIteration(State.UF - 1, VPLane::getFirstLane()));
7221 auto *ResumePhi =
7222 dyn_cast<PHINode>(Val: PhiR->getStartValue()->getUnderlyingValue());
7223 if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
7224 Kind: RdxDesc.getRecurrenceKind())) {
7225 auto *Cmp = cast<ICmpInst>(Val: PhiR->getStartValue()->getUnderlyingValue());
7226 assert(Cmp->getPredicate() == CmpInst::ICMP_NE);
7227 assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue());
7228 ResumePhi = cast<PHINode>(Val: Cmp->getOperand(i_nocapture: 0));
7229 }
7230 assert((!VectorizingEpilogue || ResumePhi) &&
7231 "when vectorizing the epilogue loop, we need a resume phi from main "
7232 "vector loop");
7233
7234 // TODO: bc.merge.rdx should not be created here, instead it should be
7235 // modeled in VPlan.
7236 BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader();
7237 // Create a phi node that merges control-flow from the backedge-taken check
7238 // block and the middle block.
7239 auto *BCBlockPhi =
7240 PHINode::Create(Ty: FinalValue->getType(), NumReservedValues: 2, NameStr: "bc.merge.rdx",
7241 InsertBefore: LoopScalarPreHeader->getTerminator()->getIterator());
7242
7243 // If we are fixing reductions in the epilogue loop then we should already
7244 // have created a bc.merge.rdx Phi after the main vector body. Ensure that
7245 // we carry over the incoming values correctly.
7246 for (auto *Incoming : predecessors(BB: LoopScalarPreHeader)) {
7247 if (Incoming == LoopMiddleBlock)
7248 BCBlockPhi->addIncoming(V: FinalValue, BB: Incoming);
7249 else if (ResumePhi && is_contained(Range: ResumePhi->blocks(), Element: Incoming))
7250 BCBlockPhi->addIncoming(V: ResumePhi->getIncomingValueForBlock(BB: Incoming),
7251 BB: Incoming);
7252 else
7253 BCBlockPhi->addIncoming(V: RdxDesc.getRecurrenceStartValue(), BB: Incoming);
7254 }
7255
7256 auto *OrigPhi = cast<PHINode>(Val: PhiR->getUnderlyingValue());
7257 // TODO: This fixup should instead be modeled in VPlan.
7258 // Fix the scalar loop reduction variable with the incoming reduction sum
7259 // from the vector body and from the backedge value.
7260 int IncomingEdgeBlockIdx =
7261 OrigPhi->getBasicBlockIndex(BB: OrigLoop->getLoopLatch());
7262 assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
7263 // Pick the other block.
7264 int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
7265 OrigPhi->setIncomingValue(i: SelfEdgeBlockIdx, V: BCBlockPhi);
7266 Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
7267 OrigPhi->setIncomingValue(i: IncomingEdgeBlockIdx, V: LoopExitInst);
7268
7269 ReductionResumeValues[&RdxDesc] = BCBlockPhi;
7270}
7271
7272std::pair<DenseMap<const SCEV *, Value *>,
7273 DenseMap<const RecurrenceDescriptor *, Value *>>
7274LoopVectorizationPlanner::executePlan(
7275 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7276 InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
7277 const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7278 assert(BestVPlan.hasVF(BestVF) &&
7279 "Trying to execute plan with unsupported VF");
7280 assert(BestVPlan.hasUF(BestUF) &&
7281 "Trying to execute plan with unsupported UF");
7282 assert(
7283 (IsEpilogueVectorization || !ExpandedSCEVs) &&
7284 "expanded SCEVs to reuse can only be used during epilogue vectorization");
7285 (void)IsEpilogueVectorization;
7286
7287 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7288
7289 LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
7290 << ", UF=" << BestUF << '\n');
7291 BestVPlan.setName("Final VPlan");
7292 LLVM_DEBUG(BestVPlan.dump());
7293
7294 // Perform the actual loop transformation.
7295 VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
7296 OrigLoop->getHeader()->getContext());
7297
7298 // 0. Generate SCEV-dependent code into the preheader, including TripCount,
7299 // before making any changes to the CFG.
7300 if (!BestVPlan.getPreheader()->empty()) {
7301 State.CFG.PrevBB = OrigLoop->getLoopPreheader();
7302 State.Builder.SetInsertPoint(OrigLoop->getLoopPreheader()->getTerminator());
7303 BestVPlan.getPreheader()->execute(State: &State);
7304 }
7305 if (!ILV.getTripCount())
7306 ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Instance: {0, 0}));
7307 else
7308 assert(IsEpilogueVectorization && "should only re-use the existing trip "
7309 "count during epilogue vectorization");
7310
7311 // 1. Set up the skeleton for vectorization, including vector pre-header and
7312 // middle block. The vector loop is created during VPlan execution.
7313 Value *CanonicalIVStartValue;
7314 std::tie(args&: State.CFG.PrevBB, args&: CanonicalIVStartValue) =
7315 ILV.createVectorizedLoopSkeleton(ExpandedSCEVs: ExpandedSCEVs ? *ExpandedSCEVs
7316 : State.ExpandedSCEVs);
7317#ifdef EXPENSIVE_CHECKS
7318 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7319#endif
7320
7321 // Only use noalias metadata when using memory checks guaranteeing no overlap
7322 // across all iterations.
7323 const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7324 std::unique_ptr<LoopVersioning> LVer = nullptr;
7325 if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7326 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7327
7328 // We currently don't use LoopVersioning for the actual loop cloning but we
7329 // still use it to add the noalias metadata.
7330 // TODO: Find a better way to re-use LoopVersioning functionality to add
7331 // metadata.
7332 LVer = std::make_unique<LoopVersioning>(
7333 args: *LAI, args: LAI->getRuntimePointerChecking()->getChecks(), args&: OrigLoop, args&: LI, args&: DT,
7334 args: PSE.getSE());
7335 State.LVer = &*LVer;
7336 State.LVer->prepareNoAliasMetadata();
7337 }
7338
7339 ILV.printDebugTracesAtStart();
7340
7341 //===------------------------------------------------===//
7342 //
7343 // Notice: any optimization or new instruction that go
7344 // into the code below should also be implemented in
7345 // the cost-model.
7346 //
7347 //===------------------------------------------------===//
7348
7349 // 2. Copy and widen instructions from the old loop into the new loop.
7350 BestVPlan.prepareToExecute(TripCount: ILV.getTripCount(),
7351 VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: nullptr),
7352 CanonicalIVStartValue, State);
7353
7354 BestVPlan.execute(State: &State);
7355
7356 // 2.5 Collect reduction resume values.
7357 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
7358 auto *ExitVPBB =
7359 cast<VPBasicBlock>(Val: BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
7360 for (VPRecipeBase &R : *ExitVPBB) {
7361 createAndCollectMergePhiForReduction(
7362 RedResult: dyn_cast<VPInstruction>(Val: &R), ReductionResumeValues, State, OrigLoop,
7363 LoopMiddleBlock: State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue: ExpandedSCEVs);
7364 }
7365
7366 // 2.6. Maintain Loop Hints
7367 // Keep all loop hints from the original loop on the vector loop (we'll
7368 // replace the vectorizer-specific hints below).
7369 MDNode *OrigLoopID = OrigLoop->getLoopID();
7370
7371 std::optional<MDNode *> VectorizedLoopID =
7372 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
7373 LLVMLoopVectorizeFollowupVectorized});
7374
7375 VPBasicBlock *HeaderVPBB =
7376 BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7377 Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[HeaderVPBB]);
7378 if (VectorizedLoopID)
7379 L->setLoopID(*VectorizedLoopID);
7380 else {
7381 // Keep all loop hints from the original loop on the vector loop (we'll
7382 // replace the vectorizer-specific hints below).
7383 if (MDNode *LID = OrigLoop->getLoopID())
7384 L->setLoopID(LID);
7385
7386 LoopVectorizeHints Hints(L, true, *ORE);
7387 Hints.setAlreadyVectorized();
7388 }
7389 TargetTransformInfo::UnrollingPreferences UP;
7390 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7391 if (!UP.UnrollVectorizedLoop || CanonicalIVStartValue)
7392 AddRuntimeUnrollDisableMetaData(L);
7393
7394 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7395 // predication, updating analyses.
7396 ILV.fixVectorizedLoop(State, Plan&: BestVPlan);
7397
7398 ILV.printDebugTracesAtEnd();
7399
7400 // 4. Adjust branch weight of the branch in the middle block.
7401 auto *MiddleTerm =
7402 cast<BranchInst>(Val: State.CFG.VPBB2IRBB[ExitVPBB]->getTerminator());
7403 if (MiddleTerm->isConditional() &&
7404 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7405 // Assume that `Count % VectorTripCount` is equally distributed.
7406 unsigned TripCount = State.UF * State.VF.getKnownMinValue();
7407 assert(TripCount > 0 && "trip count should not be zero");
7408 const uint32_t Weights[] = {1, TripCount - 1};
7409 setBranchWeights(I&: *MiddleTerm, Weights, /*IsExpected=*/false);
7410 }
7411
7412 return {State.ExpandedSCEVs, ReductionResumeValues};
7413}
7414
7415#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7416void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7417 for (const auto &Plan : VPlans)
7418 if (PrintVPlansInDotFormat)
7419 Plan->printDOT(O);
7420 else
7421 Plan->print(O);
7422}
7423#endif
7424
7425//===--------------------------------------------------------------------===//
7426// EpilogueVectorizerMainLoop
7427//===--------------------------------------------------------------------===//
7428
7429/// This function is partially responsible for generating the control flow
7430/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7431std::pair<BasicBlock *, Value *>
7432EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7433 const SCEV2ValueTy &ExpandedSCEVs) {
7434 createVectorLoopSkeleton(Prefix: "");
7435
7436 // Generate the code to check the minimum iteration count of the vector
7437 // epilogue (see below).
7438 EPI.EpilogueIterationCountCheck =
7439 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true);
7440 EPI.EpilogueIterationCountCheck->setName("iter.check");
7441
7442 // Generate the code to check any assumptions that we've made for SCEV
7443 // expressions.
7444 EPI.SCEVSafetyCheck = emitSCEVChecks(Bypass: LoopScalarPreHeader);
7445
7446 // Generate the code that checks at runtime if arrays overlap. We put the
7447 // checks into a separate block to make the more common case of few elements
7448 // faster.
7449 EPI.MemSafetyCheck = emitMemRuntimeChecks(Bypass: LoopScalarPreHeader);
7450
7451 // Generate the iteration count check for the main loop, *after* the check
7452 // for the epilogue loop, so that the path-length is shorter for the case
7453 // that goes directly through the vector epilogue. The longer-path length for
7454 // the main loop is compensated for, by the gain from vectorizing the larger
7455 // trip count. Note: the branch will get updated later on when we vectorize
7456 // the epilogue.
7457 EPI.MainLoopIterationCountCheck =
7458 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false);
7459
7460 // Generate the induction variable.
7461 EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
7462
7463 // Skip induction resume value creation here because they will be created in
7464 // the second pass for the scalar loop. The induction resume values for the
7465 // inductions in the epilogue loop are created before executing the plan for
7466 // the epilogue loop.
7467
7468 return {LoopVectorPreHeader, nullptr};
7469}
7470
7471void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7472 LLVM_DEBUG({
7473 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7474 << "Main Loop VF:" << EPI.MainLoopVF
7475 << ", Main Loop UF:" << EPI.MainLoopUF
7476 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7477 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7478 });
7479}
7480
7481void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7482 DEBUG_WITH_TYPE(VerboseDebug, {
7483 dbgs() << "intermediate fn:\n"
7484 << *OrigLoop->getHeader()->getParent() << "\n";
7485 });
7486}
7487
7488BasicBlock *
7489EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7490 bool ForEpilogue) {
7491 assert(Bypass && "Expected valid bypass basic block.");
7492 ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7493 unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7494 Value *Count = getTripCount();
7495 // Reuse existing vector loop preheader for TC checks.
7496 // Note that new preheader block is generated for vector loop.
7497 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7498 IRBuilder<> Builder(TCCheckBlock->getTerminator());
7499
7500 // Generate code to check if the loop's trip count is less than VF * UF of the
7501 // main vector loop.
7502 auto P = Cost->requiresScalarEpilogue(IsVectorizing: ForEpilogue ? EPI.EpilogueVF.isVector()
7503 : VF.isVector())
7504 ? ICmpInst::ICMP_ULE
7505 : ICmpInst::ICMP_ULT;
7506
7507 Value *CheckMinIters = Builder.CreateICmp(
7508 P, LHS: Count, RHS: createStepForVF(B&: Builder, Ty: Count->getType(), VF: VFactor, Step: UFactor),
7509 Name: "min.iters.check");
7510
7511 if (!ForEpilogue)
7512 TCCheckBlock->setName("vector.main.loop.iter.check");
7513
7514 // Create new preheader for vector loop.
7515 LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7516 DT, LI, MSSAU: nullptr, BBName: "vector.ph");
7517
7518 if (ForEpilogue) {
7519 assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7520 DT->getNode(Bypass)->getIDom()) &&
7521 "TC check is expected to dominate Bypass");
7522
7523 // Update dominator for Bypass.
7524 DT->changeImmediateDominator(BB: Bypass, NewBB: TCCheckBlock);
7525 LoopBypassBlocks.push_back(Elt: TCCheckBlock);
7526
7527 // Save the trip count so we don't have to regenerate it in the
7528 // vec.epilog.iter.check. This is safe to do because the trip count
7529 // generated here dominates the vector epilog iter check.
7530 EPI.TripCount = Count;
7531 }
7532
7533 BranchInst &BI =
7534 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7535 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7536 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
7537 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7538
7539 return TCCheckBlock;
7540}
7541
7542//===--------------------------------------------------------------------===//
7543// EpilogueVectorizerEpilogueLoop
7544//===--------------------------------------------------------------------===//
7545
7546/// This function is partially responsible for generating the control flow
7547/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7548std::pair<BasicBlock *, Value *>
7549EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7550 const SCEV2ValueTy &ExpandedSCEVs) {
7551 createVectorLoopSkeleton(Prefix: "vec.epilog.");
7552
7553 // Now, compare the remaining count and if there aren't enough iterations to
7554 // execute the vectorized epilogue skip to the scalar part.
7555 LoopVectorPreHeader->setName("vec.epilog.ph");
7556 BasicBlock *VecEpilogueIterationCountCheck =
7557 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->begin(), DT, LI,
7558 MSSAU: nullptr, BBName: "vec.epilog.iter.check", Before: true);
7559 emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader,
7560 Insert: VecEpilogueIterationCountCheck);
7561
7562 // Adjust the control flow taking the state info from the main loop
7563 // vectorization into account.
7564 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7565 "expected this to be saved from the previous pass.");
7566 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7567 From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader);
7568
7569 DT->changeImmediateDominator(BB: LoopVectorPreHeader,
7570 NewBB: EPI.MainLoopIterationCountCheck);
7571
7572 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7573 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7574
7575 if (EPI.SCEVSafetyCheck)
7576 EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7577 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7578 if (EPI.MemSafetyCheck)
7579 EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7580 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7581
7582 DT->changeImmediateDominator(
7583 BB: VecEpilogueIterationCountCheck,
7584 NewBB: VecEpilogueIterationCountCheck->getSinglePredecessor());
7585
7586 DT->changeImmediateDominator(BB: LoopScalarPreHeader,
7587 NewBB: EPI.EpilogueIterationCountCheck);
7588 if (!Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()))
7589 // If there is an epilogue which must run, there's no edge from the
7590 // middle block to exit blocks and thus no need to update the immediate
7591 // dominator of the exit blocks.
7592 DT->changeImmediateDominator(BB: LoopExitBlock,
7593 NewBB: EPI.EpilogueIterationCountCheck);
7594
7595 // Keep track of bypass blocks, as they feed start values to the induction and
7596 // reduction phis in the scalar loop preheader.
7597 if (EPI.SCEVSafetyCheck)
7598 LoopBypassBlocks.push_back(Elt: EPI.SCEVSafetyCheck);
7599 if (EPI.MemSafetyCheck)
7600 LoopBypassBlocks.push_back(Elt: EPI.MemSafetyCheck);
7601 LoopBypassBlocks.push_back(Elt: EPI.EpilogueIterationCountCheck);
7602
7603 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7604 // reductions which merge control-flow from the latch block and the middle
7605 // block. Update the incoming values here and move the Phi into the preheader.
7606 SmallVector<PHINode *, 4> PhisInBlock;
7607 for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7608 PhisInBlock.push_back(Elt: &Phi);
7609
7610 for (PHINode *Phi : PhisInBlock) {
7611 Phi->moveBefore(MovePos: LoopVectorPreHeader->getFirstNonPHI());
7612 Phi->replaceIncomingBlockWith(
7613 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7614 New: VecEpilogueIterationCountCheck);
7615
7616 // If the phi doesn't have an incoming value from the
7617 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7618 // value and also those from other check blocks. This is needed for
7619 // reduction phis only.
7620 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7621 return EPI.EpilogueIterationCountCheck == IncB;
7622 }))
7623 continue;
7624 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
7625 if (EPI.SCEVSafetyCheck)
7626 Phi->removeIncomingValue(BB: EPI.SCEVSafetyCheck);
7627 if (EPI.MemSafetyCheck)
7628 Phi->removeIncomingValue(BB: EPI.MemSafetyCheck);
7629 }
7630
7631 // Generate a resume induction for the vector epilogue and put it in the
7632 // vector epilogue preheader
7633 Type *IdxTy = Legal->getWidestInductionType();
7634 PHINode *EPResumeVal = PHINode::Create(Ty: IdxTy, NumReservedValues: 2, NameStr: "vec.epilog.resume.val");
7635 EPResumeVal->insertBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt());
7636 EPResumeVal->addIncoming(V: EPI.VectorTripCount, BB: VecEpilogueIterationCountCheck);
7637 EPResumeVal->addIncoming(V: ConstantInt::get(Ty: IdxTy, V: 0),
7638 BB: EPI.MainLoopIterationCountCheck);
7639
7640 // Generate induction resume values. These variables save the new starting
7641 // indexes for the scalar loop. They are used to test if there are any tail
7642 // iterations left once the vector loop has completed.
7643 // Note that when the vectorized epilogue is skipped due to iteration count
7644 // check, then the resume value for the induction variable comes from
7645 // the trip count of the main vector loop, hence passing the AdditionalBypass
7646 // argument.
7647 createInductionResumeValues(ExpandedSCEVs,
7648 AdditionalBypass: {VecEpilogueIterationCountCheck,
7649 EPI.VectorTripCount} /* AdditionalBypass */);
7650
7651 return {LoopVectorPreHeader, EPResumeVal};
7652}
7653
7654BasicBlock *
7655EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7656 BasicBlock *Bypass, BasicBlock *Insert) {
7657
7658 assert(EPI.TripCount &&
7659 "Expected trip count to have been safed in the first pass.");
7660 assert(
7661 (!isa<Instruction>(EPI.TripCount) ||
7662 DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7663 "saved trip count does not dominate insertion point.");
7664 Value *TC = EPI.TripCount;
7665 IRBuilder<> Builder(Insert->getTerminator());
7666 Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining");
7667
7668 // Generate code to check if the loop's trip count is less than VF * UF of the
7669 // vector epilogue loop.
7670 auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())
7671 ? ICmpInst::ICMP_ULE
7672 : ICmpInst::ICMP_ULT;
7673
7674 Value *CheckMinIters =
7675 Builder.CreateICmp(P, LHS: Count,
7676 RHS: createStepForVF(B&: Builder, Ty: Count->getType(),
7677 VF: EPI.EpilogueVF, Step: EPI.EpilogueUF),
7678 Name: "min.epilog.iters.check");
7679
7680 BranchInst &BI =
7681 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7682 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7683 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7684 unsigned EpilogueLoopStep =
7685 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7686 // We assume the remaining `Count` is equally distributed in
7687 // [0, MainLoopStep)
7688 // So the probability for `Count < EpilogueLoopStep` should be
7689 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7690 unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
7691 const uint32_t Weights[] = {EstimatedSkipCount,
7692 MainLoopStep - EstimatedSkipCount};
7693 setBranchWeights(I&: BI, Weights, /*IsExpected=*/false);
7694 }
7695 ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI);
7696 LoopBypassBlocks.push_back(Elt: Insert);
7697 return Insert;
7698}
7699
7700void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7701 LLVM_DEBUG({
7702 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7703 << "Epilogue Loop VF:" << EPI.EpilogueVF
7704 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7705 });
7706}
7707
7708void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7709 DEBUG_WITH_TYPE(VerboseDebug, {
7710 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7711 });
7712}
7713
7714bool LoopVectorizationPlanner::getDecisionAndClampRange(
7715 const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7716 assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7717 bool PredicateAtRangeStart = Predicate(Range.Start);
7718
7719 for (ElementCount TmpVF : VFRange(Range.Start * 2, Range.End))
7720 if (Predicate(TmpVF) != PredicateAtRangeStart) {
7721 Range.End = TmpVF;
7722 break;
7723 }
7724
7725 return PredicateAtRangeStart;
7726}
7727
7728/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7729/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7730/// of VF's starting at a given VF and extending it as much as possible. Each
7731/// vectorization decision can potentially shorten this sub-range during
7732/// buildVPlan().
7733void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7734 ElementCount MaxVF) {
7735 auto MaxVFTimes2 = MaxVF * 2;
7736 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
7737 VFRange SubRange = {VF, MaxVFTimes2};
7738 VPlans.push_back(Elt: buildVPlan(Range&: SubRange));
7739 VF = SubRange.End;
7740 }
7741}
7742
7743iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
7744VPRecipeBuilder::mapToVPValues(User::op_range Operands) {
7745 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
7746 if (auto *I = dyn_cast<Instruction>(Val: Op)) {
7747 if (auto *R = Ingredient2Recipe.lookup(Val: I))
7748 return R->getVPSingleValue();
7749 }
7750 return Plan.getOrAddLiveIn(V: Op);
7751 };
7752 return map_range(C&: Operands, F: Fn);
7753}
7754
7755VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
7756 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7757
7758 // Look for cached value.
7759 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7760 EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Val: Edge);
7761 if (ECEntryIt != EdgeMaskCache.end())
7762 return ECEntryIt->second;
7763
7764 VPValue *SrcMask = getBlockInMask(BB: Src);
7765
7766 // The terminator has to be a branch inst!
7767 BranchInst *BI = dyn_cast<BranchInst>(Val: Src->getTerminator());
7768 assert(BI && "Unexpected terminator found");
7769
7770 if (!BI->isConditional() || BI->getSuccessor(i: 0) == BI->getSuccessor(i: 1))
7771 return EdgeMaskCache[Edge] = SrcMask;
7772
7773 // If source is an exiting block, we know the exit edge is dynamically dead
7774 // in the vector loop, and thus we don't need to restrict the mask. Avoid
7775 // adding uses of an otherwise potentially dead instruction.
7776 if (OrigLoop->isLoopExiting(BB: Src))
7777 return EdgeMaskCache[Edge] = SrcMask;
7778
7779 VPValue *EdgeMask = getVPValueOrAddLiveIn(V: BI->getCondition(), Plan);
7780 assert(EdgeMask && "No Edge Mask found for condition");
7781
7782 if (BI->getSuccessor(i: 0) != Dst)
7783 EdgeMask = Builder.createNot(Operand: EdgeMask, DL: BI->getDebugLoc());
7784
7785 if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7786 // The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask
7787 // is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'
7788 // instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7789 EdgeMask = Builder.createLogicalAnd(LHS: SrcMask, RHS: EdgeMask, DL: BI->getDebugLoc());
7790 }
7791
7792 return EdgeMaskCache[Edge] = EdgeMask;
7793}
7794
7795VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const {
7796 assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7797
7798 // Look for cached value.
7799 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7800 EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Val: Edge);
7801 assert(ECEntryIt != EdgeMaskCache.end() &&
7802 "looking up mask for edge which has not been created");
7803 return ECEntryIt->second;
7804}
7805
7806void VPRecipeBuilder::createHeaderMask() {
7807 BasicBlock *Header = OrigLoop->getHeader();
7808
7809 // When not folding the tail, use nullptr to model all-true mask.
7810 if (!CM.foldTailByMasking()) {
7811 BlockMaskCache[Header] = nullptr;
7812 return;
7813 }
7814
7815 // Introduce the early-exit compare IV <= BTC to form header block mask.
7816 // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
7817 // constructing the desired canonical IV in the header block as its first
7818 // non-phi instructions.
7819
7820 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
7821 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
7822 auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
7823 HeaderVPBB->insert(Recipe: IV, InsertPt: NewInsertionPoint);
7824
7825 VPBuilder::InsertPointGuard Guard(Builder);
7826 Builder.setInsertPoint(TheBB: HeaderVPBB, IP: NewInsertionPoint);
7827 VPValue *BlockMask = nullptr;
7828 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
7829 BlockMask = Builder.createICmp(Pred: CmpInst::ICMP_ULE, A: IV, B: BTC);
7830 BlockMaskCache[Header] = BlockMask;
7831}
7832
7833VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const {
7834 // Return the cached value.
7835 BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(Val: BB);
7836 assert(BCEntryIt != BlockMaskCache.end() &&
7837 "Trying to access mask for block without one.");
7838 return BCEntryIt->second;
7839}
7840
7841void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
7842 assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
7843 assert(BlockMaskCache.count(BB) == 0 && "Mask for block already computed");
7844 assert(OrigLoop->getHeader() != BB &&
7845 "Loop header must have cached block mask");
7846
7847 // All-one mask is modelled as no-mask following the convention for masked
7848 // load/store/gather/scatter. Initialize BlockMask to no-mask.
7849 VPValue *BlockMask = nullptr;
7850 // This is the block mask. We OR all incoming edges.
7851 for (auto *Predecessor : predecessors(BB)) {
7852 VPValue *EdgeMask = createEdgeMask(Src: Predecessor, Dst: BB);
7853 if (!EdgeMask) { // Mask of predecessor is all-one so mask of block is too.
7854 BlockMaskCache[BB] = EdgeMask;
7855 return;
7856 }
7857
7858 if (!BlockMask) { // BlockMask has its initialized nullptr value.
7859 BlockMask = EdgeMask;
7860 continue;
7861 }
7862
7863 BlockMask = Builder.createOr(LHS: BlockMask, RHS: EdgeMask, DL: {});
7864 }
7865
7866 BlockMaskCache[BB] = BlockMask;
7867}
7868
7869VPWidenMemoryRecipe *
7870VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7871 VFRange &Range) {
7872 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7873 "Must be called with either a load or store");
7874
7875 auto willWiden = [&](ElementCount VF) -> bool {
7876 LoopVectorizationCostModel::InstWidening Decision =
7877 CM.getWideningDecision(I, VF);
7878 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7879 "CM decision should be taken at this point.");
7880 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7881 return true;
7882 if (CM.isScalarAfterVectorization(I, VF) ||
7883 CM.isProfitableToScalarize(I, VF))
7884 return false;
7885 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7886 };
7887
7888 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: willWiden, Range))
7889 return nullptr;
7890
7891 VPValue *Mask = nullptr;
7892 if (Legal->isMaskRequired(I))
7893 Mask = getBlockInMask(BB: I->getParent());
7894
7895 // Determine if the pointer operand of the access is either consecutive or
7896 // reverse consecutive.
7897 LoopVectorizationCostModel::InstWidening Decision =
7898 CM.getWideningDecision(I, VF: Range.Start);
7899 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7900 bool Consecutive =
7901 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7902
7903 VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands[0] : Operands[1];
7904 if (Consecutive) {
7905 auto *GEP = dyn_cast<GetElementPtrInst>(
7906 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7907 auto *VectorPtr = new VPVectorPointerRecipe(
7908 Ptr, getLoadStoreType(I), Reverse, GEP ? GEP->isInBounds() : false,
7909 I->getDebugLoc());
7910 Builder.getInsertBlock()->appendRecipe(Recipe: VectorPtr);
7911 Ptr = VectorPtr;
7912 }
7913 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I))
7914 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7915 I->getDebugLoc());
7916
7917 StoreInst *Store = cast<StoreInst>(Val: I);
7918 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7919 Reverse, I->getDebugLoc());
7920}
7921
7922/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7923/// insert a recipe to expand the step for the induction recipe.
7924static VPWidenIntOrFpInductionRecipe *
7925createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
7926 VPValue *Start, const InductionDescriptor &IndDesc,
7927 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7928 assert(IndDesc.getStartValue() ==
7929 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7930 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7931 "step must be loop invariant");
7932
7933 VPValue *Step =
7934 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE);
7935 if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) {
7936 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI);
7937 }
7938 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7939 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc);
7940}
7941
7942VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7943 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
7944
7945 // Check if this is an integer or fp induction. If so, build the recipe that
7946 // produces its scalar and vector values.
7947 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7948 return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands[0], IndDesc: *II, Plan,
7949 SE&: *PSE.getSE(), OrigLoop&: *OrigLoop);
7950
7951 // Check if this is pointer induction. If so, build the recipe for it.
7952 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7953 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(),
7954 SE&: *PSE.getSE());
7955 return new VPWidenPointerInductionRecipe(
7956 Phi, Operands[0], Step, *II,
7957 LoopVectorizationPlanner::getDecisionAndClampRange(
7958 Predicate: [&](ElementCount VF) {
7959 return CM.isScalarAfterVectorization(I: Phi, VF);
7960 },
7961 Range));
7962 }
7963 return nullptr;
7964}
7965
7966VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7967 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
7968 // Optimize the special case where the source is a constant integer
7969 // induction variable. Notice that we can only optimize the 'trunc' case
7970 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7971 // (c) other casts depend on pointer size.
7972
7973 // Determine whether \p K is a truncation based on an induction variable that
7974 // can be optimized.
7975 auto isOptimizableIVTruncate =
7976 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7977 return [=](ElementCount VF) -> bool {
7978 return CM.isOptimizableIVTruncate(I: K, VF);
7979 };
7980 };
7981
7982 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7983 Predicate: isOptimizableIVTruncate(I), Range)) {
7984
7985 auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: 0));
7986 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7987 VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue());
7988 return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(),
7989 OrigLoop&: *OrigLoop);
7990 }
7991 return nullptr;
7992}
7993
7994VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
7995 ArrayRef<VPValue *> Operands) {
7996 unsigned NumIncoming = Phi->getNumIncomingValues();
7997
7998 // We know that all PHIs in non-header blocks are converted into selects, so
7999 // we don't have to worry about the insertion order and we can just use the
8000 // builder. At this point we generate the predication tree. There may be
8001 // duplications since this is a simple recursive scan, but future
8002 // optimizations will clean it up.
8003 // TODO: At the moment the first mask is always skipped, but it would be
8004 // better to skip the most expensive mask.
8005 SmallVector<VPValue *, 2> OperandsWithMask;
8006
8007 for (unsigned In = 0; In < NumIncoming; In++) {
8008 OperandsWithMask.push_back(Elt: Operands[In]);
8009 VPValue *EdgeMask =
8010 getEdgeMask(Src: Phi->getIncomingBlock(i: In), Dst: Phi->getParent());
8011 if (!EdgeMask) {
8012 assert(In == 0 && "Both null and non-null edge masks found");
8013 assert(all_equal(Operands) &&
8014 "Distinct incoming values with one having a full mask");
8015 break;
8016 }
8017 if (In == 0)
8018 continue;
8019 OperandsWithMask.push_back(Elt: EdgeMask);
8020 }
8021 return new VPBlendRecipe(Phi, OperandsWithMask);
8022}
8023
8024VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8025 ArrayRef<VPValue *> Operands,
8026 VFRange &Range) {
8027 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8028 Predicate: [this, CI](ElementCount VF) {
8029 return CM.isScalarWithPredication(I: CI, VF);
8030 },
8031 Range);
8032
8033 if (IsPredicated)
8034 return nullptr;
8035
8036 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8037 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8038 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8039 ID == Intrinsic::pseudoprobe ||
8040 ID == Intrinsic::experimental_noalias_scope_decl))
8041 return nullptr;
8042
8043 SmallVector<VPValue *, 4> Ops(Operands.take_front(N: CI->arg_size()));
8044 Ops.push_back(Elt: Operands.back());
8045
8046 // Is it beneficial to perform intrinsic call compared to lib call?
8047 bool ShouldUseVectorIntrinsic =
8048 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
8049 Predicate: [&](ElementCount VF) -> bool {
8050 return CM.getCallWideningDecision(CI, VF).Kind ==
8051 LoopVectorizationCostModel::CM_IntrinsicCall;
8052 },
8053 Range);
8054 if (ShouldUseVectorIntrinsic)
8055 return new VPWidenCallRecipe(CI, make_range(x: Ops.begin(), y: Ops.end()), ID,
8056 CI->getDebugLoc());
8057
8058 Function *Variant = nullptr;
8059 std::optional<unsigned> MaskPos;
8060 // Is better to call a vectorized version of the function than to to scalarize
8061 // the call?
8062 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
8063 Predicate: [&](ElementCount VF) -> bool {
8064 // The following case may be scalarized depending on the VF.
8065 // The flag shows whether we can use a usual Call for vectorized
8066 // version of the instruction.
8067
8068 // If we've found a variant at a previous VF, then stop looking. A
8069 // vectorized variant of a function expects input in a certain shape
8070 // -- basically the number of input registers, the number of lanes
8071 // per register, and whether there's a mask required.
8072 // We store a pointer to the variant in the VPWidenCallRecipe, so
8073 // once we have an appropriate variant it's only valid for that VF.
8074 // This will force a different vplan to be generated for each VF that
8075 // finds a valid variant.
8076 if (Variant)
8077 return false;
8078 LoopVectorizationCostModel::CallWideningDecision Decision =
8079 CM.getCallWideningDecision(CI, VF);
8080 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
8081 Variant = Decision.Variant;
8082 MaskPos = Decision.MaskPos;
8083 return true;
8084 }
8085
8086 return false;
8087 },
8088 Range);
8089 if (ShouldUseVectorCall) {
8090 if (MaskPos.has_value()) {
8091 // We have 2 cases that would require a mask:
8092 // 1) The block needs to be predicated, either due to a conditional
8093 // in the scalar loop or use of an active lane mask with
8094 // tail-folding, and we use the appropriate mask for the block.
8095 // 2) No mask is required for the block, but the only available
8096 // vector variant at this VF requires a mask, so we synthesize an
8097 // all-true mask.
8098 VPValue *Mask = nullptr;
8099 if (Legal->isMaskRequired(I: CI))
8100 Mask = getBlockInMask(BB: CI->getParent());
8101 else
8102 Mask = Plan.getOrAddLiveIn(V: ConstantInt::getTrue(
8103 Ty: IntegerType::getInt1Ty(C&: Variant->getFunctionType()->getContext())));
8104
8105 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
8106 }
8107
8108 return new VPWidenCallRecipe(CI, make_range(x: Ops.begin(), y: Ops.end()),
8109 Intrinsic::not_intrinsic, CI->getDebugLoc(),
8110 Variant);
8111 }
8112
8113 return nullptr;
8114}
8115
8116bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8117 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8118 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8119 // Instruction should be widened, unless it is scalar after vectorization,
8120 // scalarization is profitable or it is predicated.
8121 auto WillScalarize = [this, I](ElementCount VF) -> bool {
8122 return CM.isScalarAfterVectorization(I, VF) ||
8123 CM.isProfitableToScalarize(I, VF) ||
8124 CM.isScalarWithPredication(I, VF);
8125 };
8126 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
8127 Range);
8128}
8129
8130VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8131 ArrayRef<VPValue *> Operands,
8132 VPBasicBlock *VPBB) {
8133 switch (I->getOpcode()) {
8134 default:
8135 return nullptr;
8136 case Instruction::SDiv:
8137 case Instruction::UDiv:
8138 case Instruction::SRem:
8139 case Instruction::URem: {
8140 // If not provably safe, use a select to form a safe divisor before widening the
8141 // div/rem operation itself. Otherwise fall through to general handling below.
8142 if (CM.isPredicatedInst(I)) {
8143 SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
8144 VPValue *Mask = getBlockInMask(BB: I->getParent());
8145 VPValue *One =
8146 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: 1u, IsSigned: false));
8147 auto *SafeRHS = Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: I->getDebugLoc());
8148 Ops[1] = SafeRHS;
8149 return new VPWidenRecipe(*I, make_range(x: Ops.begin(), y: Ops.end()));
8150 }
8151 [[fallthrough]];
8152 }
8153 case Instruction::Add:
8154 case Instruction::And:
8155 case Instruction::AShr:
8156 case Instruction::FAdd:
8157 case Instruction::FCmp:
8158 case Instruction::FDiv:
8159 case Instruction::FMul:
8160 case Instruction::FNeg:
8161 case Instruction::FRem:
8162 case Instruction::FSub:
8163 case Instruction::ICmp:
8164 case Instruction::LShr:
8165 case Instruction::Mul:
8166 case Instruction::Or:
8167 case Instruction::Select:
8168 case Instruction::Shl:
8169 case Instruction::Sub:
8170 case Instruction::Xor:
8171 case Instruction::Freeze:
8172 return new VPWidenRecipe(*I, make_range(x: Operands.begin(), y: Operands.end()));
8173 };
8174}
8175
8176void VPRecipeBuilder::fixHeaderPhis() {
8177 BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8178 for (VPHeaderPHIRecipe *R : PhisToFix) {
8179 auto *PN = cast<PHINode>(Val: R->getUnderlyingValue());
8180 VPRecipeBase *IncR =
8181 getRecipe(I: cast<Instruction>(Val: PN->getIncomingValueForBlock(BB: OrigLatch)));
8182 R->addOperand(Operand: IncR->getVPSingleValue());
8183 }
8184}
8185
8186VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
8187 VFRange &Range) {
8188 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8189 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8190 Range);
8191
8192 bool IsPredicated = CM.isPredicatedInst(I);
8193
8194 // Even if the instruction is not marked as uniform, there are certain
8195 // intrinsic calls that can be effectively treated as such, so we check for
8196 // them here. Conservatively, we only do this for scalable vectors, since
8197 // for fixed-width VFs we can always fall back on full scalarization.
8198 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
8199 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
8200 case Intrinsic::assume:
8201 case Intrinsic::lifetime_start:
8202 case Intrinsic::lifetime_end:
8203 // For scalable vectors if one of the operands is variant then we still
8204 // want to mark as uniform, which will generate one instruction for just
8205 // the first lane of the vector. We can't scalarize the call in the same
8206 // way as for fixed-width vectors because we don't know how many lanes
8207 // there are.
8208 //
8209 // The reasons for doing it this way for scalable vectors are:
8210 // 1. For the assume intrinsic generating the instruction for the first
8211 // lane is still be better than not generating any at all. For
8212 // example, the input may be a splat across all lanes.
8213 // 2. For the lifetime start/end intrinsics the pointer operand only
8214 // does anything useful when the input comes from a stack object,
8215 // which suggests it should always be uniform. For non-stack objects
8216 // the effect is to poison the object, which still allows us to
8217 // remove the call.
8218 IsUniform = true;
8219 break;
8220 default:
8221 break;
8222 }
8223 }
8224 VPValue *BlockInMask = nullptr;
8225 if (!IsPredicated) {
8226 // Finalize the recipe for Instr, first if it is not predicated.
8227 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8228 } else {
8229 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8230 // Instructions marked for predication are replicated and a mask operand is
8231 // added initially. Masked replicate recipes will later be placed under an
8232 // if-then construct to prevent side-effects. Generate recipes to compute
8233 // the block mask for this region.
8234 BlockInMask = getBlockInMask(BB: I->getParent());
8235 }
8236
8237 // Note that there is some custom logic to mark some intrinsics as uniform
8238 // manually above for scalable vectors, which this assert needs to account for
8239 // as well.
8240 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8241 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8242 "Should not predicate a uniform recipe");
8243 auto *Recipe = new VPReplicateRecipe(I, mapToVPValues(Operands: I->operands()),
8244 IsUniform, BlockInMask);
8245 return Recipe;
8246}
8247
8248VPRecipeBase *
8249VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8250 ArrayRef<VPValue *> Operands,
8251 VFRange &Range, VPBasicBlock *VPBB) {
8252 // First, check for specific widening recipes that deal with inductions, Phi
8253 // nodes, calls and memory operations.
8254 VPRecipeBase *Recipe;
8255 if (auto Phi = dyn_cast<PHINode>(Val: Instr)) {
8256 if (Phi->getParent() != OrigLoop->getHeader())
8257 return tryToBlend(Phi, Operands);
8258
8259 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8260 return Recipe;
8261
8262 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8263 assert((Legal->isReductionVariable(Phi) ||
8264 Legal->isFixedOrderRecurrence(Phi)) &&
8265 "can only widen reductions and fixed-order recurrences here");
8266 VPValue *StartV = Operands[0];
8267 if (Legal->isReductionVariable(PN: Phi)) {
8268 const RecurrenceDescriptor &RdxDesc =
8269 Legal->getReductionVars().find(Key: Phi)->second;
8270 assert(RdxDesc.getRecurrenceStartValue() ==
8271 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8272 PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8273 CM.isInLoopReduction(Phi),
8274 CM.useOrderedReductions(RdxDesc));
8275 } else {
8276 // TODO: Currently fixed-order recurrences are modeled as chains of
8277 // first-order recurrences. If there are no users of the intermediate
8278 // recurrences in the chain, the fixed order recurrence should be modeled
8279 // directly, enabling more efficient codegen.
8280 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8281 }
8282
8283 PhisToFix.push_back(Elt: PhiRecipe);
8284 return PhiRecipe;
8285 }
8286
8287 if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate(
8288 I: cast<TruncInst>(Val: Instr), Operands, Range)))
8289 return Recipe;
8290
8291 // All widen recipes below deal only with VF > 1.
8292 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8293 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8294 return nullptr;
8295
8296 if (auto *CI = dyn_cast<CallInst>(Val: Instr))
8297 return tryToWidenCall(CI, Operands, Range);
8298
8299 if (isa<LoadInst>(Val: Instr) || isa<StoreInst>(Val: Instr))
8300 return tryToWidenMemory(I: Instr, Operands, Range);
8301
8302 if (!shouldWiden(I: Instr, Range))
8303 return nullptr;
8304
8305 if (auto GEP = dyn_cast<GetElementPtrInst>(Val: Instr))
8306 return new VPWidenGEPRecipe(GEP,
8307 make_range(x: Operands.begin(), y: Operands.end()));
8308
8309 if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) {
8310 return new VPWidenSelectRecipe(
8311 *SI, make_range(x: Operands.begin(), y: Operands.end()));
8312 }
8313
8314 if (auto *CI = dyn_cast<CastInst>(Val: Instr)) {
8315 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8316 *CI);
8317 }
8318
8319 return tryToWiden(I: Instr, Operands, VPBB);
8320}
8321
8322void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8323 ElementCount MaxVF) {
8324 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8325
8326 auto MaxVFTimes2 = MaxVF * 2;
8327 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8328 VFRange SubRange = {VF, MaxVFTimes2};
8329 if (auto Plan = tryToBuildVPlanWithVPRecipes(Range&: SubRange)) {
8330 // Now optimize the initial VPlan.
8331 if (!Plan->hasVF(VF: ElementCount::getFixed(MinVal: 1)))
8332 VPlanTransforms::truncateToMinimalBitwidths(
8333 Plan&: *Plan, MinBWs: CM.getMinimalBitwidths(), Ctx&: PSE.getSE()->getContext());
8334 VPlanTransforms::optimize(Plan&: *Plan, SE&: *PSE.getSE());
8335 // TODO: try to put it close to addActiveLaneMask().
8336 // Discard the plan if it is not EVL-compatible
8337 if (CM.foldTailWithEVL() &&
8338 !VPlanTransforms::tryAddExplicitVectorLength(Plan&: *Plan))
8339 break;
8340 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8341 VPlans.push_back(Elt: std::move(Plan));
8342 }
8343 VF = SubRange.End;
8344 }
8345}
8346
8347// Add the necessary canonical IV and branch recipes required to control the
8348// loop.
8349static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8350 DebugLoc DL) {
8351 Value *StartIdx = ConstantInt::get(Ty: IdxTy, V: 0);
8352 auto *StartV = Plan.getOrAddLiveIn(V: StartIdx);
8353
8354 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8355 auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8356 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8357 VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8358 Header->insert(Recipe: CanonicalIVPHI, InsertPt: Header->begin());
8359
8360 VPBuilder Builder(TopRegion->getExitingBasicBlock());
8361 // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8362 auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8363 Opcode: Instruction::Add, Operands: {CanonicalIVPHI, &Plan.getVFxUF()}, WrapFlags: {HasNUW, false}, DL,
8364 Name: "index.next");
8365 CanonicalIVPHI->addOperand(Operand: CanonicalIVIncrement);
8366
8367 // Add the BranchOnCount VPInstruction to the latch.
8368 Builder.createNaryOp(Opcode: VPInstruction::BranchOnCount,
8369 Operands: {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8370}
8371
8372// Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8373// original exit block.
8374static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB, Loop *OrigLoop,
8375 VPRecipeBuilder &Builder, VPlan &Plan) {
8376 BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8377 BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8378 // Only handle single-exit loops with unique exit blocks for now.
8379 if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8380 return;
8381
8382 // Introduce VPUsers modeling the exit values.
8383 for (PHINode &ExitPhi : ExitBB->phis()) {
8384 Value *IncomingValue =
8385 ExitPhi.getIncomingValueForBlock(BB: ExitingBB);
8386 VPValue *V = Builder.getVPValueOrAddLiveIn(V: IncomingValue, Plan);
8387 // Exit values for inductions are computed and updated outside of VPlan and
8388 // independent of induction recipes.
8389 // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
8390 // live-outs.
8391 if ((isa<VPWidenIntOrFpInductionRecipe>(Val: V) &&
8392 !cast<VPWidenIntOrFpInductionRecipe>(Val: V)->getTruncInst()) ||
8393 isa<VPWidenPointerInductionRecipe>(Val: V))
8394 continue;
8395 Plan.addLiveOut(PN: &ExitPhi, V);
8396 }
8397}
8398
8399/// Feed a resume value for every FOR from the vector loop to the scalar loop,
8400/// if middle block branches to scalar preheader, by introducing ExtractFromEnd
8401/// and ResumePhi recipes in each, respectively, and a VPLiveOut which uses the
8402/// latter and corresponds to the scalar header.
8403static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
8404 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8405
8406 // Start by finding out if middle block branches to scalar preheader, which is
8407 // not a VPIRBasicBlock, unlike Exit block - the other possible successor of
8408 // middle block.
8409 // TODO: Should be replaced by
8410 // Plan->getScalarLoopRegion()->getSinglePredecessor() in the future once the
8411 // scalar region is modeled as well.
8412 VPBasicBlock *ScalarPHVPBB = nullptr;
8413 auto *MiddleVPBB = cast<VPBasicBlock>(Val: VectorRegion->getSingleSuccessor());
8414 for (VPBlockBase *Succ : MiddleVPBB->getSuccessors()) {
8415 if (isa<VPIRBasicBlock>(Val: Succ))
8416 continue;
8417 assert(!ScalarPHVPBB && "Two candidates for ScalarPHVPBB?");
8418 ScalarPHVPBB = cast<VPBasicBlock>(Val: Succ);
8419 }
8420 if (!ScalarPHVPBB)
8421 return;
8422
8423 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8424 VPBuilder MiddleBuilder(MiddleVPBB);
8425 // Reset insert point so new recipes are inserted before terminator and
8426 // condition, if there is either the former or both.
8427 if (auto *Terminator = MiddleVPBB->getTerminator()) {
8428 auto *Condition = dyn_cast<VPInstruction>(Val: Terminator->getOperand(N: 0));
8429 assert((!Condition || Condition->getParent() == MiddleVPBB) &&
8430 "Condition expected in MiddleVPBB");
8431 MiddleBuilder.setInsertPoint(Condition ? Condition : Terminator);
8432 }
8433 VPValue *OneVPV = Plan.getOrAddLiveIn(
8434 V: ConstantInt::get(Ty: Plan.getCanonicalIV()->getScalarType(), V: 1));
8435
8436 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8437 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
8438 if (!FOR)
8439 continue;
8440
8441 // Extract the resume value and create a new VPLiveOut for it.
8442 auto *Resume = MiddleBuilder.createNaryOp(Opcode: VPInstruction::ExtractFromEnd,
8443 Operands: {FOR->getBackedgeValue(), OneVPV},
8444 Inst: {}, Name: "vector.recur.extract");
8445 auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp(
8446 Opcode: VPInstruction::ResumePhi, Operands: {Resume, FOR->getStartValue()}, Inst: {},
8447 Name: "scalar.recur.init");
8448 Plan.addLiveOut(PN: cast<PHINode>(Val: FOR->getUnderlyingInstr()), V: ResumePhiRecipe);
8449 }
8450}
8451
8452VPlanPtr
8453LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8454
8455 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8456
8457 // ---------------------------------------------------------------------------
8458 // Build initial VPlan: Scan the body of the loop in a topological order to
8459 // visit each basic block after having visited its predecessor basic blocks.
8460 // ---------------------------------------------------------------------------
8461
8462 // Create initial VPlan skeleton, having a basic block for the pre-header
8463 // which contains SCEV expansions that need to happen before the CFG is
8464 // modified; a basic block for the vector pre-header, followed by a region for
8465 // the vector loop, followed by the middle basic block. The skeleton vector
8466 // loop region contains a header and latch basic blocks.
8467
8468 bool RequiresScalarEpilogueCheck =
8469 LoopVectorizationPlanner::getDecisionAndClampRange(
8470 Predicate: [this](ElementCount VF) {
8471 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8472 },
8473 Range);
8474 VPlanPtr Plan = VPlan::createInitialVPlan(
8475 TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop),
8476 PSE&: *PSE.getSE(), RequiresScalarEpilogueCheck, TailFolded: CM.foldTailByMasking(),
8477 TheLoop: OrigLoop);
8478
8479 // Don't use getDecisionAndClampRange here, because we don't know the UF
8480 // so this function is better to be conservative, rather than to split
8481 // it up into different VPlans.
8482 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8483 bool IVUpdateMayOverflow = false;
8484 for (ElementCount VF : Range)
8485 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8486
8487 DebugLoc DL = getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction());
8488 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8489 // When not folding the tail, we know that the induction increment will not
8490 // overflow.
8491 bool HasNUW = Style == TailFoldingStyle::None;
8492 addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW, DL);
8493
8494 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
8495
8496 // ---------------------------------------------------------------------------
8497 // Pre-construction: record ingredients whose recipes we'll need to further
8498 // process after constructing the initial VPlan.
8499 // ---------------------------------------------------------------------------
8500
8501 // For each interleave group which is relevant for this (possibly trimmed)
8502 // Range, add it to the set of groups to be later applied to the VPlan and add
8503 // placeholders for its members' Recipes which we'll be replacing with a
8504 // single VPInterleaveRecipe.
8505 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8506 auto applyIG = [IG, this](ElementCount VF) -> bool {
8507 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8508 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8509 LoopVectorizationCostModel::CM_Interleave);
8510 // For scalable vectors, the only interleave factor currently supported
8511 // is 2 since we require the (de)interleave2 intrinsics instead of
8512 // shufflevectors.
8513 assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&
8514 "Unsupported interleave factor for scalable vectors");
8515 return Result;
8516 };
8517 if (!getDecisionAndClampRange(Predicate: applyIG, Range))
8518 continue;
8519 InterleaveGroups.insert(Ptr: IG);
8520 };
8521
8522 // ---------------------------------------------------------------------------
8523 // Construct recipes for the instructions in the loop
8524 // ---------------------------------------------------------------------------
8525
8526 // Scan the body of the loop in a topological order to visit each basic block
8527 // after having visited its predecessor basic blocks.
8528 LoopBlocksDFS DFS(OrigLoop);
8529 DFS.perform(LI);
8530
8531 VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
8532 VPBasicBlock *VPBB = HeaderVPBB;
8533 BasicBlock *HeaderBB = OrigLoop->getHeader();
8534 bool NeedsMasks =
8535 CM.foldTailByMasking() ||
8536 any_of(Range: OrigLoop->blocks(), P: [this, HeaderBB](BasicBlock *BB) {
8537 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
8538 return Legal->blockNeedsPredication(BB) || NeedsBlends;
8539 });
8540 for (BasicBlock *BB : make_range(x: DFS.beginRPO(), y: DFS.endRPO())) {
8541 // Relevant instructions from basic block BB will be grouped into VPRecipe
8542 // ingredients and fill a new VPBasicBlock.
8543 if (VPBB != HeaderVPBB)
8544 VPBB->setName(BB->getName());
8545 Builder.setInsertPoint(VPBB);
8546
8547 if (VPBB == HeaderVPBB)
8548 RecipeBuilder.createHeaderMask();
8549 else if (NeedsMasks)
8550 RecipeBuilder.createBlockInMask(BB);
8551
8552 // Introduce each ingredient into VPlan.
8553 // TODO: Model and preserve debug intrinsics in VPlan.
8554 for (Instruction &I : drop_end(RangeOrContainer: BB->instructionsWithoutDebug(SkipPseudoOp: false))) {
8555 Instruction *Instr = &I;
8556 SmallVector<VPValue *, 4> Operands;
8557 auto *Phi = dyn_cast<PHINode>(Val: Instr);
8558 if (Phi && Phi->getParent() == HeaderBB) {
8559 Operands.push_back(Elt: Plan->getOrAddLiveIn(
8560 V: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopPreheader())));
8561 } else {
8562 auto OpRange = RecipeBuilder.mapToVPValues(Operands: Instr->operands());
8563 Operands = {OpRange.begin(), OpRange.end()};
8564 }
8565
8566 // Invariant stores inside loop will be deleted and a single store
8567 // with the final reduction value will be added to the exit block
8568 StoreInst *SI;
8569 if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
8570 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand()))
8571 continue;
8572
8573 VPRecipeBase *Recipe =
8574 RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands, Range, VPBB);
8575 if (!Recipe)
8576 Recipe = RecipeBuilder.handleReplication(I: Instr, Range);
8577
8578 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8579 if (isa<VPHeaderPHIRecipe>(Val: Recipe)) {
8580 // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
8581 // the following cases, VPHeaderPHIRecipes may be created after non-phi
8582 // recipes and need to be moved to the phi section of HeaderVPBB:
8583 // * tail-folding (non-phi recipes computing the header mask are
8584 // introduced earlier than regular header phi recipes, and should appear
8585 // after them)
8586 // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
8587
8588 assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
8589 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
8590 "unexpected recipe needs moving");
8591 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8592 } else
8593 VPBB->appendRecipe(Recipe);
8594 }
8595
8596 VPBlockUtils::insertBlockAfter(NewBlock: new VPBasicBlock(), BlockPtr: VPBB);
8597 VPBB = cast<VPBasicBlock>(Val: VPBB->getSingleSuccessor());
8598 }
8599
8600 // After here, VPBB should not be used.
8601 VPBB = nullptr;
8602
8603 if (CM.requiresScalarEpilogue(Range)) {
8604 // No edge from the middle block to the unique exit block has been inserted
8605 // and there is nothing to fix from vector loop; phis should have incoming
8606 // from scalar loop only.
8607 } else
8608 addUsersInExitBlock(HeaderVPBB, OrigLoop, Builder&: RecipeBuilder, Plan&: *Plan);
8609
8610 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8611 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8612 "entry block must be set to a VPRegionBlock having a non-empty entry "
8613 "VPBasicBlock");
8614 RecipeBuilder.fixHeaderPhis();
8615
8616 addLiveOutsForFirstOrderRecurrences(Plan&: *Plan);
8617
8618 // ---------------------------------------------------------------------------
8619 // Transform initial VPlan: Apply previously taken decisions, in order, to
8620 // bring the VPlan to its final state.
8621 // ---------------------------------------------------------------------------
8622
8623 // Adjust the recipes for any inloop reductions.
8624 adjustRecipesForReductions(Plan, RecipeBuilder, MinVF: Range.Start);
8625
8626 // Interleave memory: for each Interleave Group we marked earlier as relevant
8627 // for this VPlan, replace the Recipes widening its memory instructions with a
8628 // single VPInterleaveRecipe at its insertion point.
8629 for (const auto *IG : InterleaveGroups) {
8630 auto *Recipe =
8631 cast<VPWidenMemoryRecipe>(Val: RecipeBuilder.getRecipe(I: IG->getInsertPos()));
8632 SmallVector<VPValue *, 4> StoredValues;
8633 for (unsigned i = 0; i < IG->getFactor(); ++i)
8634 if (auto *SI = dyn_cast_or_null<StoreInst>(Val: IG->getMember(Index: i))) {
8635 auto *StoreR = cast<VPWidenStoreRecipe>(Val: RecipeBuilder.getRecipe(I: SI));
8636 StoredValues.push_back(Elt: StoreR->getStoredValue());
8637 }
8638
8639 bool NeedsMaskForGaps =
8640 IG->requiresScalarEpilogue() && !CM.isScalarEpilogueAllowed();
8641 assert((!NeedsMaskForGaps || useMaskedInterleavedAccesses(CM.TTI)) &&
8642 "masked interleaved groups are not allowed.");
8643 auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8644 Recipe->getMask(), NeedsMaskForGaps);
8645 VPIG->insertBefore(InsertPos: Recipe);
8646 unsigned J = 0;
8647 for (unsigned i = 0; i < IG->getFactor(); ++i)
8648 if (Instruction *Member = IG->getMember(Index: i)) {
8649 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(I: Member);
8650 if (!Member->getType()->isVoidTy()) {
8651 VPValue *OriginalV = MemberR->getVPSingleValue();
8652 OriginalV->replaceAllUsesWith(New: VPIG->getVPValue(I: J));
8653 J++;
8654 }
8655 MemberR->eraseFromParent();
8656 }
8657 }
8658
8659 for (ElementCount VF : Range)
8660 Plan->addVF(VF);
8661 Plan->setName("Initial VPlan");
8662
8663 // Replace VPValues for known constant strides guaranteed by predicate scalar
8664 // evolution.
8665 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8666 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
8667 auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV));
8668 // Only handle constant strides for now.
8669 if (!ScevStride)
8670 continue;
8671
8672 auto *CI = Plan->getOrAddLiveIn(
8673 V: ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt()));
8674 if (VPValue *StrideVPV = Plan->getLiveIn(V: StrideV))
8675 StrideVPV->replaceAllUsesWith(New: CI);
8676
8677 // The versioned value may not be used in the loop directly but through a
8678 // sext/zext. Add new live-ins in those cases.
8679 for (Value *U : StrideV->users()) {
8680 if (!isa<SExtInst, ZExtInst>(Val: U))
8681 continue;
8682 VPValue *StrideVPV = Plan->getLiveIn(V: U);
8683 if (!StrideVPV)
8684 continue;
8685 unsigned BW = U->getType()->getScalarSizeInBits();
8686 APInt C = isa<SExtInst>(Val: U) ? ScevStride->getAPInt().sext(width: BW)
8687 : ScevStride->getAPInt().zext(width: BW);
8688 VPValue *CI = Plan->getOrAddLiveIn(V: ConstantInt::get(Ty: U->getType(), V: C));
8689 StrideVPV->replaceAllUsesWith(New: CI);
8690 }
8691 }
8692
8693 VPlanTransforms::dropPoisonGeneratingRecipes(Plan&: *Plan, BlockNeedsPredication: [this](BasicBlock *BB) {
8694 return Legal->blockNeedsPredication(BB);
8695 });
8696
8697 // Sink users of fixed-order recurrence past the recipe defining the previous
8698 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8699 if (!VPlanTransforms::adjustFixedOrderRecurrences(Plan&: *Plan, Builder))
8700 return nullptr;
8701
8702 if (useActiveLaneMask(Style)) {
8703 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8704 // TailFoldingStyle is visible there.
8705 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8706 bool WithoutRuntimeCheck =
8707 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8708 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8709 DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8710 }
8711 return Plan;
8712}
8713
8714VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8715 // Outer loop handling: They may require CFG and instruction level
8716 // transformations before even evaluating whether vectorization is profitable.
8717 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8718 // the vectorization pipeline.
8719 assert(!OrigLoop->isInnermost());
8720 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8721
8722 // Create new empty VPlan
8723 auto Plan = VPlan::createInitialVPlan(
8724 TripCount: createTripCountSCEV(IdxTy: Legal->getWidestInductionType(), PSE, OrigLoop),
8725 PSE&: *PSE.getSE(), RequiresScalarEpilogueCheck: true, TailFolded: false, TheLoop: OrigLoop);
8726
8727 // Build hierarchical CFG
8728 VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
8729 HCFGBuilder.buildHierarchicalCFG();
8730
8731 for (ElementCount VF : Range)
8732 Plan->addVF(VF);
8733
8734 VPlanTransforms::VPInstructionsToVPRecipes(
8735 Plan,
8736 GetIntOrFpInductionDescriptor: [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(Phi: P); },
8737 SE&: *PSE.getSE(), TLI: *TLI);
8738
8739 // Remove the existing terminator of the exiting block of the top-most region.
8740 // A BranchOnCount will be added instead when adding the canonical IV recipes.
8741 auto *Term =
8742 Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8743 Term->eraseFromParent();
8744
8745 // Tail folding is not supported for outer loops, so the induction increment
8746 // is guaranteed to not wrap.
8747 bool HasNUW = true;
8748 addCanonicalIVRecipes(Plan&: *Plan, IdxTy: Legal->getWidestInductionType(), HasNUW,
8749 DL: DebugLoc());
8750 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8751 return Plan;
8752}
8753
8754// Adjust the recipes for reductions. For in-loop reductions the chain of
8755// instructions leading from the loop exit instr to the phi need to be converted
8756// to reductions, with one operand being vector and the other being the scalar
8757// reduction chain. For other reductions, a select is introduced between the phi
8758// and live-out recipes when folding the tail.
8759//
8760// A ComputeReductionResult recipe is added to the middle block, also for
8761// in-loop reductions which compute their result in-loop, because generating
8762// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8763//
8764// Adjust AnyOf reductions; replace the reduction phi for the selected value
8765// with a boolean reduction phi node to check if the condition is true in any
8766// iteration. The final value is selected by the final ComputeReductionResult.
8767void LoopVectorizationPlanner::adjustRecipesForReductions(
8768 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8769 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8770 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8771 // Gather all VPReductionPHIRecipe and sort them so that Intermediate stores
8772 // sank outside of the loop would keep the same order as they had in the
8773 // original loop.
8774 SmallVector<VPReductionPHIRecipe *> ReductionPHIList;
8775 for (VPRecipeBase &R : Header->phis()) {
8776 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R))
8777 ReductionPHIList.emplace_back(Args&: ReductionPhi);
8778 }
8779 bool HasIntermediateStore = false;
8780 stable_sort(Range&: ReductionPHIList,
8781 C: [this, &HasIntermediateStore](const VPReductionPHIRecipe *R1,
8782 const VPReductionPHIRecipe *R2) {
8783 auto *IS1 = R1->getRecurrenceDescriptor().IntermediateStore;
8784 auto *IS2 = R2->getRecurrenceDescriptor().IntermediateStore;
8785 HasIntermediateStore |= IS1 || IS2;
8786
8787 // If neither of the recipes has an intermediate store, keep the
8788 // order the same.
8789 if (!IS1 && !IS2)
8790 return false;
8791
8792 // If only one of the recipes has an intermediate store, then
8793 // move it towards the beginning of the list.
8794 if (IS1 && !IS2)
8795 return true;
8796
8797 if (!IS1 && IS2)
8798 return false;
8799
8800 // If both recipes have an intermediate store, then the recipe
8801 // with the later store should be processed earlier. So it
8802 // should go to the beginning of the list.
8803 return DT->dominates(Def: IS2, User: IS1);
8804 });
8805
8806 if (HasIntermediateStore && ReductionPHIList.size() > 1)
8807 for (VPRecipeBase *R : ReductionPHIList)
8808 R->moveBefore(BB&: *Header, I: Header->getFirstNonPhi());
8809
8810 for (VPRecipeBase &R : Header->phis()) {
8811 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8812 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8813 continue;
8814
8815 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8816 RecurKind Kind = RdxDesc.getRecurrenceKind();
8817 assert(!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8818 "AnyOf reductions are not allowed for in-loop reductions");
8819
8820 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8821 SetVector<VPSingleDefRecipe *> Worklist;
8822 Worklist.insert(X: PhiR);
8823 for (unsigned I = 0; I != Worklist.size(); ++I) {
8824 VPSingleDefRecipe *Cur = Worklist[I];
8825 for (VPUser *U : Cur->users()) {
8826 auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(Val: U);
8827 if (!UserRecipe) {
8828 assert(isa<VPLiveOut>(U) &&
8829 "U must either be a VPSingleDef or VPLiveOut");
8830 continue;
8831 }
8832 Worklist.insert(X: UserRecipe);
8833 }
8834 }
8835
8836 // Visit operation "Links" along the reduction chain top-down starting from
8837 // the phi until LoopExitValue. We keep track of the previous item
8838 // (PreviousLink) to tell which of the two operands of a Link will remain
8839 // scalar and which will be reduced. For minmax by select(cmp), Link will be
8840 // the select instructions. Blend recipes of in-loop reduction phi's will
8841 // get folded to their non-phi operand, as the reduction recipe handles the
8842 // condition directly.
8843 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
8844 for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
8845 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
8846
8847 // Index of the first operand which holds a non-mask vector operand.
8848 unsigned IndexOfFirstOperand;
8849 // Recognize a call to the llvm.fmuladd intrinsic.
8850 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
8851 VPValue *VecOp;
8852 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
8853 if (IsFMulAdd) {
8854 assert(
8855 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
8856 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
8857 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
8858 isa<VPWidenCallRecipe>(CurrentLink)) &&
8859 CurrentLink->getOperand(2) == PreviousLink &&
8860 "expected a call where the previous link is the added operand");
8861
8862 // If the instruction is a call to the llvm.fmuladd intrinsic then we
8863 // need to create an fmul recipe (multiplying the first two operands of
8864 // the fmuladd together) to use as the vector operand for the fadd
8865 // reduction.
8866 VPInstruction *FMulRecipe = new VPInstruction(
8867 Instruction::FMul,
8868 {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)},
8869 CurrentLinkI->getFastMathFlags());
8870 LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
8871 VecOp = FMulRecipe;
8872 } else {
8873 auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink);
8874 if (PhiR->isInLoop() && Blend) {
8875 assert(Blend->getNumIncomingValues() == 2 &&
8876 "Blend must have 2 incoming values");
8877 if (Blend->getIncomingValue(Idx: 0) == PhiR)
8878 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1));
8879 else {
8880 assert(Blend->getIncomingValue(1) == PhiR &&
8881 "PhiR must be an operand of the blend");
8882 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 0));
8883 }
8884 continue;
8885 }
8886
8887 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
8888 if (isa<VPWidenRecipe>(Val: CurrentLink)) {
8889 assert(isa<CmpInst>(CurrentLinkI) &&
8890 "need to have the compare of the select");
8891 continue;
8892 }
8893 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
8894 "must be a select recipe");
8895 IndexOfFirstOperand = 1;
8896 } else {
8897 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
8898 "Expected to replace a VPWidenSC");
8899 IndexOfFirstOperand = 0;
8900 }
8901 // Note that for non-commutable operands (cmp-selects), the semantics of
8902 // the cmp-select are captured in the recurrence kind.
8903 unsigned VecOpId =
8904 CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
8905 ? IndexOfFirstOperand + 1
8906 : IndexOfFirstOperand;
8907 VecOp = CurrentLink->getOperand(N: VecOpId);
8908 assert(VecOp != PreviousLink &&
8909 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
8910 (VecOpId - IndexOfFirstOperand)) ==
8911 PreviousLink &&
8912 "PreviousLink must be the operand other than VecOp");
8913 }
8914
8915 BasicBlock *BB = CurrentLinkI->getParent();
8916 VPValue *CondOp = nullptr;
8917 if (CM.blockNeedsPredicationForAnyReason(BB))
8918 CondOp = RecipeBuilder.getBlockInMask(BB);
8919
8920 VPReductionRecipe *RedRecipe =
8921 new VPReductionRecipe(RdxDesc, CurrentLinkI, PreviousLink, VecOp,
8922 CondOp, CM.useOrderedReductions(RdxDesc));
8923 // Append the recipe to the end of the VPBasicBlock because we need to
8924 // ensure that it comes after all of it's inputs, including CondOp.
8925 // Note that this transformation may leave over dead recipes (including
8926 // CurrentLink), which will be cleaned by a later VPlan transform.
8927 LinkVPBB->appendRecipe(Recipe: RedRecipe);
8928 CurrentLink->replaceAllUsesWith(New: RedRecipe);
8929 PreviousLink = RedRecipe;
8930 }
8931 }
8932 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8933 Builder.setInsertPoint(&*LatchVPBB->begin());
8934 VPBasicBlock *MiddleVPBB =
8935 cast<VPBasicBlock>(Val: VectorLoopRegion->getSingleSuccessor());
8936 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8937 for (VPRecipeBase &R :
8938 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8939 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8940 if (!PhiR)
8941 continue;
8942
8943 const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
8944 // Adjust AnyOf reductions; replace the reduction phi for the selected value
8945 // with a boolean reduction phi node to check if the condition is true in
8946 // any iteration. The final value is selected by the final
8947 // ComputeReductionResult.
8948 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
8949 Kind: RdxDesc.getRecurrenceKind())) {
8950 auto *Select = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
8951 return isa<VPWidenSelectRecipe>(Val: U) ||
8952 (isa<VPReplicateRecipe>(Val: U) &&
8953 cast<VPReplicateRecipe>(Val: U)->getUnderlyingInstr()->getOpcode() ==
8954 Instruction::Select);
8955 }));
8956 VPValue *Cmp = Select->getOperand(N: 0);
8957 // If the compare is checking the reduction PHI node, adjust it to check
8958 // the start value.
8959 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
8960 for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
8961 if (CmpR->getOperand(N: I) == PhiR)
8962 CmpR->setOperand(I, New: PhiR->getStartValue());
8963 }
8964 VPBuilder::InsertPointGuard Guard(Builder);
8965 Builder.setInsertPoint(Select);
8966
8967 // If the true value of the select is the reduction phi, the new value is
8968 // selected if the negated condition is true in any iteration.
8969 if (Select->getOperand(N: 1) == PhiR)
8970 Cmp = Builder.createNot(Operand: Cmp);
8971 VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
8972 Select->getVPSingleValue()->replaceAllUsesWith(New: Or);
8973
8974 // Convert the reduction phi to operate on bools.
8975 PhiR->setOperand(I: 0, New: Plan->getOrAddLiveIn(V: ConstantInt::getFalse(
8976 Context&: OrigLoop->getHeader()->getContext())));
8977 }
8978
8979 // If tail is folded by masking, introduce selects between the phi
8980 // and the live-out instruction of each reduction, at the beginning of the
8981 // dedicated latch block.
8982 auto *OrigExitingVPV = PhiR->getBackedgeValue();
8983 auto *NewExitingVPV = PhiR->getBackedgeValue();
8984 if (!PhiR->isInLoop() && CM.foldTailByMasking()) {
8985 VPValue *Cond = RecipeBuilder.getBlockInMask(BB: OrigLoop->getHeader());
8986 assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
8987 "reduction recipe must be defined before latch");
8988 Type *PhiTy = PhiR->getOperand(N: 0)->getLiveInIRValue()->getType();
8989 std::optional<FastMathFlags> FMFs =
8990 PhiTy->isFloatingPointTy()
8991 ? std::make_optional(t: RdxDesc.getFastMathFlags())
8992 : std::nullopt;
8993 NewExitingVPV =
8994 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
8995 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
8996 return isa<VPInstruction>(Val: &U) &&
8997 cast<VPInstruction>(Val: &U)->getOpcode() ==
8998 VPInstruction::ComputeReductionResult;
8999 });
9000 if (PreferPredicatedReductionSelect ||
9001 TTI.preferPredicatedReductionSelect(
9002 Opcode: PhiR->getRecurrenceDescriptor().getOpcode(), Ty: PhiTy,
9003 Flags: TargetTransformInfo::ReductionFlags()))
9004 PhiR->setOperand(I: 1, New: NewExitingVPV);
9005 }
9006
9007 // If the vector reduction can be performed in a smaller type, we truncate
9008 // then extend the loop exit value to enable InstCombine to evaluate the
9009 // entire expression in the smaller type.
9010 Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
9011 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9012 !RecurrenceDescriptor::isAnyOfRecurrenceKind(
9013 Kind: RdxDesc.getRecurrenceKind())) {
9014 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9015 Type *RdxTy = RdxDesc.getRecurrenceType();
9016 auto *Trunc =
9017 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9018 auto *Extnd =
9019 RdxDesc.isSigned()
9020 ? new VPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)
9021 : new VPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);
9022
9023 Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe());
9024 Extnd->insertAfter(InsertPos: Trunc);
9025 if (PhiR->getOperand(N: 1) == NewExitingVPV)
9026 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
9027 NewExitingVPV = Extnd;
9028 }
9029
9030 // We want code in the middle block to appear to execute on the location of
9031 // the scalar loop's latch terminator because: (a) it is all compiler
9032 // generated, (b) these instructions are always executed after evaluating
9033 // the latch conditional branch, and (c) other passes may add new
9034 // predecessors which terminate on this line. This is the easiest way to
9035 // ensure we don't accidentally cause an extra step back into the loop while
9036 // debugging.
9037 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9038
9039 // TODO: At the moment ComputeReductionResult also drives creation of the
9040 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9041 // even for in-loop reductions, until the reduction resume value handling is
9042 // also modeled in VPlan.
9043 auto *FinalReductionResult = new VPInstruction(
9044 VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
9045 FinalReductionResult->insertBefore(BB&: *MiddleVPBB, IP);
9046 OrigExitingVPV->replaceUsesWithIf(
9047 New: FinalReductionResult,
9048 ShouldReplace: [](VPUser &User, unsigned) { return isa<VPLiveOut>(Val: &User); });
9049 }
9050
9051 VPlanTransforms::clearReductionWrapFlags(Plan&: *Plan);
9052}
9053
9054void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9055 assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9056 "Not a pointer induction according to InductionDescriptor!");
9057 assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9058 "Unexpected type.");
9059 assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
9060 "Recipe should have been replaced");
9061
9062 auto *IVR = getParent()->getPlan()->getCanonicalIV();
9063 PHINode *CanonicalIV = cast<PHINode>(Val: State.get(Def: IVR, Part: 0, /*IsScalar*/ true));
9064 Type *PhiType = IndDesc.getStep()->getType();
9065
9066 // Build a pointer phi
9067 Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9068 Type *ScStValueType = ScalarStartValue->getType();
9069 PHINode *NewPointerPhi = PHINode::Create(Ty: ScStValueType, NumReservedValues: 2, NameStr: "pointer.phi",
9070 InsertBefore: CanonicalIV->getIterator());
9071
9072 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(R: this);
9073 NewPointerPhi->addIncoming(V: ScalarStartValue, BB: VectorPH);
9074
9075 // A pointer induction, performed by using a gep
9076 BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
9077
9078 Value *ScalarStepValue = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0));
9079 Value *RuntimeVF = getRuntimeVF(B&: State.Builder, Ty: PhiType, VF: State.VF);
9080 Value *NumUnrolledElems =
9081 State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: State.UF));
9082 Value *InductionGEP = GetElementPtrInst::Create(
9083 PointeeType: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
9084 IdxList: State.Builder.CreateMul(LHS: ScalarStepValue, RHS: NumUnrolledElems), NameStr: "ptr.ind",
9085 InsertBefore: InductionLoc);
9086 // Add induction update using an incorrect block temporarily. The phi node
9087 // will be fixed after VPlan execution. Note that at this point the latch
9088 // block cannot be used, as it does not exist yet.
9089 // TODO: Model increment value in VPlan, by turning the recipe into a
9090 // multi-def and a subclass of VPHeaderPHIRecipe.
9091 NewPointerPhi->addIncoming(V: InductionGEP, BB: VectorPH);
9092
9093 // Create UF many actual address geps that use the pointer
9094 // phi as base and a vectorized version of the step value
9095 // (<step*0, ..., step*N>) as offset.
9096 for (unsigned Part = 0; Part < State.UF; ++Part) {
9097 Type *VecPhiType = VectorType::get(ElementType: PhiType, EC: State.VF);
9098 Value *StartOffsetScalar =
9099 State.Builder.CreateMul(LHS: RuntimeVF, RHS: ConstantInt::get(Ty: PhiType, V: Part));
9100 Value *StartOffset =
9101 State.Builder.CreateVectorSplat(EC: State.VF, V: StartOffsetScalar);
9102 // Create a vector of consecutive numbers from zero to VF.
9103 StartOffset = State.Builder.CreateAdd(
9104 LHS: StartOffset, RHS: State.Builder.CreateStepVector(DstType: VecPhiType));
9105
9106 assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
9107 "scalar step must be the same across all parts");
9108 Value *GEP = State.Builder.CreateGEP(
9109 Ty: State.Builder.getInt8Ty(), Ptr: NewPointerPhi,
9110 IdxList: State.Builder.CreateMul(
9111 LHS: StartOffset,
9112 RHS: State.Builder.CreateVectorSplat(EC: State.VF, V: ScalarStepValue),
9113 Name: "vector.gep"));
9114 State.set(Def: this, V: GEP, Part);
9115 }
9116}
9117
9118void VPDerivedIVRecipe::execute(VPTransformState &State) {
9119 assert(!State.Instance && "VPDerivedIVRecipe being replicated.");
9120
9121 // Fast-math-flags propagate from the original induction instruction.
9122 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9123 if (FPBinOp)
9124 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9125
9126 Value *Step = State.get(Def: getStepValue(), Instance: VPIteration(0, 0));
9127 Value *CanonicalIV = State.get(Def: getOperand(N: 1), Instance: VPIteration(0, 0));
9128 Value *DerivedIV = emitTransformedIndex(
9129 B&: State.Builder, Index: CanonicalIV, StartValue: getStartValue()->getLiveInIRValue(), Step,
9130 InductionKind: Kind, InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
9131 DerivedIV->setName("offset.idx");
9132 assert(DerivedIV != CanonicalIV && "IV didn't need transforming?");
9133
9134 State.set(Def: this, V: DerivedIV, Instance: VPIteration(0, 0));
9135}
9136
9137void VPReplicateRecipe::execute(VPTransformState &State) {
9138 Instruction *UI = getUnderlyingInstr();
9139 if (State.Instance) { // Generate a single instance.
9140 assert((State.VF.isScalar() || !isUniform()) &&
9141 "uniform recipe shouldn't be predicated");
9142 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9143 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: *State.Instance, State);
9144 // Insert scalar instance packing it into a vector.
9145 if (State.VF.isVector() && shouldPack()) {
9146 // If we're constructing lane 0, initialize to start from poison.
9147 if (State.Instance->Lane.isFirstLane()) {
9148 assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9149 Value *Poison = PoisonValue::get(
9150 T: VectorType::get(ElementType: UI->getType(), EC: State.VF));
9151 State.set(Def: this, V: Poison, Part: State.Instance->Part);
9152 }
9153 State.packScalarIntoVectorValue(Def: this, Instance: *State.Instance);
9154 }
9155 return;
9156 }
9157
9158 if (IsUniform) {
9159 // If the recipe is uniform across all parts (instead of just per VF), only
9160 // generate a single instance.
9161 if ((isa<LoadInst>(Val: UI) || isa<StoreInst>(Val: UI)) &&
9162 all_of(Range: operands(), P: [](VPValue *Op) {
9163 return Op->isDefinedOutsideVectorRegions();
9164 })) {
9165 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(0, 0), State);
9166 if (user_begin() != user_end()) {
9167 for (unsigned Part = 1; Part < State.UF; ++Part)
9168 State.set(Def: this, V: State.get(Def: this, Instance: VPIteration(0, 0)),
9169 Instance: VPIteration(Part, 0));
9170 }
9171 return;
9172 }
9173
9174 // Uniform within VL means we need to generate lane 0 only for each
9175 // unrolled copy.
9176 for (unsigned Part = 0; Part < State.UF; ++Part)
9177 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, 0), State);
9178 return;
9179 }
9180
9181 // A store of a loop varying value to a uniform address only needs the last
9182 // copy of the store.
9183 if (isa<StoreInst>(Val: UI) &&
9184 vputils::isUniformAfterVectorization(VPV: getOperand(N: 1))) {
9185 auto Lane = VPLane::getLastLaneForVF(VF: State.VF);
9186 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(State.UF - 1, Lane),
9187 State);
9188 return;
9189 }
9190
9191 // Generate scalar instances for all VF lanes of all UF parts.
9192 assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9193 const unsigned EndLane = State.VF.getKnownMinValue();
9194 for (unsigned Part = 0; Part < State.UF; ++Part)
9195 for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9196 State.ILV->scalarizeInstruction(Instr: UI, RepRecipe: this, Instance: VPIteration(Part, Lane), State);
9197}
9198
9199void VPWidenLoadRecipe::execute(VPTransformState &State) {
9200 auto *LI = cast<LoadInst>(Val: &Ingredient);
9201
9202 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
9203 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
9204 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9205 bool CreateGather = !isConsecutive();
9206
9207 auto &Builder = State.Builder;
9208 State.setDebugLocFrom(getDebugLoc());
9209 for (unsigned Part = 0; Part < State.UF; ++Part) {
9210 Value *NewLI;
9211 Value *Mask = nullptr;
9212 if (auto *VPMask = getMask()) {
9213 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9214 // of a null all-one mask is a null mask.
9215 Mask = State.get(Def: VPMask, Part);
9216 if (isReverse())
9217 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
9218 }
9219
9220 Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateGather);
9221 if (CreateGather) {
9222 NewLI = Builder.CreateMaskedGather(Ty: DataTy, Ptrs: Addr, Alignment, Mask, PassThru: nullptr,
9223 Name: "wide.masked.gather");
9224 } else if (Mask) {
9225 NewLI = Builder.CreateMaskedLoad(Ty: DataTy, Ptr: Addr, Alignment, Mask,
9226 PassThru: PoisonValue::get(T: DataTy),
9227 Name: "wide.masked.load");
9228 } else {
9229 NewLI = Builder.CreateAlignedLoad(Ty: DataTy, Ptr: Addr, Align: Alignment, Name: "wide.load");
9230 }
9231 // Add metadata to the load, but setVectorValue to the reverse shuffle.
9232 State.addMetadata(To: NewLI, From: LI);
9233 if (Reverse)
9234 NewLI = Builder.CreateVectorReverse(V: NewLI, Name: "reverse");
9235 State.set(Def: this, V: NewLI, Part);
9236 }
9237}
9238
9239/// Use all-true mask for reverse rather than actual mask, as it avoids a
9240/// dependence w/o affecting the result.
9241static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
9242 Value *EVL, const Twine &Name) {
9243 VectorType *ValTy = cast<VectorType>(Val: Operand->getType());
9244 Value *AllTrueMask =
9245 Builder.CreateVectorSplat(EC: ValTy->getElementCount(), V: Builder.getTrue());
9246 return Builder.CreateIntrinsic(RetTy: ValTy, ID: Intrinsic::experimental_vp_reverse,
9247 Args: {Operand, AllTrueMask, EVL}, FMFSource: nullptr, Name);
9248}
9249
9250void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
9251 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9252 "explicit vector length.");
9253 auto *LI = cast<LoadInst>(Val: &Ingredient);
9254
9255 Type *ScalarDataTy = getLoadStoreType(I: &Ingredient);
9256 auto *DataTy = VectorType::get(ElementType: ScalarDataTy, EC: State.VF);
9257 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9258 bool CreateGather = !isConsecutive();
9259
9260 auto &Builder = State.Builder;
9261 State.setDebugLocFrom(getDebugLoc());
9262 CallInst *NewLI;
9263 Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0));
9264 Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateGather);
9265 Value *Mask = nullptr;
9266 if (VPValue *VPMask = getMask()) {
9267 Mask = State.get(Def: VPMask, Part: 0);
9268 if (isReverse())
9269 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
9270 } else {
9271 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
9272 }
9273
9274 if (CreateGather) {
9275 NewLI =
9276 Builder.CreateIntrinsic(RetTy: DataTy, ID: Intrinsic::vp_gather, Args: {Addr, Mask, EVL},
9277 FMFSource: nullptr, Name: "wide.masked.gather");
9278 } else {
9279 VectorBuilder VBuilder(Builder);
9280 VBuilder.setEVL(EVL).setMask(Mask);
9281 NewLI = cast<CallInst>(Val: VBuilder.createVectorInstruction(
9282 Opcode: Instruction::Load, ReturnTy: DataTy, VecOpArray: Addr, Name: "vp.op.load"));
9283 }
9284 NewLI->addParamAttr(
9285 ArgNo: 0, Attr: Attribute::getWithAlignment(Context&: NewLI->getContext(), Alignment));
9286 State.addMetadata(To: NewLI, From: LI);
9287 Instruction *Res = NewLI;
9288 if (isReverse())
9289 Res = createReverseEVL(Builder, Operand: Res, EVL, Name: "vp.reverse");
9290 State.set(Def: this, V: Res, Part: 0);
9291}
9292
9293void VPWidenStoreRecipe::execute(VPTransformState &State) {
9294 auto *SI = cast<StoreInst>(Val: &Ingredient);
9295
9296 VPValue *StoredVPValue = getStoredValue();
9297 bool CreateScatter = !isConsecutive();
9298 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9299
9300 auto &Builder = State.Builder;
9301 State.setDebugLocFrom(getDebugLoc());
9302
9303 for (unsigned Part = 0; Part < State.UF; ++Part) {
9304 Instruction *NewSI = nullptr;
9305 Value *Mask = nullptr;
9306 if (auto *VPMask = getMask()) {
9307 // Mask reversal is only needed for non-all-one (null) masks, as reverse
9308 // of a null all-one mask is a null mask.
9309 Mask = State.get(Def: VPMask, Part);
9310 if (isReverse())
9311 Mask = Builder.CreateVectorReverse(V: Mask, Name: "reverse");
9312 }
9313
9314 Value *StoredVal = State.get(Def: StoredVPValue, Part);
9315 if (isReverse()) {
9316 // If we store to reverse consecutive memory locations, then we need
9317 // to reverse the order of elements in the stored value.
9318 StoredVal = Builder.CreateVectorReverse(V: StoredVal, Name: "reverse");
9319 // We don't want to update the value in the map as it might be used in
9320 // another expression. So don't call resetVectorValue(StoredVal).
9321 }
9322 Value *Addr = State.get(Def: getAddr(), Part, /*IsScalar*/ !CreateScatter);
9323 if (CreateScatter)
9324 NewSI = Builder.CreateMaskedScatter(Val: StoredVal, Ptrs: Addr, Alignment, Mask);
9325 else if (Mask)
9326 NewSI = Builder.CreateMaskedStore(Val: StoredVal, Ptr: Addr, Alignment, Mask);
9327 else
9328 NewSI = Builder.CreateAlignedStore(Val: StoredVal, Ptr: Addr, Align: Alignment);
9329 State.addMetadata(To: NewSI, From: SI);
9330 }
9331}
9332
9333void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
9334 assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
9335 "explicit vector length.");
9336 auto *SI = cast<StoreInst>(Val: &Ingredient);
9337
9338 VPValue *StoredValue = getStoredValue();
9339 bool CreateScatter = !isConsecutive();
9340 const Align Alignment = getLoadStoreAlignment(I: &Ingredient);
9341
9342 auto &Builder = State.Builder;
9343 State.setDebugLocFrom(getDebugLoc());
9344
9345 CallInst *NewSI = nullptr;
9346 Value *StoredVal = State.get(Def: StoredValue, Part: 0);
9347 Value *EVL = State.get(Def: getEVL(), Instance: VPIteration(0, 0));
9348 if (isReverse())
9349 StoredVal = createReverseEVL(Builder, Operand: StoredVal, EVL, Name: "vp.reverse");
9350 Value *Mask = nullptr;
9351 if (VPValue *VPMask = getMask()) {
9352 Mask = State.get(Def: VPMask, Part: 0);
9353 if (isReverse())
9354 Mask = createReverseEVL(Builder, Operand: Mask, EVL, Name: "vp.reverse.mask");
9355 } else {
9356 Mask = Builder.CreateVectorSplat(EC: State.VF, V: Builder.getTrue());
9357 }
9358 Value *Addr = State.get(Def: getAddr(), Part: 0, IsScalar: !CreateScatter);
9359 if (CreateScatter) {
9360 NewSI = Builder.CreateIntrinsic(RetTy: Type::getVoidTy(C&: EVL->getContext()),
9361 ID: Intrinsic::vp_scatter,
9362 Args: {StoredVal, Addr, Mask, EVL});
9363 } else {
9364 VectorBuilder VBuilder(Builder);
9365 VBuilder.setEVL(EVL).setMask(Mask);
9366 NewSI = cast<CallInst>(Val: VBuilder.createVectorInstruction(
9367 Opcode: Instruction::Store, ReturnTy: Type::getVoidTy(C&: EVL->getContext()),
9368 VecOpArray: {StoredVal, Addr}));
9369 }
9370 NewSI->addParamAttr(
9371 ArgNo: 1, Attr: Attribute::getWithAlignment(Context&: NewSI->getContext(), Alignment));
9372 State.addMetadata(To: NewSI, From: SI);
9373}
9374
9375// Determine how to lower the scalar epilogue, which depends on 1) optimising
9376// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9377// predication, and 4) a TTI hook that analyses whether the loop is suitable
9378// for predication.
9379static ScalarEpilogueLowering getScalarEpilogueLowering(
9380 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9381 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9382 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9383 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9384 // don't look at hints or options, and don't request a scalar epilogue.
9385 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9386 // LoopAccessInfo (due to code dependency and not being able to reliably get
9387 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9388 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9389 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9390 // back to the old way and vectorize with versioning when forced. See D81345.)
9391 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
9392 QueryType: PGSOQueryType::IRPass) &&
9393 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9394 return CM_ScalarEpilogueNotAllowedOptSize;
9395
9396 // 2) If set, obey the directives
9397 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9398 switch (PreferPredicateOverEpilogue) {
9399 case PreferPredicateTy::ScalarEpilogue:
9400 return CM_ScalarEpilogueAllowed;
9401 case PreferPredicateTy::PredicateElseScalarEpilogue:
9402 return CM_ScalarEpilogueNotNeededUsePredicate;
9403 case PreferPredicateTy::PredicateOrDontVectorize:
9404 return CM_ScalarEpilogueNotAllowedUsePredicate;
9405 };
9406 }
9407
9408 // 3) If set, obey the hints
9409 switch (Hints.getPredicate()) {
9410 case LoopVectorizeHints::FK_Enabled:
9411 return CM_ScalarEpilogueNotNeededUsePredicate;
9412 case LoopVectorizeHints::FK_Disabled:
9413 return CM_ScalarEpilogueAllowed;
9414 };
9415
9416 // 4) if the TTI hook indicates this is profitable, request predication.
9417 TailFoldingInfo TFI(TLI, &LVL, IAI);
9418 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
9419 return CM_ScalarEpilogueNotNeededUsePredicate;
9420
9421 return CM_ScalarEpilogueAllowed;
9422}
9423
9424// Process the loop in the VPlan-native vectorization path. This path builds
9425// VPlan upfront in the vectorization pipeline, which allows to apply
9426// VPlan-to-VPlan transformations from the very beginning without modifying the
9427// input LLVM IR.
9428static bool processLoopInVPlanNativePath(
9429 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9430 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9431 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9432 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9433 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9434 LoopVectorizationRequirements &Requirements) {
9435
9436 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
9437 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9438 return false;
9439 }
9440 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9441 Function *F = L->getHeader()->getParent();
9442 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9443
9444 ScalarEpilogueLowering SEL =
9445 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI);
9446
9447 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9448 &Hints, IAI);
9449 // Use the planner for outer loop vectorization.
9450 // TODO: CM is not used at this point inside the planner. Turn CM into an
9451 // optional argument if we don't need it in the future.
9452 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9453 ORE);
9454
9455 // Get user vectorization factor.
9456 ElementCount UserVF = Hints.getWidth();
9457
9458 CM.collectElementTypesForWidening();
9459
9460 // Plan how to best vectorize, return the best VF and its cost.
9461 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9462
9463 // If we are stress testing VPlan builds, do not attempt to generate vector
9464 // code. Masked vector code generation support will follow soon.
9465 // Also, do not attempt to vectorize if no vector code will be produced.
9466 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9467 return false;
9468
9469 VPlan &BestPlan = LVP.getBestPlanFor(VF: VF.Width);
9470
9471 {
9472 bool AddBranchWeights =
9473 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9474 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9475 F->getDataLayout(), AddBranchWeights);
9476 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9477 VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9478 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9479 << L->getHeader()->getParent()->getName() << "\"\n");
9480 LVP.executePlan(BestVF: VF.Width, BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false);
9481 }
9482
9483 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
9484
9485 // Mark the loop as already vectorized to avoid vectorizing again.
9486 Hints.setAlreadyVectorized();
9487 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9488 return true;
9489}
9490
9491// Emit a remark if there are stores to floats that required a floating point
9492// extension. If the vectorized loop was generated with floating point there
9493// will be a performance penalty from the conversion overhead and the change in
9494// the vector width.
9495static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9496 SmallVector<Instruction *, 4> Worklist;
9497 for (BasicBlock *BB : L->getBlocks()) {
9498 for (Instruction &Inst : *BB) {
9499 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
9500 if (S->getValueOperand()->getType()->isFloatTy())
9501 Worklist.push_back(Elt: S);
9502 }
9503 }
9504 }
9505
9506 // Traverse the floating point stores upwards searching, for floating point
9507 // conversions.
9508 SmallPtrSet<const Instruction *, 4> Visited;
9509 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9510 while (!Worklist.empty()) {
9511 auto *I = Worklist.pop_back_val();
9512 if (!L->contains(Inst: I))
9513 continue;
9514 if (!Visited.insert(Ptr: I).second)
9515 continue;
9516
9517 // Emit a remark if the floating point store required a floating
9518 // point conversion.
9519 // TODO: More work could be done to identify the root cause such as a
9520 // constant or a function return type and point the user to it.
9521 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
9522 ORE->emit(RemarkBuilder: [&]() {
9523 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9524 I->getDebugLoc(), L->getHeader())
9525 << "floating point conversion changes vector width. "
9526 << "Mixed floating point precision requires an up/down "
9527 << "cast that will negatively impact performance.";
9528 });
9529
9530 for (Use &Op : I->operands())
9531 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
9532 Worklist.push_back(Elt: OpI);
9533 }
9534}
9535
9536static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9537 VectorizationFactor &VF,
9538 std::optional<unsigned> VScale, Loop *L,
9539 ScalarEvolution &SE,
9540 ScalarEpilogueLowering SEL) {
9541 InstructionCost CheckCost = Checks.getCost();
9542 if (!CheckCost.isValid())
9543 return false;
9544
9545 // When interleaving only scalar and vector cost will be equal, which in turn
9546 // would lead to a divide by 0. Fall back to hard threshold.
9547 if (VF.Width.isScalar()) {
9548 if (CheckCost > VectorizeMemoryCheckThreshold) {
9549 LLVM_DEBUG(
9550 dbgs()
9551 << "LV: Interleaving only is not profitable due to runtime checks\n");
9552 return false;
9553 }
9554 return true;
9555 }
9556
9557 // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9558 uint64_t ScalarC = *VF.ScalarCost.getValue();
9559 if (ScalarC == 0)
9560 return true;
9561
9562 // First, compute the minimum iteration count required so that the vector
9563 // loop outperforms the scalar loop.
9564 // The total cost of the scalar loop is
9565 // ScalarC * TC
9566 // where
9567 // * TC is the actual trip count of the loop.
9568 // * ScalarC is the cost of a single scalar iteration.
9569 //
9570 // The total cost of the vector loop is
9571 // RtC + VecC * (TC / VF) + EpiC
9572 // where
9573 // * RtC is the cost of the generated runtime checks
9574 // * VecC is the cost of a single vector iteration.
9575 // * TC is the actual trip count of the loop
9576 // * VF is the vectorization factor
9577 // * EpiCost is the cost of the generated epilogue, including the cost
9578 // of the remaining scalar operations.
9579 //
9580 // Vectorization is profitable once the total vector cost is less than the
9581 // total scalar cost:
9582 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9583 //
9584 // Now we can compute the minimum required trip count TC as
9585 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9586 //
9587 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9588 // the computations are performed on doubles, not integers and the result
9589 // is rounded up, hence we get an upper estimate of the TC.
9590 unsigned IntVF = VF.Width.getKnownMinValue();
9591 if (VF.Width.isScalable()) {
9592 unsigned AssumedMinimumVscale = 1;
9593 if (VScale)
9594 AssumedMinimumVscale = *VScale;
9595 IntVF *= AssumedMinimumVscale;
9596 }
9597 uint64_t RtC = *CheckCost.getValue();
9598 uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
9599 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(Numerator: RtC * IntVF, Denominator: Div);
9600
9601 // Second, compute a minimum iteration count so that the cost of the
9602 // runtime checks is only a fraction of the total scalar loop cost. This
9603 // adds a loop-dependent bound on the overhead incurred if the runtime
9604 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9605 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9606 // cost, compute
9607 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9608 uint64_t MinTC2 = divideCeil(Numerator: RtC * 10, Denominator: ScalarC);
9609
9610 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9611 // epilogue is allowed, choose the next closest multiple of VF. This should
9612 // partly compensate for ignoring the epilogue cost.
9613 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9614 if (SEL == CM_ScalarEpilogueAllowed)
9615 MinTC = alignTo(Value: MinTC, Align: IntVF);
9616 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9617
9618 LLVM_DEBUG(
9619 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9620 << VF.MinProfitableTripCount << "\n");
9621
9622 // Skip vectorization if the expected trip count is less than the minimum
9623 // required trip count.
9624 if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
9625 if (ElementCount::isKnownLT(LHS: ElementCount::getFixed(MinVal: *ExpectedTC),
9626 RHS: VF.MinProfitableTripCount)) {
9627 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9628 "trip count < minimum profitable VF ("
9629 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9630 << ")\n");
9631
9632 return false;
9633 }
9634 }
9635 return true;
9636}
9637
9638LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9639 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9640 !EnableLoopInterleaving),
9641 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9642 !EnableLoopVectorization) {}
9643
9644bool LoopVectorizePass::processLoop(Loop *L) {
9645 assert((EnableVPlanNativePath || L->isInnermost()) &&
9646 "VPlan-native path is not enabled. Only process inner loops.");
9647
9648 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9649 << L->getHeader()->getParent()->getName() << "' from "
9650 << L->getLocStr() << "\n");
9651
9652 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9653
9654 LLVM_DEBUG(
9655 dbgs() << "LV: Loop hints:"
9656 << " force="
9657 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9658 ? "disabled"
9659 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9660 ? "enabled"
9661 : "?"))
9662 << " width=" << Hints.getWidth()
9663 << " interleave=" << Hints.getInterleave() << "\n");
9664
9665 // Function containing loop
9666 Function *F = L->getHeader()->getParent();
9667
9668 // Looking at the diagnostic output is the only way to determine if a loop
9669 // was vectorized (other than looking at the IR or machine code), so it
9670 // is important to generate an optimization remark for each loop. Most of
9671 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9672 // generated as OptimizationRemark and OptimizationRemarkMissed are
9673 // less verbose reporting vectorized loops and unvectorized loops that may
9674 // benefit from vectorization, respectively.
9675
9676 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9677 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9678 return false;
9679 }
9680
9681 PredicatedScalarEvolution PSE(*SE, *L);
9682
9683 // Check if it is legal to vectorize the loop.
9684 LoopVectorizationRequirements Requirements;
9685 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9686 &Requirements, &Hints, DB, AC, BFI, PSI);
9687 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9688 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9689 Hints.emitRemarkWithHints();
9690 return false;
9691 }
9692
9693 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9694 // here. They may require CFG and instruction level transformations before
9695 // even evaluating whether vectorization is profitable. Since we cannot modify
9696 // the incoming IR, we need to build VPlan upfront in the vectorization
9697 // pipeline.
9698 if (!L->isInnermost())
9699 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9700 ORE, BFI, PSI, Hints, Requirements);
9701
9702 assert(L->isInnermost() && "Inner loop expected.");
9703
9704 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9705 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9706
9707 // If an override option has been passed in for interleaved accesses, use it.
9708 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9709 UseInterleaved = EnableInterleavedMemAccesses;
9710
9711 // Analyze interleaved memory accesses.
9712 if (UseInterleaved)
9713 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9714
9715 // Check the function attributes and profiles to find out if this function
9716 // should be optimized for size.
9717 ScalarEpilogueLowering SEL =
9718 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI);
9719
9720 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9721 // count by optimizing for size, to minimize overheads.
9722 auto ExpectedTC = getSmallBestKnownTC(SE&: *SE, L);
9723 if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9724 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9725 << "This loop is worth vectorizing only if no scalar "
9726 << "iteration overheads are incurred.");
9727 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9728 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9729 else {
9730 if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
9731 LLVM_DEBUG(dbgs() << "\n");
9732 // Predicate tail-folded loops are efficient even when the loop
9733 // iteration count is low. However, setting the epilogue policy to
9734 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9735 // with runtime checks. It's more effective to let
9736 // `areRuntimeChecksProfitable` determine if vectorization is beneficial
9737 // for the loop.
9738 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9739 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9740 } else {
9741 LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
9742 "small to consider vectorizing.\n");
9743 reportVectorizationFailure(
9744 DebugMsg: "The trip count is below the minial threshold value.",
9745 OREMsg: "loop trip count is too low, avoiding vectorization",
9746 ORETag: "LowTripCount", ORE, TheLoop: L);
9747 Hints.emitRemarkWithHints();
9748 return false;
9749 }
9750 }
9751 }
9752
9753 // Check the function attributes to see if implicit floats or vectors are
9754 // allowed.
9755 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
9756 reportVectorizationFailure(
9757 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9758 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9759 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9760 Hints.emitRemarkWithHints();
9761 return false;
9762 }
9763
9764 // Check if the target supports potentially unsafe FP vectorization.
9765 // FIXME: Add a check for the type of safety issue (denormal, signaling)
9766 // for the target we're vectorizing for, to make sure none of the
9767 // additional fp-math flags can help.
9768 if (Hints.isPotentiallyUnsafe() &&
9769 TTI->isFPVectorizationPotentiallyUnsafe()) {
9770 reportVectorizationFailure(
9771 DebugMsg: "Potentially unsafe FP op prevents vectorization",
9772 OREMsg: "loop not vectorized due to unsafe FP support.",
9773 ORETag: "UnsafeFP", ORE, TheLoop: L);
9774 Hints.emitRemarkWithHints();
9775 return false;
9776 }
9777
9778 bool AllowOrderedReductions;
9779 // If the flag is set, use that instead and override the TTI behaviour.
9780 if (ForceOrderedReductions.getNumOccurrences() > 0)
9781 AllowOrderedReductions = ForceOrderedReductions;
9782 else
9783 AllowOrderedReductions = TTI->enableOrderedReductions();
9784 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9785 ORE->emit(RemarkBuilder: [&]() {
9786 auto *ExactFPMathInst = Requirements.getExactFPInst();
9787 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9788 ExactFPMathInst->getDebugLoc(),
9789 ExactFPMathInst->getParent())
9790 << "loop not vectorized: cannot prove it is safe to reorder "
9791 "floating-point operations";
9792 });
9793 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9794 "reorder floating-point operations\n");
9795 Hints.emitRemarkWithHints();
9796 return false;
9797 }
9798
9799 // Use the cost model.
9800 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9801 F, &Hints, IAI);
9802 // Use the planner for vectorization.
9803 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9804 ORE);
9805
9806 // Get user vectorization factor and interleave count.
9807 ElementCount UserVF = Hints.getWidth();
9808 unsigned UserIC = Hints.getInterleave();
9809
9810 // Plan how to best vectorize, return the best VF and its cost.
9811 std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9812
9813 VectorizationFactor VF = VectorizationFactor::Disabled();
9814 unsigned IC = 1;
9815
9816 bool AddBranchWeights =
9817 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
9818 GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9819 F->getDataLayout(), AddBranchWeights);
9820 if (MaybeVF) {
9821 VF = *MaybeVF;
9822 // Select the interleave count.
9823 IC = CM.selectInterleaveCount(VF: VF.Width, LoopCost: VF.Cost);
9824
9825 unsigned SelectedIC = std::max(a: IC, b: UserIC);
9826 // Optimistically generate runtime checks if they are needed. Drop them if
9827 // they turn out to not be profitable.
9828 if (VF.Width.isVector() || SelectedIC > 1)
9829 Checks.Create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC);
9830
9831 // Check if it is profitable to vectorize with runtime checks.
9832 bool ForceVectorization =
9833 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9834 if (!ForceVectorization &&
9835 !areRuntimeChecksProfitable(Checks, VF, VScale: getVScaleForTuning(L, TTI: *TTI), L,
9836 SE&: *PSE.getSE(), SEL)) {
9837 ORE->emit(RemarkBuilder: [&]() {
9838 return OptimizationRemarkAnalysisAliasing(
9839 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9840 L->getHeader())
9841 << "loop not vectorized: cannot prove it is safe to reorder "
9842 "memory operations";
9843 });
9844 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9845 Hints.emitRemarkWithHints();
9846 return false;
9847 }
9848 }
9849
9850 // Identify the diagnostic messages that should be produced.
9851 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9852 bool VectorizeLoop = true, InterleaveLoop = true;
9853 if (VF.Width.isScalar()) {
9854 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9855 VecDiagMsg = std::make_pair(
9856 x: "VectorizationNotBeneficial",
9857 y: "the cost-model indicates that vectorization is not beneficial");
9858 VectorizeLoop = false;
9859 }
9860
9861 if (!MaybeVF && UserIC > 1) {
9862 // Tell the user interleaving was avoided up-front, despite being explicitly
9863 // requested.
9864 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9865 "interleaving should be avoided up front\n");
9866 IntDiagMsg = std::make_pair(
9867 x: "InterleavingAvoided",
9868 y: "Ignoring UserIC, because interleaving was avoided up front");
9869 InterleaveLoop = false;
9870 } else if (IC == 1 && UserIC <= 1) {
9871 // Tell the user interleaving is not beneficial.
9872 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9873 IntDiagMsg = std::make_pair(
9874 x: "InterleavingNotBeneficial",
9875 y: "the cost-model indicates that interleaving is not beneficial");
9876 InterleaveLoop = false;
9877 if (UserIC == 1) {
9878 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9879 IntDiagMsg.second +=
9880 " and is explicitly disabled or interleave count is set to 1";
9881 }
9882 } else if (IC > 1 && UserIC == 1) {
9883 // Tell the user interleaving is beneficial, but it explicitly disabled.
9884 LLVM_DEBUG(
9885 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
9886 IntDiagMsg = std::make_pair(
9887 x: "InterleavingBeneficialButDisabled",
9888 y: "the cost-model indicates that interleaving is beneficial "
9889 "but is explicitly disabled or interleave count is set to 1");
9890 InterleaveLoop = false;
9891 }
9892
9893 // Override IC if user provided an interleave count.
9894 IC = UserIC > 0 ? UserIC : IC;
9895
9896 // Emit diagnostic messages, if any.
9897 const char *VAPassName = Hints.vectorizeAnalysisPassName();
9898 if (!VectorizeLoop && !InterleaveLoop) {
9899 // Do not vectorize or interleaving the loop.
9900 ORE->emit(RemarkBuilder: [&]() {
9901 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
9902 L->getStartLoc(), L->getHeader())
9903 << VecDiagMsg.second;
9904 });
9905 ORE->emit(RemarkBuilder: [&]() {
9906 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
9907 L->getStartLoc(), L->getHeader())
9908 << IntDiagMsg.second;
9909 });
9910 return false;
9911 } else if (!VectorizeLoop && InterleaveLoop) {
9912 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9913 ORE->emit(RemarkBuilder: [&]() {
9914 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
9915 L->getStartLoc(), L->getHeader())
9916 << VecDiagMsg.second;
9917 });
9918 } else if (VectorizeLoop && !InterleaveLoop) {
9919 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9920 << ") in " << L->getLocStr() << '\n');
9921 ORE->emit(RemarkBuilder: [&]() {
9922 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
9923 L->getStartLoc(), L->getHeader())
9924 << IntDiagMsg.second;
9925 });
9926 } else if (VectorizeLoop && InterleaveLoop) {
9927 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9928 << ") in " << L->getLocStr() << '\n');
9929 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
9930 }
9931
9932 bool DisableRuntimeUnroll = false;
9933 MDNode *OrigLoopID = L->getLoopID();
9934 {
9935 using namespace ore;
9936 if (!VectorizeLoop) {
9937 assert(IC > 1 && "interleave count should not be 1 or 0");
9938 // If we decided that it is not legal to vectorize the loop, then
9939 // interleave it.
9940 InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
9941 &CM, BFI, PSI, Checks);
9942
9943 VPlan &BestPlan =
9944 UseLegacyCostModel ? LVP.getBestPlanFor(VF: VF.Width) : LVP.getBestPlan();
9945 assert((UseLegacyCostModel || BestPlan.hasScalarVFOnly()) &&
9946 "VPlan cost model and legacy cost model disagreed");
9947 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, IsEpilogueVectorization: false);
9948
9949 ORE->emit(RemarkBuilder: [&]() {
9950 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
9951 L->getHeader())
9952 << "interleaved loop (interleaved count: "
9953 << NV("InterleaveCount", IC) << ")";
9954 });
9955 } else {
9956 // If we decided that it is *legal* to vectorize the loop, then do it.
9957
9958 // Consider vectorizing the epilogue too if it's profitable.
9959 VectorizationFactor EpilogueVF =
9960 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
9961 if (EpilogueVF.Width.isVector()) {
9962
9963 // The first pass vectorizes the main loop and creates a scalar epilogue
9964 // to be vectorized by executing the plan (potentially with a different
9965 // factor) again shortly afterwards.
9966 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
9967 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
9968 EPI, &LVL, &CM, BFI, PSI, Checks);
9969
9970 std::unique_ptr<VPlan> BestMainPlan(
9971 LVP.getBestPlanFor(VF: EPI.MainLoopVF).duplicate());
9972 const auto &[ExpandedSCEVs, ReductionResumeValues] = LVP.executePlan(
9973 BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF, BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, IsEpilogueVectorization: true);
9974 ++LoopsVectorized;
9975
9976 // Second pass vectorizes the epilogue and adjusts the control flow
9977 // edges from the first pass.
9978 EPI.MainLoopVF = EPI.EpilogueVF;
9979 EPI.MainLoopUF = EPI.EpilogueUF;
9980 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
9981 ORE, EPI, &LVL, &CM, BFI, PSI,
9982 Checks);
9983
9984 VPlan &BestEpiPlan = LVP.getBestPlanFor(VF: EPI.EpilogueVF);
9985 VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
9986 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9987 Header->setName("vec.epilog.vector.body");
9988
9989 // Re-use the trip count and steps expanded for the main loop, as
9990 // skeleton creation needs it as a value that dominates both the scalar
9991 // and vector epilogue loops
9992 // TODO: This is a workaround needed for epilogue vectorization and it
9993 // should be removed once induction resume value creation is done
9994 // directly in VPlan.
9995 EpilogILV.setTripCount(MainILV.getTripCount());
9996 for (auto &R : make_early_inc_range(Range&: *BestEpiPlan.getPreheader())) {
9997 auto *ExpandR = cast<VPExpandSCEVRecipe>(Val: &R);
9998 auto *ExpandedVal = BestEpiPlan.getOrAddLiveIn(
9999 V: ExpandedSCEVs.find(Val: ExpandR->getSCEV())->second);
10000 ExpandR->replaceAllUsesWith(New: ExpandedVal);
10001 if (BestEpiPlan.getTripCount() == ExpandR)
10002 BestEpiPlan.resetTripCount(NewTripCount: ExpandedVal);
10003 ExpandR->eraseFromParent();
10004 }
10005
10006 // Ensure that the start values for any VPWidenIntOrFpInductionRecipe,
10007 // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
10008 // before vectorizing the epilogue loop.
10009 for (VPRecipeBase &R : Header->phis()) {
10010 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
10011 continue;
10012
10013 Value *ResumeV = nullptr;
10014 // TODO: Move setting of resume values to prepareToExecute.
10015 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
10016 const RecurrenceDescriptor &RdxDesc =
10017 ReductionPhi->getRecurrenceDescriptor();
10018 RecurKind RK = RdxDesc.getRecurrenceKind();
10019 ResumeV = ReductionResumeValues.find(Val: &RdxDesc)->second;
10020 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
10021 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
10022 // start value; compare the final value from the main vector loop
10023 // to the start value.
10024 IRBuilder<> Builder(
10025 cast<Instruction>(Val: ResumeV)->getParent()->getFirstNonPHI());
10026 ResumeV = Builder.CreateICmpNE(LHS: ResumeV,
10027 RHS: RdxDesc.getRecurrenceStartValue());
10028 }
10029 } else {
10030 // Create induction resume values for both widened pointer and
10031 // integer/fp inductions and update the start value of the induction
10032 // recipes to use the resume value.
10033 PHINode *IndPhi = nullptr;
10034 const InductionDescriptor *ID;
10035 if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(Val: &R)) {
10036 IndPhi = cast<PHINode>(Val: Ind->getUnderlyingValue());
10037 ID = &Ind->getInductionDescriptor();
10038 } else {
10039 auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(Val: &R);
10040 IndPhi = WidenInd->getPHINode();
10041 ID = &WidenInd->getInductionDescriptor();
10042 }
10043
10044 ResumeV = MainILV.createInductionResumeValue(
10045 OrigPhi: IndPhi, II: *ID, Step: getExpandedStep(ID: *ID, ExpandedSCEVs),
10046 BypassBlocks: {EPI.MainLoopIterationCountCheck});
10047 }
10048 assert(ResumeV && "Must have a resume value");
10049 VPValue *StartVal = BestEpiPlan.getOrAddLiveIn(V: ResumeV);
10050 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
10051 }
10052
10053 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10054 "DT not preserved correctly");
10055 LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV,
10056 DT, IsEpilogueVectorization: true, ExpandedSCEVs: &ExpandedSCEVs);
10057 ++LoopsEpilogueVectorized;
10058
10059 if (!MainILV.areSafetyChecksAdded())
10060 DisableRuntimeUnroll = true;
10061 } else {
10062 ElementCount Width = VF.Width;
10063 VPlan &BestPlan =
10064 UseLegacyCostModel ? LVP.getBestPlanFor(VF: Width) : LVP.getBestPlan();
10065 if (!UseLegacyCostModel) {
10066 assert(size(BestPlan.vectorFactors()) == 1 &&
10067 "Plan should have a single VF");
10068 Width = *BestPlan.vectorFactors().begin();
10069 LLVM_DEBUG(dbgs()
10070 << "VF picked by VPlan cost model: " << Width << "\n");
10071 assert(VF.Width == Width &&
10072 "VPlan cost model and legacy cost model disagreed");
10073 }
10074 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, Width,
10075 VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10076 PSI, Checks);
10077 LVP.executePlan(BestVF: Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, IsEpilogueVectorization: false);
10078 ++LoopsVectorized;
10079
10080 // Add metadata to disable runtime unrolling a scalar loop when there
10081 // are no runtime checks about strides and memory. A scalar loop that is
10082 // rarely used is not worth unrolling.
10083 if (!LB.areSafetyChecksAdded())
10084 DisableRuntimeUnroll = true;
10085 }
10086 // Report the vectorization decision.
10087 reportVectorization(ORE, TheLoop: L, VF, IC);
10088 }
10089
10090 if (ORE->allowExtraAnalysis(LV_NAME))
10091 checkMixedPrecision(L, ORE);
10092 }
10093
10094 std::optional<MDNode *> RemainderLoopID =
10095 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
10096 LLVMLoopVectorizeFollowupEpilogue});
10097 if (RemainderLoopID) {
10098 L->setLoopID(*RemainderLoopID);
10099 } else {
10100 if (DisableRuntimeUnroll)
10101 AddRuntimeUnrollDisableMetaData(L);
10102
10103 // Mark the loop as already vectorized to avoid vectorizing again.
10104 Hints.setAlreadyVectorized();
10105 }
10106
10107 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10108 return true;
10109}
10110
10111LoopVectorizeResult LoopVectorizePass::runImpl(
10112 Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10113 DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_,
10114 DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_,
10115 OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10116 SE = &SE_;
10117 LI = &LI_;
10118 TTI = &TTI_;
10119 DT = &DT_;
10120 BFI = BFI_;
10121 TLI = TLI_;
10122 AC = &AC_;
10123 LAIs = &LAIs_;
10124 DB = &DB_;
10125 ORE = &ORE_;
10126 PSI = PSI_;
10127
10128 // Don't attempt if
10129 // 1. the target claims to have no vector registers, and
10130 // 2. interleaving won't help ILP.
10131 //
10132 // The second condition is necessary because, even if the target has no
10133 // vector registers, loop vectorization may still enable scalar
10134 // interleaving.
10135 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
10136 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
10137 return LoopVectorizeResult(false, false);
10138
10139 bool Changed = false, CFGChanged = false;
10140
10141 // The vectorizer requires loops to be in simplified form.
10142 // Since simplification may add new inner loops, it has to run before the
10143 // legality and profitability checks. This means running the loop vectorizer
10144 // will simplify all loops, regardless of whether anything end up being
10145 // vectorized.
10146 for (const auto &L : *LI)
10147 Changed |= CFGChanged |=
10148 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
10149
10150 // Build up a worklist of inner-loops to vectorize. This is necessary as
10151 // the act of vectorizing or partially unrolling a loop creates new loops
10152 // and can invalidate iterators across the loops.
10153 SmallVector<Loop *, 8> Worklist;
10154
10155 for (Loop *L : *LI)
10156 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
10157
10158 LoopsAnalyzed += Worklist.size();
10159
10160 // Now walk the identified inner loops.
10161 while (!Worklist.empty()) {
10162 Loop *L = Worklist.pop_back_val();
10163
10164 // For the inner loops we actually process, form LCSSA to simplify the
10165 // transform.
10166 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
10167
10168 Changed |= CFGChanged |= processLoop(L);
10169
10170 if (Changed) {
10171 LAIs->clear();
10172
10173#ifndef NDEBUG
10174 if (VerifySCEV)
10175 SE->verify();
10176#endif
10177 }
10178 }
10179
10180 // Process each loop nest in the function.
10181 return LoopVectorizeResult(Changed, CFGChanged);
10182}
10183
10184PreservedAnalyses LoopVectorizePass::run(Function &F,
10185 FunctionAnalysisManager &AM) {
10186 auto &LI = AM.getResult<LoopAnalysis>(IR&: F);
10187 // There are no loops in the function. Return before computing other expensive
10188 // analyses.
10189 if (LI.empty())
10190 return PreservedAnalyses::all();
10191 auto &SE = AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
10192 auto &TTI = AM.getResult<TargetIRAnalysis>(IR&: F);
10193 auto &DT = AM.getResult<DominatorTreeAnalysis>(IR&: F);
10194 auto &TLI = AM.getResult<TargetLibraryAnalysis>(IR&: F);
10195 auto &AC = AM.getResult<AssumptionAnalysis>(IR&: F);
10196 auto &DB = AM.getResult<DemandedBitsAnalysis>(IR&: F);
10197 auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
10198
10199 LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(IR&: F);
10200 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
10201 ProfileSummaryInfo *PSI =
10202 MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
10203 BlockFrequencyInfo *BFI = nullptr;
10204 if (PSI && PSI->hasProfileSummary())
10205 BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
10206 LoopVectorizeResult Result =
10207 runImpl(F, SE_&: SE, LI_&: LI, TTI_&: TTI, DT_&: DT, BFI_: BFI, TLI_: &TLI, DB_&: DB, AC_&: AC, LAIs_&: LAIs, ORE_&: ORE, PSI_: PSI);
10208 if (!Result.MadeAnyChange)
10209 return PreservedAnalyses::all();
10210 PreservedAnalyses PA;
10211
10212 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
10213 for (auto &BB : F)
10214 RemoveRedundantDbgInstrs(BB: &BB);
10215 }
10216
10217 PA.preserve<LoopAnalysis>();
10218 PA.preserve<DominatorTreeAnalysis>();
10219 PA.preserve<ScalarEvolutionAnalysis>();
10220 PA.preserve<LoopAccessAnalysis>();
10221
10222 if (Result.MadeCFGChange) {
10223 // Making CFG changes likely means a loop got vectorized. Indicate that
10224 // extra simplification passes should be run.
10225 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10226 // be run if runtime checks have been added.
10227 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10228 PA.preserve<ShouldRunExtraVectorPasses>();
10229 } else {
10230 PA.preserveSet<CFGAnalyses>();
10231 }
10232 return PA;
10233}
10234
10235void LoopVectorizePass::printPipeline(
10236 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10237 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10238 OS, MapClassName2PassName);
10239
10240 OS << '<';
10241 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10242 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10243 OS << '>';
10244}
10245