1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/DenseMapInfo.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
74#include "llvm/ADT/SmallPtrSet.h"
75#include "llvm/ADT/SmallVector.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
80#include "llvm/ADT/iterator_range.h"
81#include "llvm/Analysis/AssumptionCache.h"
82#include "llvm/Analysis/BasicAliasAnalysis.h"
83#include "llvm/Analysis/BlockFrequencyInfo.h"
84#include "llvm/Analysis/CFG.h"
85#include "llvm/Analysis/CodeMetrics.h"
86#include "llvm/Analysis/DemandedBits.h"
87#include "llvm/Analysis/GlobalsModRef.h"
88#include "llvm/Analysis/LoopAccessAnalysis.h"
89#include "llvm/Analysis/LoopAnalysisManager.h"
90#include "llvm/Analysis/LoopInfo.h"
91#include "llvm/Analysis/LoopIterator.h"
92#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93#include "llvm/Analysis/ProfileSummaryInfo.h"
94#include "llvm/Analysis/ScalarEvolution.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
97#include "llvm/Analysis/TargetLibraryInfo.h"
98#include "llvm/Analysis/TargetTransformInfo.h"
99#include "llvm/Analysis/ValueTracking.h"
100#include "llvm/Analysis/VectorUtils.h"
101#include "llvm/IR/Attributes.h"
102#include "llvm/IR/BasicBlock.h"
103#include "llvm/IR/CFG.h"
104#include "llvm/IR/Constant.h"
105#include "llvm/IR/Constants.h"
106#include "llvm/IR/DataLayout.h"
107#include "llvm/IR/DebugInfo.h"
108#include "llvm/IR/DebugLoc.h"
109#include "llvm/IR/DerivedTypes.h"
110#include "llvm/IR/DiagnosticInfo.h"
111#include "llvm/IR/Dominators.h"
112#include "llvm/IR/Function.h"
113#include "llvm/IR/IRBuilder.h"
114#include "llvm/IR/InstrTypes.h"
115#include "llvm/IR/Instruction.h"
116#include "llvm/IR/Instructions.h"
117#include "llvm/IR/IntrinsicInst.h"
118#include "llvm/IR/Intrinsics.h"
119#include "llvm/IR/MDBuilder.h"
120#include "llvm/IR/Metadata.h"
121#include "llvm/IR/Module.h"
122#include "llvm/IR/Operator.h"
123#include "llvm/IR/PatternMatch.h"
124#include "llvm/IR/ProfDataUtils.h"
125#include "llvm/IR/Type.h"
126#include "llvm/IR/Use.h"
127#include "llvm/IR/User.h"
128#include "llvm/IR/Value.h"
129#include "llvm/IR/Verifier.h"
130#include "llvm/Support/Casting.h"
131#include "llvm/Support/CommandLine.h"
132#include "llvm/Support/Debug.h"
133#include "llvm/Support/ErrorHandling.h"
134#include "llvm/Support/InstructionCost.h"
135#include "llvm/Support/MathExtras.h"
136#include "llvm/Support/NativeFormatting.h"
137#include "llvm/Support/raw_ostream.h"
138#include "llvm/Transforms/Utils/BasicBlockUtils.h"
139#include "llvm/Transforms/Utils/InjectTLIMappings.h"
140#include "llvm/Transforms/Utils/Local.h"
141#include "llvm/Transforms/Utils/LoopSimplify.h"
142#include "llvm/Transforms/Utils/LoopUtils.h"
143#include "llvm/Transforms/Utils/LoopVersioning.h"
144#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
145#include "llvm/Transforms/Utils/SizeOpts.h"
146#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147#include <algorithm>
148#include <cassert>
149#include <cmath>
150#include <cstdint>
151#include <functional>
152#include <iterator>
153#include <limits>
154#include <memory>
155#include <string>
156#include <tuple>
157#include <utility>
158
159using namespace llvm;
160using namespace SCEVPatternMatch;
161using namespace LoopVectorizationUtils;
162
163#define LV_NAME "loop-vectorize"
164#define DEBUG_TYPE LV_NAME
165
166#ifndef NDEBUG
167const char VerboseDebug[] = DEBUG_TYPE "-verbose";
168#endif
169
170STATISTIC(LoopsVectorized, "Number of loops vectorized");
171STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
172STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
173STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
174STATISTIC(LoopsPartialAliasVectorized,
175 "Number of partial aliasing loops vectorized");
176
177static cl::opt<bool> EnableEpilogueVectorization(
178 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
179 cl::desc("Enable vectorization of epilogue loops."));
180
181static cl::opt<unsigned> EpilogueVectorizationForceVF(
182 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
183 cl::desc("When epilogue vectorization is enabled, and a value greater than "
184 "1 is specified, forces the given VF for all applicable epilogue "
185 "loops."));
186
187static cl::opt<unsigned> EpilogueVectorizationMinVF(
188 "epilogue-vectorization-minimum-VF", cl::Hidden,
189 cl::desc("Only loops with vectorization factor equal to or larger than "
190 "the specified value are considered for epilogue vectorization."));
191
192/// Loops with a known constant trip count below this number are vectorized only
193/// if no scalar iteration overheads are incurred.
194static cl::opt<unsigned> TinyTripCountVectorThreshold(
195 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
196 cl::desc("Loops with a constant trip count that is smaller than this "
197 "value are vectorized only if no scalar iteration overheads "
198 "are incurred."));
199
200static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
201 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
202 cl::desc("The maximum allowed number of runtime memory checks"));
203
204static cl::opt<bool> ForcePartialAliasingVectorization(
205 "force-partial-aliasing-vectorization", cl::init(Val: false), cl::Hidden,
206 cl::desc("Replace pointer diff checks with alias masks."));
207
208/// Option tail-folding-policy controls the tail-folding strategy and lists all
209/// available options. The vectorizer will attempt to fold the tail-loop into
210/// the vector loop (main/epilogue loops) and predicate the instructions
211/// accordingly. If tail-folding fails, there are different fallback strategies
212/// depending on these values:
213enum class TailFoldingPolicyTy { None = 0, PreferFoldTail, MustFoldTail };
214
215static cl::opt<TailFoldingPolicyTy> TailFoldingPolicy(
216 "tail-folding-policy", cl::init(Val: TailFoldingPolicyTy::None), cl::Hidden,
217 cl::desc("Tail-folding preferences over creating an epilogue loop."),
218 cl::values(
219 clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail",
220 "Don't tail-fold loops."),
221 clEnumValN(TailFoldingPolicyTy::PreferFoldTail, "prefer-fold-tail",
222 "prefer tail-folding, otherwise create an epilogue when "
223 "appropriate."),
224 clEnumValN(TailFoldingPolicyTy::MustFoldTail, "must-fold-tail",
225 "always tail-fold, don't attempt vectorization if "
226 "tail-folding fails.")));
227
228static cl::opt<TailFoldingPolicyTy> EpilogueTailFoldingPolicy(
229 "epilogue-tail-folding-policy", cl::Hidden,
230 cl::desc(
231 "Epilogue-tail-folding preferences over creating an epilogue loop."),
232 cl::values(
233 clEnumValN(TailFoldingPolicyTy::None, "dont-fold-tail",
234 "Don't tail-fold loops."),
235 clEnumValN(TailFoldingPolicyTy::PreferFoldTail, "prefer-fold-tail",
236 "prefer tail-folding, otherwise create an epilogue when "
237 "appropriate.")));
238
239static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
240 "force-tail-folding-style", cl::desc("Force the tail folding style"),
241 cl::init(Val: TailFoldingStyle::None),
242 cl::values(
243 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244 clEnumValN(
245 TailFoldingStyle::Data, "data",
246 "Create lane mask for data only, using active.lane.mask intrinsic"),
247 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
248 "data-without-lane-mask",
249 "Create lane mask with compare/stepvector"),
250 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
251 "Create lane mask using active.lane.mask intrinsic, and use "
252 "it for both data and control flow"),
253 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254 "Use predicated EVL instructions for tail folding. If EVL "
255 "is unsupported, fallback to data-without-lane-mask.")));
256
257cl::opt<bool> llvm::EnableWideActiveLaneMask(
258 "enable-wide-lane-mask", cl::init(Val: false), cl::Hidden,
259 cl::desc("Enable use of wide lane masks when used for control flow in "
260 "tail-folded loops"));
261
262static cl::opt<bool> EnableInterleavedMemAccesses(
263 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
264 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
265
266/// An interleave-group may need masking if it resides in a block that needs
267/// predication, or in order to mask away gaps.
268static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
269 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
270 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
271
272static cl::opt<unsigned> ForceTargetNumScalarRegs(
273 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
274 cl::desc("A flag that overrides the target's number of scalar registers."));
275
276static cl::opt<unsigned> ForceTargetNumVectorRegs(
277 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
278 cl::desc("A flag that overrides the target's number of vector registers."));
279
280static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
281 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
282 cl::desc("A flag that overrides the target's max interleave factor for "
283 "scalar loops."));
284
285static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
286 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
287 cl::desc("A flag that overrides the target's max interleave factor for "
288 "vectorized loops."));
289
290cl::opt<unsigned> llvm::ForceTargetInstructionCost(
291 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
292 cl::desc("A flag that overrides the target's expected cost for "
293 "an instruction to a single constant value. Mostly "
294 "useful for getting consistent testing."));
295
296static cl::opt<unsigned> SmallLoopCost(
297 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
298 cl::desc(
299 "The cost of a loop that is considered 'small' by the interleaver."));
300
301static cl::opt<bool> LoopVectorizeWithBlockFrequency(
302 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
303 cl::desc("Enable the use of the block frequency analysis to access PGO "
304 "heuristics minimizing code growth in cold regions and being more "
305 "aggressive in hot regions."));
306
307// Runtime interleave loops for load/store throughput.
308static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
309 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
310 cl::desc(
311 "Enable runtime interleaving until load/store ports are saturated"));
312
313/// The number of stores in a loop that are allowed to need predication.
314cl::opt<unsigned> NumberOfStoresToPredicate(
315 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
316 cl::desc("Max number of stores to be predicated behind an if."));
317
318// TODO: Move size-based thresholds out of legality checking, make cost based
319// decisions instead of hard thresholds.
320static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
321 "vectorize-scev-check-threshold", cl::init(Val: 16), cl::Hidden,
322 cl::desc("The maximum number of SCEV checks allowed."));
323
324static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
325 "pragma-vectorize-scev-check-threshold", cl::init(Val: 128), cl::Hidden,
326 cl::desc("The maximum number of SCEV checks allowed with a "
327 "vectorize(enable) pragma"));
328
329static cl::opt<bool> EnableIndVarRegisterHeur(
330 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
331 cl::desc("Count the induction variable only once when interleaving"));
332
333static cl::opt<unsigned> MaxNestedScalarReductionIC(
334 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
335 cl::desc("The maximum interleave count to use when interleaving a scalar "
336 "reduction in a nested loop."));
337
338static cl::opt<bool> ForceOrderedReductions(
339 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
340 cl::desc("Enable the vectorisation of loops with in-order (strict) "
341 "FP reductions"));
342
343static cl::opt<bool> PreferPredicatedReductionSelect(
344 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
345 cl::desc(
346 "Prefer predicating a reduction operation over an after loop select."));
347
348cl::opt<bool> llvm::EnableVPlanNativePath(
349 "enable-vplan-native-path", cl::Hidden,
350 cl::desc("Enable VPlan-native vectorization path with "
351 "support for outer loop vectorization."));
352
353cl::opt<bool>
354 llvm::VerifyEachVPlan("vplan-verify-each",
355#ifdef EXPENSIVE_CHECKS
356 cl::init(true),
357#else
358 cl::init(Val: false),
359#endif
360 cl::Hidden,
361 cl::desc("Verify VPlans after VPlan transforms."));
362
363#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
364cl::opt<bool> llvm::VPlanPrintBeforeAll(
365 "vplan-print-before-all", cl::init(false), cl::Hidden,
366 cl::desc("Print VPlans before all VPlan transformations."));
367
368cl::opt<bool> llvm::VPlanPrintAfterAll(
369 "vplan-print-after-all", cl::init(false), cl::Hidden,
370 cl::desc("Print VPlans after all VPlan transformations."));
371
372cl::list<std::string> llvm::VPlanPrintBeforePasses(
373 "vplan-print-before", cl::Hidden,
374 cl::desc("Print VPlans before specified VPlan transformations (regexp)."));
375
376cl::list<std::string> llvm::VPlanPrintAfterPasses(
377 "vplan-print-after", cl::Hidden,
378 cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
379
380cl::opt<bool> llvm::VPlanPrintVectorRegionScope(
381 "vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
382 cl::desc("Limit VPlan printing to vector loop region in "
383 "`-vplan-print-after*` if the plan has one."));
384#endif
385
386// This flag enables the stress testing of the VPlan H-CFG construction in the
387// VPlan-native vectorization path. It must be used in conjuction with
388// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
389// verification of the H-CFGs built.
390cl::opt<bool> VPlanBuildOuterloopStressTest(
391 "vplan-build-outerloop-stress-test", cl::init(Val: false), cl::Hidden,
392 cl::desc(
393 "Build VPlan for every supported loop nest in the function and bail "
394 "out right after the build (stress test the VPlan H-CFG construction "
395 "in the VPlan-native vectorization path)."));
396
397cl::opt<bool> llvm::EnableLoopInterleaving(
398 "interleave-loops", cl::init(Val: true), cl::Hidden,
399 cl::desc("Enable loop interleaving in Loop vectorization passes"));
400cl::opt<bool> llvm::EnableLoopVectorization(
401 "vectorize-loops", cl::init(Val: true), cl::Hidden,
402 cl::desc("Run the Loop vectorization passes"));
403
404static cl::opt<cl::boolOrDefault>
405 ForceMaskedDivRem("force-widen-divrem-via-masked-intrinsic", cl::Hidden,
406 cl::desc("Override cost based masked intrinsic widening "
407 "for div/rem instructions"));
408
409static cl::opt<bool> EnableEarlyExitVectorization(
410 "enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
411 cl::desc(
412 "Enable vectorization of early exit loops with uncountable exits."));
413
414static cl::opt<bool> EnableEarlyExitVectorizationWithSideEffects(
415 "enable-early-exit-vectorization-with-side-effects", cl::init(Val: false),
416 cl::Hidden,
417 cl::desc("Enable vectorization of early exit loops with uncountable exits "
418 "and side effects"));
419
420// Likelyhood of bypassing the vectorized loop because there are zero trips left
421// after prolog. See `emitIterationCountCheck`.
422static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
423
424/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
425/// ElementCount to include loops whose trip count is a function of vscale.
426static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
427 const Loop *L) {
428 if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
429 return ElementCount::getFixed(MinVal: ExpectedTC);
430
431 const SCEV *BTC = SE->getBackedgeTakenCount(L);
432 if (isa<SCEVCouldNotCompute>(Val: BTC))
433 return ElementCount::getFixed(MinVal: 0);
434
435 const SCEV *ExitCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
436 if (isa<SCEVVScale>(Val: ExitCount))
437 return ElementCount::getScalable(MinVal: 1);
438
439 const APInt *Scale;
440 if (match(S: ExitCount, P: m_scev_Mul(Op0: m_scev_APInt(C&: Scale), Op1: m_SCEVVScale())))
441 if (cast<SCEVMulExpr>(Val: ExitCount)->hasNoUnsignedWrap())
442 if (Scale->getActiveBits() <= 32)
443 return ElementCount::getScalable(MinVal: Scale->getZExtValue());
444
445 return ElementCount::getFixed(MinVal: 0);
446}
447
448/// Get the maximum trip count for \p L from the SCEV unsigned range, excluding
449/// zero from the range. Only valid when not folding the tail, as the minimum
450/// iteration count check guards against a zero trip count. Returns 0 if
451/// unknown.
452static unsigned getMaxTCFromNonZeroRange(PredicatedScalarEvolution &PSE,
453 Loop *L) {
454 const SCEV *BTC = PSE.getBackedgeTakenCount();
455 if (isa<SCEVCouldNotCompute>(Val: BTC))
456 return 0;
457 ScalarEvolution *SE = PSE.getSE();
458 const SCEV *TripCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
459 ConstantRange TCRange = SE->getUnsignedRange(S: TripCount);
460 APInt MaxTCFromRange = TCRange.getUnsignedMax();
461 if (!MaxTCFromRange.isZero() && MaxTCFromRange.getActiveBits() <= 32)
462 return MaxTCFromRange.getZExtValue();
463 return 0;
464}
465
466/// Returns "best known" trip count, which is either a valid positive trip count
467/// or std::nullopt when an estimate cannot be made (including when the trip
468/// count would overflow), for the specified loop \p L as defined by the
469/// following procedure:
470/// 1) Returns exact trip count if it is known.
471/// 2) Returns expected trip count according to profile data if any.
472/// 3) Returns upper bound estimate if known, if \p CanUseConstantMax, and
473/// if \p ComputeUpperBoundOnly is false.
474/// 4) Returns the maximum trip count from the SCEV range excluding zero,
475/// if \p CanUseConstantMax and \p CanExcludeZeroTrips.
476/// 5) Returns std::nullopt if all of the above failed.
477static std::optional<ElementCount> getSmallBestKnownTC(
478 PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax = true,
479 bool CanExcludeZeroTrips = false, bool ComputeUpperBoundOnly = false) {
480 // Check if exact trip count is known.
481 if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
482 return ExpectedTC;
483
484 // Check if there is an expected trip count available from profile data.
485 if (LoopVectorizeWithBlockFrequency && !ComputeUpperBoundOnly)
486 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
487 return ElementCount::getFixed(MinVal: *EstimatedTC);
488
489 if (!CanUseConstantMax)
490 return std::nullopt;
491
492 // Check if upper bound estimate is known.
493 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
494 return ElementCount::getFixed(MinVal: ExpectedTC);
495
496 // Get the maximum trip count from the SCEV range excluding zero. This is
497 // only safe when not folding the tail, as the minimum iteration count check
498 // prevents entering the vector loop with a zero trip count.
499 if (CanUseConstantMax && CanExcludeZeroTrips)
500 if (unsigned RefinedTC = getMaxTCFromNonZeroRange(PSE, L))
501 return ElementCount::getFixed(MinVal: RefinedTC);
502
503 return std::nullopt;
504}
505
506namespace {
507// Forward declare GeneratedRTChecks.
508class GeneratedRTChecks;
509
510using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
511} // namespace
512
513namespace llvm {
514
515AnalysisKey ShouldRunExtraVectorPasses::Key;
516
517/// InnerLoopVectorizer vectorizes loops which contain only one basic
518/// block to a specified vectorization factor (VF).
519/// This class performs the widening of scalars into vectors, or multiple
520/// scalars. This class also implements the following features:
521/// * It inserts an epilogue loop for handling loops that don't have iteration
522/// counts that are known to be a multiple of the vectorization factor.
523/// * It handles the code generation for reduction variables.
524/// * Scalarization (implementation using scalars) of un-vectorizable
525/// instructions.
526/// InnerLoopVectorizer does not perform any vectorization-legality
527/// checks, and relies on the caller to check for the different legality
528/// aspects. The InnerLoopVectorizer relies on the
529/// LoopVectorizationLegality class to provide information about the induction
530/// and reduction variables that were found to a given vectorization factor.
531class InnerLoopVectorizer {
532public:
533 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
534 LoopInfo *LI, DominatorTree *DT,
535 const TargetTransformInfo *TTI, AssumptionCache *AC,
536 ElementCount VecWidth, unsigned UnrollFactor,
537 LoopVectorizationCostModel *CM,
538 GeneratedRTChecks &RTChecks, VPlan &Plan)
539 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
540 VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()),
541 Cost(CM), RTChecks(RTChecks), Plan(Plan),
542 VectorPHVPBB(cast<VPBasicBlock>(
543 Val: Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
544
545 virtual ~InnerLoopVectorizer() = default;
546
547 /// Creates a basic block for the scalar preheader. Both
548 /// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
549 /// the method to create additional blocks and checks needed for epilogue
550 /// vectorization.
551 virtual BasicBlock *createVectorizedLoopSkeleton();
552
553 /// Fix the vectorized code, taking care of header phi's, and more.
554 void fixVectorizedLoop(VPTransformState &State);
555
556protected:
557 friend class LoopVectorizationPlanner;
558
559 /// Create and return a new IR basic block for the scalar preheader whose name
560 /// is prefixed with \p Prefix.
561 BasicBlock *createScalarPreheader(StringRef Prefix);
562
563 /// Allow subclasses to override and print debug traces before/after vplan
564 /// execution, when trace information is requested.
565 virtual void printDebugTracesAtStart() {}
566 virtual void printDebugTracesAtEnd() {}
567
568 /// The original loop.
569 Loop *OrigLoop;
570
571 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
572 /// dynamic knowledge to simplify SCEV expressions and converts them to a
573 /// more usable form.
574 PredicatedScalarEvolution &PSE;
575
576 /// Loop Info.
577 LoopInfo *LI;
578
579 /// Dominator Tree.
580 DominatorTree *DT;
581
582 /// Target Transform Info.
583 const TargetTransformInfo *TTI;
584
585 /// Assumption Cache.
586 AssumptionCache *AC;
587
588 /// The vectorization SIMD factor to use. Each vector will have this many
589 /// vector elements.
590 ElementCount VF;
591
592 /// The vectorization unroll factor to use. Each scalar is vectorized to this
593 /// many different vector instructions.
594 unsigned UF;
595
596 /// The builder that we use
597 IRBuilder<> Builder;
598
599 // --- Vectorization state ---
600
601 /// The profitablity analysis.
602 LoopVectorizationCostModel *Cost;
603
604 /// Structure to hold information about generated runtime checks, responsible
605 /// for cleaning the checks, if vectorization turns out unprofitable.
606 GeneratedRTChecks &RTChecks;
607
608 VPlan &Plan;
609
610 /// The vector preheader block of \p Plan, used as target for check blocks
611 /// introduced during skeleton creation.
612 VPBasicBlock *VectorPHVPBB;
613};
614
615/// Encapsulate information regarding vectorization of a loop and its epilogue.
616/// This information is meant to be updated and used across two stages of
617/// epilogue vectorization.
618struct EpilogueLoopVectorizationInfo {
619 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
620 unsigned MainLoopUF = 0;
621 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
622 unsigned EpilogueUF = 0;
623 BasicBlock *MainLoopIterationCountCheck = nullptr;
624 BasicBlock *EpilogueIterationCountCheck = nullptr;
625 Value *VectorTripCount = nullptr;
626 VPlan &EpiloguePlan;
627
628 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
629 ElementCount EVF, unsigned EUF,
630 VPlan &EpiloguePlan)
631 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
632 EpiloguePlan(EpiloguePlan) {
633 assert(EUF == 1 &&
634 "A high UF for the epilogue loop is likely not beneficial.");
635 }
636};
637
638/// An extension of the inner loop vectorizer that creates a skeleton for a
639/// vectorized loop that has its epilogue (residual) also vectorized.
640/// The idea is to run the vplan on a given loop twice, firstly to setup the
641/// skeleton and vectorize the main loop, and secondly to complete the skeleton
642/// from the first step and vectorize the epilogue. This is achieved by
643/// deriving two concrete strategy classes from this base class and invoking
644/// them in succession from the loop vectorizer planner.
645class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
646public:
647 InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
648 LoopInfo *LI, DominatorTree *DT,
649 const TargetTransformInfo *TTI,
650 AssumptionCache *AC,
651 EpilogueLoopVectorizationInfo &EPI,
652 LoopVectorizationCostModel *CM,
653 GeneratedRTChecks &Checks, VPlan &Plan,
654 ElementCount VecWidth, unsigned UnrollFactor)
655 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
656 UnrollFactor, CM, Checks, Plan),
657 EPI(EPI) {}
658
659 /// Holds and updates state information required to vectorize the main loop
660 /// and its epilogue in two separate passes. This setup helps us avoid
661 /// regenerating and recomputing runtime safety checks. It also helps us to
662 /// shorten the iteration-count-check path length for the cases where the
663 /// iteration count of the loop is so small that the main vector loop is
664 /// completely skipped.
665 EpilogueLoopVectorizationInfo &EPI;
666};
667
668/// A specialized derived class of inner loop vectorizer that performs
669/// vectorization of *main* loops in the process of vectorizing loops and their
670/// epilogues.
671class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
672public:
673 EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
674 LoopInfo *LI, DominatorTree *DT,
675 const TargetTransformInfo *TTI,
676 AssumptionCache *AC,
677 EpilogueLoopVectorizationInfo &EPI,
678 LoopVectorizationCostModel *CM,
679 GeneratedRTChecks &Check, VPlan &Plan)
680 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
681 Check, Plan, EPI.MainLoopVF,
682 EPI.MainLoopUF) {}
683
684protected:
685 void printDebugTracesAtStart() override;
686 void printDebugTracesAtEnd() override;
687};
688
689// A specialized derived class of inner loop vectorizer that performs
690// vectorization of *epilogue* loops in the process of vectorizing loops and
691// their epilogues.
692class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
693public:
694 EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
695 LoopInfo *LI, DominatorTree *DT,
696 const TargetTransformInfo *TTI,
697 AssumptionCache *AC,
698 EpilogueLoopVectorizationInfo &EPI,
699 LoopVectorizationCostModel *CM,
700 GeneratedRTChecks &Checks, VPlan &Plan)
701 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
702 Checks, Plan, EPI.EpilogueVF,
703 EPI.EpilogueUF) {}
704 /// Implements the interface for creating a vectorized skeleton using the
705 /// *epilogue loop* strategy (i.e., the second pass of VPlan execution).
706 BasicBlock *createVectorizedLoopSkeleton() final;
707
708protected:
709 void printDebugTracesAtStart() override;
710 void printDebugTracesAtEnd() override;
711};
712} // end namespace llvm
713
714/// Look for a meaningful debug location on the instruction or its operands.
715static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
716 if (!I)
717 return DebugLoc::getUnknown();
718
719 DebugLoc Empty;
720 if (I->getDebugLoc() != Empty)
721 return I->getDebugLoc();
722
723 for (Use &Op : I->operands()) {
724 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
725 if (OpInst->getDebugLoc() != Empty)
726 return OpInst->getDebugLoc();
727 }
728
729 return I->getDebugLoc();
730}
731
732namespace llvm {
733
734/// Return the runtime value for VF.
735Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
736 return B.CreateElementCount(Ty, EC: VF);
737}
738
739} // end namespace llvm
740
741namespace llvm {
742
743// Loop vectorization cost-model hints how the epilogue/tail loop should be
744// lowered.
745enum EpilogueLowering {
746
747 // The default: allowing epilogues.
748 CM_EpilogueAllowed,
749
750 // Vectorization with OptForSize: don't allow epilogues.
751 CM_EpilogueNotAllowedOptSize,
752
753 // A special case of vectorisation with OptForSize: loops with a very small
754 // trip count are considered for vectorization under OptForSize, thereby
755 // making sure the cost of their loop body is dominant, free of runtime
756 // guards and scalar iteration overheads.
757 CM_EpilogueNotAllowedLowTripLoop,
758
759 // Loop hint indicating an epilogue is undesired, apply tail folding.
760 CM_EpilogueNotNeededFoldTail,
761
762 // Directive indicating we must either fold the epilogue/tail or not vectorize
763 CM_EpilogueNotAllowedFoldTail
764};
765
766enum class AliasMaskingStatus { NotDecided, Disabled, Enabled };
767
768/// LoopVectorizationCostModel - estimates the expected speedups due to
769/// vectorization.
770/// In many cases vectorization is not profitable. This can happen because of
771/// a number of reasons. In this class we mainly attempt to predict the
772/// expected speedup/slowdowns due to the supported instruction set. We use the
773/// TargetTransformInfo to query the different backends for the cost of
774/// different operations.
775class LoopVectorizationCostModel {
776 friend class LoopVectorizationPlanner;
777
778public:
779 LoopVectorizationCostModel(EpilogueLowering SEL, Loop *L,
780 PredicatedScalarEvolution &PSE, LoopInfo *LI,
781 LoopVectorizationLegality *Legal,
782 const TargetTransformInfo &TTI,
783 const TargetLibraryInfo *TLI, AssumptionCache *AC,
784 OptimizationRemarkEmitter *ORE,
785 std::function<BlockFrequencyInfo &()> GetBFI,
786 const Function *F, const LoopVectorizeHints *Hints,
787 InterleavedAccessInfo &IAI,
788 VFSelectionContext &Config)
789 : Config(Config), EpilogueLoweringStatus(SEL), TheLoop(L), PSE(PSE),
790 LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), AC(AC), ORE(ORE),
791 GetBFI(GetBFI), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
792
793 /// \return An upper bound for the vectorization factors (both fixed and
794 /// scalable). If the factors are 0, vectorization and interleaving should be
795 /// avoided up front.
796 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
797
798 /// Memory access instruction may be vectorized in more than one way.
799 /// Form of instruction after vectorization depends on cost.
800 /// This function takes cost-based decisions for Load/Store instructions
801 /// and collects them in a map. This decisions map is used for building
802 /// the lists of loop-uniform and loop-scalar instructions.
803 /// The calculated cost is saved with widening decision in order to
804 /// avoid redundant calculations.
805 void setCostBasedWideningDecision(ElementCount VF);
806
807 /// Collect values we want to ignore in the cost model.
808 void collectValuesToIgnore();
809
810 /// \returns True if it is more profitable to scalarize instruction \p I for
811 /// vectorization factor \p VF.
812 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
813 assert(VF.isVector() &&
814 "Profitable to scalarize relevant only for VF > 1.");
815 assert(
816 TheLoop->isInnermost() &&
817 "cost-model should not be used for outer loops (in VPlan-native path)");
818
819 auto Scalars = InstsToScalarize.find(Key: VF);
820 assert(Scalars != InstsToScalarize.end() &&
821 "VF not yet analyzed for scalarization profitability");
822 return Scalars->second.contains(Key: I);
823 }
824
825 /// Returns true if \p I is known to be uniform after vectorization.
826 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
827 assert(
828 TheLoop->isInnermost() &&
829 "cost-model should not be used for outer loops (in VPlan-native path)");
830
831 // If VF is scalar, then all instructions are trivially uniform.
832 if (VF.isScalar())
833 return true;
834
835 // Pseudo probes must be duplicated per vector lane so that the
836 // profiled loop trip count is not undercounted.
837 if (isa<PseudoProbeInst>(Val: I))
838 return false;
839
840 auto UniformsPerVF = Uniforms.find(Val: VF);
841 assert(UniformsPerVF != Uniforms.end() &&
842 "VF not yet analyzed for uniformity");
843 return UniformsPerVF->second.count(Ptr: I);
844 }
845
846 /// Returns true if \p I is known to be scalar after vectorization.
847 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
848 assert(
849 TheLoop->isInnermost() &&
850 "cost-model should not be used for outer loops (in VPlan-native path)");
851 if (VF.isScalar())
852 return true;
853
854 auto ScalarsPerVF = Scalars.find(Val: VF);
855 assert(ScalarsPerVF != Scalars.end() &&
856 "Scalar values are not calculated for VF");
857 return ScalarsPerVF->second.count(Ptr: I);
858 }
859
860 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
861 /// for vectorization factor \p VF.
862 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
863 const auto &MinBWs = Config.getMinimalBitwidths();
864 // Truncs must truncate at most to their destination type.
865 if (isa_and_nonnull<TruncInst>(Val: I) && MinBWs.contains(Key: I) &&
866 I->getType()->getScalarSizeInBits() < MinBWs.lookup(Key: I))
867 return false;
868 return VF.isVector() && MinBWs.contains(Key: I) &&
869 !isProfitableToScalarize(I, VF) &&
870 !isScalarAfterVectorization(I, VF);
871 }
872
873 /// Decision that was taken during cost calculation for memory instruction.
874 enum InstWidening {
875 CM_Unknown,
876 CM_Widen, // For consecutive accesses with stride +1.
877 CM_Widen_Reverse, // For consecutive accesses with stride -1.
878 CM_Interleave,
879 CM_GatherScatter,
880 CM_Scalarize,
881 /// A widening decision that has been invalidated after replacing the
882 /// corresponding recipe during VPlan transforms.
883 /// TODO: Remove once the legacy exit cost computation is retired.
884 CM_InvalidatedDecision
885 };
886
887 /// Save vectorization decision \p W and \p Cost taken by the cost model for
888 /// instruction \p I and vector width \p VF.
889 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
890 InstructionCost Cost) {
891 assert(VF.isVector() && "Expected VF >=2");
892 WideningDecisions[{I, VF}] = {W, Cost};
893 }
894
895 /// Save vectorization decision \p W and \p Cost taken by the cost model for
896 /// interleaving group \p Grp and vector width \p VF.
897 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
898 ElementCount VF, InstWidening W,
899 InstructionCost Cost) {
900 assert(VF.isVector() && "Expected VF >=2");
901 /// Broadcast this decicion to all instructions inside the group.
902 /// When interleaving, the cost will only be assigned one instruction, the
903 /// insert position. For other cases, add the appropriate fraction of the
904 /// total cost to each instruction. This ensures accurate costs are used,
905 /// even if the insert position instruction is not used.
906 InstructionCost InsertPosCost = Cost;
907 InstructionCost OtherMemberCost = 0;
908 if (W != CM_Interleave)
909 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
910 ;
911 for (auto *I : Grp->members()) {
912 if (Grp->getInsertPos() == I)
913 WideningDecisions[{I, VF}] = {W, InsertPosCost};
914 else
915 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
916 }
917 }
918
919 /// Return the cost model decision for the given instruction \p I and vector
920 /// width \p VF. Return CM_Unknown if this instruction did not pass
921 /// through the cost modeling.
922 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
923 assert(VF.isVector() && "Expected VF to be a vector VF");
924 assert(
925 TheLoop->isInnermost() &&
926 "cost-model should not be used for outer loops (in VPlan-native path)");
927
928 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
929 auto Itr = WideningDecisions.find(Val: InstOnVF);
930 if (Itr == WideningDecisions.end())
931 return CM_Unknown;
932 return Itr->second.first;
933 }
934
935 /// Return the vectorization cost for the given instruction \p I and vector
936 /// width \p VF.
937 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
938 assert(VF.isVector() && "Expected VF >=2");
939 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
940 assert(WideningDecisions.contains(InstOnVF) &&
941 "The cost is not calculated");
942 return WideningDecisions[InstOnVF].second;
943 }
944
945 /// Return True if instruction \p I is an optimizable truncate whose operand
946 /// is an induction variable. Such a truncate will be removed by adding a new
947 /// induction variable with the destination type.
948 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
949 // If the instruction is not a truncate, return false.
950 auto *Trunc = dyn_cast<TruncInst>(Val: I);
951 if (!Trunc)
952 return false;
953
954 // Get the source and destination types of the truncate.
955 Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
956 Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
957
958 // If the truncate is free for the given types, return false. Replacing a
959 // free truncate with an induction variable would add an induction variable
960 // update instruction to each iteration of the loop. We exclude from this
961 // check the primary induction variable since it will need an update
962 // instruction regardless.
963 Value *Op = Trunc->getOperand(i_nocapture: 0);
964 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
965 return false;
966
967 // If the truncated value is not an induction variable, return false.
968 return Legal->isInductionPhi(V: Op);
969 }
970
971 /// Collects the instructions to scalarize for each predicated instruction in
972 /// the loop.
973 void collectInstsToScalarize(ElementCount VF);
974
975 /// Collect values that will not be widened, including Uniforms, Scalars, and
976 /// Instructions to Scalarize for the given \p VF.
977 /// The sets depend on CM decision for Load/Store instructions
978 /// that may be vectorized as interleave, gather-scatter or scalarized.
979 /// Also make a decision on what to do about call instructions in the loop
980 /// at that VF -- scalarize, call a known vector routine, or call a
981 /// vector intrinsic.
982 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
983 // Do the analysis once.
984 if (VF.isScalar() || Uniforms.contains(Val: VF))
985 return;
986 setCostBasedWideningDecision(VF);
987 collectLoopUniforms(VF);
988 collectLoopScalars(VF);
989 collectInstsToScalarize(VF);
990 }
991
992 /// Given costs for both strategies, return true if the scalar predication
993 /// lowering should be used for div/rem. This incorporates an override
994 /// option so it is not simply a cost comparison.
995 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
996 InstructionCost MaskedCost) const {
997 switch (ForceMaskedDivRem) {
998 case cl::boolOrDefault::BOU_UNSET:
999 return ScalarCost < MaskedCost;
1000 case cl::boolOrDefault::BOU_TRUE:
1001 return false;
1002 case cl::boolOrDefault::BOU_FALSE:
1003 return true;
1004 }
1005 llvm_unreachable("impossible case value");
1006 }
1007
1008 /// Returns true if \p I is an instruction which requires predication and
1009 /// for which our chosen predication strategy is scalarization (i.e. we
1010 /// don't have an alternate strategy such as masking available).
1011 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1012 bool isScalarWithPredication(Instruction *I, ElementCount VF);
1013
1014 /// Wrapper function for LoopVectorizationLegality::isMaskRequired,
1015 /// that passes the Instruction \p I and if we fold tail.
1016 bool isMaskRequired(Instruction *I) const;
1017
1018 /// Returns true if \p I is an instruction that needs to be predicated
1019 /// at runtime. The result is independent of the predication mechanism.
1020 /// Superset of instructions that return true for isScalarWithPredication.
1021 bool isPredicatedInst(Instruction *I) const;
1022
1023 /// A helper function that returns how much we should divide the cost of a
1024 /// predicated block by. Typically this is the reciprocal of the block
1025 /// probability, i.e. if we return X we are assuming the predicated block will
1026 /// execute once for every X iterations of the loop header so the block should
1027 /// only contribute 1/X of its cost to the total cost calculation, but when
1028 /// optimizing for code size it will just be 1 as code size costs don't depend
1029 /// on execution probabilities.
1030 ///
1031 /// Note that if a block wasn't originally predicated but was predicated due
1032 /// to tail folding, the divisor will still be 1 because it will execute for
1033 /// every iteration of the loop header.
1034 inline uint64_t
1035 getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1036 const BasicBlock *BB);
1037
1038 /// Returns true if an artificially high cost for emulated masked memrefs
1039 /// should be used.
1040 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1041
1042 /// Return the costs for our two available strategies for lowering a
1043 /// div/rem operation which requires speculating at least one lane.
1044 /// First result is for scalarization (will be invalid for scalable
1045 /// vectors); second is for the masked intrinsic strategy.
1046 std::pair<InstructionCost, InstructionCost>
1047 getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1048
1049 /// Returns true if \p I is a memory instruction with consecutive memory
1050 /// access that can be widened.
1051 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1052
1053 /// Returns true if \p I is a memory instruction in an interleaved-group
1054 /// of memory accesses that can be vectorized with wide vector loads/stores
1055 /// and shuffles.
1056 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1057
1058 /// Check if \p Instr belongs to any interleaved access group.
1059 bool isAccessInterleaved(Instruction *Instr) const {
1060 return InterleaveInfo.isInterleaved(Instr);
1061 }
1062
1063 /// Get the interleaved access group that \p Instr belongs to.
1064 const InterleaveGroup<Instruction> *
1065 getInterleavedAccessGroup(Instruction *Instr) const {
1066 return InterleaveInfo.getInterleaveGroup(Instr);
1067 }
1068
1069 /// Returns true if we're required to use a scalar epilogue for at least
1070 /// the final iteration of the original loop.
1071 bool requiresScalarEpilogue(bool IsVectorizing) const {
1072 if (!isEpilogueAllowed()) {
1073 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1074 return false;
1075 }
1076 // If we might exit from anywhere but the latch and early exit vectorization
1077 // is disabled, we must run the exiting iteration in scalar form.
1078 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1079 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1080 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1081 "from latch block\n");
1082 return true;
1083 }
1084 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1085 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1086 "interleaved group requires scalar epilogue\n");
1087 return true;
1088 }
1089 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1090 return false;
1091 }
1092
1093 /// Returns true if an epilogue is allowed (e.g., not prevented by
1094 /// optsize or a loop hint annotation).
1095 bool isEpilogueAllowed() const {
1096 return EpilogueLoweringStatus == CM_EpilogueAllowed;
1097 }
1098
1099 /// Returns true if tail-folding is preferred over an epilogue.
1100 bool preferTailFoldedLoop() const {
1101 return EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail ||
1102 EpilogueLoweringStatus == CM_EpilogueNotAllowedFoldTail;
1103 }
1104
1105 /// Returns the TailFoldingStyle that is best for the current loop.
1106 TailFoldingStyle getTailFoldingStyle() const {
1107 return ChosenTailFoldingStyle;
1108 }
1109
1110 /// Selects and saves TailFoldingStyle.
1111 /// \param IsScalableVF true if scalable vector factors enabled.
1112 /// \param UserIC User specific interleave count.
1113 void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC) {
1114 assert(ChosenTailFoldingStyle == TailFoldingStyle::None &&
1115 "Tail folding must not be selected yet.");
1116 if (!Legal->canFoldTailByMasking()) {
1117 ChosenTailFoldingStyle = TailFoldingStyle::None;
1118 return;
1119 }
1120
1121 // Default to TTI preference, but allow command line override.
1122 ChosenTailFoldingStyle = TTI.getPreferredTailFoldingStyle();
1123 if (ForceTailFoldingStyle.getNumOccurrences())
1124 ChosenTailFoldingStyle = ForceTailFoldingStyle.getValue();
1125
1126 if (ChosenTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1127 return;
1128 // Override EVL styles if needed.
1129 // FIXME: Investigate opportunity for fixed vector factor.
1130 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1131 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1132 if (EVLIsLegal)
1133 return;
1134 // If for some reason EVL mode is unsupported, fallback to an epilogue
1135 // if it's allowed, or DataWithoutLaneMask otherwise.
1136 if (EpilogueLoweringStatus == CM_EpilogueAllowed ||
1137 EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail)
1138 ChosenTailFoldingStyle = TailFoldingStyle::None;
1139 else
1140 ChosenTailFoldingStyle = TailFoldingStyle::DataWithoutLaneMask;
1141
1142 LLVM_DEBUG(
1143 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1144 "not try to generate VP Intrinsics "
1145 << (UserIC > 1
1146 ? "since interleave count specified is greater than 1.\n"
1147 : "due to non-interleaving reasons.\n"));
1148 }
1149
1150 /// Returns true if all loop blocks should be masked to fold tail loop.
1151 bool foldTailByMasking() const {
1152 return getTailFoldingStyle() != TailFoldingStyle::None;
1153 }
1154
1155 void tryToEnablePartialAliasMasking() {
1156 assert(foldTailByMasking() && "Expected tail folding to be enabled!");
1157 assert(!foldTailWithEVL() &&
1158 "Did not expect to enable alias masking with EVL!");
1159 assert(PartialAliasMaskingStatus == AliasMaskingStatus::NotDecided);
1160
1161 // Assume we fail to enable alias masking (in case we early exit).
1162 PartialAliasMaskingStatus = AliasMaskingStatus::Disabled;
1163
1164 // Note: FixedOrderRecurrences are not supported yet as we cannot handle
1165 // the required `splice.right` with the alias-mask.
1166 if (!ForcePartialAliasingVectorization ||
1167 !Legal->getFixedOrderRecurrences().empty())
1168 return;
1169
1170 const RuntimePointerChecking *Checks = Legal->getRuntimePointerChecking();
1171 if (!Checks)
1172 return;
1173
1174 auto DiffChecks = Checks->getDiffChecks();
1175 if (!DiffChecks || DiffChecks->empty())
1176 return;
1177
1178 [[maybe_unused]] auto HasPointerArgs = [](CallBase *CB) {
1179 return any_of(Range: CB->args(), P: [](Value const *Arg) {
1180 return Arg->getType()->isPointerTy();
1181 });
1182 };
1183
1184 for (BasicBlock *BB : TheLoop->blocks()) {
1185 for (Instruction &I : *BB) {
1186 if (!isa<LoadInst, StoreInst>(Val: I)) {
1187 [[maybe_unused]] auto *Call = dyn_cast<CallInst>(Val: &I);
1188 assert(
1189 (!I.mayReadOrWriteMemory() || (Call && !HasPointerArgs(Call))) &&
1190 "Skipped unexpected memory access");
1191 continue;
1192 }
1193
1194 Type *ScalarTy = getLoadStoreType(I: &I);
1195 Value *Ptr = getLoadStorePointerOperand(V: &I);
1196
1197 // Currently, we can't handle alias masking in reverse. Reversing the
1198 // alias mask is not correct (or necessary). When combined with
1199 // tail-folding the active lane mask should only be reversed where the
1200 // alias-mask is true.
1201 if (Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr) == -1)
1202 return;
1203 }
1204 }
1205
1206 PartialAliasMaskingStatus = AliasMaskingStatus::Enabled;
1207 }
1208
1209 /// Returns true if all loop blocks should have partial aliases masked.
1210 bool maskPartialAliasing() const {
1211 return PartialAliasMaskingStatus == AliasMaskingStatus::Enabled;
1212 }
1213
1214 /// Returns true if the use of wide lane masks is requested and the loop is
1215 /// using tail-folding with a lane mask for control flow.
1216 bool useWideActiveLaneMask() const {
1217 if (!EnableWideActiveLaneMask)
1218 return false;
1219
1220 return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow;
1221 }
1222
1223 /// Returns true if the instructions in this block requires predication
1224 /// for any reason, e.g. because tail folding now requires a predicate
1225 /// or because the block in the original loop was predicated.
1226 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1227 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1228 }
1229
1230 /// Returns true if VP intrinsics with explicit vector length support should
1231 /// be generated in the tail folded loop.
1232 bool foldTailWithEVL() const {
1233 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1234 }
1235
1236 /// Returns true if the predicated reduction select should be used to set the
1237 /// incoming value for the reduction phi.
1238 bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const {
1239 // Force to use predicated reduction select since the EVL of the
1240 // second-to-last iteration might not be VF*UF.
1241 if (foldTailWithEVL())
1242 return true;
1243
1244 // Force a predicated select with alias-masking to avoid propagating poison
1245 // values to the header phi for lanes outside the alias-mask.
1246 if (maskPartialAliasing())
1247 return true;
1248
1249 // Note: For FindLast recurrences we prefer a predicated select to simplify
1250 // matching in handleFindLastReductions(), rather than handle multiple
1251 // cases.
1252 if (RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RecurrenceKind))
1253 return true;
1254
1255 return PreferPredicatedReductionSelect ||
1256 TTI.preferPredicatedReductionSelect();
1257 }
1258
1259 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1260 /// with factor VF. Return the cost of the instruction, including
1261 /// scalarization overhead if it's needed.
1262 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1263
1264 /// Estimate cost of a call instruction CI if it were vectorized with factor
1265 /// VF. Return the cost of the instruction, including scalarization overhead
1266 /// if it's needed.
1267 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1268
1269 /// Invalidates decisions already taken by the cost model.
1270 void invalidateCostModelingDecisions() {
1271 WideningDecisions.clear();
1272 Uniforms.clear();
1273 Scalars.clear();
1274 }
1275
1276 /// Returns the expected execution cost. The unit of the cost does
1277 /// not matter because we use the 'cost' units to compare different
1278 /// vector widths. The cost that is returned is *not* normalized by
1279 /// the factor width.
1280 InstructionCost expectedCost(ElementCount VF);
1281
1282 /// Returns true if epilogue vectorization is considered profitable, and
1283 /// false otherwise.
1284 /// \p VF is the vectorization factor chosen for the original loop.
1285 /// \p Multiplier is an aditional scaling factor applied to VF before
1286 /// comparing to EpilogueVectorizationMinVF.
1287 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1288 const unsigned IC) const;
1289
1290 /// Returns the execution time cost of an instruction for a given vector
1291 /// width. Vector width of one means scalar.
1292 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1293
1294 /// Return the cost of instructions in an inloop reduction pattern, if I is
1295 /// part of that pattern.
1296 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1297 ElementCount VF,
1298 Type *VectorTy) const;
1299
1300 /// Returns true if \p Op should be considered invariant and if it is
1301 /// trivially hoistable.
1302 bool shouldConsiderInvariant(Value *Op);
1303
1304 /// Returns true if \p I has been forced to be scalarized at \p VF.
1305 bool isForcedScalar(Instruction *I, ElementCount VF) const {
1306 auto FS = ForcedScalars.find(Val: VF);
1307 return FS != ForcedScalars.end() && FS->second.contains(Ptr: I);
1308 }
1309
1310private:
1311 unsigned NumPredStores = 0;
1312
1313 /// VF selection state independent of cost-modeling decisions.
1314 VFSelectionContext &Config;
1315
1316 /// Wrapper around LoopVectorizationLegality::isUniform() that takes into
1317 /// account if alias-masking is enabled. We consider the VF to be unknown when
1318 /// alias masking.
1319 bool isUniform(Value *V, ElementCount VF) const {
1320 // With alias-masking our runtime VF is [2, VF] (and not necessarily a
1321 // power-of-two). Something that is uniform for VF may not be for the full
1322 // range.
1323 assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided &&
1324 "alias-mask status must be decided already");
1325 return Legal->isUniform(V, VF: PartialAliasMaskingStatus ==
1326 AliasMaskingStatus::Disabled
1327 ? std::optional(VF)
1328 : std::nullopt);
1329 }
1330
1331 /// Wrapper around LoopVectorizationLegality::isUniformMemOp() that takes into
1332 /// account if alias-masking is enabled. We consider the VF to be unknown when
1333 /// alias masking.
1334 bool isUniformMemOp(Instruction &I, ElementCount VF) const {
1335 assert(PartialAliasMaskingStatus != AliasMaskingStatus::NotDecided &&
1336 "alias-mask status must be decided already");
1337 return Legal->isUniformMemOp(I, VF: PartialAliasMaskingStatus ==
1338 AliasMaskingStatus::Disabled
1339 ? std::optional(VF)
1340 : std::nullopt);
1341 }
1342
1343 /// Calculate vectorization cost of memory instruction \p I.
1344 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1345
1346 /// The cost computation for scalarized memory instruction.
1347 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1348
1349 /// The cost computation for interleaving group of memory instructions.
1350 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1351
1352 /// The cost computation for Gather/Scatter instruction.
1353 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1354
1355 /// The cost computation for widening instruction \p I with consecutive
1356 /// memory access.
1357 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1358
1359 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1360 /// Load: scalar load + broadcast.
1361 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1362 /// element)
1363 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1364
1365 /// Estimate the overhead of scalarizing an instruction. This is a
1366 /// convenience wrapper for the type-based getScalarizationOverhead API.
1367 InstructionCost getScalarizationOverhead(Instruction *I,
1368 ElementCount VF) const;
1369
1370 /// A type representing the costs for instructions if they were to be
1371 /// scalarized rather than vectorized. The entries are Instruction-Cost
1372 /// pairs.
1373 using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1374
1375 /// A set containing all BasicBlocks that are known to present after
1376 /// vectorization as a predicated block.
1377 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1378 PredicatedBBsAfterVectorization;
1379
1380 /// Records whether it is allowed to have the original scalar loop execute at
1381 /// least once. This may be needed as a fallback loop in case runtime
1382 /// aliasing/dependence checks fail, or to handle the tail/remainder
1383 /// iterations when the trip count is unknown or doesn't divide by the VF,
1384 /// or as a peel-loop to handle gaps in interleave-groups.
1385 /// Under optsize and when the trip count is very small we don't allow any
1386 /// iterations to execute in the scalar loop.
1387 EpilogueLowering EpilogueLoweringStatus = CM_EpilogueAllowed;
1388
1389 /// Control finally chosen tail folding style.
1390 TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;
1391
1392 /// If partial alias masking is enabled/disabled or not decided.
1393 AliasMaskingStatus PartialAliasMaskingStatus = AliasMaskingStatus::NotDecided;
1394
1395 /// A map holding scalar costs for different vectorization factors. The
1396 /// presence of a cost for an instruction in the mapping indicates that the
1397 /// instruction will be scalarized when vectorizing with the associated
1398 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1399 MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1400
1401 /// Holds the instructions known to be uniform after vectorization.
1402 /// The data is collected per VF.
1403 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1404
1405 /// Holds the instructions known to be scalar after vectorization.
1406 /// The data is collected per VF.
1407 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1408
1409 /// Holds the instructions (address computations) that are forced to be
1410 /// scalarized.
1411 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1412
1413 /// Returns the expected difference in cost from scalarizing the expression
1414 /// feeding a predicated instruction \p PredInst. The instructions to
1415 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1416 /// non-negative return value implies the expression will be scalarized.
1417 /// Currently, only single-use chains are considered for scalarization.
1418 InstructionCost computePredInstDiscount(Instruction *PredInst,
1419 ScalarCostsTy &ScalarCosts,
1420 ElementCount VF);
1421
1422 /// Collect the instructions that are uniform after vectorization. An
1423 /// instruction is uniform if we represent it with a single scalar value in
1424 /// the vectorized loop corresponding to each vector iteration. Examples of
1425 /// uniform instructions include pointer operands of consecutive or
1426 /// interleaved memory accesses. Note that although uniformity implies an
1427 /// instruction will be scalar, the reverse is not true. In general, a
1428 /// scalarized instruction will be represented by VF scalar values in the
1429 /// vectorized loop, each corresponding to an iteration of the original
1430 /// scalar loop.
1431 void collectLoopUniforms(ElementCount VF);
1432
1433 /// Collect the instructions that are scalar after vectorization. An
1434 /// instruction is scalar if it is known to be uniform or will be scalarized
1435 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1436 /// to the list if they are used by a load/store instruction that is marked as
1437 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1438 /// VF values in the vectorized loop, each corresponding to an iteration of
1439 /// the original scalar loop.
1440 void collectLoopScalars(ElementCount VF);
1441
1442 /// Keeps cost model vectorization decision and cost for instructions.
1443 /// Right now it is used for memory instructions only.
1444 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1445 std::pair<InstWidening, InstructionCost>>;
1446
1447 DecisionList WideningDecisions;
1448
1449 /// Returns true if \p V is expected to be vectorized and it needs to be
1450 /// extracted.
1451 bool needsExtract(Value *V, ElementCount VF) const {
1452 Instruction *I = dyn_cast<Instruction>(Val: V);
1453 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1454 TheLoop->isLoopInvariant(V: I) ||
1455 getWideningDecision(I, VF) == CM_Scalarize)
1456 return false;
1457
1458 // Assume we can vectorize V (and hence we need extraction) if the
1459 // scalars are not computed yet. This can happen, because it is called
1460 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1461 // the scalars are collected. That should be a safe assumption in most
1462 // cases, because we check if the operands have vectorizable types
1463 // beforehand in LoopVectorizationLegality.
1464 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1465 };
1466
1467 /// Returns a range containing only operands needing to be extracted.
1468 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1469 ElementCount VF) const {
1470
1471 SmallPtrSet<const Value *, 4> UniqueOperands;
1472 SmallVector<Value *, 4> Res;
1473 for (Value *Op : Ops) {
1474 if (isa<Constant>(Val: Op) || !UniqueOperands.insert(Ptr: Op).second ||
1475 !needsExtract(V: Op, VF))
1476 continue;
1477 Res.push_back(Elt: Op);
1478 }
1479 return Res;
1480 }
1481
1482public:
1483 /// The loop that we evaluate.
1484 Loop *TheLoop;
1485
1486 /// Predicated scalar evolution analysis.
1487 PredicatedScalarEvolution &PSE;
1488
1489 /// Loop Info analysis.
1490 LoopInfo *LI;
1491
1492 /// Vectorization legality.
1493 LoopVectorizationLegality *Legal;
1494
1495 /// Vector target information.
1496 const TargetTransformInfo &TTI;
1497
1498 /// Target Library Info.
1499 const TargetLibraryInfo *TLI;
1500
1501 /// Assumption cache.
1502 AssumptionCache *AC;
1503
1504 /// Interface to emit optimization remarks.
1505 OptimizationRemarkEmitter *ORE;
1506
1507 /// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1508 /// unless necessary, e.g. when the loop isn't legal to vectorize or when
1509 /// there is no predication.
1510 std::function<BlockFrequencyInfo &()> GetBFI;
1511 /// The BlockFrequencyInfo returned from GetBFI.
1512 BlockFrequencyInfo *BFI = nullptr;
1513 /// Returns the BlockFrequencyInfo for the function if cached, otherwise
1514 /// fetches it via GetBFI. Avoids an indirect call to the std::function.
1515 BlockFrequencyInfo &getBFI() {
1516 if (!BFI)
1517 BFI = &GetBFI();
1518 return *BFI;
1519 }
1520
1521 const Function *TheFunction;
1522
1523 /// Loop Vectorize Hint.
1524 const LoopVectorizeHints *Hints;
1525
1526 /// The interleave access information contains groups of interleaved accesses
1527 /// with the same stride and close to each other.
1528 InterleavedAccessInfo &InterleaveInfo;
1529
1530 /// Values to ignore in the cost model.
1531 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1532
1533 /// Values to ignore in the cost model when VF > 1.
1534 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1535};
1536} // end namespace llvm
1537
1538namespace {
1539/// Helper struct to manage generating runtime checks for vectorization.
1540///
1541/// The runtime checks are created up-front in temporary blocks to allow better
1542/// estimating the cost and un-linked from the existing IR. After deciding to
1543/// vectorize, the checks are moved back. If deciding not to vectorize, the
1544/// temporary blocks are completely removed.
1545class GeneratedRTChecks {
1546 /// Basic block which contains the generated SCEV checks, if any.
1547 BasicBlock *SCEVCheckBlock = nullptr;
1548
1549 /// The value representing the result of the generated SCEV checks. If it is
1550 /// nullptr no SCEV checks have been generated.
1551 Value *SCEVCheckCond = nullptr;
1552
1553 /// Basic block which contains the generated memory runtime checks, if any.
1554 BasicBlock *MemCheckBlock = nullptr;
1555
1556 /// The value representing the result of the generated memory runtime checks.
1557 /// If it is nullptr no memory runtime checks have been generated.
1558 Value *MemRuntimeCheckCond = nullptr;
1559
1560 DominatorTree *DT;
1561 LoopInfo *LI;
1562 TargetTransformInfo *TTI;
1563
1564 SCEVExpander SCEVExp;
1565 SCEVExpander MemCheckExp;
1566
1567 bool CostTooHigh = false;
1568
1569 Loop *OuterLoop = nullptr;
1570
1571 PredicatedScalarEvolution &PSE;
1572
1573 /// The kind of cost that we are calculating
1574 TTI::TargetCostKind CostKind;
1575
1576 /// True if the loop is alias-masked (which allows us to omit diff checks).
1577 bool LoopUsesPartialAliasMasking = false;
1578
1579public:
1580 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1581 LoopInfo *LI, TargetTransformInfo *TTI,
1582 TTI::TargetCostKind CostKind,
1583 bool LoopUsesPartialAliasMasking)
1584 : DT(DT), LI(LI), TTI(TTI),
1585 SCEVExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1586 MemCheckExp(*PSE.getSE(), "scev.check", /*PreserveLCSSA=*/false),
1587 PSE(PSE), CostKind(CostKind),
1588 LoopUsesPartialAliasMasking(LoopUsesPartialAliasMasking) {}
1589
1590 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1591 /// accurately estimate the cost of the runtime checks. The blocks are
1592 /// un-linked from the IR and are added back during vector code generation. If
1593 /// there is no vector code generation, the check blocks are removed
1594 /// completely.
1595 void create(Loop *L, const LoopAccessInfo &LAI,
1596 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1597 OptimizationRemarkEmitter &ORE) {
1598
1599 // Hard cutoff to limit compile-time increase in case a very large number of
1600 // runtime checks needs to be generated.
1601 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1602 // profile info.
1603 CostTooHigh =
1604 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1605 if (CostTooHigh) {
1606 // Mark runtime checks as never succeeding when they exceed the threshold.
1607 MemRuntimeCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1608 SCEVCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1609 ORE.emit(RemarkBuilder: [&]() {
1610 return OptimizationRemarkAnalysisAliasing(
1611 DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1612 L->getHeader())
1613 << "loop not vectorized: too many memory checks needed";
1614 });
1615 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1616 return;
1617 }
1618
1619 BasicBlock *LoopHeader = L->getHeader();
1620 BasicBlock *Preheader = L->getLoopPreheader();
1621
1622 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1623 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1624 // may be used by SCEVExpander. The blocks will be un-linked from their
1625 // predecessors and removed from LI & DT at the end of the function.
1626 if (!UnionPred.isAlwaysTrue()) {
1627 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1628 MSSAU: nullptr, BBName: "vector.scevcheck");
1629
1630 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1631 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1632 if (isa<Constant>(Val: SCEVCheckCond)) {
1633 // Clean up directly after expanding the predicate to a constant, to
1634 // avoid further expansions re-using anything left over from SCEVExp.
1635 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1636 SCEVCleaner.cleanup();
1637 }
1638 }
1639
1640 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1641 // TODO: We need to estimate the cost of alias-masking in
1642 // GeneratedRTChecks::getCost(). We can't check the MemCheckBlock as the
1643 // alias-mask is generated later in VPlan.
1644 if (RtPtrChecking.Need && !LoopUsesPartialAliasMasking) {
1645 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1646 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1647 BBName: "vector.memcheck");
1648
1649 auto DiffChecks = RtPtrChecking.getDiffChecks();
1650 if (DiffChecks) {
1651 Value *RuntimeVF = nullptr;
1652 MemRuntimeCheckCond = addDiffRuntimeChecks(
1653 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1654 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1655 if (!RuntimeVF)
1656 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1657 return RuntimeVF;
1658 },
1659 IC);
1660 } else {
1661 MemRuntimeCheckCond = addRuntimeChecks(
1662 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1663 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1664 }
1665 assert(MemRuntimeCheckCond &&
1666 "no RT checks generated although RtPtrChecking "
1667 "claimed checks are required");
1668 }
1669
1670 SCEVExp.eraseDeadInstructions(Root: SCEVCheckCond);
1671
1672 if (!MemCheckBlock && !SCEVCheckBlock)
1673 return;
1674
1675 // Unhook the temporary block with the checks, update various places
1676 // accordingly.
1677 if (SCEVCheckBlock)
1678 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1679 if (MemCheckBlock)
1680 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1681
1682 if (SCEVCheckBlock) {
1683 SCEVCheckBlock->getTerminator()->moveBefore(
1684 InsertPos: Preheader->getTerminator()->getIterator());
1685 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1686 UI->setDebugLoc(DebugLoc::getTemporary());
1687 Preheader->getTerminator()->eraseFromParent();
1688 }
1689 if (MemCheckBlock) {
1690 MemCheckBlock->getTerminator()->moveBefore(
1691 InsertPos: Preheader->getTerminator()->getIterator());
1692 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1693 UI->setDebugLoc(DebugLoc::getTemporary());
1694 Preheader->getTerminator()->eraseFromParent();
1695 }
1696
1697 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1698 if (MemCheckBlock) {
1699 DT->eraseNode(BB: MemCheckBlock);
1700 LI->removeBlock(BB: MemCheckBlock);
1701 }
1702 if (SCEVCheckBlock) {
1703 DT->eraseNode(BB: SCEVCheckBlock);
1704 LI->removeBlock(BB: SCEVCheckBlock);
1705 }
1706
1707 // Outer loop is used as part of the later cost calculations.
1708 OuterLoop = L->getParentLoop();
1709 }
1710
1711 InstructionCost getCost() {
1712 if (SCEVCheckBlock || MemCheckBlock)
1713 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1714
1715 if (CostTooHigh) {
1716 InstructionCost Cost;
1717 Cost.setInvalid();
1718 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1719 return Cost;
1720 }
1721
1722 InstructionCost RTCheckCost = 0;
1723 if (SCEVCheckBlock)
1724 for (Instruction &I : *SCEVCheckBlock) {
1725 if (SCEVCheckBlock->getTerminator() == &I)
1726 continue;
1727 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1728 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1729 RTCheckCost += C;
1730 }
1731 if (MemCheckBlock) {
1732 InstructionCost MemCheckCost = 0;
1733 for (Instruction &I : *MemCheckBlock) {
1734 if (MemCheckBlock->getTerminator() == &I)
1735 continue;
1736 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1737 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1738 MemCheckCost += C;
1739 }
1740
1741 // If the runtime memory checks are being created inside an outer loop
1742 // we should find out if these checks are outer loop invariant. If so,
1743 // the checks will likely be hoisted out and so the effective cost will
1744 // reduce according to the outer loop trip count.
1745 if (OuterLoop) {
1746 ScalarEvolution *SE = MemCheckExp.getSE();
1747 // TODO: If profitable, we could refine this further by analysing every
1748 // individual memory check, since there could be a mixture of loop
1749 // variant and invariant checks that mean the final condition is
1750 // variant.
1751 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1752 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1753 // It seems reasonable to assume that we can reduce the effective
1754 // cost of the checks even when we know nothing about the trip
1755 // count. Assume that the outer loop executes at least twice.
1756 unsigned BestTripCount = 2;
1757
1758 // Get the best known TC estimate.
1759 if (auto EstimatedTC = getSmallBestKnownTC(
1760 PSE, L: OuterLoop, /* CanUseConstantMax = */ false))
1761 if (EstimatedTC->isFixed())
1762 BestTripCount = EstimatedTC->getFixedValue();
1763
1764 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1765
1766 // Let's ensure the cost is always at least 1.
1767 NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
1768 b: (InstructionCost::CostType)1);
1769
1770 if (BestTripCount > 1)
1771 LLVM_DEBUG(dbgs()
1772 << "We expect runtime memory checks to be hoisted "
1773 << "out of the outer loop. Cost reduced from "
1774 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1775
1776 MemCheckCost = NewMemCheckCost;
1777 }
1778 }
1779
1780 RTCheckCost += MemCheckCost;
1781 }
1782
1783 if (SCEVCheckBlock || MemCheckBlock)
1784 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1785 << "\n");
1786
1787 return RTCheckCost;
1788 }
1789
1790 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1791 /// unused.
1792 ~GeneratedRTChecks() {
1793 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1794 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1795 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(BB: SCEVCheckBlock);
1796 bool MemChecksUsed = !MemCheckBlock || !pred_empty(BB: MemCheckBlock);
1797 if (SCEVChecksUsed)
1798 SCEVCleaner.markResultUsed();
1799
1800 if (MemChecksUsed) {
1801 MemCheckCleaner.markResultUsed();
1802 } else {
1803 auto &SE = *MemCheckExp.getSE();
1804 // Memory runtime check generation creates compares that use expanded
1805 // values. Remove them before running the SCEVExpanderCleaners.
1806 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
1807 if (MemCheckExp.isInsertedInstruction(I: &I))
1808 continue;
1809 SE.forgetValue(V: &I);
1810 I.eraseFromParent();
1811 }
1812 }
1813 MemCheckCleaner.cleanup();
1814 SCEVCleaner.cleanup();
1815
1816 if (!SCEVChecksUsed)
1817 SCEVCheckBlock->eraseFromParent();
1818 if (!MemChecksUsed)
1819 MemCheckBlock->eraseFromParent();
1820 }
1821
1822 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
1823 /// outside VPlan.
1824 std::pair<Value *, BasicBlock *> getSCEVChecks() const {
1825 using namespace llvm::PatternMatch;
1826 if (!SCEVCheckCond || match(V: SCEVCheckCond, P: m_ZeroInt()))
1827 return {nullptr, nullptr};
1828
1829 return {SCEVCheckCond, SCEVCheckBlock};
1830 }
1831
1832 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
1833 /// outside VPlan.
1834 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() const {
1835 using namespace llvm::PatternMatch;
1836 if (MemRuntimeCheckCond && match(V: MemRuntimeCheckCond, P: m_ZeroInt()))
1837 return {nullptr, nullptr};
1838 return {MemRuntimeCheckCond, MemCheckBlock};
1839 }
1840
1841 /// Return true if any runtime checks have been added
1842 bool hasChecks() const {
1843 return getSCEVChecks().first || getMemRuntimeChecks().first;
1844 }
1845};
1846} // namespace
1847
1848static bool useActiveLaneMask(TailFoldingStyle Style) {
1849 return Style == TailFoldingStyle::Data ||
1850 Style == TailFoldingStyle::DataAndControlFlow;
1851}
1852
1853static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
1854 return Style == TailFoldingStyle::DataAndControlFlow;
1855}
1856
1857// Return true if \p OuterLp is an outer loop annotated with hints for explicit
1858// vectorization. The loop needs to be annotated with #pragma omp simd
1859// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1860// vector length information is not provided, vectorization is not considered
1861// explicit. Interleave hints are not allowed either. These limitations will be
1862// relaxed in the future.
1863// Please, note that we are currently forced to abuse the pragma 'clang
1864// vectorize' semantics. This pragma provides *auto-vectorization hints*
1865// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1866// provides *explicit vectorization hints* (LV can bypass legal checks and
1867// assume that vectorization is legal). However, both hints are implemented
1868// using the same metadata (llvm.loop.vectorize, processed by
1869// LoopVectorizeHints). This will be fixed in the future when the native IR
1870// representation for pragma 'omp simd' is introduced.
1871static bool isExplicitVecOuterLoop(Loop *OuterLp,
1872 OptimizationRemarkEmitter *ORE) {
1873 assert(!OuterLp->isInnermost() && "This is not an outer loop");
1874 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1875
1876 // Only outer loops with an explicit vectorization hint are supported.
1877 // Unannotated outer loops are ignored.
1878 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1879 return false;
1880
1881 Function *Fn = OuterLp->getHeader()->getParent();
1882 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
1883 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
1884 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1885 return false;
1886 }
1887
1888 if (Hints.getInterleave() > 1) {
1889 // TODO: Interleave support is future work.
1890 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1891 "outer loops.\n");
1892 Hints.emitRemarkWithHints();
1893 return false;
1894 }
1895
1896 return true;
1897}
1898
1899static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1900 OptimizationRemarkEmitter *ORE,
1901 SmallVectorImpl<Loop *> &V) {
1902 // Collect inner loops and outer loops without irreducible control flow. For
1903 // now, only collect outer loops that have explicit vectorization hints. If we
1904 // are stress testing the VPlan H-CFG construction, we collect the outermost
1905 // loop of every loop nest.
1906 if (L.isInnermost() || VPlanBuildOuterloopStressTest ||
1907 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
1908 LoopBlocksRPO RPOT(&L);
1909 RPOT.perform(LI);
1910 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
1911 V.push_back(Elt: &L);
1912 // TODO: Collect inner loops inside marked outer loops in case
1913 // vectorization fails for the outer loop. Do not invoke
1914 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1915 // already known to be reducible. We can use an inherited attribute for
1916 // that.
1917 return;
1918 }
1919 }
1920 for (Loop *InnerL : L)
1921 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
1922}
1923
1924//===----------------------------------------------------------------------===//
1925// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1926// LoopVectorizationCostModel and LoopVectorizationPlanner.
1927//===----------------------------------------------------------------------===//
1928
1929/// For the given VF and UF and maximum trip count computed for the loop, return
1930/// whether the induction variable might overflow in the vectorized loop. If not,
1931/// then we know a runtime overflow check always evaluates to false and can be
1932/// removed.
1933static bool isIndvarOverflowCheckKnownFalse(
1934 const LoopVectorizationCostModel *Cost,
1935 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
1936 // Always be conservative if we don't know the exact unroll factor.
1937 unsigned MaxUF = UF ? *UF
1938 : std::max(a: Cost->TTI.getMaxInterleaveFactor(VF, HasUnorderedReductions: false),
1939 b: Cost->TTI.getMaxInterleaveFactor(VF, HasUnorderedReductions: true));
1940
1941 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
1942 APInt MaxUIntTripCount = IdxTy->getMask();
1943
1944 // We know the runtime overflow check is known false iff the (max) trip-count
1945 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
1946 // the vector loop induction variable.
1947 if (std::optional<ElementCount> TC = getSmallBestKnownTC(
1948 PSE&: Cost->PSE, L: Cost->TheLoop,
1949 /*CanUseConstantMax=*/true, /*CanExcludeZeroTrips=*/false,
1950 /*ComputeUpperBoundOnly=*/true)) {
1951 unsigned MaxVF = VF.getKnownMinValue();
1952 unsigned MaxTC = TC->getKnownMinValue();
1953 if (VF.isScalable() || TC->isScalable()) {
1954 std::optional<unsigned> MaxVScale =
1955 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
1956 if (!MaxVScale)
1957 return false;
1958 if (VF.isScalable())
1959 MaxVF *= *MaxVScale;
1960 if (TC->isScalable()) {
1961 bool Overflow;
1962 MaxTC = SaturatingMultiply(X: MaxTC, Y: *MaxVScale, ResultOverflowed: &Overflow);
1963 if (Overflow)
1964 return false;
1965 }
1966 }
1967
1968 return (MaxUIntTripCount - MaxTC).ugt(RHS: MaxVF * MaxUF);
1969 }
1970
1971 return false;
1972}
1973
1974// Return whether we allow using masked interleave-groups (for dealing with
1975// strided loads/stores that reside in predicated blocks, or for dealing
1976// with gaps).
1977static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
1978 // If an override option has been passed in for interleaved accesses, use it.
1979 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
1980 return EnableMaskedInterleavedMemAccesses;
1981
1982 return TTI.enableMaskedInterleavedAccessVectorization();
1983}
1984
1985/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
1986/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
1987/// predecessors and successors of VPBB, if any, are rewired to the new
1988/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
1989static VPIRBasicBlock *replaceVPBBWithIRVPBB(VPBasicBlock *VPBB,
1990 BasicBlock *IRBB,
1991 VPlan *Plan = nullptr) {
1992 if (!Plan)
1993 Plan = VPBB->getPlan();
1994 VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
1995 auto IP = IRVPBB->begin();
1996 for (auto &R : make_early_inc_range(Range: VPBB->phis()))
1997 R.moveBefore(BB&: *IRVPBB, I: IP);
1998
1999 for (auto &R :
2000 make_early_inc_range(Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end())))
2001 R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2002
2003 VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2004 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2005 return IRVPBB;
2006}
2007
2008BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
2009 BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2010 assert(VectorPH && "Invalid loop structure");
2011 assert((OrigLoop->getUniqueLatchExitBlock() ||
2012 Cost->requiresScalarEpilogue(VF.isVector())) &&
2013 "loops not exiting via the latch without required epilogue?");
2014
2015 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2016 // wrapping the newly created scalar preheader here at the moment, because the
2017 // Plan's scalar preheader may be unreachable at this point. Instead it is
2018 // replaced in executePlan.
2019 return SplitBlock(Old: VectorPH, SplitPt: VectorPH->getTerminator(), DT, LI, MSSAU: nullptr,
2020 BBName: Twine(Prefix) + "scalar.ph");
2021}
2022
2023/// Knowing that loop \p L executes a single vector iteration, add instructions
2024/// that will get simplified and thus should not have any cost to \p
2025/// InstsToIgnore.
2026static void addFullyUnrolledInstructionsToIgnore(
2027 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2028 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2029 auto *Cmp = L->getLatchCmpInst();
2030 if (Cmp)
2031 InstsToIgnore.insert(Ptr: Cmp);
2032 for (const auto &KV : IL) {
2033 // Extract the key by hand so that it can be used in the lambda below. Note
2034 // that captured structured bindings are a C++20 extension.
2035 const PHINode *IV = KV.first;
2036
2037 // Get next iteration value of the induction variable.
2038 Instruction *IVInst =
2039 cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2040 if (all_of(Range: IVInst->users(),
2041 P: [&](const User *U) { return U == IV || U == Cmp; }))
2042 InstsToIgnore.insert(Ptr: IVInst);
2043 }
2044}
2045
2046BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2047 // Create a new IR basic block for the scalar preheader.
2048 BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
2049 return ScalarPH->getSinglePredecessor();
2050}
2051
2052namespace {
2053
2054struct CSEDenseMapInfo {
2055 static bool canHandle(const Instruction *I) {
2056 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2057 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2058 }
2059
2060 static unsigned getHashValue(const Instruction *I) {
2061 assert(canHandle(I) && "Unknown instruction!");
2062 return hash_combine(args: I->getOpcode(),
2063 args: hash_combine_range(R: I->operand_values()));
2064 }
2065
2066 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2067 return LHS->isIdenticalTo(I: RHS);
2068 }
2069};
2070
2071} // end anonymous namespace
2072
2073/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2074/// removal, in favor of the VPlan-based one.
2075static void legacyCSE(BasicBlock *BB) {
2076 // Perform simple cse.
2077 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2078 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2079 if (!CSEDenseMapInfo::canHandle(I: &In))
2080 continue;
2081
2082 // Check if we can replace this instruction with any of the
2083 // visited instructions.
2084 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2085 In.replaceAllUsesWith(V);
2086 In.eraseFromParent();
2087 continue;
2088 }
2089
2090 CSEMap[&In] = &In;
2091 }
2092}
2093
2094/// This function attempts to return a value that represents the ElementCount
2095/// at runtime. For fixed-width VFs we know this precisely at compile
2096/// time, but for scalable VFs we calculate it based on an estimate of the
2097/// vscale value.
2098static unsigned estimateElementCount(ElementCount VF,
2099 std::optional<unsigned> VScale) {
2100 unsigned EstimatedVF = VF.getKnownMinValue();
2101 if (VF.isScalable())
2102 if (VScale)
2103 EstimatedVF *= *VScale;
2104 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2105 return EstimatedVF;
2106}
2107
2108/// Returns the vector library variant function of \p CI usable at \p VF,
2109/// respecting \p MaskRequired, or nullptr if none is found: a mapping with
2110/// matching VF, masked if required, whose vector function is declared in the
2111/// module.
2112static Function *getVectorLibraryVariantFor(const CallInst &CI, ElementCount VF,
2113 bool MaskRequired,
2114 const TargetLibraryInfo *TLI) {
2115 if (!TLI || CI.isNoBuiltin())
2116 return nullptr;
2117 for (const VFInfo &Info : VFDatabase::getMappings(CI))
2118 if (Info.Shape.VF == VF && (!MaskRequired || Info.isMasked()))
2119 if (Function *F = CI.getModule()->getFunction(Name: Info.VectorName))
2120 return F;
2121 return nullptr;
2122}
2123
2124/// Returns true iff \p CI has a library vector variant usable at \p VF.
2125static bool hasVectorLibraryVariantFor(const CallInst &CI, ElementCount VF,
2126 bool MaskRequired,
2127 const TargetLibraryInfo *TLI) {
2128 return getVectorLibraryVariantFor(CI, VF, MaskRequired, TLI) != nullptr;
2129}
2130
2131InstructionCost
2132LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2133 ElementCount VF) const {
2134 Type *RetTy = CI->getType();
2135 SmallVector<Type *, 4> Tys;
2136 for (auto &ArgOp : CI->args())
2137 Tys.push_back(Elt: ArgOp->getType());
2138
2139 InstructionCost ScalarCallCost = TTI.getCallInstrCost(
2140 F: CI->getCalledFunction(), RetTy, Tys, CostKind: Config.CostKind);
2141
2142 // Cost of the scalar call (scalar VF) or its scalarization (vector VF). The
2143 // scalarization cost is only meaningful for fixed VFs.
2144 InstructionCost Cost = VF.isScalable()
2145 ? InstructionCost::getInvalid()
2146 : ScalarCallCost * VF.getKnownMinValue() +
2147 getScalarizationOverhead(I: CI, VF);
2148
2149 // The call may be vectorized at this VF, via a vector intrinsic or a vector
2150 // library variant.
2151 if (getVectorIntrinsicIDForCall(CI, TLI))
2152 Cost = std::min(a: Cost, b: getVectorIntrinsicCost(CI, VF));
2153
2154 if (Function *Variant =
2155 getVectorLibraryVariantFor(CI: *CI, VF, MaskRequired: isMaskRequired(I: CI), TLI))
2156 Cost = std::min(a: Cost,
2157 b: TTI.getCallInstrCost(
2158 /*F=*/nullptr, RetTy: Variant->getReturnType(),
2159 Tys: Variant->getFunctionType()->params(), CostKind: Config.CostKind));
2160
2161 return Cost;
2162}
2163
2164static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2165 if (VF.isScalar() || !canVectorizeTy(Ty))
2166 return Ty;
2167 return toVectorizedTy(Ty, EC: VF);
2168}
2169
2170InstructionCost
2171LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2172 ElementCount VF) const {
2173 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2174 assert(ID && "Expected intrinsic call!");
2175 Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2176 FastMathFlags FMF;
2177 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2178 FMF = FPMO->getFastMathFlags();
2179
2180 SmallVector<const Value *> Arguments(CI->args());
2181 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2182 SmallVector<Type *> ParamTys;
2183 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2184 result: std::back_inserter(x&: ParamTys),
2185 unary_op: [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2186
2187 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2188 dyn_cast<IntrinsicInst>(Val: CI),
2189 InstructionCost::getInvalid());
2190 return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind: Config.CostKind);
2191}
2192
2193void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2194 // Don't apply optimizations below when no (vector) loop remains, as they all
2195 // require one at the moment.
2196 VPBasicBlock *HeaderVPBB =
2197 vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2198 if (!HeaderVPBB)
2199 return;
2200
2201 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2202
2203 // Remove redundant induction instructions.
2204 legacyCSE(BB: HeaderBB);
2205}
2206
2207void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2208 // We should not collect Scalars more than once per VF. Right now, this
2209 // function is called from collectUniformsAndScalars(), which already does
2210 // this check. Collecting Scalars for VF=1 does not make any sense.
2211 assert(VF.isVector() && !Scalars.contains(VF) &&
2212 "This function should not be visited twice for the same VF");
2213
2214 // This avoids any chances of creating a REPLICATE recipe during planning
2215 // since that would result in generation of scalarized code during execution,
2216 // which is not supported for scalable vectors.
2217 if (VF.isScalable()) {
2218 Scalars[VF].insert_range(R&: Uniforms[VF]);
2219 return;
2220 }
2221
2222 SmallSetVector<Instruction *, 8> Worklist;
2223
2224 // These sets are used to seed the analysis with pointers used by memory
2225 // accesses that will remain scalar.
2226 SmallSetVector<Instruction *, 8> ScalarPtrs;
2227 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2228 auto *Latch = TheLoop->getLoopLatch();
2229
2230 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2231 // The pointer operands of loads and stores will be scalar as long as the
2232 // memory access is not a gather or scatter operation. The value operand of a
2233 // store will remain scalar if the store is scalarized.
2234 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2235 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2236 assert(WideningDecision != CM_Unknown &&
2237 "Widening decision should be ready at this moment");
2238 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2239 if (Ptr == Store->getValueOperand())
2240 return WideningDecision == CM_Scalarize;
2241 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2242 "Ptr is neither a value or pointer operand");
2243 return WideningDecision != CM_GatherScatter;
2244 };
2245
2246 // A helper that returns true if the given value is a getelementptr
2247 // instruction contained in the loop.
2248 auto IsLoopVaryingGEP = [&](Value *V) {
2249 return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2250 };
2251
2252 // A helper that evaluates a memory access's use of a pointer. If the use will
2253 // be a scalar use and the pointer is only used by memory accesses, we place
2254 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2255 // PossibleNonScalarPtrs.
2256 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2257 // We only care about bitcast and getelementptr instructions contained in
2258 // the loop.
2259 if (!IsLoopVaryingGEP(Ptr))
2260 return;
2261
2262 // If the pointer has already been identified as scalar (e.g., if it was
2263 // also identified as uniform), there's nothing to do.
2264 auto *I = cast<Instruction>(Val: Ptr);
2265 if (Worklist.count(key: I))
2266 return;
2267
2268 // If the use of the pointer will be a scalar use, and all users of the
2269 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2270 // place the pointer in PossibleNonScalarPtrs.
2271 if (IsScalarUse(MemAccess, Ptr) &&
2272 all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2273 ScalarPtrs.insert(X: I);
2274 else
2275 PossibleNonScalarPtrs.insert(Ptr: I);
2276 };
2277
2278 // We seed the scalars analysis with three classes of instructions: (1)
2279 // instructions marked uniform-after-vectorization and (2) bitcast,
2280 // getelementptr and (pointer) phi instructions used by memory accesses
2281 // requiring a scalar use.
2282 //
2283 // (1) Add to the worklist all instructions that have been identified as
2284 // uniform-after-vectorization.
2285 Worklist.insert_range(R&: Uniforms[VF]);
2286
2287 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2288 // memory accesses requiring a scalar use. The pointer operands of loads and
2289 // stores will be scalar unless the operation is a gather or scatter.
2290 // The value operand of a store will remain scalar if the store is scalarized.
2291 for (auto *BB : TheLoop->blocks())
2292 for (auto &I : *BB) {
2293 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2294 EvaluatePtrUse(Load, Load->getPointerOperand());
2295 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2296 EvaluatePtrUse(Store, Store->getPointerOperand());
2297 EvaluatePtrUse(Store, Store->getValueOperand());
2298 }
2299 }
2300 for (auto *I : ScalarPtrs)
2301 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2302 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2303 Worklist.insert(X: I);
2304 }
2305
2306 // Insert the forced scalars.
2307 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2308 // induction variable when the PHI user is scalarized.
2309 auto ForcedScalar = ForcedScalars.find(Val: VF);
2310 if (ForcedScalar != ForcedScalars.end())
2311 for (auto *I : ForcedScalar->second) {
2312 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2313 Worklist.insert(X: I);
2314 }
2315
2316 // Expand the worklist by looking through any bitcasts and getelementptr
2317 // instructions we've already identified as scalar. This is similar to the
2318 // expansion step in collectLoopUniforms(); however, here we're only
2319 // expanding to include additional bitcasts and getelementptr instructions.
2320 unsigned Idx = 0;
2321 while (Idx != Worklist.size()) {
2322 Instruction *Dst = Worklist[Idx++];
2323 if (!IsLoopVaryingGEP(Dst->getOperand(i: 0)))
2324 continue;
2325 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
2326 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
2327 auto *J = cast<Instruction>(Val: U);
2328 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
2329 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
2330 IsScalarUse(J, Src));
2331 })) {
2332 Worklist.insert(X: Src);
2333 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2334 }
2335 }
2336
2337 // An induction variable will remain scalar if all users of the induction
2338 // variable and induction variable update remain scalar.
2339 for (const auto &Induction : Legal->getInductionVars()) {
2340 auto *Ind = Induction.first;
2341 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2342
2343 // If tail-folding is applied, the primary induction variable will be used
2344 // to feed a vector compare.
2345 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2346 continue;
2347
2348 // Returns true if \p Indvar is a pointer induction that is used directly by
2349 // load/store instruction \p I.
2350 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2351 Instruction *I) {
2352 return Induction.second.getKind() ==
2353 InductionDescriptor::IK_PtrInduction &&
2354 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
2355 Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse(I, Indvar);
2356 };
2357
2358 // Determine if all users of the induction variable are scalar after
2359 // vectorization.
2360 bool ScalarInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2361 auto *I = cast<Instruction>(Val: U);
2362 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2363 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2364 });
2365 if (!ScalarInd)
2366 continue;
2367
2368 // If the induction variable update is a fixed-order recurrence, neither the
2369 // induction variable or its update should be marked scalar after
2370 // vectorization.
2371 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2372 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2373 continue;
2374
2375 // Determine if all users of the induction variable update instruction are
2376 // scalar after vectorization.
2377 bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
2378 auto *I = cast<Instruction>(Val: U);
2379 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2380 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2381 });
2382 if (!ScalarIndUpdate)
2383 continue;
2384
2385 // The induction variable and its update instruction will remain scalar.
2386 Worklist.insert(X: Ind);
2387 Worklist.insert(X: IndUpdate);
2388 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2389 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2390 << "\n");
2391 }
2392
2393 Scalars[VF].insert_range(R&: Worklist);
2394}
2395
2396bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
2397 ElementCount VF) {
2398 if (!isPredicatedInst(I))
2399 return false;
2400
2401 // Do we have a non-scalar lowering for this predicated
2402 // instruction? No - it is scalar with predication.
2403 switch(I->getOpcode()) {
2404 default:
2405 return true;
2406 case Instruction::Call: {
2407 if (VF.isScalar())
2408 return true;
2409 auto *CI = cast<CallInst>(Val: I);
2410 // A vector intrinsic or library variant lowering avoids scalarization.
2411 return !getVectorIntrinsicIDForCall(CI, TLI) &&
2412 !hasVectorLibraryVariantFor(CI: *CI, VF, MaskRequired: isMaskRequired(I: CI), TLI);
2413 }
2414 case Instruction::Load:
2415 case Instruction::Store: {
2416 bool IsConsecutive = Legal->isConsecutivePtr(AccessTy: getLoadStoreType(I),
2417 Ptr: getLoadStorePointerOperand(V: I));
2418 return !(IsConsecutive && Config.isLegalMaskedLoadOrStore(I, VF)) &&
2419 !Config.isLegalGatherOrScatter(V: I, VF);
2420 }
2421 case Instruction::UDiv:
2422 case Instruction::SDiv:
2423 case Instruction::SRem:
2424 case Instruction::URem: {
2425 // We have the option to use the llvm.masked.udiv intrinsics to avoid
2426 // predication. The cost based decision here will always select the masked
2427 // intrinsics for scalable vectors as scalarization isn't legal.
2428 const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
2429 return isDivRemScalarWithPredication(ScalarCost, MaskedCost);
2430 }
2431 }
2432}
2433
2434bool LoopVectorizationCostModel::isMaskRequired(Instruction *I) const {
2435 return Legal->isMaskRequired(I, TailFolded: foldTailByMasking());
2436}
2437
2438// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2439bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2440 // TODO: We can use the loop-preheader as context point here and get
2441 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2442 if (isSafeToSpeculativelyExecute(I) ||
2443 (isa<LoadInst, StoreInst, CallInst>(Val: I) && !isMaskRequired(I)) ||
2444 isa<UncondBrInst, CondBrInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2445 return false;
2446
2447 // If the instruction was executed conditionally in the original scalar loop,
2448 // predication is needed with a mask whose lanes are all possibly inactive.
2449 if (Legal->blockNeedsPredication(BB: I->getParent()))
2450 return true;
2451
2452 // If we're not folding the tail by masking and not vectorizing a loop with
2453 // uncountable exits and side effects, predication is unnecessary.
2454 if (!foldTailByMasking() && !Legal->hasUncountableExitWithSideEffects())
2455 return false;
2456
2457 // All that remain are instructions with side-effects originally executed in
2458 // the loop unconditionally, but now execute under a tail-fold mask (only)
2459 // having at least one active lane (the first). If the side-effects of the
2460 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2461 // - it will cause the same side-effects as when masked.
2462 switch(I->getOpcode()) {
2463 default:
2464 llvm_unreachable(
2465 "instruction should have been considered by earlier checks");
2466 case Instruction::Call:
2467 // Side-effects of a Call are assumed to be non-invariant, needing a
2468 // (fold-tail) mask.
2469 assert(isMaskRequired(I) &&
2470 "should have returned earlier for calls not needing a mask");
2471 return true;
2472 case Instruction::Load:
2473 // If the address is loop invariant no predication is needed.
2474 return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2475 case Instruction::Store: {
2476 // For stores, we need to prove both speculation safety (which follows from
2477 // the same argument as loads), but also must prove the value being stored
2478 // is correct. The easiest form of the later is to require that all values
2479 // stored are the same.
2480 return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2481 TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2482 }
2483 case Instruction::UDiv:
2484 case Instruction::URem:
2485 // If the divisor is loop-invariant no predication is needed.
2486 return !Legal->isInvariant(V: I->getOperand(i: 1));
2487 case Instruction::SDiv:
2488 case Instruction::SRem:
2489 // Conservative for now, since masked-off lanes may be poison and could
2490 // trigger signed overflow.
2491 return true;
2492 }
2493}
2494
2495uint64_t LoopVectorizationCostModel::getPredBlockCostDivisor(
2496 TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
2497 if (CostKind == TTI::TCK_CodeSize)
2498 return 1;
2499 // If the block wasn't originally predicated then return early to avoid
2500 // computing BlockFrequencyInfo unnecessarily.
2501 if (!Legal->blockNeedsPredication(BB))
2502 return 1;
2503
2504 uint64_t HeaderFreq =
2505 getBFI().getBlockFreq(BB: TheLoop->getHeader()).getFrequency();
2506 uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2507 assert(HeaderFreq >= BBFreq &&
2508 "Header has smaller block freq than dominated BB?");
2509 return std::round(x: (double)HeaderFreq / BBFreq);
2510}
2511
2512static Intrinsic::ID getMaskedDivRemIntrinsic(unsigned Opcode) {
2513 switch (Opcode) {
2514 case Instruction::UDiv:
2515 return Intrinsic::masked_udiv;
2516 case Instruction::SDiv:
2517 return Intrinsic::masked_sdiv;
2518 case Instruction::URem:
2519 return Intrinsic::masked_urem;
2520 case Instruction::SRem:
2521 return Intrinsic::masked_srem;
2522 default:
2523 llvm_unreachable("Unexpected opcode");
2524 }
2525}
2526
2527std::pair<InstructionCost, InstructionCost>
2528LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
2529 ElementCount VF) {
2530 assert(I->getOpcode() == Instruction::UDiv ||
2531 I->getOpcode() == Instruction::SDiv ||
2532 I->getOpcode() == Instruction::SRem ||
2533 I->getOpcode() == Instruction::URem);
2534 assert(!isSafeToSpeculativelyExecute(I));
2535
2536 // Scalarization isn't legal for scalable vector types
2537 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2538 if (!VF.isScalable()) {
2539 // Get the scalarization cost and scale this amount by the probability of
2540 // executing the predicated block. If the instruction is not predicated,
2541 // we fall through to the next case.
2542 ScalarizationCost = 0;
2543
2544 // These instructions have a non-void type, so account for the phi nodes
2545 // that we will create. This cost is likely to be zero. The phi node
2546 // cost, if any, should be scaled by the block probability because it
2547 // models a copy at the end of each predicated block.
2548 ScalarizationCost += VF.getFixedValue() *
2549 TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Config.CostKind);
2550
2551 // The cost of the non-predicated instruction.
2552 ScalarizationCost +=
2553 VF.getFixedValue() * TTI.getArithmeticInstrCost(
2554 Opcode: I->getOpcode(), Ty: I->getType(), CostKind: Config.CostKind);
2555
2556 // The cost of insertelement and extractelement instructions needed for
2557 // scalarization.
2558 ScalarizationCost += getScalarizationOverhead(I, VF);
2559
2560 // Scale the cost by the probability of executing the predicated blocks.
2561 // This assumes the predicated block for each vector lane is equally
2562 // likely.
2563 ScalarizationCost =
2564 ScalarizationCost /
2565 getPredBlockCostDivisor(CostKind: Config.CostKind, BB: I->getParent());
2566 }
2567
2568 auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
2569 auto *MaskTy = toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF);
2570 IntrinsicCostAttributes ICA(getMaskedDivRemIntrinsic(Opcode: I->getOpcode()), VecTy,
2571 {VecTy, VecTy, MaskTy});
2572 InstructionCost MaskedCost = TTI.getIntrinsicInstrCost(ICA, CostKind: Config.CostKind);
2573 return {ScalarizationCost, MaskedCost};
2574}
2575
2576bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
2577 Instruction *I, ElementCount VF) const {
2578 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2579 assert(getWideningDecision(I, VF) == CM_Unknown &&
2580 "Decision should not be set yet.");
2581 auto *Group = getInterleavedAccessGroup(Instr: I);
2582 assert(Group && "Must have a group.");
2583 unsigned InterleaveFactor = Group->getFactor();
2584
2585 // If the instruction's allocated size doesn't equal its type size, it
2586 // requires padding and will be scalarized.
2587 auto &DL = I->getDataLayout();
2588 auto *ScalarTy = getLoadStoreType(I);
2589 if (hasIrregularType(Ty: ScalarTy, DL))
2590 return false;
2591
2592 // For scalable vectors, the interleave factors must be <= 8 since we require
2593 // the (de)interleaveN intrinsics instead of shufflevectors.
2594 if (VF.isScalable() && InterleaveFactor > 8)
2595 return false;
2596
2597 // If the group involves a non-integral pointer, we may not be able to
2598 // losslessly cast all values to a common type.
2599 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
2600 for (Instruction *Member : Group->members()) {
2601 auto *MemberTy = getLoadStoreType(I: Member);
2602 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
2603 // Don't coerce non-integral pointers to integers or vice versa.
2604 if (MemberNI != ScalarNI)
2605 // TODO: Consider adding special nullptr value case here
2606 return false;
2607 if (MemberNI && ScalarNI &&
2608 ScalarTy->getPointerAddressSpace() !=
2609 MemberTy->getPointerAddressSpace())
2610 return false;
2611 }
2612
2613 // Check if masking is required.
2614 // A Group may need masking for one of two reasons: it resides in a block that
2615 // needs predication, or it was decided to use masking to deal with gaps
2616 // (either a gap at the end of a load-access that may result in a speculative
2617 // load, or any gaps in a store-access).
2618 bool PredicatedAccessRequiresMasking =
2619 blockNeedsPredicationForAnyReason(BB: I->getParent()) && isMaskRequired(I);
2620 bool LoadAccessWithGapsRequiresEpilogMasking =
2621 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
2622 !isEpilogueAllowed();
2623 bool StoreAccessWithGapsRequiresMasking =
2624 isa<StoreInst>(Val: I) && !Group->isFull();
2625 if (!PredicatedAccessRequiresMasking &&
2626 !LoadAccessWithGapsRequiresEpilogMasking &&
2627 !StoreAccessWithGapsRequiresMasking)
2628 return true;
2629
2630 // If masked interleaving is required, we expect that the user/target had
2631 // enabled it, because otherwise it either wouldn't have been created or
2632 // it should have been invalidated by the CostModel.
2633 assert(useMaskedInterleavedAccesses(TTI) &&
2634 "Masked interleave-groups for predicated accesses are not enabled.");
2635
2636 if (Group->isReverse())
2637 return false;
2638
2639 // TODO: Support interleaved access that requires a gap mask for scalable VFs.
2640 bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking ||
2641 StoreAccessWithGapsRequiresMasking;
2642 if (VF.isScalable() && NeedsMaskForGaps)
2643 return false;
2644
2645 return Config.isLegalMaskedLoadOrStore(I, VF);
2646}
2647
2648bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
2649 Instruction *I, ElementCount VF) {
2650 // Get and ensure we have a valid memory instruction.
2651 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
2652
2653 auto *Ptr = getLoadStorePointerOperand(V: I);
2654 auto *ScalarTy = getLoadStoreType(I);
2655
2656 // In order to be widened, the pointer should be consecutive, first of all.
2657 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
2658 return false;
2659
2660 // If the instruction is a store located in a predicated block, it will be
2661 // scalarized.
2662 if (isScalarWithPredication(I, VF))
2663 return false;
2664
2665 // If the instruction's allocated size doesn't equal it's type size, it
2666 // requires padding and will be scalarized.
2667 auto &DL = I->getDataLayout();
2668 if (hasIrregularType(Ty: ScalarTy, DL))
2669 return false;
2670
2671 return true;
2672}
2673
2674void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
2675 // We should not collect Uniforms more than once per VF. Right now,
2676 // this function is called from collectUniformsAndScalars(), which
2677 // already does this check. Collecting Uniforms for VF=1 does not make any
2678 // sense.
2679
2680 assert(VF.isVector() && !Uniforms.contains(VF) &&
2681 "This function should not be visited twice for the same VF");
2682
2683 // Visit the list of Uniforms. If we find no uniform value, we won't
2684 // analyze again. Uniforms.count(VF) will return 1.
2685 Uniforms[VF].clear();
2686
2687 // Now we know that the loop is vectorizable!
2688 // Collect instructions inside the loop that will remain uniform after
2689 // vectorization.
2690
2691 // Global values, params and instructions outside of current loop are out of
2692 // scope.
2693 auto IsOutOfScope = [&](Value *V) -> bool {
2694 Instruction *I = dyn_cast<Instruction>(Val: V);
2695 return (!I || !TheLoop->contains(Inst: I));
2696 };
2697
2698 // Worklist containing uniform instructions demanding lane 0.
2699 SetVector<Instruction *> Worklist;
2700
2701 // Add uniform instructions demanding lane 0 to the worklist. Instructions
2702 // that require predication must not be considered uniform after
2703 // vectorization, because that would create an erroneous replicating region
2704 // where only a single instance out of VF should be formed.
2705 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
2706 if (IsOutOfScope(I)) {
2707 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
2708 << *I << "\n");
2709 return;
2710 }
2711 if (isPredicatedInst(I)) {
2712 LLVM_DEBUG(
2713 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
2714 << "\n");
2715 return;
2716 }
2717 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
2718 Worklist.insert(X: I);
2719 };
2720
2721 // Start with the conditional branches exiting the loop. If the branch
2722 // condition is an instruction contained in the loop that is only used by the
2723 // branch, it is uniform. Note conditions from uncountable early exits are not
2724 // uniform.
2725 SmallVector<BasicBlock *> Exiting;
2726 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
2727 for (BasicBlock *E : Exiting) {
2728 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
2729 continue;
2730 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
2731 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
2732 AddToWorklistIfAllowed(Cmp);
2733 }
2734
2735 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
2736 // Return true if all lanes perform the same memory operation, and we can
2737 // thus choose to execute only one.
2738 auto IsUniformMemOpUse = [&](Instruction *I) {
2739 // If the value was already known to not be uniform for the previous
2740 // (smaller VF), it cannot be uniform for the larger VF.
2741 if (PrevVF.isVector()) {
2742 auto Iter = Uniforms.find(Val: PrevVF);
2743 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
2744 return false;
2745 }
2746 if (!isUniformMemOp(I&: *I, VF))
2747 return false;
2748 if (isa<LoadInst>(Val: I))
2749 // Loading the same address always produces the same result - at least
2750 // assuming aliasing and ordering which have already been checked.
2751 return true;
2752 // Storing the same value on every iteration.
2753 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
2754 };
2755
2756 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
2757 InstWidening WideningDecision = getWideningDecision(I, VF);
2758 assert(WideningDecision != CM_Unknown &&
2759 "Widening decision should be ready at this moment");
2760
2761 if (IsUniformMemOpUse(I))
2762 return true;
2763
2764 return (WideningDecision == CM_Widen ||
2765 WideningDecision == CM_Widen_Reverse ||
2766 WideningDecision == CM_Interleave);
2767 };
2768
2769 // Returns true if Ptr is the pointer operand of a memory access instruction
2770 // I, I is known to not require scalarization, and the pointer is not also
2771 // stored.
2772 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
2773 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
2774 return false;
2775 return getLoadStorePointerOperand(V: I) == Ptr &&
2776 (IsUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
2777 };
2778
2779 // Holds a list of values which are known to have at least one uniform use.
2780 // Note that there may be other uses which aren't uniform. A "uniform use"
2781 // here is something which only demands lane 0 of the unrolled iterations;
2782 // it does not imply that all lanes produce the same value (e.g. this is not
2783 // the usual meaning of uniform)
2784 SetVector<Value *> HasUniformUse;
2785
2786 // Scan the loop for instructions which are either a) known to have only
2787 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
2788 for (auto *BB : TheLoop->blocks())
2789 for (auto &I : *BB) {
2790 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
2791 switch (II->getIntrinsicID()) {
2792 case Intrinsic::sideeffect:
2793 case Intrinsic::experimental_noalias_scope_decl:
2794 case Intrinsic::assume:
2795 case Intrinsic::lifetime_start:
2796 case Intrinsic::lifetime_end:
2797 if (TheLoop->hasLoopInvariantOperands(I: &I))
2798 AddToWorklistIfAllowed(&I);
2799 break;
2800 default:
2801 break;
2802 }
2803 }
2804
2805 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
2806 if (IsOutOfScope(EVI->getAggregateOperand())) {
2807 AddToWorklistIfAllowed(EVI);
2808 continue;
2809 }
2810 // Only ExtractValue instructions where the aggregate value comes from a
2811 // call are allowed to be non-uniform.
2812 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
2813 "Expected aggregate value to be call return value");
2814 }
2815
2816 // If there's no pointer operand, there's nothing to do.
2817 auto *Ptr = getLoadStorePointerOperand(V: &I);
2818 if (!Ptr)
2819 continue;
2820
2821 // If the pointer can be proven to be uniform, always add it to the
2822 // worklist.
2823 if (isa<Instruction>(Val: Ptr) && isUniform(V: Ptr, VF))
2824 AddToWorklistIfAllowed(cast<Instruction>(Val: Ptr));
2825
2826 if (IsUniformMemOpUse(&I))
2827 AddToWorklistIfAllowed(&I);
2828
2829 if (IsVectorizedMemAccessUse(&I, Ptr))
2830 HasUniformUse.insert(X: Ptr);
2831 }
2832
2833 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
2834 // demanding) users. Since loops are assumed to be in LCSSA form, this
2835 // disallows uses outside the loop as well.
2836 for (auto *V : HasUniformUse) {
2837 if (IsOutOfScope(V))
2838 continue;
2839 auto *I = cast<Instruction>(Val: V);
2840 bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User *U) -> bool {
2841 auto *UI = cast<Instruction>(Val: U);
2842 return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse(UI, V);
2843 });
2844 if (UsersAreMemAccesses)
2845 AddToWorklistIfAllowed(I);
2846 }
2847
2848 // Expand Worklist in topological order: whenever a new instruction
2849 // is added , its users should be already inside Worklist. It ensures
2850 // a uniform instruction will only be used by uniform instructions.
2851 unsigned Idx = 0;
2852 while (Idx != Worklist.size()) {
2853 Instruction *I = Worklist[Idx++];
2854
2855 for (auto *OV : I->operand_values()) {
2856 // isOutOfScope operands cannot be uniform instructions.
2857 if (IsOutOfScope(OV))
2858 continue;
2859 // First order recurrence Phi's should typically be considered
2860 // non-uniform.
2861 auto *OP = dyn_cast<PHINode>(Val: OV);
2862 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
2863 continue;
2864 // If all the users of the operand are uniform, then add the
2865 // operand into the uniform worklist.
2866 auto *OI = cast<Instruction>(Val: OV);
2867 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
2868 auto *J = cast<Instruction>(Val: U);
2869 return Worklist.count(key: J) || IsVectorizedMemAccessUse(J, OI);
2870 }))
2871 AddToWorklistIfAllowed(OI);
2872 }
2873 }
2874
2875 // For an instruction to be added into Worklist above, all its users inside
2876 // the loop should also be in Worklist. However, this condition cannot be
2877 // true for phi nodes that form a cyclic dependence. We must process phi
2878 // nodes separately. An induction variable will remain uniform if all users
2879 // of the induction variable and induction variable update remain uniform.
2880 // The code below handles both pointer and non-pointer induction variables.
2881 BasicBlock *Latch = TheLoop->getLoopLatch();
2882 for (const auto &Induction : Legal->getInductionVars()) {
2883 auto *Ind = Induction.first;
2884 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2885
2886 // Determine if all users of the induction variable are uniform after
2887 // vectorization.
2888 bool UniformInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2889 auto *I = cast<Instruction>(Val: U);
2890 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2891 IsVectorizedMemAccessUse(I, Ind);
2892 });
2893 if (!UniformInd)
2894 continue;
2895
2896 // Determine if all users of the induction variable update instruction are
2897 // uniform after vectorization.
2898 bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
2899 auto *I = cast<Instruction>(Val: U);
2900 return I == Ind || Worklist.count(key: I) ||
2901 IsVectorizedMemAccessUse(I, IndUpdate);
2902 });
2903 if (!UniformIndUpdate)
2904 continue;
2905
2906 // The induction variable and its update instruction will remain uniform.
2907 AddToWorklistIfAllowed(Ind);
2908 AddToWorklistIfAllowed(IndUpdate);
2909 }
2910
2911 Uniforms[VF].insert_range(R&: Worklist);
2912}
2913
2914FixedScalableVFPair
2915LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
2916 // Make sure once we return PartialAliasMaskingStatus is not "NotDecided".
2917 scope_exit EnsureAliasMaskingStatusIsDecidedOnReturn([this] {
2918 if (PartialAliasMaskingStatus == AliasMaskingStatus::NotDecided)
2919 PartialAliasMaskingStatus = AliasMaskingStatus::Disabled;
2920 });
2921
2922 // For outer loops, use simple type-based heuristic VF. No cost model or
2923 // memory dependence analysis is available.
2924 if (!TheLoop->isInnermost()) {
2925 return Config.computeVPlanOuterloopVF(UserVF);
2926 }
2927
2928 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
2929 // TODO: It may be useful to do since it's still likely to be dynamically
2930 // uniform if the target can skip.
2931 reportVectorizationFailure(
2932 DebugMsg: "Not inserting runtime ptr check for divergent target",
2933 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
2934 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
2935 return FixedScalableVFPair::getNone();
2936 }
2937
2938 ScalarEvolution *SE = PSE.getSE();
2939 ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
2940 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
2941 if (!MaxTC && EpilogueLoweringStatus == CM_EpilogueAllowed)
2942 MaxTC = getMaxTCFromNonZeroRange(PSE, L: TheLoop);
2943 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
2944 if (TC != ElementCount::getFixed(MinVal: MaxTC))
2945 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
2946 if (TC.isScalar()) {
2947 reportVectorizationFailure(
2948 DebugMsg: "Single iteration (non) loop",
2949 OREMsg: "loop trip count is one, irrelevant for vectorization",
2950 ORETag: "SingleIterationLoop", ORE, TheLoop);
2951 return FixedScalableVFPair::getNone();
2952 }
2953
2954 // If BTC matches the widest induction type and is -1 then the trip count
2955 // computation will wrap to 0 and the vector trip count will be 0. Do not try
2956 // to vectorize.
2957 const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
2958 if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
2959 BTC->getType()->getScalarSizeInBits() >=
2960 Legal->getWidestInductionType()->getScalarSizeInBits() &&
2961 SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
2962 RHS: SE->getMinusOne(Ty: BTC->getType()))) {
2963 reportVectorizationFailure(
2964 DebugMsg: "Trip count computation wrapped",
2965 OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
2966 ORETag: "TripCountWrapped", ORE, TheLoop);
2967 return FixedScalableVFPair::getNone();
2968 }
2969
2970 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
2971 "No cost-modeling decisions should have been taken at this point");
2972
2973 switch (EpilogueLoweringStatus) {
2974 case CM_EpilogueAllowed:
2975 return Config.computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: false,
2976 RequiresScalarEpilogue: requiresScalarEpilogue(IsVectorizing: true));
2977 case CM_EpilogueNotAllowedFoldTail:
2978 [[fallthrough]];
2979 case CM_EpilogueNotNeededFoldTail:
2980 LLVM_DEBUG(dbgs() << "LV: tail-folding hint/switch found.\n"
2981 << "LV: Not allowing epilogue, creating tail-folded "
2982 << "vector loop.\n");
2983 break;
2984 case CM_EpilogueNotAllowedLowTripLoop:
2985 // fallthrough as a special case of OptForSize
2986 case CM_EpilogueNotAllowedOptSize:
2987 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedOptSize)
2988 LLVM_DEBUG(dbgs() << "LV: Not allowing epilogue due to -Os/-Oz.\n");
2989 else
2990 LLVM_DEBUG(dbgs() << "LV: Not allowing epilogue due to low trip "
2991 << "count.\n");
2992
2993 // Bail if runtime checks are required, which are not good when optimising
2994 // for size.
2995 if (Config.runtimeChecksRequired())
2996 return FixedScalableVFPair::getNone();
2997
2998 break;
2999 }
3000
3001 // Now try the tail folding
3002
3003 // Invalidate interleave groups that require an epilogue if we can't mask
3004 // the interleave-group.
3005 if (!useMaskedInterleavedAccesses(TTI)) {
3006 // Note: There is no need to invalidate any cost modeling decisions here, as
3007 // none were taken so far (see assertion above).
3008 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3009 }
3010
3011 FixedScalableVFPair MaxFactors = Config.computeFeasibleMaxVF(
3012 MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: true, RequiresScalarEpilogue: requiresScalarEpilogue(IsVectorizing: true));
3013
3014 // Avoid tail folding if the trip count is known to be a multiple of any VF
3015 // we choose.
3016 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3017 MaxFactors.FixedVF.getFixedValue();
3018 if (MaxFactors.ScalableVF) {
3019 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3020 if (MaxVScale) {
3021 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3022 a: *MaxPowerOf2RuntimeVF,
3023 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3024 } else
3025 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3026 }
3027
3028 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3029 // Return false if the loop is neither a single-latch-exit loop nor an
3030 // early-exit loop as tail-folding is not supported in that case.
3031 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3032 !Legal->hasUncountableEarlyExit())
3033 return false;
3034 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3035 ScalarEvolution *SE = PSE.getSE();
3036 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3037 // with uncountable exits. For countable loops, the symbolic maximum must
3038 // remain identical to the known back-edge taken count.
3039 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3040 assert((Legal->hasUncountableEarlyExit() ||
3041 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3042 "Invalid loop count");
3043 const SCEV *ExitCount = SE->getAddExpr(
3044 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3045 const SCEV *Rem = SE->getURemExpr(
3046 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3047 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3048 return Rem->isZero();
3049 };
3050
3051 if (MaxPowerOf2RuntimeVF > 0u) {
3052 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3053 "MaxFixedVF must be a power of 2");
3054 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3055 // Accept MaxFixedVF if we do not have a tail.
3056 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3057 return MaxFactors;
3058 }
3059 }
3060
3061 auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3062 if (ExpectedTC && ExpectedTC->isFixed() &&
3063 ExpectedTC->getFixedValue() <=
3064 TTI.getMinTripCountTailFoldingThreshold()) {
3065 if (MaxPowerOf2RuntimeVF > 0u) {
3066 // If we have a low-trip-count, and the fixed-width VF is known to divide
3067 // the trip count but the scalable factor does not, use the fixed-width
3068 // factor in preference to allow the generation of a non-predicated loop.
3069 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedLowTripLoop &&
3070 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3071 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3072 "remain for any chosen VF.\n");
3073 MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: 0);
3074 return MaxFactors;
3075 }
3076 }
3077
3078 reportVectorizationFailure(
3079 DebugMsg: "The trip count is below the minial threshold value.",
3080 OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3081 ORE, TheLoop);
3082 return FixedScalableVFPair::getNone();
3083 }
3084
3085 // If we don't know the precise trip count, or if the trip count that we
3086 // found modulo the vectorization factor is not zero, try to fold the tail
3087 // by masking.
3088 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3089 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3090 setTailFoldingStyle(IsScalableVF: ContainsScalableVF, UserIC);
3091 if (foldTailByMasking()) {
3092 if (foldTailWithEVL()) {
3093 LLVM_DEBUG(
3094 dbgs()
3095 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3096 "try to generate VP Intrinsics with scalable vector "
3097 "factors only.\n");
3098 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3099 // for now.
3100 // TODO: extend it for fixed vectors, if required.
3101 assert(ContainsScalableVF && "Expected scalable vector factor.");
3102
3103 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
3104 } else {
3105 tryToEnablePartialAliasMasking();
3106 }
3107 return MaxFactors;
3108 }
3109
3110 // If there was a tail-folding hint/switch, but we can't fold the tail by
3111 // masking, fallback to a vectorization with an epilogue.
3112 if (EpilogueLoweringStatus == CM_EpilogueNotNeededFoldTail) {
3113 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with an "
3114 "epilogue instead.\n");
3115 EpilogueLoweringStatus = CM_EpilogueAllowed;
3116 return MaxFactors;
3117 }
3118
3119 if (EpilogueLoweringStatus == CM_EpilogueNotAllowedFoldTail) {
3120 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3121 return FixedScalableVFPair::getNone();
3122 }
3123
3124 if (TC.isZero()) {
3125 reportVectorizationFailure(
3126 DebugMsg: "unable to calculate the loop count due to complex control flow",
3127 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3128 return FixedScalableVFPair::getNone();
3129 }
3130
3131 reportVectorizationFailure(
3132 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3133 OREMsg: "cannot optimize for size and vectorize at the same time. "
3134 "Enable vectorization of this loop with '#pragma clang loop "
3135 "vectorize(enable)' when compiling with -Os/-Oz",
3136 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3137 return FixedScalableVFPair::getNone();
3138}
3139
3140void LoopVectorizationPlanner::emitInvalidCostRemarks(
3141 OptimizationRemarkEmitter *ORE) {
3142 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3143 SmallVector<RecipeVFPair> InvalidCosts;
3144 for (const auto &Plan : VPlans) {
3145 for (ElementCount VF : Plan->vectorFactors()) {
3146 // The VPlan-based cost model is designed for computing vector cost.
3147 // Querying VPlan-based cost model with a scarlar VF will cause some
3148 // errors because we expect the VF is vector for most of the widen
3149 // recipes.
3150 if (VF.isScalar())
3151 continue;
3152
3153 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
3154 OrigLoop);
3155 precomputeCosts(Plan&: *Plan, VF, CostCtx);
3156 auto Iter = vp_depth_first_deep(G: Plan->getVectorLoopRegion()->getEntry());
3157 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: Iter)) {
3158 for (auto &R : *VPBB) {
3159 if (!R.cost(VF, Ctx&: CostCtx).isValid())
3160 InvalidCosts.emplace_back(Args: &R, Args&: VF);
3161 }
3162 }
3163 }
3164 }
3165 if (InvalidCosts.empty())
3166 return;
3167
3168 // Emit a report of VFs with invalid costs in the loop.
3169
3170 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
3171 DenseMap<VPRecipeBase *, unsigned> Numbering;
3172 unsigned I = 0;
3173 for (auto &Pair : InvalidCosts)
3174 if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
3175 ++I;
3176
3177 // Sort the list, first on recipe(number) then on VF.
3178 sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
3179 unsigned NA = Numbering[A.first];
3180 unsigned NB = Numbering[B.first];
3181 if (NA != NB)
3182 return NA < NB;
3183 return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
3184 });
3185
3186 // For a list of ordered recipe-VF pairs:
3187 // [(load, VF1), (load, VF2), (store, VF1)]
3188 // group the recipes together to emit separate remarks for:
3189 // load (VF1, VF2)
3190 // store (VF1)
3191 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
3192 auto Subset = ArrayRef<RecipeVFPair>();
3193 do {
3194 if (Subset.empty())
3195 Subset = Tail.take_front(N: 1);
3196
3197 VPRecipeBase *R = Subset.front().first;
3198
3199 unsigned Opcode =
3200 TypeSwitch<const VPRecipeBase *, unsigned>(R)
3201 .Case(caseFn: [](const VPHeaderPHIRecipe *R) { return Instruction::PHI; })
3202 .Case(
3203 caseFn: [](const VPWidenStoreRecipe *R) { return Instruction::Store; })
3204 .Case(caseFn: [](const VPWidenLoadRecipe *R) { return Instruction::Load; })
3205 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
3206 caseFn: [](const auto *R) { return Instruction::Call; })
3207 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
3208 VPWidenCastRecipe>(
3209 caseFn: [](const auto *R) { return R->getOpcode(); })
3210 .Case(caseFn: [](const VPInterleaveRecipe *R) {
3211 return R->getStoredValues().empty() ? Instruction::Load
3212 : Instruction::Store;
3213 })
3214 .Case(caseFn: [](const VPReductionRecipe *R) {
3215 return RecurrenceDescriptor::getOpcode(Kind: R->getRecurrenceKind());
3216 });
3217
3218 // If the next recipe is different, or if there are no other pairs,
3219 // emit a remark for the collated subset. e.g.
3220 // [(load, VF1), (load, VF2))]
3221 // to emit:
3222 // remark: invalid costs for 'load' at VF=(VF1, VF2)
3223 if (Subset == Tail || Tail[Subset.size()].first != R) {
3224 std::string OutString;
3225 raw_string_ostream OS(OutString);
3226 assert(!Subset.empty() && "Unexpected empty range");
3227 OS << "Recipe with invalid costs prevented vectorization at VF=(";
3228 for (const auto &Pair : Subset)
3229 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
3230 OS << "):";
3231 if (Opcode == Instruction::Call) {
3232 StringRef Name = "";
3233 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
3234 Name = Int->getIntrinsicName();
3235 } else {
3236 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
3237 Function *CalledFn =
3238 WidenCall ? WidenCall->getCalledScalarFunction()
3239 : cast<Function>(Val: R->getOperand(N: R->getNumOperands() - 1)
3240 ->getLiveInIRValue());
3241 Name = CalledFn->getName();
3242 }
3243 OS << " call to " << Name;
3244 } else
3245 OS << " " << Instruction::getOpcodeName(Opcode);
3246 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
3247 DL: R->getDebugLoc());
3248 Tail = Tail.drop_front(N: Subset.size());
3249 Subset = {};
3250 } else
3251 // Grow the subset by one element
3252 Subset = Tail.take_front(N: Subset.size() + 1);
3253 } while (!Tail.empty());
3254}
3255
3256/// Check if any recipe of \p Plan will generate a vector value, which will be
3257/// assigned a vector register.
3258static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
3259 const TargetTransformInfo &TTI) {
3260 assert(VF.isVector() && "Checking a scalar VF?");
3261 DenseSet<VPRecipeBase *> EphemeralRecipes;
3262 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
3263 // Set of already visited types.
3264 DenseSet<Type *> Visited;
3265 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3266 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
3267 for (VPRecipeBase &R : *VPBB) {
3268 if (EphemeralRecipes.contains(V: &R))
3269 continue;
3270 // Continue early if the recipe is considered to not produce a vector
3271 // result. Note that this includes VPInstruction where some opcodes may
3272 // produce a vector, to preserve existing behavior as VPInstructions model
3273 // aspects not directly mapped to existing IR instructions.
3274 switch (R.getVPRecipeID()) {
3275 case VPRecipeBase::VPDerivedIVSC:
3276 case VPRecipeBase::VPScalarIVStepsSC:
3277 case VPRecipeBase::VPReplicateSC:
3278 case VPRecipeBase::VPInstructionSC:
3279 case VPRecipeBase::VPCurrentIterationPHISC:
3280 case VPRecipeBase::VPVectorPointerSC:
3281 case VPRecipeBase::VPVectorEndPointerSC:
3282 case VPRecipeBase::VPExpandSCEVSC:
3283 case VPRecipeBase::VPPredInstPHISC:
3284 case VPRecipeBase::VPBranchOnMaskSC:
3285 continue;
3286 case VPRecipeBase::VPReductionSC:
3287 case VPRecipeBase::VPActiveLaneMaskPHISC:
3288 case VPRecipeBase::VPWidenCallSC:
3289 case VPRecipeBase::VPWidenCanonicalIVSC:
3290 case VPRecipeBase::VPWidenCastSC:
3291 case VPRecipeBase::VPWidenGEPSC:
3292 case VPRecipeBase::VPWidenIntrinsicSC:
3293 case VPRecipeBase::VPWidenMemIntrinsicSC:
3294 case VPRecipeBase::VPWidenSC:
3295 case VPRecipeBase::VPBlendSC:
3296 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
3297 case VPRecipeBase::VPHistogramSC:
3298 case VPRecipeBase::VPWidenPHISC:
3299 case VPRecipeBase::VPWidenIntOrFpInductionSC:
3300 case VPRecipeBase::VPWidenPointerInductionSC:
3301 case VPRecipeBase::VPReductionPHISC:
3302 case VPRecipeBase::VPInterleaveEVLSC:
3303 case VPRecipeBase::VPInterleaveSC:
3304 case VPRecipeBase::VPWidenLoadEVLSC:
3305 case VPRecipeBase::VPWidenLoadSC:
3306 case VPRecipeBase::VPWidenStoreEVLSC:
3307 case VPRecipeBase::VPWidenStoreSC:
3308 break;
3309 default:
3310 llvm_unreachable("unhandled recipe");
3311 }
3312
3313 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
3314 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
3315 if (!NumLegalParts)
3316 return false;
3317 if (VF.isScalable()) {
3318 // <vscale x 1 x iN> is assumed to be profitable over iN because
3319 // scalable registers are a distinct register class from scalar
3320 // ones. If we ever find a target which wants to lower scalable
3321 // vectors back to scalars, we'll need to update this code to
3322 // explicitly ask TTI about the register class uses for each part.
3323 return NumLegalParts <= VF.getKnownMinValue();
3324 }
3325 // Two or more elements that share a register - are vectorized.
3326 return NumLegalParts < VF.getFixedValue();
3327 };
3328
3329 // If no def nor is a store, e.g., branches, continue - no value to check.
3330 if (R.getNumDefinedValues() == 0 &&
3331 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(Val: &R))
3332 continue;
3333 // For multi-def recipes, currently only interleaved loads, suffice to
3334 // check first def only.
3335 // For stores check their stored value; for interleaved stores suffice
3336 // the check first stored value only. In all cases this is the second
3337 // operand.
3338 VPValue *ToCheck =
3339 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
3340 Type *ScalarTy = ToCheck->getScalarType();
3341 if (!Visited.insert(V: {ScalarTy}).second)
3342 continue;
3343 Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
3344 if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
3345 return true;
3346 }
3347 }
3348
3349 return false;
3350}
3351
3352static bool hasReplicatorRegion(VPlan &Plan) {
3353 return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
3354 G: Plan.getVectorLoopRegion()->getEntry())),
3355 P: [](auto *VPRB) { return VPRB->isReplicator(); });
3356}
3357
3358/// Returns true if the VPlan contains a VPReductionPHIRecipe with
3359/// FindLast recurrence kind.
3360static bool hasFindLastReductionPhi(VPlan &Plan) {
3361 return any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
3362 P: [](VPRecipeBase &R) {
3363 auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
3364 return RedPhi &&
3365 RecurrenceDescriptor::isFindLastRecurrenceKind(
3366 Kind: RedPhi->getRecurrenceKind());
3367 });
3368}
3369
3370/// Returns true if the VPlan contains header phi recipes that are not currently
3371/// supported for epilogue vectorization.
3372static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan) {
3373 return any_of(
3374 Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
3375 P: [](VPRecipeBase &R) {
3376 switch (R.getVPRecipeID()) {
3377 case VPRecipeBase::VPFirstOrderRecurrencePHISC:
3378 // TODO: Add support for fixed-order recurrences.
3379 return true;
3380 case VPRecipeBase::VPWidenIntOrFpInductionSC:
3381 return !cast<VPWidenIntOrFpInductionRecipe>(Val: &R)->getPHINode();
3382 case VPRecipeBase::VPReductionPHISC: {
3383 auto *RedPhi = cast<VPReductionPHIRecipe>(Val: &R);
3384 // TODO: Support FMinNum/FMaxNum, FindLast reductions, and reductions
3385 // without underlying values.
3386 RecurKind Kind = RedPhi->getRecurrenceKind();
3387 if (RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind) ||
3388 RecurrenceDescriptor::isFindLastRecurrenceKind(Kind) ||
3389 !RedPhi->getUnderlyingValue())
3390 return true;
3391 // TODO: Add support for FindIV reductions with sunk expressions: the
3392 // resume value from the main loop is in expression domain (e.g.,
3393 // mul(ReducedIV, 3)), but the epilogue tracks raw IV values. A sunk
3394 // expression is identified by a non-VPInstruction user of
3395 // ComputeReductionResult.
3396 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
3397 auto *RdxResult = vputils::findComputeReductionResult(PhiR: RedPhi);
3398 assert(RdxResult &&
3399 "FindIV reduction must have ComputeReductionResult");
3400 return any_of(Range: RdxResult->users(),
3401 P: std::not_fn(fn: IsaPred<VPInstruction>));
3402 }
3403 return false;
3404 }
3405 default:
3406 return false;
3407 };
3408 });
3409}
3410
3411bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
3412 VPlan &MainPlan) const {
3413 // Bail out if the plan contains header phi recipes not yet supported
3414 // for epilogue vectorization.
3415 if (hasUnsupportedHeaderPhiRecipe(Plan&: MainPlan))
3416 return false;
3417
3418 // Epilogue vectorization code has not been auditted to ensure it handles
3419 // non-latch exits properly. It may be fine, but it needs auditted and
3420 // tested.
3421 // TODO: Add support for loops with an early exit.
3422 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
3423 return false;
3424
3425 return true;
3426}
3427
3428bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
3429 const ElementCount VF, const unsigned IC) const {
3430 // FIXME: We need a much better cost-model to take different parameters such
3431 // as register pressure, code size increase and cost of extra branches into
3432 // account. For now we apply a very crude heuristic and only consider loops
3433 // with vectorization factors larger than a certain value.
3434
3435 // Allow the target to opt out.
3436 if (!TTI.preferEpilogueVectorization(Iters: VF * IC))
3437 return false;
3438
3439 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
3440 ? EpilogueVectorizationMinVF
3441 : TTI.getEpilogueVectorizationMinVF();
3442 return estimateElementCount(VF: VF * IC, VScale: Config.getVScaleForTuning()) >=
3443 MinVFThreshold;
3444}
3445
3446std::unique_ptr<VPlan> LoopVectorizationPlanner::selectBestEpiloguePlan(
3447 VPlan &MainPlan, ElementCount MainLoopVF, unsigned IC) {
3448 if (!EnableEpilogueVectorization) {
3449 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
3450 return nullptr;
3451 }
3452
3453 if (!CM.isEpilogueAllowed()) {
3454 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
3455 "epilogue is allowed.\n");
3456 return nullptr;
3457 }
3458
3459 if (CM.maskPartialAliasing()) {
3460 LLVM_DEBUG(
3461 dbgs()
3462 << "LEV: Epilogue vectorization not supported with alias masking.\n");
3463 return nullptr;
3464 }
3465
3466 // Not really a cost consideration, but check for unsupported cases here to
3467 // simplify the logic.
3468 if (!isCandidateForEpilogueVectorization(MainPlan)) {
3469 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
3470 "is not a supported candidate.\n");
3471 return nullptr;
3472 }
3473
3474 if (EpilogueVectorizationForceVF > 1) {
3475 if (EpilogueVectorizationForceVF >=
3476 IC * estimateElementCount(VF: MainLoopVF, VScale: Config.getVScaleForTuning())) {
3477 // Note that the main loop leaves IC * MainLoopVF iterations iff a scalar
3478 // epilogue is required, but then the epilogue loop also requires a scalar
3479 // epilogue.
3480 LLVM_DEBUG(dbgs() << "LEV: Forced epilogue VF results in dead epilogue "
3481 "vector loop, skipping vectorizing epilogue.\n");
3482 return nullptr;
3483 }
3484
3485 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
3486 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
3487 if (hasPlanWithVF(VF: ForcedEC)) {
3488 std::unique_ptr<VPlan> Clone(getPlanFor(VF: ForcedEC).duplicate());
3489 Clone->setVF(ForcedEC);
3490 return Clone;
3491 }
3492
3493 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
3494 "viable.\n");
3495 return nullptr;
3496 }
3497
3498 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
3499 LLVM_DEBUG(
3500 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
3501 return nullptr;
3502 }
3503
3504 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
3505 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
3506 "this loop\n");
3507 return nullptr;
3508 }
3509
3510 // Check if a plan's vector loop processes fewer iterations than VF (e.g. when
3511 // interleave groups have been narrowed) narrowInterleaveGroups) and return
3512 // the adjusted, effective VF.
3513 using namespace VPlanPatternMatch;
3514 auto GetEffectiveVF = [](VPlan &Plan, ElementCount VF) -> ElementCount {
3515 auto *Exiting = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3516 if (match(V: &Exiting->back(),
3517 P: m_BranchOnCount(Op0: m_Add(Op0: m_CanonicalIV(), Op1: m_Specific(VPV: &Plan.getUF())),
3518 Op1: m_VPValue())))
3519 return ElementCount::get(MinVal: 1, Scalable: VF.isScalable());
3520 return VF;
3521 };
3522
3523 // Check if the main loop processes fewer than MainLoopVF elements per
3524 // iteration (e.g. due to narrowing interleave groups). Adjust MainLoopVF
3525 // as needed.
3526 MainLoopVF = GetEffectiveVF(MainPlan, MainLoopVF);
3527
3528 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
3529 // the main loop handles 8 lanes per iteration. We could still benefit from
3530 // vectorizing the epilogue loop with VF=4.
3531 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
3532 MinVal: estimateElementCount(VF: MainLoopVF, VScale: Config.getVScaleForTuning()));
3533
3534 Type *TCType = Legal->getWidestInductionType();
3535 const SCEV *RemainingIterations = nullptr;
3536 unsigned MaxTripCount = 0;
3537 const SCEV *TC = vputils::getSCEVExprForVPValue(V: MainPlan.getTripCount(), PSE);
3538 assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
3539 const SCEV *KnownMinTC;
3540 bool ScalableTC = match(S: TC, P: m_scev_c_Mul(Op0: m_SCEV(V&: KnownMinTC), Op1: m_SCEVVScale()));
3541 bool ScalableRemIter = false;
3542 ScalarEvolution &SE = *PSE.getSE();
3543 // Use versions of TC and VF in which both are either scalable or fixed.
3544 if (ScalableTC == MainLoopVF.isScalable()) {
3545 ScalableRemIter = ScalableTC;
3546 RemainingIterations =
3547 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
3548 } else if (ScalableTC) {
3549 const SCEV *EstimatedTC = SE.getMulExpr(
3550 LHS: KnownMinTC,
3551 RHS: SE.getConstant(Ty: TCType, V: Config.getVScaleForTuning().value_or(u: 1)));
3552 RemainingIterations = SE.getURemExpr(
3553 LHS: EstimatedTC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
3554 } else
3555 RemainingIterations =
3556 SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: EstimatedRuntimeVF * IC));
3557
3558 // No iterations left to process in the epilogue.
3559 if (RemainingIterations->isZero())
3560 return nullptr;
3561
3562 if (MainLoopVF.isFixed()) {
3563 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
3564 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
3565 RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
3566 MaxTripCount = SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
3567 }
3568 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
3569 << MaxTripCount << "\n");
3570 }
3571
3572 auto SkipVF = [&](const SCEV *VF, const SCEV *RemIter) -> bool {
3573 return SE.isKnownPredicate(Pred: CmpInst::ICMP_UGT, LHS: VF, RHS: RemIter);
3574 };
3575 VectorizationFactor Result = VectorizationFactor::Disabled();
3576 VPlan *BestPlan = nullptr;
3577 for (auto &NextVF : ProfitableVFs) {
3578 // Skip candidate VFs without a corresponding VPlan.
3579 if (!hasPlanWithVF(VF: NextVF.Width))
3580 continue;
3581
3582 VPlan &CurrentPlan = getPlanFor(VF: NextVF.Width);
3583 ElementCount EffectiveVF = GetEffectiveVF(CurrentPlan, NextVF.Width);
3584 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
3585 // vectors) or > the VF of the main loop (fixed vectors).
3586 if ((!EffectiveVF.isScalable() && MainLoopVF.isScalable() &&
3587 ElementCount::isKnownGE(LHS: EffectiveVF, RHS: EstimatedRuntimeVF)) ||
3588 (EffectiveVF.isScalable() &&
3589 ElementCount::isKnownGE(LHS: EffectiveVF, RHS: MainLoopVF)) ||
3590 (!EffectiveVF.isScalable() && !MainLoopVF.isScalable() &&
3591 ElementCount::isKnownGT(LHS: EffectiveVF, RHS: MainLoopVF)))
3592 continue;
3593
3594 // If EffectiveVF is greater than the number of remaining iterations, the
3595 // epilogue loop would be dead. Skip such factors. If the epilogue plan
3596 // also has narrowed interleave groups, use the effective VF since
3597 // the epilogue step will be reduced to its IC.
3598 // TODO: We should also consider comparing against a scalable
3599 // RemainingIterations when SCEV be able to evaluate non-canonical
3600 // vscale-based expressions.
3601 if (!ScalableRemIter) {
3602 // Handle the case where EffectiveVF and RemainingIterations are in
3603 // different numerical spaces.
3604 if (EffectiveVF.isScalable())
3605 EffectiveVF = ElementCount::getFixed(
3606 MinVal: estimateElementCount(VF: EffectiveVF, VScale: Config.getVScaleForTuning()));
3607 if (SkipVF(SE.getElementCount(Ty: TCType, EC: EffectiveVF), RemainingIterations))
3608 continue;
3609 }
3610
3611 if (Result.Width.isScalar() ||
3612 isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking(),
3613 /*IsEpilogue*/ true)) {
3614 Result = NextVF;
3615 BestPlan = &CurrentPlan;
3616 }
3617 }
3618
3619 if (!BestPlan)
3620 return nullptr;
3621
3622 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
3623 << Result.Width << "\n");
3624 std::unique_ptr<VPlan> Clone(BestPlan->duplicate());
3625 Clone->setVF(Result.Width);
3626 return Clone;
3627}
3628
3629unsigned
3630LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
3631 InstructionCost LoopCost) {
3632 // -- The interleave heuristics --
3633 // We interleave the loop in order to expose ILP and reduce the loop overhead.
3634 // There are many micro-architectural considerations that we can't predict
3635 // at this level. For example, frontend pressure (on decode or fetch) due to
3636 // code size, or the number and capabilities of the execution ports.
3637 //
3638 // We use the following heuristics to select the interleave count:
3639 // 1. If the code has reductions, then we interleave to break the cross
3640 // iteration dependency.
3641 // 2. If the loop is really small, then we interleave to reduce the loop
3642 // overhead.
3643 // 3. We don't interleave if we think that we will spill registers to memory
3644 // due to the increased register pressure.
3645
3646 // Only interleave tail-folded loops if wide lane masks are requested, as the
3647 // overhead of multiple instructions to calculate the predicate is likely
3648 // not beneficial. If an epilogue is not allowed for any other reason,
3649 // do not interleave.
3650 if (!CM.isEpilogueAllowed() &&
3651 !(CM.preferTailFoldedLoop() && CM.useWideActiveLaneMask()))
3652 return 1;
3653
3654 if (any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
3655 P: IsaPred<VPCurrentIterationPHIRecipe>)) {
3656 LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
3657 "Unroll factor forced to be 1.\n");
3658 return 1;
3659 }
3660
3661 // We used the distance for the interleave count.
3662 if (!Legal->isSafeForAnyVectorWidth())
3663 return 1;
3664
3665 // We don't attempt to perform interleaving for loops with uncountable early
3666 // exits because the VPInstruction::AnyOf code cannot currently handle
3667 // multiple parts.
3668 if (Plan.hasEarlyExit())
3669 return 1;
3670
3671 const bool HasReductions =
3672 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
3673 P: IsaPred<VPReductionPHIRecipe>);
3674
3675 // FIXME: implement interleaving for FindLast transform correctly.
3676 if (hasFindLastReductionPhi(Plan))
3677 return 1;
3678
3679 VPRegisterUsage R =
3680 calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore: CM.ValuesToIgnore)[0];
3681
3682 // If we did not calculate the cost for VF (because the user selected the VF)
3683 // then we calculate the cost of VF here.
3684 if (LoopCost == 0) {
3685 if (VF.isScalar())
3686 LoopCost = CM.expectedCost(VF);
3687 else
3688 LoopCost = cost(Plan, VF, RU: &R);
3689 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
3690
3691 // Loop body is free and there is no need for interleaving.
3692 if (LoopCost == 0)
3693 return 1;
3694 }
3695
3696 // We divide by these constants so assume that we have at least one
3697 // instruction that uses at least one register.
3698 for (auto &Pair : R.MaxLocalUsers) {
3699 Pair.second = std::max(a: Pair.second, b: 1U);
3700 }
3701
3702 // We calculate the interleave count using the following formula.
3703 // Subtract the number of loop invariants from the number of available
3704 // registers. These registers are used by all of the interleaved instances.
3705 // Next, divide the remaining registers by the number of registers that is
3706 // required by the loop, in order to estimate how many parallel instances
3707 // fit without causing spills. All of this is rounded down if necessary to be
3708 // a power of two. We want power of two interleave count to simplify any
3709 // addressing operations or alignment considerations.
3710 // We also want power of two interleave counts to ensure that the induction
3711 // variable of the vector loop wraps to zero, when tail is folded by masking;
3712 // this currently happens when OptForSize, in which case IC is set to 1 above.
3713 unsigned IC = UINT_MAX;
3714
3715 for (const auto &Pair : R.MaxLocalUsers) {
3716 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
3717 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
3718 << " registers of "
3719 << TTI.getRegisterClassName(Pair.first)
3720 << " register class\n");
3721 if (VF.isScalar()) {
3722 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
3723 TargetNumRegisters = ForceTargetNumScalarRegs;
3724 } else {
3725 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
3726 TargetNumRegisters = ForceTargetNumVectorRegs;
3727 }
3728 unsigned MaxLocalUsers = Pair.second;
3729 unsigned LoopInvariantRegs = 0;
3730 if (R.LoopInvariantRegs.contains(Key: Pair.first))
3731 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
3732
3733 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
3734 MaxLocalUsers);
3735 // Don't count the induction variable as interleaved.
3736 if (EnableIndVarRegisterHeur) {
3737 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
3738 std::max(a: 1U, b: (MaxLocalUsers - 1)));
3739 }
3740
3741 IC = std::min(a: IC, b: TmpIC);
3742 }
3743
3744 // Clamp the interleave ranges to reasonable counts.
3745 bool HasUnorderedReductions =
3746 HasReductions &&
3747 !any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
3748 P: [](VPRecipeBase &R) {
3749 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
3750 return RedR && RedR->isOrdered();
3751 });
3752 unsigned MaxInterleaveCount =
3753 TTI.getMaxInterleaveFactor(VF, HasUnorderedReductions);
3754 LLVM_DEBUG(dbgs() << "LV: MaxInterleaveFactor for the target is "
3755 << MaxInterleaveCount << "\n");
3756
3757 // Check if the user has overridden the max.
3758 if (VF.isScalar()) {
3759 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
3760 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
3761 } else {
3762 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
3763 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
3764 }
3765
3766 // Try to get the exact trip count, or an estimate based on profiling data or
3767 // ConstantMax from PSE, failing that.
3768 auto BestKnownTC =
3769 getSmallBestKnownTC(PSE, L: OrigLoop,
3770 /*CanUseConstantMax=*/true,
3771 /*CanExcludeZeroTrips=*/CM.isEpilogueAllowed());
3772
3773 // For fixed length VFs treat a scalable trip count as unknown.
3774 if (BestKnownTC && (BestKnownTC->isFixed() || VF.isScalable())) {
3775 // Re-evaluate trip counts and VFs to be in the same numerical space.
3776 unsigned AvailableTC =
3777 estimateElementCount(VF: *BestKnownTC, VScale: Config.getVScaleForTuning());
3778 unsigned EstimatedVF =
3779 estimateElementCount(VF, VScale: Config.getVScaleForTuning());
3780
3781 // At least one iteration must be scalar when this constraint holds. So the
3782 // maximum available iterations for interleaving is one less.
3783 if (CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()))
3784 --AvailableTC;
3785
3786 unsigned InterleaveCountLB = bit_floor(Value: std::max(
3787 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
3788
3789 if (getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop).isNonZero()) {
3790 // If the best known trip count is exact, we select between two
3791 // prospective ICs, where
3792 //
3793 // 1) the aggressive IC is capped by the trip count divided by VF
3794 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
3795 //
3796 // The final IC is selected in a way that the epilogue loop trip count is
3797 // minimized while maximizing the IC itself, so that we either run the
3798 // vector loop at least once if it generates a small epilogue loop, or
3799 // else we run the vector loop at least twice.
3800
3801 unsigned InterleaveCountUB = bit_floor(Value: std::max(
3802 a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
3803 MaxInterleaveCount = InterleaveCountLB;
3804
3805 if (InterleaveCountUB != InterleaveCountLB) {
3806 unsigned TailTripCountUB =
3807 (AvailableTC % (EstimatedVF * InterleaveCountUB));
3808 unsigned TailTripCountLB =
3809 (AvailableTC % (EstimatedVF * InterleaveCountLB));
3810 // If both produce same scalar tail, maximize the IC to do the same work
3811 // in fewer vector loop iterations
3812 if (TailTripCountUB == TailTripCountLB)
3813 MaxInterleaveCount = InterleaveCountUB;
3814 }
3815 } else {
3816 // If trip count is an estimated compile time constant, limit the
3817 // IC to be capped by the trip count divided by VF * 2, such that the
3818 // vector loop runs at least twice to make interleaving seem profitable
3819 // when there is an epilogue loop present. Since exact Trip count is not
3820 // known we choose to be conservative in our IC estimate.
3821 MaxInterleaveCount = InterleaveCountLB;
3822 }
3823 }
3824
3825 assert(MaxInterleaveCount > 0 &&
3826 "Maximum interleave count must be greater than 0");
3827
3828 // Clamp the calculated IC to be between the 1 and the max interleave count
3829 // that the target and trip count allows.
3830 if (IC > MaxInterleaveCount)
3831 IC = MaxInterleaveCount;
3832 else
3833 // Make sure IC is greater than 0.
3834 IC = std::max(a: 1u, b: IC);
3835
3836 assert(IC > 0 && "Interleave count must be greater than 0.");
3837
3838 // Interleave if we vectorized this loop and there is a reduction that could
3839 // benefit from interleaving.
3840 if (VF.isVector() && HasReductions) {
3841 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
3842 return IC;
3843 }
3844
3845 // For any scalar loop that either requires runtime checks or tail-folding we
3846 // are better off leaving this to the unroller. Note that if we've already
3847 // vectorized the loop we will have done the runtime check and so interleaving
3848 // won't require further checks.
3849 bool ScalarInterleavingRequiresPredication =
3850 (VF.isScalar() && any_of(Range: OrigLoop->blocks(), P: [this](BasicBlock *BB) {
3851 return Legal->blockNeedsPredication(BB);
3852 }));
3853 bool ScalarInterleavingRequiresRuntimePointerCheck =
3854 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
3855
3856 // We want to interleave small loops in order to reduce the loop overhead and
3857 // potentially expose ILP opportunities.
3858 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
3859 << "LV: IC is " << IC << '\n'
3860 << "LV: VF is " << VF << '\n');
3861 const bool AggressivelyInterleave =
3862 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
3863 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
3864 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
3865 // We assume that the cost overhead is 1 and we use the cost model
3866 // to estimate the cost of the loop and interleave until the cost of the
3867 // loop overhead is about 5% of the cost of the loop.
3868 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
3869 Value: SmallLoopCost / LoopCost.getValue()));
3870
3871 // Interleave until store/load ports (estimated by max interleave count) are
3872 // saturated.
3873 unsigned NumStores = 0;
3874 unsigned NumLoads = 0;
3875 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
3876 Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
3877 for (VPRecipeBase &R : *VPBB) {
3878 if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: &R)) {
3879 NumLoads++;
3880 continue;
3881 }
3882 if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(Val: &R)) {
3883 NumStores++;
3884 continue;
3885 }
3886
3887 if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R)) {
3888 if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
3889 NumStores += StoreOps;
3890 else
3891 NumLoads += InterleaveR->getNumDefinedValues();
3892 continue;
3893 }
3894 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
3895 NumLoads += isa<LoadInst>(Val: RepR->getUnderlyingInstr());
3896 NumStores += isa<StoreInst>(Val: RepR->getUnderlyingInstr());
3897 continue;
3898 }
3899 if (isa<VPHistogramRecipe>(Val: &R)) {
3900 NumLoads++;
3901 NumStores++;
3902 continue;
3903 }
3904 }
3905 }
3906 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
3907 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
3908
3909 // There is little point in interleaving for reductions containing selects
3910 // and compares when VF=1 since it may just create more overhead than it's
3911 // worth for loops with small trip counts. This is because we still have to
3912 // do the final reduction after the loop.
3913 bool HasSelectCmpReductions =
3914 HasReductions &&
3915 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
3916 P: [](VPRecipeBase &R) {
3917 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
3918 return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
3919 Kind: RedR->getRecurrenceKind()) ||
3920 RecurrenceDescriptor::isFindIVRecurrenceKind(
3921 Kind: RedR->getRecurrenceKind()));
3922 });
3923 if (HasSelectCmpReductions) {
3924 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
3925 return 1;
3926 }
3927
3928 // If we have a scalar reduction (vector reductions are already dealt with
3929 // by this point), we can increase the critical path length if the loop
3930 // we're interleaving is inside another loop. For tree-wise reductions
3931 // set the limit to 2, and for ordered reductions it's best to disable
3932 // interleaving entirely.
3933 if (HasReductions && OrigLoop->getLoopDepth() > 1) {
3934 bool HasOrderedReductions =
3935 any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
3936 P: [](VPRecipeBase &R) {
3937 auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
3938
3939 return RedR && RedR->isOrdered();
3940 });
3941 if (HasOrderedReductions) {
3942 LLVM_DEBUG(
3943 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
3944 return 1;
3945 }
3946
3947 unsigned F = MaxNestedScalarReductionIC;
3948 SmallIC = std::min(a: SmallIC, b: F);
3949 StoresIC = std::min(a: StoresIC, b: F);
3950 LoadsIC = std::min(a: LoadsIC, b: F);
3951 }
3952
3953 if (EnableLoadStoreRuntimeInterleave &&
3954 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
3955 LLVM_DEBUG(
3956 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
3957 return std::max(a: StoresIC, b: LoadsIC);
3958 }
3959
3960 // If there are scalar reductions and TTI has enabled aggressive
3961 // interleaving for reductions, we will interleave to expose ILP.
3962 if (VF.isScalar() && AggressivelyInterleave) {
3963 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
3964 // Interleave no less than SmallIC but not as aggressive as the normal IC
3965 // to satisfy the rare situation when resources are too limited.
3966 return std::max(a: IC / 2, b: SmallIC);
3967 }
3968
3969 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
3970 return SmallIC;
3971 }
3972
3973 // Interleave if this is a large loop (small loops are already dealt with by
3974 // this point) that could benefit from interleaving.
3975 if (AggressivelyInterleave) {
3976 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
3977 return IC;
3978 }
3979
3980 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
3981 return 1;
3982}
3983
3984bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
3985 ElementCount VF) {
3986 // TODO: Cost model for emulated masked load/store is completely
3987 // broken. This hack guides the cost model to use an artificially
3988 // high enough value to practically disable vectorization with such
3989 // operations, except where previously deployed legality hack allowed
3990 // using very low cost values. This is to avoid regressions coming simply
3991 // from moving "masked load/store" check from legality to cost model.
3992 // Masked Load/Gather emulation was previously never allowed.
3993 // Limited number of Masked Store/Scatter emulation was allowed.
3994 assert((isPredicatedInst(I)) &&
3995 "Expecting a scalar emulated instruction");
3996 return isa<LoadInst>(Val: I) ||
3997 (isa<StoreInst>(Val: I) &&
3998 NumPredStores > NumberOfStoresToPredicate);
3999}
4000
4001void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4002 assert(VF.isVector() && "Expected VF >= 2");
4003
4004 // If we've already collected the instructions to scalarize or the predicated
4005 // BBs after vectorization, there's nothing to do. Collection may already have
4006 // occurred if we have a user-selected VF and are now computing the expected
4007 // cost for interleaving.
4008 if (InstsToScalarize.contains(Key: VF) ||
4009 PredicatedBBsAfterVectorization.contains(Val: VF))
4010 return;
4011
4012 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4013 // not profitable to scalarize any instructions, the presence of VF in the
4014 // map will indicate that we've analyzed it already.
4015 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4016
4017 // Find all the instructions that are scalar with predication in the loop and
4018 // determine if it would be better to not if-convert the blocks they are in.
4019 // If so, we also record the instructions to scalarize.
4020 for (BasicBlock *BB : TheLoop->blocks()) {
4021 if (!blockNeedsPredicationForAnyReason(BB))
4022 continue;
4023 for (Instruction &I : *BB)
4024 if (isScalarWithPredication(I: &I, VF)) {
4025 ScalarCostsTy ScalarCosts;
4026 // Do not apply discount logic for:
4027 // 1. Scalars after vectorization, as there will only be a single copy
4028 // of the instruction.
4029 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4030 // 3. Emulated masked memrefs, if a hacked cost is needed.
4031 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
4032 !useEmulatedMaskMemRefHack(I: &I, VF) &&
4033 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) {
4034 for (const auto &[I, IC] : ScalarCosts)
4035 ScalarCostsVF.insert(KV: {I, IC});
4036 }
4037 // Remember that BB will remain after vectorization.
4038 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
4039 for (auto *Pred : predecessors(BB)) {
4040 if (Pred->getSingleSuccessor() == BB)
4041 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
4042 }
4043 }
4044 }
4045}
4046
4047InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4048 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4049 assert(!isUniformAfterVectorization(PredInst, VF) &&
4050 "Instruction marked uniform-after-vectorization will be predicated");
4051
4052 // Initialize the discount to zero, meaning that the scalar version and the
4053 // vector version cost the same.
4054 InstructionCost Discount = 0;
4055
4056 // Holds instructions to analyze. The instructions we visit are mapped in
4057 // ScalarCosts. Those instructions are the ones that would be scalarized if
4058 // we find that the scalar version costs less.
4059 SmallVector<Instruction *, 8> Worklist;
4060
4061 // Returns true if the given instruction can be scalarized.
4062 auto CanBeScalarized = [&](Instruction *I) -> bool {
4063 // We only attempt to scalarize instructions forming a single-use chain
4064 // from the original predicated block that would otherwise be vectorized.
4065 // Although not strictly necessary, we give up on instructions we know will
4066 // already be scalar to avoid traversing chains that are unlikely to be
4067 // beneficial.
4068 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4069 isScalarAfterVectorization(I, VF))
4070 return false;
4071
4072 // If the instruction is scalar with predication, it will be analyzed
4073 // separately. We ignore it within the context of PredInst.
4074 if (isScalarWithPredication(I, VF))
4075 return false;
4076
4077 // If any of the instruction's operands are uniform after vectorization,
4078 // the instruction cannot be scalarized. This prevents, for example, a
4079 // masked load from being scalarized.
4080 //
4081 // We assume we will only emit a value for lane zero of an instruction
4082 // marked uniform after vectorization, rather than VF identical values.
4083 // Thus, if we scalarize an instruction that uses a uniform, we would
4084 // create uses of values corresponding to the lanes we aren't emitting code
4085 // for. This behavior can be changed by allowing getScalarValue to clone
4086 // the lane zero values for uniforms rather than asserting.
4087 for (Use &U : I->operands())
4088 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
4089 if (isUniformAfterVectorization(I: J, VF))
4090 return false;
4091
4092 // Otherwise, we can scalarize the instruction.
4093 return true;
4094 };
4095
4096 // Compute the expected cost discount from scalarizing the entire expression
4097 // feeding the predicated instruction. We currently only consider expressions
4098 // that are single-use instruction chains.
4099 Worklist.push_back(Elt: PredInst);
4100 while (!Worklist.empty()) {
4101 Instruction *I = Worklist.pop_back_val();
4102
4103 // If we've already analyzed the instruction, there's nothing to do.
4104 if (ScalarCosts.contains(Key: I))
4105 continue;
4106
4107 // Cannot scalarize fixed-order recurrence phis at the moment.
4108 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
4109 continue;
4110
4111 // Compute the cost of the vector instruction. Note that this cost already
4112 // includes the scalarization overhead of the predicated instruction.
4113 InstructionCost VectorCost = getInstructionCost(I, VF);
4114
4115 // Compute the cost of the scalarized instruction. This cost is the cost of
4116 // the instruction as if it wasn't if-converted and instead remained in the
4117 // predicated block. We will scale this cost by block probability after
4118 // computing the scalarization overhead.
4119 InstructionCost ScalarCost =
4120 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
4121
4122 // Compute the scalarization overhead of needed insertelement instructions
4123 // and phi nodes.
4124 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
4125 Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
4126 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
4127 ScalarCost += TTI.getScalarizationOverhead(
4128 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
4129 /*Insert=*/true,
4130 /*Extract=*/false, CostKind: Config.CostKind);
4131 }
4132 ScalarCost += VF.getFixedValue() *
4133 TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Config.CostKind);
4134 }
4135
4136 // Compute the scalarization overhead of needed extractelement
4137 // instructions. For each of the instruction's operands, if the operand can
4138 // be scalarized, add it to the worklist; otherwise, account for the
4139 // overhead.
4140 for (Use &U : I->operands())
4141 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
4142 assert(canVectorizeTy(J->getType()) &&
4143 "Instruction has non-scalar type");
4144 if (CanBeScalarized(J))
4145 Worklist.push_back(Elt: J);
4146 else if (needsExtract(V: J, VF)) {
4147 Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
4148 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
4149 ScalarCost += TTI.getScalarizationOverhead(
4150 Ty: cast<VectorType>(Val: VectorTy),
4151 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
4152 /*Extract*/ true, CostKind: Config.CostKind);
4153 }
4154 }
4155 }
4156
4157 // Scale the total scalar cost by block probability.
4158 ScalarCost /= getPredBlockCostDivisor(CostKind: Config.CostKind, BB: I->getParent());
4159
4160 // Compute the discount. A non-negative discount means the vector version
4161 // of the instruction costs more, and scalarizing would be beneficial.
4162 Discount += VectorCost - ScalarCost;
4163 ScalarCosts[I] = ScalarCost;
4164 }
4165
4166 return Discount;
4167}
4168
4169InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
4170 InstructionCost Cost;
4171 assert(VF.isScalar() && "must only be called for scalar VFs");
4172
4173 // For each block.
4174 for (BasicBlock *BB : TheLoop->blocks()) {
4175 InstructionCost BlockCost;
4176
4177 // For each instruction in the old loop.
4178 for (Instruction &I : *BB) {
4179 // Skip ignored values.
4180 if (ValuesToIgnore.count(Ptr: &I) ||
4181 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
4182 continue;
4183
4184 InstructionCost C = getInstructionCost(I: &I, VF);
4185
4186 // Check if we should override the cost.
4187 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
4188 C = InstructionCost(ForceTargetInstructionCost);
4189
4190 BlockCost += C;
4191 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
4192 << VF << " For instruction: " << I << '\n');
4193 }
4194
4195 // In the scalar loop, we may not always execute the predicated block, if it
4196 // is an if-else block. Thus, scale the block's cost by the probability of
4197 // executing it. getPredBlockCostDivisor will return 1 for blocks that are
4198 // only predicated by the header mask when folding the tail.
4199 Cost += BlockCost / getPredBlockCostDivisor(CostKind: Config.CostKind, BB);
4200 }
4201
4202 return Cost;
4203}
4204
4205/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
4206/// according to isAddressSCEVForCost.
4207///
4208/// This SCEV can be sent to the Target in order to estimate the address
4209/// calculation cost.
4210static const SCEV *getAddressAccessSCEV(
4211 Value *Ptr,
4212 PredicatedScalarEvolution &PSE,
4213 const Loop *TheLoop) {
4214 const SCEV *Addr = PSE.getSCEV(V: Ptr);
4215 return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L: TheLoop) ? Addr
4216 : nullptr;
4217}
4218
4219InstructionCost
4220LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
4221 ElementCount VF) {
4222 assert(VF.isVector() &&
4223 "Scalarization cost of instruction implies vectorization.");
4224 if (VF.isScalable())
4225 return InstructionCost::getInvalid();
4226
4227 Type *ValTy = getLoadStoreType(I);
4228 auto *SE = PSE.getSE();
4229
4230 unsigned AS = getLoadStoreAddressSpace(I);
4231 Value *Ptr = getLoadStorePointerOperand(V: I);
4232 Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
4233 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
4234 // that it is being called from this specific place.
4235
4236 // Figure out whether the access is strided and get the stride value
4237 // if it's known in compile time
4238 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
4239
4240 // Get the cost of the scalar memory instruction and address computation.
4241 InstructionCost Cost =
4242 VF.getFixedValue() *
4243 TTI.getAddressComputationCost(PtrTy, SE, Ptr: PtrSCEV, CostKind: Config.CostKind);
4244
4245 // Don't pass *I here, since it is scalar but will actually be part of a
4246 // vectorized loop where the user of it is a vectorized instruction.
4247 const Align Alignment = getLoadStoreAlignment(I);
4248 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
4249 Cost += VF.getFixedValue() *
4250 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy->getScalarType(), Alignment,
4251 AddressSpace: AS, CostKind: Config.CostKind, OpdInfo: OpInfo);
4252
4253 // Get the overhead of the extractelement and insertelement instructions
4254 // we might create due to scalarization.
4255 Cost += getScalarizationOverhead(I, VF);
4256
4257 // If we have a predicated load/store, it will need extra i1 extracts and
4258 // conditional branches, but may not be executed for each vector lane. Scale
4259 // the cost by the probability of executing the predicated block.
4260 if (isPredicatedInst(I)) {
4261 Cost /= getPredBlockCostDivisor(CostKind: Config.CostKind, BB: I->getParent());
4262
4263 // Add the cost of an i1 extract and a branch
4264 auto *VecI1Ty =
4265 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
4266 Cost += TTI.getScalarizationOverhead(
4267 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
4268 /*Insert=*/false, /*Extract=*/true, CostKind: Config.CostKind);
4269 Cost += TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind: Config.CostKind);
4270
4271 if (useEmulatedMaskMemRefHack(I, VF))
4272 // Artificially setting to a high enough value to practically disable
4273 // vectorization with such operations.
4274 Cost = 3000000;
4275 }
4276
4277 return Cost;
4278}
4279
4280InstructionCost
4281LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
4282 ElementCount VF) {
4283 Type *ValTy = getLoadStoreType(I);
4284 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4285 Value *Ptr = getLoadStorePointerOperand(V: I);
4286 unsigned AS = getLoadStoreAddressSpace(I);
4287 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
4288
4289 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
4290 "Stride should be 1 or -1 for consecutive memory access");
4291 const Align Alignment = getLoadStoreAlignment(I);
4292 InstructionCost Cost = 0;
4293 if (isMaskRequired(I)) {
4294 unsigned IID = I->getOpcode() == Instruction::Load
4295 ? Intrinsic::masked_load
4296 : Intrinsic::masked_store;
4297 Cost += TTI.getMemIntrinsicInstrCost(
4298 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Alignment, AS),
4299 CostKind: Config.CostKind);
4300 } else {
4301 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
4302 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
4303 CostKind: Config.CostKind, OpdInfo: OpInfo, I);
4304 }
4305
4306 bool Reverse = ConsecutiveStride < 0;
4307 if (Reverse)
4308 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
4309 SrcTy: VectorTy, Mask: {}, CostKind: Config.CostKind, Index: 0);
4310 return Cost;
4311}
4312
4313InstructionCost
4314LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
4315 ElementCount VF) {
4316 assert(isUniformMemOp(*I, VF));
4317
4318 Type *ValTy = getLoadStoreType(I);
4319 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
4320 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4321 const Align Alignment = getLoadStoreAlignment(I);
4322 unsigned AS = getLoadStoreAddressSpace(I);
4323 if (isa<LoadInst>(Val: I)) {
4324 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
4325 CostKind: Config.CostKind) +
4326 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
4327 CostKind: Config.CostKind) +
4328 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
4329 SrcTy: VectorTy, Mask: {}, CostKind: Config.CostKind);
4330 }
4331 StoreInst *SI = cast<StoreInst>(Val: I);
4332
4333 bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
4334 // TODO: We have existing tests that request the cost of extracting element
4335 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
4336 // the actual generated code, which involves extracting the last element of
4337 // a scalable vector where the lane to extract is unknown at compile time.
4338 InstructionCost Cost =
4339 TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind: Config.CostKind) +
4340 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
4341 CostKind: Config.CostKind);
4342 if (!IsLoopInvariantStoreValue)
4343 Cost += TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
4344 Val: VectorTy, CostKind: Config.CostKind, Index: 0);
4345 return Cost;
4346}
4347
4348InstructionCost
4349LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
4350 ElementCount VF) {
4351 Type *ValTy = getLoadStoreType(I);
4352 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4353 const Align Alignment = getLoadStoreAlignment(I);
4354 Value *Ptr = getLoadStorePointerOperand(V: I);
4355 Type *PtrTy = Ptr->getType();
4356
4357 if (!isUniform(V: Ptr, VF))
4358 PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
4359
4360 unsigned IID = I->getOpcode() == Instruction::Load
4361 ? Intrinsic::masked_gather
4362 : Intrinsic::masked_scatter;
4363 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
4364 CostKind: Config.CostKind) +
4365 TTI.getMemIntrinsicInstrCost(
4366 MICA: MemIntrinsicCostAttributes(IID, VectorTy, Ptr, isMaskRequired(I),
4367 Alignment, I),
4368 CostKind: Config.CostKind);
4369}
4370
4371InstructionCost
4372LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
4373 ElementCount VF) {
4374 const auto *Group = getInterleavedAccessGroup(Instr: I);
4375 assert(Group && "Fail to get an interleaved access group.");
4376
4377 Instruction *InsertPos = Group->getInsertPos();
4378 Type *ValTy = getLoadStoreType(I: InsertPos);
4379 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
4380 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
4381
4382 unsigned InterleaveFactor = Group->getFactor();
4383 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
4384
4385 // Holds the indices of existing members in the interleaved group.
4386 SmallVector<unsigned, 4> Indices;
4387 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
4388 if (Group->getMember(Index: IF))
4389 Indices.push_back(Elt: IF);
4390
4391 // Calculate the cost of the whole interleaved group.
4392 bool UseMaskForGaps =
4393 (Group->requiresScalarEpilogue() && !isEpilogueAllowed()) ||
4394 (isa<StoreInst>(Val: I) && !Group->isFull());
4395 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
4396 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
4397 Alignment: Group->getAlign(), AddressSpace: AS, CostKind: Config.CostKind, UseMaskForCond: isMaskRequired(I),
4398 UseMaskForGaps);
4399
4400 if (Group->isReverse()) {
4401 // TODO: Add support for reversed masked interleaved access.
4402 assert(!isMaskRequired(I) &&
4403 "Reverse masked interleaved access not supported.");
4404 Cost += Group->getNumMembers() *
4405 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
4406 SrcTy: VectorTy, Mask: {}, CostKind: Config.CostKind, Index: 0);
4407 }
4408 return Cost;
4409}
4410
4411std::optional<InstructionCost>
4412LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
4413 ElementCount VF,
4414 Type *Ty) const {
4415 using namespace llvm::PatternMatch;
4416 // Early exit for no inloop reductions
4417 if (Config.getInLoopReductions().empty() || VF.isScalar() ||
4418 !isa<VectorType>(Val: Ty))
4419 return std::nullopt;
4420 auto *VectorTy = cast<VectorType>(Val: Ty);
4421
4422 // We are looking for a pattern of, and finding the minimal acceptable cost:
4423 // reduce(mul(ext(A), ext(B))) or
4424 // reduce(mul(A, B)) or
4425 // reduce(ext(A)) or
4426 // reduce(A).
4427 // The basic idea is that we walk down the tree to do that, finding the root
4428 // reduction instruction in InLoopReductionImmediateChains. From there we find
4429 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
4430 // of the components. If the reduction cost is lower then we return it for the
4431 // reduction instruction and 0 for the other instructions in the pattern. If
4432 // it is not we return an invalid cost specifying the orignal cost method
4433 // should be used.
4434 Instruction *RetI = I;
4435 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
4436 if (!RetI->hasOneUser())
4437 return std::nullopt;
4438 RetI = RetI->user_back();
4439 }
4440
4441 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
4442 RetI->user_back()->getOpcode() == Instruction::Add) {
4443 RetI = RetI->user_back();
4444 }
4445
4446 // Test if the found instruction is a reduction, and if not return an invalid
4447 // cost specifying the parent to use the original cost modelling.
4448 Instruction *LastChain = Config.getInLoopReductionImmediateChain(I: RetI);
4449 if (!LastChain)
4450 return std::nullopt;
4451
4452 // Find the reduction this chain is a part of and calculate the basic cost of
4453 // the reduction on its own.
4454 Instruction *ReductionPhi = LastChain;
4455 while (!isa<PHINode>(Val: ReductionPhi))
4456 ReductionPhi = Config.getInLoopReductionImmediateChain(I: ReductionPhi);
4457
4458 const RecurrenceDescriptor &RdxDesc =
4459 Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
4460
4461 InstructionCost BaseCost;
4462 RecurKind RK = RdxDesc.getRecurrenceKind();
4463 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
4464 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
4465 BaseCost = TTI.getMinMaxReductionCost(
4466 IID: MinMaxID, Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind: Config.CostKind);
4467 } else {
4468 BaseCost = TTI.getArithmeticReductionCost(Opcode: RdxDesc.getOpcode(), Ty: VectorTy,
4469 FMF: RdxDesc.getFastMathFlags(),
4470 CostKind: Config.CostKind);
4471 }
4472
4473 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
4474 // normal fmul instruction to the cost of the fadd reduction.
4475 if (RK == RecurKind::FMulAdd)
4476 BaseCost += TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy,
4477 CostKind: Config.CostKind);
4478
4479 // If we're using ordered reductions then we can just return the base cost
4480 // here, since getArithmeticReductionCost calculates the full ordered
4481 // reduction cost when FP reassociation is not allowed.
4482 if (Config.useOrderedReductions(RdxDesc))
4483 return BaseCost;
4484
4485 // Get the operand that was not the reduction chain and match it to one of the
4486 // patterns, returning the better cost if it is found.
4487 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
4488 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
4489 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
4490
4491 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
4492
4493 Instruction *Op0, *Op1;
4494 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
4495 match(V: RedOp,
4496 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
4497 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
4498 Op0->getOpcode() == Op1->getOpcode() &&
4499 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
4500 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
4501 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
4502
4503 // Matched reduce.add(ext(mul(ext(A), ext(B)))
4504 // Note that the extend opcodes need to all match, or if A==B they will have
4505 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
4506 // which is equally fine.
4507 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
4508 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
4509 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
4510
4511 InstructionCost ExtCost =
4512 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
4513 CCH: TTI::CastContextHint::None, CostKind: Config.CostKind, I: Op0);
4514 InstructionCost MulCost =
4515 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind: Config.CostKind);
4516 InstructionCost Ext2Cost = TTI.getCastInstrCost(
4517 Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType, CCH: TTI::CastContextHint::None,
4518 CostKind: Config.CostKind, I: RedOp);
4519
4520 InstructionCost RedCost = TTI.getMulAccReductionCost(
4521 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
4522 CostKind: Config.CostKind);
4523
4524 if (RedCost.isValid() &&
4525 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
4526 return I == RetI ? RedCost : 0;
4527 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
4528 !TheLoop->isLoopInvariant(V: RedOp)) {
4529 // Matched reduce(ext(A))
4530 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
4531 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
4532 InstructionCost RedCost = TTI.getExtendedReductionCost(
4533 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
4534 FMF: RdxDesc.getFastMathFlags(), CostKind: Config.CostKind);
4535
4536 InstructionCost ExtCost = TTI.getCastInstrCost(
4537 Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType, CCH: TTI::CastContextHint::None,
4538 CostKind: Config.CostKind, I: RedOp);
4539 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
4540 return I == RetI ? RedCost : 0;
4541 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
4542 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
4543 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
4544 Op0->getOpcode() == Op1->getOpcode() &&
4545 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
4546 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
4547 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
4548 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
4549 Type *LargestOpTy =
4550 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
4551 : Op0Ty;
4552 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
4553
4554 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
4555 // different sizes. We take the largest type as the ext to reduce, and add
4556 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
4557 InstructionCost ExtCost0 = TTI.getCastInstrCost(
4558 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
4559 CCH: TTI::CastContextHint::None, CostKind: Config.CostKind, I: Op0);
4560 InstructionCost ExtCost1 = TTI.getCastInstrCost(
4561 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
4562 CCH: TTI::CastContextHint::None, CostKind: Config.CostKind, I: Op1);
4563 InstructionCost MulCost = TTI.getArithmeticInstrCost(
4564 Opcode: Instruction::Mul, Ty: VectorTy, CostKind: Config.CostKind);
4565
4566 InstructionCost RedCost = TTI.getMulAccReductionCost(
4567 IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
4568 CostKind: Config.CostKind);
4569 InstructionCost ExtraExtCost = 0;
4570 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
4571 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
4572 ExtraExtCost = TTI.getCastInstrCost(
4573 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
4574 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
4575 CCH: TTI::CastContextHint::None, CostKind: Config.CostKind, I: ExtraExtOp);
4576 }
4577
4578 if (RedCost.isValid() &&
4579 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
4580 return I == RetI ? RedCost : 0;
4581 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
4582 // Matched reduce.add(mul())
4583 InstructionCost MulCost = TTI.getArithmeticInstrCost(
4584 Opcode: Instruction::Mul, Ty: VectorTy, CostKind: Config.CostKind);
4585
4586 InstructionCost RedCost = TTI.getMulAccReductionCost(
4587 IsUnsigned: true, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy,
4588 CostKind: Config.CostKind);
4589
4590 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
4591 return I == RetI ? RedCost : 0;
4592 }
4593 }
4594
4595 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
4596}
4597
4598InstructionCost
4599LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
4600 ElementCount VF) {
4601 // Calculate scalar cost only. Vectorization cost should be ready at this
4602 // moment.
4603 if (VF.isScalar()) {
4604 Type *ValTy = getLoadStoreType(I);
4605 Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
4606 const Align Alignment = getLoadStoreAlignment(I);
4607 unsigned AS = getLoadStoreAddressSpace(I);
4608
4609 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
4610 return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr,
4611 CostKind: Config.CostKind) +
4612 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS,
4613 CostKind: Config.CostKind, OpdInfo: OpInfo, I);
4614 }
4615 return getWideningCost(I, VF);
4616}
4617
4618InstructionCost
4619LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
4620 ElementCount VF) const {
4621
4622 // There is no mechanism yet to create a scalable scalarization loop,
4623 // so this is currently Invalid.
4624 if (VF.isScalable())
4625 return InstructionCost::getInvalid();
4626
4627 if (VF.isScalar())
4628 return 0;
4629
4630 InstructionCost Cost = 0;
4631 Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
4632 if (!RetTy->isVoidTy() &&
4633 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) {
4634
4635 TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None;
4636 if (isa<LoadInst>(Val: I))
4637 VIC = TTI::VectorInstrContext::Load;
4638 else if (isa<StoreInst>(Val: I))
4639 VIC = TTI::VectorInstrContext::Store;
4640
4641 for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
4642 Cost += TTI.getScalarizationOverhead(
4643 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
4644 /*Insert=*/true, /*Extract=*/false, CostKind: Config.CostKind,
4645 /*ForPoisonSrc=*/true, VL: {}, VIC);
4646 }
4647 }
4648
4649 // Some targets keep addresses scalar.
4650 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
4651 return Cost;
4652
4653 // Some targets support efficient element stores.
4654 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
4655 return Cost;
4656
4657 // Collect operands to consider.
4658 CallInst *CI = dyn_cast<CallInst>(Val: I);
4659 Instruction::op_range Ops = CI ? CI->args() : I->operands();
4660
4661 // Skip operands that do not require extraction/scalarization and do not incur
4662 // any overhead.
4663 SmallVector<Type *> Tys;
4664 for (auto *V : filterExtractingOperands(Ops, VF))
4665 Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
4666
4667 TTI::VectorInstrContext OperandVIC = isa<StoreInst>(Val: I)
4668 ? TTI::VectorInstrContext::Store
4669 : TTI::VectorInstrContext::None;
4670 return Cost +
4671 TTI.getOperandsScalarizationOverhead(Tys, CostKind: Config.CostKind, VIC: OperandVIC);
4672}
4673
4674void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
4675 if (VF.isScalar())
4676 return;
4677
4678 // TODO: We should generate better code and update the cost model for
4679 // predicated uniform stores. Today they are treated as any other
4680 // predicated store (see added test cases in
4681 // invariant-store-vectorization.ll).
4682 NumPredStores = 0;
4683 for (BasicBlock *BB : TheLoop->blocks())
4684 for (Instruction &I : *BB)
4685 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
4686 ++NumPredStores;
4687
4688 for (BasicBlock *BB : TheLoop->blocks()) {
4689 // For each instruction in the old loop.
4690 for (Instruction &I : *BB) {
4691 Value *Ptr = getLoadStorePointerOperand(V: &I);
4692 if (!Ptr)
4693 continue;
4694
4695 if (isUniformMemOp(I, VF)) {
4696 auto IsLegalToScalarize = [&]() {
4697 if (!VF.isScalable())
4698 // Scalarization of fixed length vectors "just works".
4699 return true;
4700
4701 // We have dedicated lowering for unpredicated uniform loads and
4702 // stores. Note that even with tail folding we know that at least
4703 // one lane is active (i.e. generalized predication is not possible
4704 // here), and the logic below depends on this fact.
4705 if (!foldTailByMasking())
4706 return true;
4707
4708 // For scalable vectors, a uniform memop load is always
4709 // uniform-by-parts and we know how to scalarize that.
4710 if (isa<LoadInst>(Val: I))
4711 return true;
4712
4713 // A uniform store isn't neccessarily uniform-by-part
4714 // and we can't assume scalarization.
4715 auto &SI = cast<StoreInst>(Val&: I);
4716 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
4717 };
4718
4719 const InstructionCost GatherScatterCost =
4720 Config.isLegalGatherOrScatter(V: &I, VF)
4721 ? getGatherScatterCost(I: &I, VF)
4722 : InstructionCost::getInvalid();
4723
4724 // Load: Scalar load + broadcast
4725 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
4726 // FIXME: This cost is a significant under-estimate for tail folded
4727 // memory ops.
4728 const InstructionCost ScalarizationCost =
4729 IsLegalToScalarize() ? getUniformMemOpCost(I: &I, VF)
4730 : InstructionCost::getInvalid();
4731
4732 // Choose better solution for the current VF, Note that Invalid
4733 // costs compare as maximumal large. If both are invalid, we get
4734 // scalable invalid which signals a failure and a vectorization abort.
4735 if (GatherScatterCost < ScalarizationCost)
4736 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
4737 else
4738 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
4739 continue;
4740 }
4741
4742 // We assume that widening is the best solution when possible.
4743 if (memoryInstructionCanBeWidened(I: &I, VF)) {
4744 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
4745 int ConsecutiveStride = Legal->isConsecutivePtr(
4746 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
4747 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
4748 "Expected consecutive stride.");
4749 InstWidening Decision =
4750 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
4751 setWideningDecision(I: &I, VF, W: Decision, Cost);
4752 continue;
4753 }
4754
4755 // Choose between Interleaving, Gather/Scatter or Scalarization.
4756 InstructionCost InterleaveCost = InstructionCost::getInvalid();
4757 unsigned NumAccesses = 1;
4758 if (isAccessInterleaved(Instr: &I)) {
4759 const auto *Group = getInterleavedAccessGroup(Instr: &I);
4760 assert(Group && "Fail to get an interleaved access group.");
4761
4762 // Make one decision for the whole group.
4763 if (getWideningDecision(I: &I, VF) != CM_Unknown)
4764 continue;
4765
4766 NumAccesses = Group->getNumMembers();
4767 if (interleavedAccessCanBeWidened(I: &I, VF))
4768 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
4769 }
4770
4771 InstructionCost GatherScatterCost =
4772 Config.isLegalGatherOrScatter(V: &I, VF)
4773 ? getGatherScatterCost(I: &I, VF) * NumAccesses
4774 : InstructionCost::getInvalid();
4775
4776 InstructionCost ScalarizationCost =
4777 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
4778
4779 // Choose better solution for the current VF,
4780 // write down this decision and use it during vectorization.
4781 InstructionCost Cost;
4782 InstWidening Decision;
4783 if (InterleaveCost <= GatherScatterCost &&
4784 InterleaveCost < ScalarizationCost) {
4785 Decision = CM_Interleave;
4786 Cost = InterleaveCost;
4787 } else if (GatherScatterCost < ScalarizationCost) {
4788 Decision = CM_GatherScatter;
4789 Cost = GatherScatterCost;
4790 } else {
4791 Decision = CM_Scalarize;
4792 Cost = ScalarizationCost;
4793 }
4794 // If the instructions belongs to an interleave group, the whole group
4795 // receives the same decision. The whole group receives the cost, but
4796 // the cost will actually be assigned to one instruction.
4797 if (const auto *Group = getInterleavedAccessGroup(Instr: &I)) {
4798 if (Decision == CM_Scalarize) {
4799 for (Instruction *I : Group->members())
4800 setWideningDecision(I, VF, W: Decision,
4801 Cost: getMemInstScalarizationCost(I, VF));
4802 } else {
4803 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
4804 }
4805 } else
4806 setWideningDecision(I: &I, VF, W: Decision, Cost);
4807 }
4808 }
4809
4810 // Make sure that any load of address and any other address computation
4811 // remains scalar unless there is gather/scatter support. This avoids
4812 // inevitable extracts into address registers, and also has the benefit of
4813 // activating LSR more, since that pass can't optimize vectorized
4814 // addresses.
4815 if (TTI.prefersVectorizedAddressing())
4816 return;
4817
4818 // Start with all scalar pointer uses.
4819 SmallSetVector<Instruction *, 8> AddrDefs;
4820 for (BasicBlock *BB : TheLoop->blocks())
4821 for (Instruction &I : *BB) {
4822 Instruction *PtrDef =
4823 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
4824 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
4825 getWideningDecision(I: &I, VF) != CM_GatherScatter)
4826 AddrDefs.insert(X: PtrDef);
4827 }
4828
4829 // Add all instructions used to generate the addresses.
4830 SmallVector<Instruction *, 4> Worklist;
4831 append_range(C&: Worklist, R&: AddrDefs);
4832 while (!Worklist.empty()) {
4833 Instruction *I = Worklist.pop_back_val();
4834 for (auto &Op : I->operands())
4835 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
4836 if (TheLoop->contains(Inst: InstOp) && !isa<PHINode>(Val: InstOp) &&
4837 AddrDefs.insert(X: InstOp))
4838 Worklist.push_back(Elt: InstOp);
4839 }
4840
4841 auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
4842 // If there are direct memory op users of the newly scalarized load,
4843 // their cost may have changed because there's no scalarization
4844 // overhead for the operand. Update it.
4845 for (User *U : LI->users()) {
4846 if (!isa<LoadInst, StoreInst>(Val: U))
4847 continue;
4848 if (getWideningDecision(I: cast<Instruction>(Val: U), VF) != CM_Scalarize)
4849 continue;
4850 setWideningDecision(
4851 I: cast<Instruction>(Val: U), VF, W: CM_Scalarize,
4852 Cost: getMemInstScalarizationCost(I: cast<Instruction>(Val: U), VF));
4853 }
4854 };
4855 for (auto *I : AddrDefs) {
4856 if (isa<LoadInst>(Val: I)) {
4857 // Setting the desired widening decision should ideally be handled in
4858 // by cost functions, but since this involves the task of finding out
4859 // if the loaded register is involved in an address computation, it is
4860 // instead changed here when we know this is the case.
4861 InstWidening Decision = getWideningDecision(I, VF);
4862 if (!isPredicatedInst(I) &&
4863 (Decision == CM_Widen || Decision == CM_Widen_Reverse ||
4864 (!isUniformMemOp(I&: *I, VF) && Decision == CM_Scalarize))) {
4865 // Scalarize a widened load of address or update the cost of a scalar
4866 // load of an address.
4867 setWideningDecision(
4868 I, VF, W: CM_Scalarize,
4869 Cost: (VF.getKnownMinValue() *
4870 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
4871 UpdateMemOpUserCost(cast<LoadInst>(Val: I));
4872 } else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
4873 // Scalarize all members of this interleaved group when any member
4874 // is used as an address. The address-used load skips scalarization
4875 // overhead, other members include it.
4876 for (Instruction *Member : Group->members()) {
4877 InstructionCost Cost = AddrDefs.contains(key: Member)
4878 ? (VF.getKnownMinValue() *
4879 getMemoryInstructionCost(
4880 I: Member, VF: ElementCount::getFixed(MinVal: 1)))
4881 : getMemInstScalarizationCost(I: Member, VF);
4882 setWideningDecision(I: Member, VF, W: CM_Scalarize, Cost);
4883 UpdateMemOpUserCost(cast<LoadInst>(Val: Member));
4884 }
4885 }
4886 } else {
4887 // Cannot scalarize fixed-order recurrence phis at the moment.
4888 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
4889 continue;
4890
4891 // Make sure I gets scalarized and a cost estimate without
4892 // scalarization overhead.
4893 ForcedScalars[VF].insert(Ptr: I);
4894 }
4895 }
4896}
4897
4898bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
4899 if (!Legal->isInvariant(V: Op))
4900 return false;
4901 // Consider Op invariant, if it or its operands aren't predicated
4902 // instruction in the loop. In that case, it is not trivially hoistable.
4903 auto *OpI = dyn_cast<Instruction>(Val: Op);
4904 return !OpI || !TheLoop->contains(Inst: OpI) ||
4905 (!isPredicatedInst(I: OpI) &&
4906 (!isa<PHINode>(Val: OpI) || OpI->getParent() != TheLoop->getHeader()) &&
4907 all_of(Range: OpI->operands(),
4908 P: [this](Value *Op) { return shouldConsiderInvariant(Op); }));
4909}
4910
4911InstructionCost
4912LoopVectorizationCostModel::getInstructionCost(Instruction *I,
4913 ElementCount VF) {
4914 // If we know that this instruction will remain uniform, check the cost of
4915 // the scalar version.
4916 if (isUniformAfterVectorization(I, VF))
4917 VF = ElementCount::getFixed(MinVal: 1);
4918
4919 if (VF.isVector() && isProfitableToScalarize(I, VF))
4920 return InstsToScalarize[VF][I];
4921
4922 // Forced scalars do not have any scalarization overhead.
4923 auto ForcedScalar = ForcedScalars.find(Val: VF);
4924 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
4925 auto InstSet = ForcedScalar->second;
4926 if (InstSet.count(Ptr: I))
4927 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
4928 VF.getKnownMinValue();
4929 }
4930
4931 const auto &MinBWs = Config.getMinimalBitwidths();
4932 uint64_t InstrMinBWs = MinBWs.lookup(Key: I);
4933 Type *RetTy = I->getType();
4934 if (canTruncateToMinimalBitwidth(I, VF))
4935 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: InstrMinBWs);
4936 auto *SE = PSE.getSE();
4937
4938 Type *VectorTy;
4939 if (isScalarAfterVectorization(I, VF)) {
4940 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
4941 [this](Instruction *I, ElementCount VF) -> bool {
4942 if (VF.isScalar())
4943 return true;
4944
4945 auto Scalarized = InstsToScalarize.find(Key: VF);
4946 assert(Scalarized != InstsToScalarize.end() &&
4947 "VF not yet analyzed for scalarization profitability");
4948 return !Scalarized->second.count(Key: I) &&
4949 llvm::all_of(Range: I->users(), P: [&](User *U) {
4950 auto *UI = cast<Instruction>(Val: U);
4951 return !Scalarized->second.count(Key: UI);
4952 });
4953 };
4954
4955 // With the exception of GEPs and PHIs, after scalarization there should
4956 // only be one copy of the instruction generated in the loop. This is
4957 // because the VF is either 1, or any instructions that need scalarizing
4958 // have already been dealt with by the time we get here. As a result,
4959 // it means we don't have to multiply the instruction cost by VF.
4960 assert(I->getOpcode() == Instruction::GetElementPtr ||
4961 I->getOpcode() == Instruction::PHI ||
4962 (I->getOpcode() == Instruction::BitCast &&
4963 I->getType()->isPointerTy()) ||
4964 HasSingleCopyAfterVectorization(I, VF));
4965 VectorTy = RetTy;
4966 } else
4967 VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
4968
4969 if (VF.isVector() && VectorTy->isVectorTy() &&
4970 !TTI.getNumberOfParts(Tp: VectorTy))
4971 return InstructionCost::getInvalid();
4972
4973 // TODO: We need to estimate the cost of intrinsic calls.
4974 switch (I->getOpcode()) {
4975 case Instruction::GetElementPtr:
4976 // We mark this instruction as zero-cost because the cost of GEPs in
4977 // vectorized code depends on whether the corresponding memory instruction
4978 // is scalarized or not. Therefore, we handle GEPs with the memory
4979 // instruction cost.
4980 return 0;
4981 case Instruction::UncondBr:
4982 case Instruction::CondBr: {
4983 // In cases of scalarized and predicated instructions, there will be VF
4984 // predicated blocks in the vectorized loop. Each branch around these
4985 // blocks requires also an extract of its vector compare i1 element.
4986 // Note that the conditional branch from the loop latch will be replaced by
4987 // a single branch controlling the loop, so there is no extra overhead from
4988 // scalarization.
4989 bool ScalarPredicatedBB = false;
4990 CondBrInst *BI = dyn_cast<CondBrInst>(Val: I);
4991 if (VF.isVector() && BI &&
4992 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
4993 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
4994 BI->getParent() != TheLoop->getLoopLatch())
4995 ScalarPredicatedBB = true;
4996
4997 if (ScalarPredicatedBB) {
4998 // Not possible to scalarize scalable vector with predicated instructions.
4999 if (VF.isScalable())
5000 return InstructionCost::getInvalid();
5001 // Return cost for branches around scalarized and predicated blocks.
5002 auto *VecI1Ty =
5003 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
5004 return (TTI.getScalarizationOverhead(
5005 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5006 /*Insert*/ false, /*Extract*/ true, CostKind: Config.CostKind) +
5007 (TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind: Config.CostKind) *
5008 VF.getFixedValue()));
5009 }
5010
5011 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
5012 // The back-edge branch will remain, as will all scalar branches.
5013 return TTI.getCFInstrCost(Opcode: Instruction::UncondBr, CostKind: Config.CostKind);
5014
5015 // This branch will be eliminated by if-conversion.
5016 return 0;
5017 // Note: We currently assume zero cost for an unconditional branch inside
5018 // a predicated block since it will become a fall-through, although we
5019 // may decide in the future to call TTI for all branches.
5020 }
5021 case Instruction::Switch: {
5022 if (VF.isScalar())
5023 return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind: Config.CostKind);
5024 auto *Switch = cast<SwitchInst>(Val: I);
5025 return Switch->getNumCases() *
5026 TTI.getCmpSelInstrCost(
5027 Opcode: Instruction::ICmp,
5028 ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
5029 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
5030 VecPred: CmpInst::ICMP_EQ, CostKind: Config.CostKind);
5031 }
5032 case Instruction::PHI: {
5033 auto *Phi = cast<PHINode>(Val: I);
5034
5035 // First-order recurrences are replaced by vector shuffles inside the loop.
5036 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
5037 return TTI.getShuffleCost(
5038 Kind: TargetTransformInfo::SK_Splice, DstTy: cast<VectorType>(Val: VectorTy),
5039 SrcTy: cast<VectorType>(Val: VectorTy), Mask: {}, CostKind: Config.CostKind, Index: -1);
5040 }
5041
5042 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
5043 // converted into select instructions. We require N - 1 selects per phi
5044 // node, where N is the number of incoming values.
5045 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
5046 Type *ResultTy = Phi->getType();
5047
5048 // All instructions in an Any-of reduction chain are narrowed to bool.
5049 // Check if that is the case for this phi node.
5050 auto *HeaderUser = cast_if_present<PHINode>(
5051 Val: find_singleton<User>(Range: Phi->users(), P: [this](User *U, bool) -> User * {
5052 auto *Phi = dyn_cast<PHINode>(Val: U);
5053 if (Phi && Phi->getParent() == TheLoop->getHeader())
5054 return Phi;
5055 return nullptr;
5056 }));
5057 if (HeaderUser) {
5058 auto &ReductionVars = Legal->getReductionVars();
5059 auto Iter = ReductionVars.find(Key: HeaderUser);
5060 if (Iter != ReductionVars.end() &&
5061 RecurrenceDescriptor::isAnyOfRecurrenceKind(
5062 Kind: Iter->second.getRecurrenceKind()))
5063 ResultTy = Type::getInt1Ty(C&: Phi->getContext());
5064 }
5065 return (Phi->getNumIncomingValues() - 1) *
5066 TTI.getCmpSelInstrCost(
5067 Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
5068 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
5069 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind: Config.CostKind);
5070 }
5071
5072 // When tail folding with EVL, if the phi is part of an out of loop
5073 // reduction then it will be transformed into a wide vp_merge.
5074 if (VF.isVector() && foldTailWithEVL() &&
5075 Legal->getReductionVars().contains(Key: Phi) &&
5076 !Config.isInLoopReduction(Phi)) {
5077 IntrinsicCostAttributes ICA(
5078 Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
5079 {toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
5080 return TTI.getIntrinsicInstrCost(ICA, CostKind: Config.CostKind);
5081 }
5082
5083 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind: Config.CostKind);
5084 }
5085 case Instruction::UDiv:
5086 case Instruction::SDiv:
5087 case Instruction::URem:
5088 case Instruction::SRem:
5089 if (VF.isVector() && isPredicatedInst(I)) {
5090 const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
5091 return isDivRemScalarWithPredication(ScalarCost, MaskedCost) ? ScalarCost
5092 : MaskedCost;
5093 }
5094 // We've proven all lanes safe to speculate, fall through.
5095 [[fallthrough]];
5096 case Instruction::Add:
5097 case Instruction::Sub: {
5098 auto Info = Legal->getHistogramInfo(I);
5099 if (Info && VF.isVector()) {
5100 const HistogramInfo *HGram = Info.value();
5101 // Assume that a non-constant update value (or a constant != 1) requires
5102 // a multiply, and add that into the cost.
5103 InstructionCost MulCost = TTI::TCC_Free;
5104 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
5105 if (!RHS || RHS->getZExtValue() != 1)
5106 MulCost = TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy,
5107 CostKind: Config.CostKind);
5108
5109 // Find the cost of the histogram operation itself.
5110 Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
5111 Type *ScalarTy = I->getType();
5112 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
5113 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
5114 Type::getVoidTy(C&: I->getContext()),
5115 {PtrTy, ScalarTy, MaskTy});
5116
5117 // Add the costs together with the add/sub operation.
5118 return TTI.getIntrinsicInstrCost(ICA, CostKind: Config.CostKind) + MulCost +
5119 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy,
5120 CostKind: Config.CostKind);
5121 }
5122 [[fallthrough]];
5123 }
5124 case Instruction::FAdd:
5125 case Instruction::FSub:
5126 case Instruction::Mul:
5127 case Instruction::FMul:
5128 case Instruction::FDiv:
5129 case Instruction::FRem:
5130 case Instruction::Shl:
5131 case Instruction::LShr:
5132 case Instruction::AShr:
5133 case Instruction::And:
5134 case Instruction::Or:
5135 case Instruction::Xor: {
5136 // If we're speculating on the stride being 1, the multiplication may
5137 // fold away. We can generalize this for all operations using the notion
5138 // of neutral elements. (TODO)
5139 if (I->getOpcode() == Instruction::Mul &&
5140 ((TheLoop->isLoopInvariant(V: I->getOperand(i: 0)) &&
5141 PSE.getSCEV(V: I->getOperand(i: 0))->isOne()) ||
5142 (TheLoop->isLoopInvariant(V: I->getOperand(i: 1)) &&
5143 PSE.getSCEV(V: I->getOperand(i: 1))->isOne())))
5144 return 0;
5145
5146 // Detect reduction patterns
5147 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
5148 return *RedCost;
5149
5150 // Certain instructions can be cheaper to vectorize if they have a constant
5151 // second vector operand. One example of this are shifts on x86.
5152 Value *Op2 = I->getOperand(i: 1);
5153 if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
5154 PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
5155 isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
5156 Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
5157 }
5158 auto Op2Info = TTI.getOperandInfo(V: Op2);
5159 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
5160 shouldConsiderInvariant(Op: Op2))
5161 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
5162
5163 SmallVector<const Value *, 4> Operands(I->operand_values());
5164 return TTI.getArithmeticInstrCost(
5165 Opcode: I->getOpcode(), Ty: VectorTy, CostKind: Config.CostKind,
5166 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
5167 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
5168 }
5169 case Instruction::FNeg: {
5170 return TTI.getArithmeticInstrCost(
5171 Opcode: I->getOpcode(), Ty: VectorTy, CostKind: Config.CostKind,
5172 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
5173 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
5174 Args: I->getOperand(i: 0), CxtI: I);
5175 }
5176 case Instruction::Select: {
5177 SelectInst *SI = cast<SelectInst>(Val: I);
5178 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
5179 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
5180
5181 const Value *Op0, *Op1;
5182 using namespace llvm::PatternMatch;
5183 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
5184 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
5185 // select x, y, false --> x & y
5186 // select x, true, y --> x | y
5187 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
5188 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
5189 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
5190 Op1->getType()->getScalarSizeInBits() == 1);
5191
5192 return TTI.getArithmeticInstrCost(
5193 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And,
5194 Ty: VectorTy, CostKind: Config.CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: {Op0, Op1},
5195 CxtI: I);
5196 }
5197
5198 Type *CondTy = SI->getCondition()->getType();
5199 if (!ScalarCond)
5200 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
5201
5202 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
5203 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
5204 Pred = Cmp->getPredicate();
5205 return TTI.getCmpSelInstrCost(
5206 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred, CostKind: Config.CostKind,
5207 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
5208 }
5209 case Instruction::ICmp:
5210 case Instruction::FCmp: {
5211 Type *ValTy = I->getOperand(i: 0)->getType();
5212
5213 if (canTruncateToMinimalBitwidth(I, VF)) {
5214 [[maybe_unused]] Instruction *Op0AsInstruction =
5215 dyn_cast<Instruction>(Val: I->getOperand(i: 0));
5216 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
5217 InstrMinBWs == MinBWs.lookup(Op0AsInstruction)) &&
5218 "if both the operand and the compare are marked for "
5219 "truncation, they must have the same bitwidth");
5220 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: InstrMinBWs);
5221 }
5222
5223 VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
5224 return TTI.getCmpSelInstrCost(
5225 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
5226 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind: Config.CostKind,
5227 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
5228 }
5229 case Instruction::Store:
5230 case Instruction::Load: {
5231 ElementCount Width = VF;
5232 if (Width.isVector()) {
5233 InstWidening Decision = getWideningDecision(I, VF: Width);
5234 assert(Decision != CM_Unknown &&
5235 "CM decision should be taken at this point");
5236 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
5237 return InstructionCost::getInvalid();
5238 if (Decision == CM_Scalarize)
5239 Width = ElementCount::getFixed(MinVal: 1);
5240 }
5241 VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
5242 return getMemoryInstructionCost(I, VF);
5243 }
5244 case Instruction::BitCast:
5245 if (I->getType()->isPointerTy())
5246 return 0;
5247 [[fallthrough]];
5248 case Instruction::ZExt:
5249 case Instruction::SExt:
5250 case Instruction::FPToUI:
5251 case Instruction::FPToSI:
5252 case Instruction::FPExt:
5253 case Instruction::PtrToInt:
5254 case Instruction::IntToPtr:
5255 case Instruction::SIToFP:
5256 case Instruction::UIToFP:
5257 case Instruction::Trunc:
5258 case Instruction::FPTrunc: {
5259 // Computes the CastContextHint from a Load/Store instruction.
5260 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
5261 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
5262 "Expected a load or a store!");
5263
5264 if (VF.isScalar() || !TheLoop->contains(Inst: I))
5265 return TTI::CastContextHint::Normal;
5266
5267 switch (getWideningDecision(I, VF)) {
5268 case LoopVectorizationCostModel::CM_GatherScatter:
5269 return TTI::CastContextHint::GatherScatter;
5270 case LoopVectorizationCostModel::CM_Interleave:
5271 return TTI::CastContextHint::Interleave;
5272 case LoopVectorizationCostModel::CM_Scalarize:
5273 case LoopVectorizationCostModel::CM_Widen:
5274 return isPredicatedInst(I) ? TTI::CastContextHint::Masked
5275 : TTI::CastContextHint::Normal;
5276 case LoopVectorizationCostModel::CM_Widen_Reverse:
5277 return TTI::CastContextHint::Reversed;
5278 case LoopVectorizationCostModel::CM_Unknown:
5279 llvm_unreachable("Instr did not go through cost modelling?");
5280 case LoopVectorizationCostModel::CM_InvalidatedDecision:
5281 return TTI::CastContextHint::None;
5282 }
5283
5284 llvm_unreachable("Unhandled case!");
5285 };
5286
5287 unsigned Opcode = I->getOpcode();
5288 TTI::CastContextHint CCH = TTI::CastContextHint::None;
5289 // For Trunc, the context is the only user, which must be a StoreInst.
5290 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
5291 if (I->hasOneUse())
5292 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
5293 CCH = ComputeCCH(Store);
5294 }
5295 // For Z/Sext, the context is the operand, which must be a LoadInst.
5296 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
5297 Opcode == Instruction::FPExt) {
5298 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
5299 CCH = ComputeCCH(Load);
5300 }
5301
5302 // We optimize the truncation of induction variables having constant
5303 // integer steps. The cost of these truncations is the same as the scalar
5304 // operation.
5305 if (isOptimizableIVTruncate(I, VF)) {
5306 auto *Trunc = cast<TruncInst>(Val: I);
5307 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
5308 Src: Trunc->getSrcTy(), CCH, CostKind: Config.CostKind,
5309 I: Trunc);
5310 }
5311
5312 // Detect reduction patterns
5313 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
5314 return *RedCost;
5315
5316 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
5317 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
5318 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
5319 SrcScalarTy = IntegerType::get(C&: SrcScalarTy->getContext(),
5320 NumBits: MinBWs.lookup(Key: Op0AsInstruction));
5321 Type *SrcVecTy =
5322 VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
5323
5324 if (canTruncateToMinimalBitwidth(I, VF)) {
5325 // If the result type is <= the source type, there will be no extend
5326 // after truncating the users to the minimal required bitwidth.
5327 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
5328 (I->getOpcode() == Instruction::ZExt ||
5329 I->getOpcode() == Instruction::SExt))
5330 return 0;
5331 }
5332
5333 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH,
5334 CostKind: Config.CostKind, I);
5335 }
5336 case Instruction::Call:
5337 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
5338 case Instruction::ExtractValue:
5339 return TTI.getInstructionCost(U: I, CostKind: Config.CostKind);
5340 case Instruction::Alloca:
5341 // We cannot easily widen alloca to a scalable alloca, as
5342 // the result would need to be a vector of pointers.
5343 if (VF.isScalable())
5344 return InstructionCost::getInvalid();
5345 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: RetTy, CostKind: Config.CostKind);
5346 case Instruction::Freeze:
5347 return TTI::TCC_Free;
5348 default:
5349 // This opcode is unknown. Assume that it is the same as 'mul'.
5350 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy,
5351 CostKind: Config.CostKind);
5352 } // end of switch.
5353}
5354
5355void LoopVectorizationCostModel::collectValuesToIgnore() {
5356 // Ignore ephemeral values.
5357 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
5358
5359 SmallVector<Value *, 4> DeadInterleavePointerOps;
5360 SmallVector<Value *, 4> DeadOps;
5361
5362 // If a scalar epilogue is required, users outside the loop won't use
5363 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
5364 // that is the case.
5365 bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
5366 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
5367 return RequiresScalarEpilogue &&
5368 !TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
5369 };
5370
5371 LoopBlocksDFS DFS(TheLoop);
5372 DFS.perform(LI);
5373 for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
5374 for (Instruction &I : reverse(C&: *BB)) {
5375 if (VecValuesToIgnore.contains(Ptr: &I) || ValuesToIgnore.contains(Ptr: &I))
5376 continue;
5377
5378 // Add instructions that would be trivially dead and are only used by
5379 // values already ignored to DeadOps to seed worklist.
5380 if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
5381 all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
5382 return VecValuesToIgnore.contains(Ptr: U) ||
5383 ValuesToIgnore.contains(Ptr: U) || IsLiveOutDead(U);
5384 }))
5385 DeadOps.push_back(Elt: &I);
5386
5387 // For interleave groups, we only create a pointer for the start of the
5388 // interleave group. Queue up addresses of group members except the insert
5389 // position for further processing.
5390 if (isAccessInterleaved(Instr: &I)) {
5391 auto *Group = getInterleavedAccessGroup(Instr: &I);
5392 if (Group->getInsertPos() == &I)
5393 continue;
5394 Value *PointerOp = getLoadStorePointerOperand(V: &I);
5395 DeadInterleavePointerOps.push_back(Elt: PointerOp);
5396 }
5397
5398 // Queue branches for analysis. They are dead, if their successors only
5399 // contain dead instructions.
5400 if (isa<CondBrInst>(Val: &I))
5401 DeadOps.push_back(Elt: &I);
5402 }
5403
5404 // Mark ops feeding interleave group members as free, if they are only used
5405 // by other dead computations.
5406 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
5407 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
5408 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
5409 Instruction *UI = cast<Instruction>(Val: U);
5410 return !VecValuesToIgnore.contains(Ptr: U) &&
5411 (!isAccessInterleaved(Instr: UI) ||
5412 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
5413 }))
5414 continue;
5415 VecValuesToIgnore.insert(Ptr: Op);
5416 append_range(C&: DeadInterleavePointerOps, R: Op->operands());
5417 }
5418
5419 // Mark ops that would be trivially dead and are only used by ignored
5420 // instructions as free.
5421 BasicBlock *Header = TheLoop->getHeader();
5422
5423 // Returns true if the block contains only dead instructions. Such blocks will
5424 // be removed by VPlan-to-VPlan transforms and won't be considered by the
5425 // VPlan-based cost model, so skip them in the legacy cost-model as well.
5426 auto IsEmptyBlock = [this](BasicBlock *BB) {
5427 return all_of(Range&: *BB, P: [this](Instruction &I) {
5428 return ValuesToIgnore.contains(Ptr: &I) || VecValuesToIgnore.contains(Ptr: &I) ||
5429 isa<UncondBrInst>(Val: &I);
5430 });
5431 };
5432 for (unsigned I = 0; I != DeadOps.size(); ++I) {
5433 auto *Op = dyn_cast<Instruction>(Val: DeadOps[I]);
5434
5435 // Check if the branch should be considered dead.
5436 if (auto *Br = dyn_cast_or_null<CondBrInst>(Val: Op)) {
5437 BasicBlock *ThenBB = Br->getSuccessor(i: 0);
5438 BasicBlock *ElseBB = Br->getSuccessor(i: 1);
5439 // Don't considers branches leaving the loop for simplification.
5440 if (!TheLoop->contains(BB: ThenBB) || !TheLoop->contains(BB: ElseBB))
5441 continue;
5442 bool ThenEmpty = IsEmptyBlock(ThenBB);
5443 bool ElseEmpty = IsEmptyBlock(ElseBB);
5444 if ((ThenEmpty && ElseEmpty) ||
5445 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
5446 ElseBB->phis().empty()) ||
5447 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
5448 ThenBB->phis().empty())) {
5449 VecValuesToIgnore.insert(Ptr: Br);
5450 DeadOps.push_back(Elt: Br->getCondition());
5451 }
5452 continue;
5453 }
5454
5455 // Skip any op that shouldn't be considered dead.
5456 if (!Op || !TheLoop->contains(Inst: Op) ||
5457 (isa<PHINode>(Val: Op) && Op->getParent() == Header) ||
5458 !wouldInstructionBeTriviallyDead(I: Op, TLI) ||
5459 any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
5460 return !VecValuesToIgnore.contains(Ptr: U) &&
5461 !ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead(U);
5462 }))
5463 continue;
5464
5465 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
5466 // which applies for both scalar and vector versions. Otherwise it is only
5467 // dead in vector versions, so only add it to VecValuesToIgnore.
5468 if (all_of(Range: Op->users(),
5469 P: [this](User *U) { return ValuesToIgnore.contains(Ptr: U); }))
5470 ValuesToIgnore.insert(Ptr: Op);
5471
5472 VecValuesToIgnore.insert(Ptr: Op);
5473 append_range(C&: DeadOps, R: Op->operands());
5474 }
5475
5476 // Ignore type-promoting instructions we identified during reduction
5477 // detection.
5478 for (const auto &Reduction : Legal->getReductionVars()) {
5479 const RecurrenceDescriptor &RedDes = Reduction.second;
5480 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
5481 VecValuesToIgnore.insert_range(R: Casts);
5482 }
5483 // Ignore type-casting instructions we identified during induction
5484 // detection.
5485 for (const auto &Induction : Legal->getInductionVars()) {
5486 const InductionDescriptor &IndDes = Induction.second;
5487 VecValuesToIgnore.insert_range(R: IndDes.getCastInsts());
5488 }
5489}
5490
5491void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
5492 CM.collectValuesToIgnore();
5493 Config.collectElementTypesForWidening(ValuesToIgnore: &CM.ValuesToIgnore);
5494
5495 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
5496 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
5497 return;
5498
5499 Config.collectInLoopReductions();
5500 // Cases that may be vectorized may be optimized by unit stride predicates.
5501 // TODO: Currently unit stride predicates are added unconditionally, even if
5502 // they are not used for the selected VF (e.g. when only interleaving).
5503 if (MaxFactors.FixedVF.isVector() || MaxFactors.ScalableVF.isVector())
5504 Legal->collectUnitStridePredicates();
5505
5506 auto VPlan1 = tryToBuildVPlan1();
5507 if (!VPlan1)
5508 return;
5509
5510 if (!OrigLoop->isInnermost()) {
5511 // For outer loops, computeMaxVF returns a single non-scalar VF; build a
5512 // plan for that VF only.
5513 ElementCount VF =
5514 MaxFactors.FixedVF ? MaxFactors.FixedVF : MaxFactors.ScalableVF;
5515 buildVPlans(VPlan1&: *VPlan1, MinVF: VF, MaxVF: VF);
5516 LLVM_DEBUG(printPlans(dbgs()));
5517 return;
5518 }
5519
5520 // Compute the minimal bitwidths required for integer operations in the loop
5521 // for later use by the cost model.
5522 Config.computeMinimalBitwidths();
5523
5524 // Invalidate interleave groups if all blocks of loop will be predicated.
5525 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
5526 !useMaskedInterleavedAccesses(TTI)) {
5527 LLVM_DEBUG(
5528 dbgs()
5529 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
5530 "which requires masked-interleaved support.\n");
5531 if (CM.InterleaveInfo.invalidateGroups())
5532 // Invalidating interleave groups also requires invalidating all decisions
5533 // based on them, which includes widening decisions and uniform and scalar
5534 // values.
5535 CM.invalidateCostModelingDecisions();
5536 }
5537
5538 if (CM.foldTailByMasking())
5539 Legal->prepareToFoldTailByMasking();
5540
5541 ElementCount MaxUserVF =
5542 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
5543 if (UserVF) {
5544 if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
5545 reportVectorizationInfo(
5546 Msg: "UserVF ignored because it may be larger than the maximal safe VF",
5547 ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
5548 } else {
5549 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
5550 "VF needs to be a power of two");
5551 // Collect the instructions (and their associated costs) that will be more
5552 // profitable to scalarize.
5553 CM.collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
5554 ElementCount EpilogueUserVF =
5555 ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
5556 if (EpilogueUserVF.isVector() &&
5557 ElementCount::isKnownLT(LHS: EpilogueUserVF, RHS: UserVF)) {
5558 CM.collectNonVectorizedAndSetWideningDecisions(VF: EpilogueUserVF);
5559 buildVPlans(VPlan1&: *VPlan1, MinVF: EpilogueUserVF, MaxVF: EpilogueUserVF);
5560 }
5561 buildVPlans(VPlan1&: *VPlan1, MinVF: UserVF, MaxVF: UserVF);
5562 if (!VPlans.empty() && VPlans.back()->getSingleVF() == UserVF) {
5563 // For scalar VF, skip VPlan cost check as VPlan cost is designed for
5564 // vector VFs only.
5565 if (UserVF.isScalar() ||
5566 cost(Plan&: *VPlans.back(), VF: UserVF, /*RU=*/nullptr).isValid()) {
5567 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
5568 LLVM_DEBUG(printPlans(dbgs()));
5569 return;
5570 }
5571 }
5572 VPlans.clear();
5573 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
5574 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
5575 }
5576 }
5577
5578 // Collect the Vectorization Factor Candidates.
5579 SmallVector<ElementCount> VFCandidates;
5580 for (auto VF = ElementCount::getFixed(MinVal: 1);
5581 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
5582 VFCandidates.push_back(Elt: VF);
5583 for (auto VF = ElementCount::getScalable(MinVal: 1);
5584 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
5585 VFCandidates.push_back(Elt: VF);
5586
5587 for (const auto &VF : VFCandidates) {
5588 // Collect Uniform and Scalar instructions after vectorization with VF.
5589 CM.collectNonVectorizedAndSetWideningDecisions(VF);
5590 }
5591
5592 buildVPlans(VPlan1&: *VPlan1, MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
5593 buildVPlans(VPlan1&: *VPlan1, MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
5594
5595 LLVM_DEBUG(printPlans(dbgs()));
5596}
5597
5598InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
5599 ElementCount VF) const {
5600 InstructionCost Cost = CM.getInstructionCost(I: UI, VF);
5601 if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
5602 return InstructionCost(ForceTargetInstructionCost);
5603 return Cost;
5604}
5605
5606bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
5607 return CM.ValuesToIgnore.contains(Ptr: UI) ||
5608 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
5609 SkipCostComputation.contains(Ptr: UI);
5610}
5611
5612void VPCostContext::invalidateWideningDecision(Instruction *I,
5613 ElementCount VF) {
5614 CM.setWideningDecision(I, VF,
5615 W: LoopVectorizationCostModel::CM_InvalidatedDecision, Cost: 0);
5616}
5617
5618uint64_t VPCostContext::getPredBlockCostDivisor(BasicBlock *BB) const {
5619 return CM.getPredBlockCostDivisor(CostKind, BB);
5620}
5621
5622bool VPCostContext::willBeScalarized(Instruction *I, ElementCount VF) const {
5623 return CM.isScalarWithPredication(I, VF) ||
5624 CM.isUniformAfterVectorization(I, VF) || CM.isForcedScalar(I, VF) ||
5625 (VF.isVector() && CM.isProfitableToScalarize(I, VF));
5626}
5627
5628bool VPCostContext::isMaskRequired(Instruction *I) const {
5629 return CM.isMaskRequired(I);
5630}
5631
5632InstructionCost
5633LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
5634 VPCostContext &CostCtx) const {
5635 InstructionCost Cost;
5636 // Cost modeling for inductions is inaccurate in the legacy cost model
5637 // compared to the recipes that are generated. To match here initially during
5638 // VPlan cost model bring up directly use the induction costs from the legacy
5639 // cost model. Note that we do this as pre-processing; the VPlan may not have
5640 // any recipes associated with the original induction increment instruction
5641 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
5642 // the cost of induction phis and increments (both that are represented by
5643 // recipes and those that are not), to avoid distinguishing between them here,
5644 // and skip all recipes that represent induction phis and increments (the
5645 // former case) later on, if they exist, to avoid counting them twice.
5646 // Similarly we pre-compute the cost of any optimized truncates.
5647 // TODO: Switch to more accurate costing based on VPlan.
5648 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
5649 Instruction *IVInc = cast<Instruction>(
5650 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
5651 SmallVector<Instruction *> IVInsts = {IVInc};
5652 for (unsigned I = 0; I != IVInsts.size(); I++) {
5653 for (Value *Op : IVInsts[I]->operands()) {
5654 auto *OpI = dyn_cast<Instruction>(Val: Op);
5655 if (Op == IV || !OpI || !OrigLoop->contains(Inst: OpI) || !Op->hasOneUse())
5656 continue;
5657 IVInsts.push_back(Elt: OpI);
5658 }
5659 }
5660 IVInsts.push_back(Elt: IV);
5661 for (User *U : IV->users()) {
5662 auto *CI = cast<Instruction>(Val: U);
5663 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
5664 continue;
5665 IVInsts.push_back(Elt: CI);
5666 }
5667
5668 // If the vector loop gets executed exactly once with the given VF, ignore
5669 // the costs of comparison and induction instructions, as they'll get
5670 // simplified away.
5671 // TODO: Remove this code after stepping away from the legacy cost model and
5672 // adding code to simplify VPlans before calculating their costs.
5673 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
5674 if (TC == VF && !CM.foldTailByMasking())
5675 addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
5676 InstsToIgnore&: CostCtx.SkipCostComputation);
5677
5678 for (Instruction *IVInst : IVInsts) {
5679 if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
5680 continue;
5681 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
5682 LLVM_DEBUG({
5683 dbgs() << "Cost of " << InductionCost << " for VF " << VF
5684 << ": induction instruction " << *IVInst << "\n";
5685 });
5686 Cost += InductionCost;
5687 CostCtx.SkipCostComputation.insert(Ptr: IVInst);
5688 }
5689 }
5690
5691 // Pre-compute the costs for branches except for the backedge, as the number
5692 // of replicate regions in a VPlan may not directly match the number of
5693 // branches, which would lead to different decisions.
5694 // TODO: Compute cost of branches for each replicate region in the VPlan,
5695 // which is more accurate than the legacy cost model.
5696 for (BasicBlock *BB : OrigLoop->blocks()) {
5697 if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
5698 continue;
5699 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
5700 if (BB == OrigLoop->getLoopLatch())
5701 continue;
5702 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
5703 Cost += BranchCost;
5704 }
5705
5706 // Don't apply special costs when instruction cost is forced to make sure the
5707 // forced cost is used for each recipe.
5708 if (ForceTargetInstructionCost.getNumOccurrences())
5709 return Cost;
5710
5711 // Pre-compute costs for instructions that are forced-scalar or profitable to
5712 // scalarize. For most such instructions, their scalarization costs are
5713 // accounted for here using the legacy cost model. However, some opcodes
5714 // are excluded from these precomputed scalarization costs and are instead
5715 // modeled later by the VPlan cost model (see UseVPlanCostModel below).
5716 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
5717 if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
5718 continue;
5719 CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
5720 InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
5721 LLVM_DEBUG({
5722 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
5723 << ": forced scalar " << *ForcedScalar << "\n";
5724 });
5725 Cost += ForcedCost;
5726 }
5727
5728 auto UseVPlanCostModel = [](Instruction *I) -> bool {
5729 switch (I->getOpcode()) {
5730 case Instruction::SDiv:
5731 case Instruction::UDiv:
5732 case Instruction::SRem:
5733 case Instruction::URem:
5734 return true;
5735 default:
5736 return false;
5737 }
5738 };
5739 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
5740 if (UseVPlanCostModel(Scalarized) ||
5741 CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
5742 continue;
5743 CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
5744 LLVM_DEBUG({
5745 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
5746 << ": profitable to scalarize " << *Scalarized << "\n";
5747 });
5748 Cost += ScalarCost;
5749 }
5750
5751 return Cost;
5752}
5753
5754InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan, ElementCount VF,
5755 VPRegisterUsage *RU) const {
5756 VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, Config.CostKind, PSE,
5757 OrigLoop);
5758 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
5759
5760 // Now compute and add the VPlan-based cost.
5761 Cost += Plan.cost(VF, Ctx&: CostCtx);
5762
5763 // Add the cost of spills due to excess register usage
5764 if (RU && Config.shouldConsiderRegPressureForVF(VF))
5765 Cost += RU->spillCost(TTI: CM.TTI, CostKind: Config.CostKind, OverrideMaxNumRegs: ForceTargetNumVectorRegs);
5766
5767#ifndef NDEBUG
5768 unsigned EstimatedWidth =
5769 estimateElementCount(VF, Config.getVScaleForTuning());
5770 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
5771 << " (Estimated cost per lane: ");
5772 if (Cost.isValid()) {
5773 APFloat CostPerLane(APFloat::IEEEdouble());
5774 APFloat EstimatedWidthAsAPFloat(APFloat::IEEEdouble());
5775 (void)CostPerLane.convertFromAPInt(APInt(64, (uint64_t)Cost.getValue()),
5776 false, APFloat::rmTowardZero);
5777 (void)EstimatedWidthAsAPFloat.convertFromAPInt(
5778 APInt(64, (uint64_t)EstimatedWidth), false, APFloat::rmTowardZero);
5779 (void)CostPerLane.divide(EstimatedWidthAsAPFloat, APFloat::rmTowardZero);
5780
5781 SmallString<16> Str;
5782 CostPerLane.toString(Str, 3);
5783 LLVM_DEBUG(dbgs() << Str);
5784 } else /* No point dividing an invalid cost - it will still be invalid */
5785 LLVM_DEBUG(dbgs() << "Invalid");
5786 LLVM_DEBUG(dbgs() << ")\n");
5787#endif
5788 return Cost;
5789}
5790
5791std::pair<VectorizationFactor, VPlan *>
5792LoopVectorizationPlanner::computeBestVF() {
5793 if (VPlans.empty())
5794 return {VectorizationFactor::Disabled(), nullptr};
5795 // If there is a single VPlan with a single VF, return it directly.
5796 VPlan &FirstPlan = *VPlans[0];
5797
5798 ElementCount UserVF = Hints.getWidth();
5799 if (VPlans.size() == 1) {
5800 // For outer loops, the plan has a single vector VF determined by the
5801 // heuristic.
5802 assert((FirstPlan.hasScalarVFOnly() || hasPlanWithVF(UserVF) ||
5803 FirstPlan.isOuterLoop()) &&
5804 "must have a single scalar VF, UserVF or an outer loop");
5805 return {VectorizationFactor(FirstPlan.getSingleVF(), 0, 0), &FirstPlan};
5806 }
5807
5808 if (hasPlanWithVF(VF: UserVF) && EpilogueVectorizationForceVF > 1) {
5809 assert(VPlans.size() == 2 && "Must have exactly 2 VPlans built");
5810 assert(VPlans[0]->getSingleVF() ==
5811 ElementCount::getFixed(EpilogueVectorizationForceVF) &&
5812 "expected first plan to be for the forced epilogue VF");
5813 assert(VPlans[1]->getSingleVF() == UserVF &&
5814 "expected second plan to be for the forced UserVF");
5815 return {VectorizationFactor(UserVF, 0, 0), VPlans[1].get()};
5816 }
5817
5818 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
5819 << (Config.CostKind == TTI::TCK_RecipThroughput
5820 ? "Reciprocal Throughput\n"
5821 : Config.CostKind == TTI::TCK_Latency
5822 ? "Instruction Latency\n"
5823 : Config.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
5824 : Config.CostKind == TTI::TCK_SizeAndLatency
5825 ? "Code Size and Latency\n"
5826 : "Unknown\n"));
5827
5828 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
5829 assert(FirstPlan.hasVF(ScalarVF) &&
5830 "More than a single plan/VF w/o any plan having scalar VF");
5831
5832 // TODO: Compute scalar cost using VPlan-based cost model.
5833 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
5834 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
5835 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
5836 VectorizationFactor BestFactor = ScalarFactor;
5837
5838 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
5839 if (ForceVectorization) {
5840 // Ignore scalar width, because the user explicitly wants vectorization.
5841 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5842 // evaluation.
5843 BestFactor.Cost = InstructionCost::getMax();
5844 }
5845
5846 VPlan *PlanForBestVF = &FirstPlan;
5847
5848 for (auto &P : VPlans) {
5849 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
5850 P->vectorFactors().end());
5851
5852 SmallVector<VPRegisterUsage, 8> RUs;
5853 bool ConsiderRegPressure = any_of(Range&: VFs, P: [this](ElementCount VF) {
5854 return Config.shouldConsiderRegPressureForVF(VF);
5855 });
5856 if (ConsiderRegPressure)
5857 RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
5858
5859 for (unsigned I = 0; I < VFs.size(); I++) {
5860 ElementCount VF = VFs[I];
5861 if (VF.isScalar())
5862 continue;
5863 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
5864 LLVM_DEBUG(
5865 dbgs()
5866 << "LV: Not considering vector loop of width " << VF
5867 << " because it will not generate any vector instructions.\n");
5868 continue;
5869 }
5870 if (Config.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
5871 LLVM_DEBUG(
5872 dbgs()
5873 << "LV: Not considering vector loop of width " << VF
5874 << " because it would cause replicated blocks to be generated,"
5875 << " which isn't allowed when optimizing for size.\n");
5876 continue;
5877 }
5878
5879 InstructionCost Cost =
5880 cost(Plan&: *P, VF, RU: ConsiderRegPressure ? &RUs[I] : nullptr);
5881 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
5882
5883 if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P->hasScalarTail())) {
5884 BestFactor = CurrentFactor;
5885 PlanForBestVF = P.get();
5886 }
5887
5888 // If profitable add it to ProfitableVF list.
5889 if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P->hasScalarTail()))
5890 ProfitableVFs.push_back(Elt: CurrentFactor);
5891 }
5892 }
5893
5894 VPlan &BestPlan = *PlanForBestVF;
5895
5896 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
5897 "when vectorizing, the scalar cost must be computed.");
5898
5899 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
5900 return {BestFactor, &BestPlan};
5901}
5902
5903DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
5904 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
5905 InnerLoopVectorizer &ILV, DominatorTree *DT,
5906 EpilogueVectorizationKind EpilogueVecKind) {
5907 assert(BestVPlan.hasVF(BestVF) &&
5908 "Trying to execute plan with unsupported VF");
5909 assert(BestVPlan.hasUF(BestUF) &&
5910 "Trying to execute plan with unsupported UF");
5911 if (BestVPlan.hasEarlyExit())
5912 ++LoopsEarlyExitVectorized;
5913
5914 RUN_VPLAN_PASS(VPlanTransforms::replaceWideCanonicalIVWithWideIV, BestVPlan,
5915 *PSE.getSE(), CM.TTI, Config.CostKind, BestVF, BestUF,
5916 CM.ValuesToIgnore);
5917 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
5918 // cost model is complete for better cost estimates.
5919 RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
5920 RUN_VPLAN_PASS(VPlanTransforms::materializePacksAndUnpacks, BestVPlan);
5921 RUN_VPLAN_PASS(VPlanTransforms::materializeBroadcasts, BestVPlan);
5922 RUN_VPLAN_PASS(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
5923 bool HasBranchWeights =
5924 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
5925 if (HasBranchWeights) {
5926 std::optional<unsigned> VScale = Config.getVScaleForTuning();
5927 RUN_VPLAN_PASS(VPlanTransforms::addBranchWeightToMiddleTerminator,
5928 BestVPlan, BestVF, VScale);
5929 }
5930
5931 if (CM.maskPartialAliasing()) {
5932 assert(CM.foldTailByMasking() && "Expected tail folding to be enabled");
5933 RUN_VPLAN_PASS(VPlanTransforms::materializeAliasMaskCheckBlock, BestVPlan,
5934 *CM.Legal->getRuntimePointerChecking()->getDiffChecks(),
5935 HasBranchWeights);
5936 ++LoopsPartialAliasVectorized;
5937 }
5938
5939 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
5940 VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
5941
5942 RUN_VPLAN_PASS(VPlanTransforms::materializeConstantVectorTripCount, BestVPlan,
5943 BestVF, BestUF, PSE);
5944 RUN_VPLAN_PASS(VPlanTransforms::optimizeForVFAndUF, BestVPlan, BestVF, BestUF,
5945 PSE);
5946 RUN_VPLAN_PASS(VPlanTransforms::simplifyRecipes, BestVPlan);
5947 if (EpilogueVecKind == EpilogueVectorizationKind::None)
5948 RUN_VPLAN_PASS(VPlanTransforms::removeBranchOnConst, BestVPlan,
5949 /*OnlyLatches=*/false);
5950 if (BestVPlan.getEntry()->getSingleSuccessor() ==
5951 BestVPlan.getScalarPreheader()) {
5952 // TODO: The vector loop would be dead, should not even try to vectorize.
5953 ORE->emit(RemarkBuilder: [&]() {
5954 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationDead",
5955 OrigLoop->getStartLoc(),
5956 OrigLoop->getHeader())
5957 << "Created vector loop never executes due to insufficient trip "
5958 "count.";
5959 });
5960 return DenseMap<const SCEV *, Value *>();
5961 }
5962
5963 RUN_VPLAN_PASS(VPlanTransforms::removeDeadRecipes, BestVPlan);
5964
5965 RUN_VPLAN_PASS(VPlanTransforms::convertToConcreteRecipes, BestVPlan);
5966 // Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
5967 RUN_VPLAN_PASS(VPlanTransforms::convertEVLExitCond, BestVPlan);
5968 // Regions are dissolved after optimizing for VF and UF, which completely
5969 // removes unneeded loop regions first.
5970 RUN_VPLAN_PASS(VPlanTransforms::dissolveLoopRegions, BestVPlan);
5971 // Expand BranchOnTwoConds after dissolution, when latch has direct access to
5972 // its successors.
5973 RUN_VPLAN_PASS(VPlanTransforms::expandBranchOnTwoConds, BestVPlan);
5974 // Convert loops with variable-length stepping after regions are dissolved.
5975 RUN_VPLAN_PASS(VPlanTransforms::convertToVariableLengthStep, BestVPlan);
5976 // Remove dead back-edges for single-iteration loops with BranchOnCond(true).
5977 // Only process loop latches to avoid removing edges from the middle block,
5978 // which may be needed for epilogue vectorization.
5979 VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan, /*OnlyLatches=*/true);
5980 VPlanTransforms::materializeBackedgeTakenCount(Plan&: BestVPlan, VectorPH);
5981 std::optional<uint64_t> MaxRuntimeStep;
5982 if (auto MaxVScale = getMaxVScale(F: *CM.TheFunction, TTI: CM.TTI))
5983 MaxRuntimeStep = uint64_t(*MaxVScale) * BestVF.getKnownMinValue() * BestUF;
5984 VPlanTransforms::materializeVectorTripCount(
5985 Plan&: BestVPlan, VectorPHVPBB: VectorPH, TailByMasking: CM.foldTailByMasking(),
5986 RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: BestVF.isVector()), Step: &BestVPlan.getVFxUF(),
5987 MaxRuntimeStep);
5988 VPlanTransforms::materializeFactors(Plan&: BestVPlan, VectorPH, VF: BestVF);
5989 // Limit expansions to VPInstruction to when not vectorizing the epilogue.
5990 // Currently this code path still relies on code re-using SCEVs expanded
5991 // directly to IR instructions.
5992 if (EpilogueVecKind == EpilogueVectorizationKind::None)
5993 VPlanTransforms::expandSCEVsToVPInstructions(Plan&: BestVPlan, SE&: *PSE.getSE());
5994 VPlanTransforms::cse(Plan&: BestVPlan);
5995 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
5996 // Removing branches and incoming values may expose additional simplification
5997 // opportunities.
5998 if (VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan,
5999 /*OnlyLatches=*/EpilogueVecKind !=
6000 EpilogueVectorizationKind::None))
6001 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
6002 VPlanTransforms::simplifyKnownEVL(Plan&: BestVPlan, VF: BestVF, PSE);
6003
6004 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
6005 // making any changes to the CFG.
6006 DenseMap<const SCEV *, Value *> ExpandedSCEVs =
6007 VPlanTransforms::expandSCEVs(Plan&: BestVPlan, SE&: *PSE.getSE());
6008
6009 // Perform the actual loop transformation.
6010 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
6011 OrigLoop->getParentLoop());
6012
6013#ifdef EXPENSIVE_CHECKS
6014 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
6015#endif
6016
6017 // 1. Set up the skeleton for vectorization, including vector pre-header and
6018 // middle block. The vector loop is created during VPlan execution.
6019 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6020 if (VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader())
6021 replaceVPBBWithIRVPBB(VPBB: ScalarPH, IRBB: State.CFG.PrevBB->getSingleSuccessor(),
6022 Plan: &BestVPlan);
6023 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
6024
6025 assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
6026
6027 // After vectorization, the exit blocks of the original loop will have
6028 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
6029 // looked through single-entry phis.
6030 ScalarEvolution &SE = *PSE.getSE();
6031 for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
6032 if (!Exit->hasPredecessors())
6033 continue;
6034 for (VPRecipeBase &PhiR : Exit->phis())
6035 SE.forgetLcssaPhiWithNewPredecessor(L: OrigLoop,
6036 V: &cast<VPIRPhi>(Val&: PhiR).getIRPhi());
6037 }
6038 // Forget the original loop and block dispositions.
6039 SE.forgetLoop(L: OrigLoop);
6040 SE.forgetBlockAndLoopDispositions();
6041
6042 ILV.printDebugTracesAtStart();
6043
6044 //===------------------------------------------------===//
6045 //
6046 // Notice: any optimization or new instruction that go
6047 // into the code below should also be implemented in
6048 // the cost-model.
6049 //
6050 //===------------------------------------------------===//
6051
6052 // Retrieve loop information before executing the plan, which may remove the
6053 // original loop, if it becomes unreachable.
6054 MDNode *LID = OrigLoop->getLoopID();
6055 unsigned OrigLoopInvocationWeight = 0;
6056 std::optional<unsigned> OrigAverageTripCount =
6057 getLoopEstimatedTripCount(L: OrigLoop, EstimatedLoopInvocationWeight: &OrigLoopInvocationWeight);
6058
6059 BestVPlan.execute(State: &State);
6060
6061 // 2.6. Maintain Loop Hints
6062 // Keep all loop hints from the original loop on the vector loop (we'll
6063 // replace the vectorizer-specific hints below).
6064 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
6065 // Add metadata to disable runtime unrolling a scalar loop when there
6066 // are no runtime checks about strides and memory. A scalar loop that is
6067 // rarely used is not worth unrolling.
6068 bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
6069 updateLoopMetadataAndProfileInfo(
6070 VectorLoop: HeaderVPBB ? LI->getLoopFor(BB: State.CFG.VPBB2IRBB.lookup(Val: HeaderVPBB))
6071 : nullptr,
6072 HeaderVPBB, Plan: BestVPlan,
6073 VectorizingEpilogue: EpilogueVecKind == EpilogueVectorizationKind::Epilogue, OrigLoopID: LID,
6074 OrigAverageTripCount, OrigLoopInvocationWeight,
6075 EstimatedVFxUF: estimateElementCount(VF: BestVF * BestUF, VScale: Config.getVScaleForTuning()),
6076 DisableRuntimeUnroll);
6077
6078 // 3. Fix the vectorized code: take care of header phi's, live-outs,
6079 // predication, updating analyses.
6080 ILV.fixVectorizedLoop(State);
6081
6082 ILV.printDebugTracesAtEnd();
6083
6084 return ExpandedSCEVs;
6085}
6086
6087//===--------------------------------------------------------------------===//
6088// EpilogueVectorizerMainLoop
6089//===--------------------------------------------------------------------===//
6090
6091void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
6092 LLVM_DEBUG({
6093 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
6094 << "Main Loop VF:" << EPI.MainLoopVF
6095 << ", Main Loop UF:" << EPI.MainLoopUF
6096 << ", Epilogue Loop VF:" << EPI.EpilogueVF
6097 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
6098 });
6099}
6100
6101void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
6102 DEBUG_WITH_TYPE(VerboseDebug, {
6103 dbgs() << "intermediate fn:\n"
6104 << *OrigLoop->getHeader()->getParent() << "\n";
6105 });
6106}
6107
6108//===--------------------------------------------------------------------===//
6109// EpilogueVectorizerEpilogueLoop
6110//===--------------------------------------------------------------------===//
6111
6112/// This function creates a new scalar preheader, using the previous one as
6113/// entry block to the epilogue VPlan. The minimum iteration check is being
6114/// represented in VPlan.
6115BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
6116 BasicBlock *NewScalarPH = createScalarPreheader(Prefix: "vec.epilog.");
6117 BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
6118 OriginalScalarPH->setName("vec.epilog.iter.check");
6119 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: OriginalScalarPH);
6120 VPBasicBlock *OldEntry = Plan.getEntry();
6121 for (auto &R : make_early_inc_range(Range&: *OldEntry)) {
6122 // Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
6123 // defining.
6124 if (isa<VPIRInstruction>(Val: &R))
6125 continue;
6126 R.moveBefore(BB&: *NewEntry, I: NewEntry->end());
6127 }
6128
6129 VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
6130 Plan.setEntry(NewEntry);
6131 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
6132
6133 return OriginalScalarPH;
6134}
6135
6136void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
6137 LLVM_DEBUG({
6138 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
6139 << "Epilogue Loop VF:" << EPI.EpilogueVF
6140 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
6141 });
6142}
6143
6144void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
6145 DEBUG_WITH_TYPE(VerboseDebug, {
6146 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
6147 });
6148}
6149
6150bool VPRecipeBuilder::isPredicatedInst(Instruction *I) const {
6151 return CM.isPredicatedInst(I);
6152}
6153
6154bool VPRecipeBuilder::prefersVectorizedAddressing() const {
6155 return CM.TTI.prefersVectorizedAddressing();
6156}
6157
6158VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
6159 VFRange &Range) {
6160 assert((VPI->getOpcode() == Instruction::Load ||
6161 VPI->getOpcode() == Instruction::Store) &&
6162 "Must be called with either a load or store");
6163 Instruction *I = VPI->getUnderlyingInstr();
6164
6165 auto WillWiden = [&](ElementCount VF) -> bool {
6166 LoopVectorizationCostModel::InstWidening Decision =
6167 CM.getWideningDecision(I, VF);
6168 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6169 "CM decision should be taken at this point.");
6170 if (Decision == LoopVectorizationCostModel::CM_Interleave)
6171 return true;
6172 if (CM.isScalarAfterVectorization(I, VF) ||
6173 CM.isProfitableToScalarize(I, VF))
6174 return false;
6175 return Decision != LoopVectorizationCostModel::CM_Scalarize;
6176 };
6177
6178 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
6179 return nullptr;
6180
6181 // If a mask is not required, drop it - use unmasked version for safe loads.
6182 // TODO: Determine if mask is needed in VPlan.
6183 VPValue *Mask = CM.isMaskRequired(I) ? VPI->getMask() : nullptr;
6184
6185 // Determine if the pointer operand of the access is either consecutive or
6186 // reverse consecutive.
6187 LoopVectorizationCostModel::InstWidening Decision =
6188 CM.getWideningDecision(I, VF: Range.Start);
6189 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
6190 bool Consecutive =
6191 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
6192
6193 VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(N: 0)
6194 : VPI->getOperand(N: 1);
6195 if (Consecutive) {
6196 GEPNoWrapFlags Flags = vputils::getGEPFlagsForPtr(Ptr);
6197 VPSingleDefRecipe *VectorPtr;
6198 if (Reverse) {
6199 // When folding the tail, we may compute an address that we don't in the
6200 // original scalar loop: drop the GEP no-wrap flags in this case.
6201 // Otherwise preserve existing flags without no-unsigned-wrap, as we will
6202 // emit negative indices.
6203 GEPNoWrapFlags ReverseFlags = CM.foldTailByMasking()
6204 ? GEPNoWrapFlags::none()
6205 : Flags.withoutNoUnsignedWrap();
6206 VectorPtr = new VPVectorEndPointerRecipe(
6207 Ptr, &Plan.getVF(), getLoadStoreType(I),
6208 /*Stride*/ -1, ReverseFlags, VPI->getDebugLoc());
6209 } else {
6210 const DataLayout &DL = I->getDataLayout();
6211 auto *StrideTy = DL.getIndexType(PtrTy: Ptr->getUnderlyingValue()->getType());
6212 VPValue *StrideOne = Plan.getConstantInt(Ty: StrideTy, Val: 1);
6213 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
6214 Flags, VPI->getDebugLoc());
6215 }
6216 Builder.setInsertPoint(VPI);
6217 Builder.insert(R: VectorPtr);
6218 Ptr = VectorPtr;
6219 }
6220
6221 if (Reverse && Mask)
6222 Mask = Builder.createNaryOp(Opcode: VPInstruction::Reverse, Operands: Mask, DL: I->getDebugLoc());
6223
6224 if (VPI->getOpcode() == Instruction::Load) {
6225 auto *Load = cast<LoadInst>(Val: I);
6226 auto *LoadR = new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, *VPI,
6227 Load->getDebugLoc());
6228 if (Reverse) {
6229 Builder.insert(R: LoadR);
6230 return new VPInstruction(VPInstruction::Reverse, LoadR, {}, {},
6231 LoadR->getDebugLoc());
6232 }
6233 return LoadR;
6234 }
6235
6236 StoreInst *Store = cast<StoreInst>(Val: I);
6237 VPValue *StoredVal = VPI->getOperand(N: 0);
6238 if (Reverse)
6239 StoredVal = Builder.createNaryOp(Opcode: VPInstruction::Reverse, Operands: StoredVal,
6240 DL: Store->getDebugLoc());
6241 return new VPWidenStoreRecipe(*Store, Ptr, StoredVal, Mask, Consecutive, *VPI,
6242 Store->getDebugLoc());
6243}
6244
6245VPWidenIntOrFpInductionRecipe *
6246VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
6247 VFRange &Range) {
6248 auto *I = cast<TruncInst>(Val: VPI->getUnderlyingInstr());
6249 // Optimize the special case where the source is a constant integer
6250 // induction variable. Notice that we can only optimize the 'trunc' case
6251 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6252 // (c) other casts depend on pointer size.
6253
6254 // Determine whether \p K is a truncation based on an induction variable that
6255 // can be optimized.
6256 if (!LoopVectorizationPlanner::getDecisionAndClampRange(
6257 Predicate: bind_front(Fn: &LoopVectorizationCostModel::isOptimizableIVTruncate, BindArgs&: CM,
6258 BindArgs&: I),
6259 Range))
6260 return nullptr;
6261
6262 auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
6263 Val: VPI->getOperand(N: 0)->getDefiningRecipe());
6264 PHINode *Phi = WidenIV->getPHINode();
6265 VPIRValue *Start = WidenIV->getStartValue();
6266 const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
6267
6268 // Wrap flags from the original induction do not apply to the truncated type,
6269 // so do not propagate them.
6270 VPIRFlags Flags = VPIRFlags::WrapFlagsTy(false, false);
6271 VPValue *Step =
6272 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
6273 return new VPWidenIntOrFpInductionRecipe(
6274 Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
6275}
6276
6277bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
6278 assert((!isa<UncondBrInst, CondBrInst, PHINode, LoadInst, StoreInst>(I)) &&
6279 "Instruction should have been handled earlier");
6280 // Instruction should be widened, unless it is scalar after vectorization,
6281 // scalarization is profitable or it is predicated.
6282 auto WillScalarize = [this, I](ElementCount VF) -> bool {
6283 return CM.isScalarAfterVectorization(I, VF) ||
6284 CM.isProfitableToScalarize(I, VF) ||
6285 CM.isScalarWithPredication(I, VF);
6286 };
6287 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
6288 Range);
6289}
6290
6291VPRecipeWithIRFlags *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
6292 auto *I = VPI->getUnderlyingInstr();
6293 switch (VPI->getOpcode()) {
6294 default:
6295 return nullptr;
6296 case Instruction::SDiv:
6297 case Instruction::UDiv:
6298 case Instruction::SRem:
6299 case Instruction::URem:
6300 // If not provably safe, use a masked intrinsic.
6301 if (CM.isPredicatedInst(I))
6302 return new VPWidenIntrinsicRecipe(
6303 getMaskedDivRemIntrinsic(Opcode: VPI->getOpcode()), VPI->operands(),
6304 I->getType(), {}, {}, VPI->getDebugLoc());
6305 [[fallthrough]];
6306 case Instruction::Add:
6307 case Instruction::And:
6308 case Instruction::AShr:
6309 case Instruction::FAdd:
6310 case Instruction::FCmp:
6311 case Instruction::FDiv:
6312 case Instruction::FMul:
6313 case Instruction::FNeg:
6314 case Instruction::FRem:
6315 case Instruction::FSub:
6316 case Instruction::ICmp:
6317 case Instruction::LShr:
6318 case Instruction::Mul:
6319 case Instruction::Or:
6320 case Instruction::Select:
6321 case Instruction::Shl:
6322 case Instruction::Sub:
6323 case Instruction::Xor:
6324 case Instruction::Freeze:
6325 return new VPWidenRecipe(*I, VPI->operandsWithoutMask(), *VPI, *VPI,
6326 VPI->getDebugLoc());
6327 case Instruction::ExtractValue: {
6328 SmallVector<VPValue *> NewOps(VPI->operandsWithoutMask());
6329 auto *EVI = cast<ExtractValueInst>(Val: I);
6330 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
6331 unsigned Idx = EVI->getIndices()[0];
6332 NewOps.push_back(Elt: Plan.getConstantInt(BitWidth: 32, Val: Idx));
6333 return new VPWidenRecipe(*I, NewOps, *VPI, *VPI, VPI->getDebugLoc());
6334 }
6335 };
6336}
6337
6338VPHistogramRecipe *VPRecipeBuilder::widenIfHistogram(VPInstruction *VPI) {
6339 if (VPI->getOpcode() != Instruction::Store)
6340 return nullptr;
6341
6342 auto HistInfo =
6343 Legal->getHistogramInfo(I: cast<StoreInst>(Val: VPI->getUnderlyingInstr()));
6344 if (!HistInfo)
6345 return nullptr;
6346
6347 const HistogramInfo *HI = *HistInfo;
6348 // FIXME: Support other operations.
6349 unsigned Opcode = HI->Update->getOpcode();
6350 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
6351 "Histogram update operation must be an Add or Sub");
6352
6353 SmallVector<VPValue *, 3> HGramOps;
6354 // Bucket address.
6355 HGramOps.push_back(Elt: VPI->getOperand(N: 1));
6356 // Increment value.
6357 HGramOps.push_back(Elt: Plan.getOrAddLiveIn(V: HI->Update->getOperand(i: 1)));
6358
6359 // In case of predicated execution (due to tail-folding, or conditional
6360 // execution, or both), pass the relevant mask.
6361 if (CM.isMaskRequired(I: HI->Store))
6362 HGramOps.push_back(Elt: VPI->getMask());
6363
6364 return new VPHistogramRecipe(Opcode, HGramOps, cast<VPIRMetadata>(Val&: *VPI),
6365 VPI->getDebugLoc());
6366}
6367
6368bool VPRecipeBuilder::replaceWithFinalIfReductionStore(
6369 VPInstruction *VPI, VPBuilder &FinalRedStoresBuilder) {
6370 StoreInst *SI;
6371 if ((SI = dyn_cast<StoreInst>(Val: VPI->getUnderlyingInstr())) &&
6372 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
6373 // Only create recipe for the final invariant store of the reduction.
6374 if (Legal->isInvariantStoreOfReduction(SI)) {
6375 VPValue *Val = VPI->getOperand(N: 0);
6376 VPValue *Addr = VPI->getOperand(N: 1);
6377 // We need to store the exiting value of the reduction, so use the blend
6378 // if tail folded.
6379 if (auto *Blend = VPlanPatternMatch::findUserOf<VPBlendRecipe>(V: Val))
6380 Val = Blend;
6381 [[maybe_unused]] auto *Rdx =
6382 VPlanPatternMatch::findUserOf<VPReductionPHIRecipe>(V: Val);
6383 assert((!Rdx || Rdx->getBackedgeValue() == Val) &&
6384 "Store of reduction thats not the backedge value?");
6385 auto *Recipe = new VPReplicateRecipe(
6386 SI, {Val, Addr}, true /* IsUniform */, nullptr /*Mask*/, *VPI, *VPI,
6387 VPI->getDebugLoc());
6388 FinalRedStoresBuilder.insert(R: Recipe);
6389 }
6390 VPI->eraseFromParent();
6391 return true;
6392 }
6393
6394 return false;
6395}
6396
6397VPSingleDefRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
6398 VFRange &Range) {
6399 auto *I = VPI->getUnderlyingInstr();
6400 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6401 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
6402 Range);
6403
6404 bool IsPredicated = CM.isPredicatedInst(I);
6405
6406 // Even if the instruction is not marked as uniform, there are certain
6407 // intrinsic calls that can be effectively treated as such, so we check for
6408 // them here. Conservatively, we only do this for scalable vectors, since
6409 // for fixed-width VFs we can always fall back on full scalarization.
6410 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
6411 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
6412 case Intrinsic::assume:
6413 case Intrinsic::lifetime_start:
6414 case Intrinsic::lifetime_end:
6415 // For scalable vectors if one of the operands is variant then we still
6416 // want to mark as uniform, which will generate one instruction for just
6417 // the first lane of the vector. We can't scalarize the call in the same
6418 // way as for fixed-width vectors because we don't know how many lanes
6419 // there are.
6420 //
6421 // The reasons for doing it this way for scalable vectors are:
6422 // 1. For the assume intrinsic generating the instruction for the first
6423 // lane is still be better than not generating any at all. For
6424 // example, the input may be a splat across all lanes.
6425 // 2. For the lifetime start/end intrinsics the pointer operand only
6426 // does anything useful when the input comes from a stack object,
6427 // which suggests it should always be uniform. For non-stack objects
6428 // the effect is to poison the object, which still allows us to
6429 // remove the call.
6430 IsUniform = true;
6431 break;
6432 default:
6433 break;
6434 }
6435 }
6436 VPValue *BlockInMask = nullptr;
6437 if (!IsPredicated) {
6438 // Finalize the recipe for Instr, first if it is not predicated.
6439 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
6440 } else {
6441 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
6442 // Instructions marked for predication are replicated and a mask operand is
6443 // added initially. Masked replicate recipes will later be placed under an
6444 // if-then construct to prevent side-effects. Generate recipes to compute
6445 // the block mask for this region.
6446 BlockInMask = VPI->getMask();
6447 }
6448
6449 // Note that there is some custom logic to mark some intrinsics as uniform
6450 // manually above for scalable vectors, which this assert needs to account for
6451 // as well.
6452 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
6453 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
6454 "Should not predicate a uniform recipe");
6455 if (IsUniform) {
6456 return VPBuilder::createSingleScalarOp(
6457 Opcode: VPI->getOpcode(), Operands: VPI->operandsWithoutMask(), Mask: BlockInMask, Flags: *VPI, Metadata: *VPI,
6458 DL: VPI->getDebugLoc(), UV: I);
6459 }
6460 auto *Recipe = new VPReplicateRecipe(I, VPI->operandsWithoutMask(),
6461 /*IsSingleScalar=*/false, BlockInMask,
6462 *VPI, *VPI, VPI->getDebugLoc());
6463 return Recipe;
6464}
6465
6466VPRecipeBase *
6467VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
6468 VFRange &Range) {
6469 assert(!R->isPhi() && "phis must be handled earlier");
6470 // First, check for specific widening recipes that deal with optimizing
6471 // truncates and memory operations.
6472 auto *VPI = cast<VPInstruction>(Val: R);
6473 assert(VPI->getOpcode() != Instruction::Call &&
6474 "Call should have been handled by makeCallWideningDecisions");
6475
6476 VPRecipeBase *Recipe;
6477 if (VPI->getOpcode() == Instruction::Trunc &&
6478 (Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
6479 return Recipe;
6480
6481 // All widen recipes below deal only with VF > 1.
6482 if (LoopVectorizationPlanner::getDecisionAndClampRange(
6483 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
6484 return nullptr;
6485
6486 Instruction *Instr = R->getUnderlyingInstr();
6487 assert(!is_contained({Instruction::Load, Instruction::Store},
6488 VPI->getOpcode()) &&
6489 "Should have been handled prior to this!");
6490
6491 if (!shouldWiden(I: Instr, Range))
6492 return nullptr;
6493
6494 if (VPI->getOpcode() == Instruction::GetElementPtr) {
6495 auto *GEP = cast<GetElementPtrInst>(Val: Instr);
6496 return new VPWidenGEPRecipe(GEP->getSourceElementType(),
6497 VPI->operandsWithoutMask(), *VPI,
6498 VPI->getDebugLoc(), GEP);
6499 }
6500
6501 if (Instruction::isCast(Opcode: VPI->getOpcode())) {
6502 auto *CI = cast<CastInst>(Val: Instr);
6503 auto *CastR = cast<VPInstructionWithType>(Val: VPI);
6504 return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(N: 0),
6505 CastR->getResultType(), CI, *VPI, *VPI,
6506 VPI->getDebugLoc());
6507 }
6508
6509 return tryToWiden(VPI);
6510}
6511
6512// To allow RUN_VPLAN_PASS to print the VPlan after VF/UF independent
6513// optimizations.
6514static void printOptimizedVPlan(VPlan &) {}
6515
6516VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan1() {
6517 bool IsInnerLoop = OrigLoop->isInnermost();
6518
6519 // Set up loop versioning for inner loops with memory runtime checks.
6520 // Outer loops don't have LoopAccessInfo since canVectorizeMemory() is not
6521 // called for them.
6522 std::optional<LoopVersioning> LVer;
6523 if (IsInnerLoop) {
6524 const LoopAccessInfo *LAI = Legal->getLAI();
6525 LVer.emplace(args: *LAI, args: LAI->getRuntimePointerChecking()->getChecks(), args&: OrigLoop,
6526 args&: LI, args&: DT, args: PSE.getSE());
6527 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
6528 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
6529 // Only use noalias metadata when using memory checks guaranteeing no
6530 // overlap across all iterations.
6531 LVer->prepareNoAliasMetadata();
6532 }
6533 }
6534
6535 // Create initial base VPlan0, to serve as common starting point for all
6536 // candidates built later for specific VF ranges.
6537 auto VPlan0 = VPlanTransforms::buildVPlan0(TheLoop: OrigLoop, LI&: *LI,
6538 InductionTy: Legal->getWidestInductionType(),
6539 PSE, LVer: LVer ? &*LVer : nullptr);
6540
6541 VPDominatorTree VPDT(*VPlan0);
6542 if (const LoopAccessInfo *LAI = Legal->getLAI())
6543 RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *VPlan0, PSE,
6544 LAI->getSymbolicStrides(), VPDT);
6545 RUN_VPLAN_PASS(VPlanTransforms::simplifyRecipes, *VPlan0);
6546 RUN_VPLAN_PASS(VPlanTransforms::removeDeadRecipes, *VPlan0);
6547
6548 // Create recipes for header phis. For outer loops, reductions, recurrences
6549 // and in-loop reductions are empty since legality doesn't detect them.
6550 if (!RUN_VPLAN_PASS(VPlanTransforms::createHeaderPhiRecipes, *VPlan0, PSE,
6551 *OrigLoop, VPDT, Legal->getInductionVars(),
6552 Legal->getReductionVars(),
6553 Legal->getFixedOrderRecurrences(),
6554 Config.getInLoopReductions(), Hints.allowReordering())) {
6555 return nullptr;
6556 }
6557
6558 if (const LoopAccessInfo *LAI = Legal->getLAI())
6559 RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *VPlan0, PSE,
6560 LAI->getSymbolicStrides(), VPDT);
6561
6562 // Add surviving induction predicates to PSE and check constraints.
6563 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
6564 bool OptForSize =
6565 !ForceVectorization &&
6566 (CM.EpilogueLoweringStatus == CM_EpilogueNotAllowedOptSize ||
6567 CM.EpilogueLoweringStatus == CM_EpilogueNotAllowedLowTripLoop);
6568 unsigned SCEVCheckThreshold = ForceVectorization
6569 ? PragmaVectorizeSCEVCheckThreshold
6570 : VectorizeSCEVCheckThreshold;
6571 if (!RUN_VPLAN_PASS(VPlanTransforms::finalizeSCEVPredicates, *VPlan0, PSE,
6572 OptForSize, SCEVCheckThreshold, ORE, OrigLoop))
6573 return nullptr;
6574
6575 RUN_VPLAN_PASS(VPlanTransforms::addMiddleCheck, *VPlan0);
6576
6577 // If we're vectorizing a loop with an uncountable exit, make sure that the
6578 // recipes are safe to handle.
6579 // TODO: Remove this once we can properly check the VPlan itself for both
6580 // the presence of an uncountable exit and the presence of stores in
6581 // the loop inside handleEarlyExits itself.
6582 UncountableExitStyle EEStyle = UncountableExitStyle::NoUncountableExit;
6583 if (Legal->hasUncountableEarlyExit())
6584 EEStyle = Legal->hasUncountableExitWithSideEffects()
6585 ? UncountableExitStyle::MaskedHandleExitInScalarLoop
6586 : UncountableExitStyle::ReadOnly;
6587
6588 if (!RUN_VPLAN_PASS(VPlanTransforms::handleEarlyExits, *VPlan0, EEStyle,
6589 OrigLoop, PSE, *DT, Legal->getAssumptionCache())) {
6590 return nullptr;
6591 }
6592
6593 RUN_VPLAN_PASS(VPlanTransforms::createLoopRegions, *VPlan0,
6594 getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()));
6595 if (CM.foldTailByMasking())
6596 RUN_VPLAN_PASS(VPlanTransforms::foldTailByMasking, *VPlan0);
6597 RUN_VPLAN_PASS(VPlanTransforms::introduceMasksAndLinearize, *VPlan0);
6598
6599 return VPlan0;
6600}
6601
6602void LoopVectorizationPlanner::buildVPlans(VPlan &VPlan1, ElementCount MinVF,
6603 ElementCount MaxVF) {
6604 if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
6605 return;
6606
6607 auto MaxVFTimes2 = MaxVF * 2;
6608 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
6609 VFRange SubRange = {VF, MaxVFTimes2};
6610 auto Plan =
6611 tryToBuildVPlan(InitialPlan: std::unique_ptr<VPlan>(VPlan1.duplicate()), Range&: SubRange);
6612 VF = SubRange.End;
6613
6614 if (!Plan)
6615 continue;
6616
6617 // Now optimize the initial VPlan.
6618 RUN_VPLAN_PASS(VPlanTransforms::hoistPredicatedLoads, *Plan, PSE, OrigLoop);
6619 RUN_VPLAN_PASS(VPlanTransforms::sinkPredicatedStores, *Plan, PSE, OrigLoop);
6620 RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
6621 Config.getMinimalBitwidths());
6622 RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
6623 // TODO: try to put addExplicitVectorLength close to addActiveLaneMask
6624 if (CM.foldTailWithEVL()) {
6625 RUN_VPLAN_PASS(VPlanTransforms::addExplicitVectorLength, *Plan,
6626 Config.getMaxSafeElements());
6627 RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
6628 }
6629
6630 if (auto P =
6631 RUN_VPLAN_PASS(VPlanTransforms::narrowInterleaveGroups, *Plan, TTI))
6632 VPlans.push_back(Elt: std::move(P));
6633
6634 RUN_VPLAN_PASS_NO_VERIFY(printOptimizedVPlan, *Plan);
6635 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
6636 VPlans.push_back(Elt: std::move(Plan));
6637 }
6638}
6639
6640VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VPlanPtr Plan,
6641 VFRange &Range) {
6642
6643 // For outer loops, the plan only needs basic recipe conversion and induction
6644 // live-out optimization; the full inner-loop recipe building below does not
6645 // apply (no widening decisions, interleave groups, reductions, etc.).
6646 if (Plan->isOuterLoop()) {
6647 for (ElementCount VF : Range)
6648 Plan->addVF(VF);
6649 if (!RUN_VPLAN_PASS(VPlanTransforms::tryToConvertVPInstructionsToVPRecipes,
6650 *Plan, *TLI))
6651 return nullptr;
6652 RUN_VPLAN_PASS(VPlanTransforms::optimizeInductionLiveOutUsers, *Plan, PSE,
6653 /*FoldTail=*/false);
6654 return Plan;
6655 }
6656
6657 using namespace llvm::VPlanPatternMatch;
6658 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
6659
6660 // ---------------------------------------------------------------------------
6661 // Build initial VPlan: Scan the body of the loop in a topological order to
6662 // visit each basic block after having visited its predecessor basic blocks.
6663 // ---------------------------------------------------------------------------
6664
6665 bool RequiresScalarEpilogueCheck =
6666 LoopVectorizationPlanner::getDecisionAndClampRange(
6667 Predicate: [this](ElementCount VF) {
6668 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
6669 },
6670 Range);
6671 // Update the branch in the middle block if a scalar epilogue is required.
6672 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
6673 if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == 2) {
6674 auto *BranchOnCond = cast<VPInstruction>(Val: MiddleVPBB->getTerminator());
6675 assert(MiddleVPBB->getSuccessors()[1] == Plan->getScalarPreheader() &&
6676 "second successor must be scalar preheader");
6677 BranchOnCond->setOperand(I: 0, New: Plan->getFalse());
6678 }
6679
6680 // Don't use getDecisionAndClampRange here, because we don't know the UF
6681 // so this function is better to be conservative, rather than to split
6682 // it up into different VPlans.
6683 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
6684 bool IVUpdateMayOverflow = false;
6685 for (ElementCount VF : Range)
6686 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
6687
6688 TailFoldingStyle Style = CM.getTailFoldingStyle();
6689 // Use NUW for the induction increment if we proved that it won't overflow in
6690 // the vector loop or when not folding the tail. In the later case, we know
6691 // that the canonical induction increment will not overflow as the vector trip
6692 // count is >= increment and a multiple of the increment.
6693 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
6694 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
6695 if (!HasNUW) {
6696 auto *IVInc =
6697 LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(N: 0);
6698 assert(match(IVInc,
6699 m_VPInstruction<Instruction::Add>(
6700 m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
6701 "Did not find the canonical IV increment");
6702 LoopRegion->clearCanonicalIVNUW(Increment: cast<VPInstruction>(Val: IVInc));
6703 }
6704
6705 // ---------------------------------------------------------------------------
6706 // Pre-construction: record ingredients whose recipes we'll need to further
6707 // process after constructing the initial VPlan.
6708 // ---------------------------------------------------------------------------
6709
6710 // For each interleave group which is relevant for this (possibly trimmed)
6711 // Range, add it to the set of groups to be later applied to the VPlan and add
6712 // placeholders for its members' Recipes which we'll be replacing with a
6713 // single VPInterleaveRecipe.
6714 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
6715 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
6716 bool Result = (VF.isVector() && // Query is illegal for VF == 1
6717 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
6718 LoopVectorizationCostModel::CM_Interleave);
6719 // For scalable vectors, the interleave factors must be <= 8 since we
6720 // require the (de)interleaveN intrinsics instead of shufflevectors.
6721 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
6722 "Unsupported interleave factor for scalable vectors");
6723 return Result;
6724 };
6725 if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
6726 continue;
6727 InterleaveGroups.insert(Ptr: IG);
6728 }
6729
6730 // ---------------------------------------------------------------------------
6731 // Construct wide recipes and apply predication for original scalar
6732 // VPInstructions in the loop.
6733 // ---------------------------------------------------------------------------
6734 VPRecipeBuilder RecipeBuilder(*Plan, Legal, CM, Builder);
6735
6736 // Scan the body of the loop in a topological order to visit each basic block
6737 // after having visited its predecessor basic blocks.
6738 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
6739 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
6740 HeaderVPBB);
6741
6742 RUN_VPLAN_PASS(VPlanTransforms::createInLoopReductionRecipes, *Plan,
6743 Range.Start);
6744
6745 VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, Config.CostKind, CM.PSE,
6746 OrigLoop);
6747
6748 RUN_VPLAN_PASS(VPlanTransforms::makeMemOpWideningDecisions, *Plan, Range,
6749 RecipeBuilder, CM.PSE, OrigLoop);
6750
6751 RUN_VPLAN_PASS(VPlanTransforms::makeScalarizationDecisions, *Plan, Range);
6752
6753 RUN_VPLAN_PASS(VPlanTransforms::makeCallWideningDecisions, *Plan, Range,
6754 RecipeBuilder, CostCtx);
6755
6756 // Now process all other blocks and instructions.
6757 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range&: RPOT)) {
6758 // Convert input VPInstructions to widened recipes.
6759 for (VPRecipeBase &R : make_early_inc_range(
6760 Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end()))) {
6761 // Skip recipes that do not need transforming or have already been
6762 // transformed.
6763 if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe,
6764 VPReplicateRecipe, VPWidenLoadRecipe, VPWidenStoreRecipe,
6765 VPWidenCallRecipe, VPWidenIntrinsicRecipe, VPVectorPointerRecipe,
6766 VPVectorEndPointerRecipe, VPHistogramRecipe>(Val: &R) ||
6767 (isa<VPInstructionWithType>(Val: R) &&
6768 Instruction::isCast(Opcode: cast<VPInstructionWithType>(Val&: R).getOpcode()) &&
6769 vputils::onlyFirstLaneUsed(Def: R.getVPSingleValue())))
6770 continue;
6771 auto *VPI = cast<VPInstruction>(Val: &R);
6772 if (!VPI->getUnderlyingValue())
6773 continue;
6774
6775 // TODO: Gradually replace uses of underlying instruction by analyses on
6776 // VPlan. Migrate code relying on the underlying instruction from VPlan0
6777 // to construct recipes below to not use the underlying instruction.
6778 Instruction *Instr = cast<Instruction>(Val: VPI->getUnderlyingValue());
6779 Builder.setInsertPoint(VPI);
6780
6781 VPRecipeBase *Recipe =
6782 RecipeBuilder.tryToCreateWidenNonPhiRecipe(R: VPI, Range);
6783 if (!Recipe)
6784 Recipe =
6785 RecipeBuilder.handleReplication(VPI: cast<VPInstruction>(Val: VPI), Range);
6786
6787 if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
6788 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
6789 // moved to the phi section in the header.
6790 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
6791 } else {
6792 Builder.insert(R: Recipe);
6793 }
6794 if (Recipe->getNumDefinedValues() == 1) {
6795 VPI->replaceAllUsesWith(New: Recipe->getVPSingleValue());
6796 } else {
6797 assert(Recipe->getNumDefinedValues() == 0 &&
6798 "Unexpected multidef recipe");
6799 }
6800 R.eraseFromParent();
6801 }
6802 }
6803
6804 assert(isa<VPRegionBlock>(LoopRegion) &&
6805 !LoopRegion->getEntryBasicBlock()->empty() &&
6806 "entry block must be set to a VPRegionBlock having a non-empty entry "
6807 "VPBasicBlock");
6808
6809 RUN_VPLAN_PASS(VPlanTransforms::adjustFirstOrderRecurrenceMiddleUsers, *Plan,
6810 Range);
6811
6812 // ---------------------------------------------------------------------------
6813 // Transform initial VPlan: Apply previously taken decisions, in order, to
6814 // bring the VPlan to its final state.
6815 // ---------------------------------------------------------------------------
6816
6817 addReductionResultComputation(Plan, RecipeBuilder, MinVF: Range.Start);
6818
6819 // Optimize FindIV reductions to use sentinel-based approach when possible.
6820 RUN_VPLAN_PASS(VPlanTransforms::optimizeFindIVReductions, *Plan, PSE,
6821 *OrigLoop);
6822 RUN_VPLAN_PASS(VPlanTransforms::optimizeInductionLiveOutUsers, *Plan, PSE,
6823 CM.foldTailByMasking());
6824
6825 // Apply mandatory transformation to handle reductions with multiple in-loop
6826 // uses if possible, bail out otherwise.
6827 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMultiUseReductions, *Plan, ORE,
6828 OrigLoop))
6829 return nullptr;
6830 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
6831 // NaNs if possible, bail out otherwise.
6832 if (!RUN_VPLAN_PASS(VPlanTransforms::handleMaxMinNumReductions, *Plan))
6833 return nullptr;
6834
6835 // Create whole-vector selects for find-last recurrences.
6836 if (!RUN_VPLAN_PASS(VPlanTransforms::handleFindLastReductions, *Plan))
6837 return nullptr;
6838
6839 RUN_VPLAN_PASS(VPlanTransforms::removeBranchOnConst, *Plan, false);
6840
6841 // Create partial reduction recipes for scaled reductions and transform
6842 // recipes to abstract recipes if it is legal and beneficial and clamp the
6843 // range for better cost estimation.
6844 // TODO: Enable following transform when the EVL-version of extended-reduction
6845 // and mulacc-reduction are implemented.
6846 if (!CM.foldTailWithEVL()) {
6847 RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
6848 Range);
6849 RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
6850 Range);
6851 }
6852
6853 // Interleave memory: for each Interleave Group we marked earlier as relevant
6854 // for this VPlan, replace the Recipes widening its memory instructions with a
6855 // single VPInterleaveRecipe at its insertion point.
6856 RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
6857 InterleaveGroups, CM.isEpilogueAllowed());
6858
6859 // Convert memory recipes to strided access recipes if the strided access is
6860 // legal and profitable.
6861 RUN_VPLAN_PASS(VPlanTransforms::convertToStridedAccesses, *Plan, PSE,
6862 *OrigLoop, CostCtx, Range);
6863
6864 // Ensure scalar VF plans only contain VF=1, as required by hasScalarVFOnly.
6865 if (Range.Start.isScalar())
6866 Range.End = Range.Start * 2;
6867
6868 for (ElementCount VF : Range)
6869 Plan->addVF(VF);
6870 Plan->setName("Initial VPlan");
6871
6872 RUN_VPLAN_PASS(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan);
6873
6874 if (useActiveLaneMask(Style)) {
6875 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
6876 // TailFoldingStyle is visible there.
6877 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
6878 RUN_VPLAN_PASS(VPlanTransforms::addActiveLaneMask, *Plan, ForControlFlow);
6879 }
6880
6881 if (CM.maskPartialAliasing())
6882 RUN_VPLAN_PASS(VPlanTransforms::attachAliasMaskToHeaderMask, *Plan);
6883
6884 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
6885 return Plan;
6886}
6887
6888void LoopVectorizationPlanner::addReductionResultComputation(
6889 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
6890 using namespace VPlanPatternMatch;
6891 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
6892 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
6893 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
6894 Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
6895 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
6896 VPValue *HeaderMask = vputils::findHeaderMask(Plan&: *Plan);
6897 for (VPRecipeBase &R :
6898 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
6899 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
6900 if (!PhiR)
6901 continue;
6902
6903 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
6904 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
6905 PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
6906 Type *PhiTy = PhiR->getScalarType();
6907
6908 // Convert a VPBlendRecipe backedge to a select.
6909 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: PhiR->getBackedgeValue())) {
6910 if (Blend->getNumIncomingValues() == 2 &&
6911 Blend->getMask(Idx: 0) == HeaderMask) {
6912 auto *Sel = VPBuilder(Blend).createSelect(
6913 Cond: Blend->getMask(Idx: 0), TrueVal: Blend->getIncomingValue(Idx: 0),
6914 FalseVal: Blend->getIncomingValue(Idx: 1), DL: {}, Name: "", Flags: *Blend);
6915 Blend->replaceAllUsesWith(New: Sel);
6916 Blend->eraseFromParent();
6917 }
6918 }
6919
6920 auto *OrigExitingVPV = PhiR->getBackedgeValue();
6921 auto *NewExitingVPV = OrigExitingVPV;
6922
6923 // Remove the predicated select if the target doesn't want it.
6924 VPValue *V;
6925 if (!CM.usePredicatedReductionSelect(RecurrenceKind) &&
6926 match(V: PhiR->getBackedgeValue(),
6927 P: m_Select(Op0: m_Specific(VPV: HeaderMask), Op1: m_VPValue(V), Op2: m_Specific(VPV: PhiR))))
6928 PhiR->setBackedgeValue(V);
6929
6930 // We want code in the middle block to appear to execute on the location of
6931 // the scalar loop's latch terminator because: (a) it is all compiler
6932 // generated, (b) these instructions are always executed after evaluating
6933 // the latch conditional branch, and (c) other passes may add new
6934 // predecessors which terminate on this line. This is the easiest way to
6935 // ensure we don't accidentally cause an extra step back into the loop while
6936 // debugging.
6937 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
6938
6939 // TODO: At the moment ComputeReductionResult also drives creation of the
6940 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
6941 // even for in-loop reductions, until the reduction resume value handling is
6942 // also modeled in VPlan.
6943 VPInstruction *FinalReductionResult;
6944 VPBuilder::InsertPointGuard Guard(Builder);
6945 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
6946 // For AnyOf reductions, find the select among PhiR's users and convert
6947 // the reduction phi to operate on bools before creating the final
6948 // reduction result.
6949 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
6950 auto *AnyOfSelect =
6951 cast<VPSingleDefRecipe>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
6952 return match(U, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()));
6953 }));
6954 VPValue *Start = PhiR->getStartValue();
6955 bool TrueValIsPhi = AnyOfSelect->getOperand(N: 1) == PhiR;
6956 // NewVal is the non-phi operand of the select.
6957 VPValue *NewVal = TrueValIsPhi ? AnyOfSelect->getOperand(N: 2)
6958 : AnyOfSelect->getOperand(N: 1);
6959
6960 // Adjust AnyOf reductions; replace the reduction phi for the selected
6961 // value with a boolean reduction phi node to check if the condition is
6962 // true in any iteration. The final value is selected by the final
6963 // ComputeReductionResult.
6964 VPValue *Cmp = AnyOfSelect->getOperand(N: 0);
6965 // If the compare is checking the reduction PHI node, adjust it to check
6966 // the start value.
6967 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
6968 CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
6969 Builder.setInsertPoint(AnyOfSelect);
6970
6971 // If the true value of the select is the reduction phi, the new value
6972 // is selected if the negated condition is true in any iteration.
6973 if (TrueValIsPhi)
6974 Cmp = Builder.createNot(Operand: Cmp);
6975
6976 // Build a fresh i1 chain (phi, or, and i1 versions of any blend/select
6977 // the exiting value flows through).
6978 auto *NewPhiR =
6979 PhiR->cloneWithOperands(Start: Plan->getFalse(), BackedgeValue: Plan->getFalse());
6980 NewPhiR->insertBefore(InsertPos: PhiR);
6981 VPValue *NewExiting = Builder.createOr(LHS: NewPhiR, RHS: Cmp);
6982
6983 // The exiting value may flow through a chain of VPBlendRecipes and
6984 // select recipes (VPInstruction, VPWidenRecipe or VPReplicateRecipe with
6985 // Select opcode) before reaching OrigExitingVPV. Clone each chain link
6986 // in topological order so each clone refers to the already-rewritten i1
6987 // operands via Substitutions.
6988 DenseMap<VPValue *, VPValue *> Substitutions = {{AnyOfSelect, NewExiting},
6989 {PhiR, NewPhiR}};
6990 std::function<void(VPSingleDefRecipe *)> CloneChain =
6991 [&](VPSingleDefRecipe *Old) {
6992 if (Substitutions.contains(Val: Old))
6993 return;
6994 SmallVector<VPValue *> NewOps;
6995 for (VPValue *Op : Old->operands()) {
6996 if (isa<VPBlendRecipe>(Val: Op) ||
6997 match(V: Op, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue())))
6998 CloneChain(cast<VPSingleDefRecipe>(Val: Op));
6999 NewOps.push_back(Elt: Substitutions.lookup_or(Val: Op, Default&: Op));
7000 }
7001 VPSingleDefRecipe *New;
7002 if (auto *B = dyn_cast<VPBlendRecipe>(Val: Old))
7003 New = B->cloneWithOperands(NewOperands: NewOps);
7004 else if (auto *W = dyn_cast<VPWidenRecipe>(Val: Old))
7005 New = W->cloneWithOperands(NewOperands: NewOps);
7006 else if (auto *Rep = dyn_cast<VPReplicateRecipe>(Val: Old))
7007 New = Rep->cloneWithOperands(NewOperands: NewOps);
7008 else
7009 New = cast<VPInstruction>(Val: Old)->cloneWithOperands(NewOperands: NewOps);
7010 New->insertBefore(InsertPos: Old);
7011 Substitutions[Old] = New;
7012 };
7013
7014 if (OrigExitingVPV != AnyOfSelect) {
7015 CloneChain(cast<VPSingleDefRecipe>(Val: OrigExitingVPV));
7016 NewExiting = Substitutions.lookup(Val: OrigExitingVPV);
7017 }
7018 NewPhiR->setOperand(I: 1, New: NewExiting);
7019 PhiR->replaceAllUsesWith(New: Plan->getPoison(Ty: PhiR->getScalarType()));
7020
7021 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
7022 FinalReductionResult =
7023 Builder.createAnyOfReduction(ChainOp: NewExiting, TrueVal: NewVal, FalseVal: Start, DL: ExitDL);
7024 } else {
7025 // If the vector reduction can be performed in a smaller type, we
7026 // truncate then extend the loop exit value to enable InstCombine to
7027 // evaluate the entire expression in the smaller type.
7028 VPValue *ReductionOp = NewExitingVPV;
7029 Instruction::CastOps ExtendOpc = Instruction::CastOpsEnd;
7030 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
7031 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
7032 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
7033 "Unexpected truncated min-max recurrence!");
7034 Type *RdxTy = RdxDesc.getRecurrenceType();
7035 ExtendOpc = RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
7036 {
7037 VPBuilder::InsertPointGuard Guard(Builder);
7038 Builder.setInsertPoint(
7039 TheBB: NewExitingVPV->getDefiningRecipe()->getParent(),
7040 IP: std::next(x: NewExitingVPV->getDefiningRecipe()->getIterator()));
7041 ReductionOp =
7042 Builder.createWidenCast(Opcode: Instruction::Trunc, Op: NewExitingVPV, ResultTy: RdxTy);
7043 VPWidenCastRecipe *Extnd =
7044 Builder.createWidenCast(Opcode: ExtendOpc, Op: ReductionOp, ResultTy: PhiTy);
7045 if (PhiR->getOperand(N: 1) == NewExitingVPV)
7046 PhiR->setOperand(I: 1, New: Extnd);
7047 }
7048 }
7049
7050 VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
7051 PhiR->getFastMathFlagsOrNone());
7052 FinalReductionResult = Builder.createNaryOp(
7053 Opcode: VPInstruction::ComputeReductionResult, Operands: {ReductionOp}, Flags, DL: ExitDL);
7054 if (ExtendOpc != Instruction::CastOpsEnd)
7055 FinalReductionResult = Builder.createScalarCast(
7056 Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
7057 }
7058
7059 // Update all users outside the vector region. Also replace redundant
7060 // extracts.
7061 for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
7062 auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
7063 if (FinalReductionResult == U || Parent->getParent())
7064 continue;
7065 // Skip ComputeReductionResult and FindIV reductions when they are not the
7066 // final result.
7067 if (match(U, P: m_VPInstruction<VPInstruction::ComputeReductionResult>()) ||
7068 (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind) &&
7069 match(U, P: m_VPInstruction<Instruction::ICmp>())))
7070 continue;
7071 U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
7072
7073 // Look through ExtractLastPart.
7074 if (match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
7075 U = cast<VPInstruction>(Val: U)->getSingleUser();
7076
7077 if (match(U, P: m_CombineOr(Ps: m_ExtractLane(Op0: m_VPValue(), Op1: m_VPValue()),
7078 Ps: m_ExtractLastLane(Op0: m_VPValue()))))
7079 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
7080 }
7081
7082 RecurKind RK = PhiR->getRecurrenceKind();
7083 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
7084 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
7085 !RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) &&
7086 !RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RK))) {
7087 VPBuilder PHBuilder(Plan->getVectorPreheader());
7088 VPValue *Iden = Plan->getOrAddLiveIn(
7089 V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: PhiR->getFastMathFlagsOrNone()));
7090 auto *ScaleFactorVPV = Plan->getConstantInt(BitWidth: 32, Val: 1);
7091 VPValue *StartV = PHBuilder.createNaryOp(
7092 Opcode: VPInstruction::ReductionStartVector,
7093 Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV}, Flags: *PhiR);
7094 PhiR->setOperand(I: 0, New: StartV);
7095 }
7096 }
7097
7098 RUN_VPLAN_PASS(VPlanTransforms::clearReductionWrapFlags, *Plan);
7099}
7100
7101void LoopVectorizationPlanner::attachRuntimeChecks(
7102 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
7103 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
7104 if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(N: 0)) {
7105 assert((!Config.OptForSize ||
7106 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
7107 "Cannot SCEV check stride or overflow when optimizing for size");
7108 RUN_VPLAN_PASS(VPlanTransforms::attachCheckBlock, Plan, SCEVCheckCond,
7109 SCEVCheckBlock, HasBranchWeights);
7110 }
7111 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
7112 if (MemCheckBlock && MemCheckBlock->hasNPredecessors(N: 0)) {
7113 // VPlan-native path does not do any analysis for runtime checks
7114 // currently.
7115 assert((!EnableVPlanNativePath || !Plan.isOuterLoop()) &&
7116 "Runtime checks are not supported for outer loops yet");
7117
7118 if (Config.OptForSize) {
7119 assert(
7120 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
7121 "Cannot emit memory checks when optimizing for size, unless forced "
7122 "to vectorize.");
7123 ORE->emit(RemarkBuilder: [&]() {
7124 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
7125 OrigLoop->getStartLoc(),
7126 OrigLoop->getHeader())
7127 << "Code-size may be reduced by not forcing "
7128 "vectorization, or by source-code modifications "
7129 "eliminating the need for runtime checks "
7130 "(e.g., adding 'restrict').";
7131 });
7132 }
7133 RUN_VPLAN_PASS(VPlanTransforms::attachCheckBlock, Plan, MemCheckCond,
7134 MemCheckBlock, HasBranchWeights);
7135 }
7136}
7137
7138void LoopVectorizationPlanner::addMinimumIterationCheck(
7139 VPlan &Plan, ElementCount VF, unsigned UF,
7140 ElementCount MinProfitableTripCount) const {
7141 const uint32_t *BranchWeights =
7142 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())
7143 ? &MinItersBypassWeights[0]
7144 : nullptr;
7145 RUN_VPLAN_PASS(VPlanTransforms::addMinimumIterationCheck, Plan, VF, UF,
7146 MinProfitableTripCount,
7147 CM.requiresScalarEpilogue(VF.isVector()),
7148 CM.foldTailByMasking(), OrigLoop, BranchWeights,
7149 OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(),
7150 PSE, Plan.getEntry());
7151}
7152
7153// Determine how to lower the epilogue, which depends on 1) optimising
7154// for minimum code-size, 2) tail-folding compiler options, 3) loop
7155// hints forcing tail-folding, and 4) a TTI hook that analyses whether the loop
7156// is suitable for tail-folding.
7157// This function determines epilogue lowering for the main vector loop while
7158// epilogue lowering for the tail-folded epilogue path will be handled
7159// separately in getEpilogueTailLowering.
7160static EpilogueLowering
7161getEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
7162 bool OptForSize, TargetTransformInfo *TTI,
7163 TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL,
7164 InterleavedAccessInfo *IAI) {
7165 // 1) OptSize takes precedence over all other options, i.e. if this is set,
7166 // don't look at hints or options, and don't request an epilogue.
7167 if (F->hasOptSize() ||
7168 (OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
7169 return CM_EpilogueNotAllowedOptSize;
7170
7171 // 2) If set, obey the directives
7172 if (TailFoldingPolicy.getNumOccurrences()) {
7173 switch (TailFoldingPolicy) {
7174 case TailFoldingPolicyTy::None:
7175 return CM_EpilogueAllowed;
7176 case TailFoldingPolicyTy::PreferFoldTail:
7177 return CM_EpilogueNotNeededFoldTail;
7178 case TailFoldingPolicyTy::MustFoldTail:
7179 return CM_EpilogueNotAllowedFoldTail;
7180 };
7181 }
7182
7183 // 3) If set, obey the hints
7184 switch (Hints.getPredicate()) {
7185 case LoopVectorizeHints::FK_Enabled:
7186 return CM_EpilogueNotNeededFoldTail;
7187 case LoopVectorizeHints::FK_Disabled:
7188 return CM_EpilogueAllowed;
7189 };
7190
7191 // 4) if the TTI hook indicates this is profitable, request tail-folding.
7192 TailFoldingInfo TFI(TLI, &LVL, IAI);
7193 if (TTI->preferTailFoldingOverEpilogue(TFI: &TFI))
7194 return CM_EpilogueNotNeededFoldTail;
7195
7196 return CM_EpilogueAllowed;
7197}
7198
7199/// Determine how to lower the epilogue for the vector epilogue loop.
7200/// Check if there are any conflicts that prevent tail-folding the epilogue.
7201/// \return CM_EpilogueNotNeededFoldTail if epilogue tail-folding is possible,
7202/// otherwise CM_EpilogueAllowed.
7203static EpilogueLowering
7204getEpilogueTailLowering(const LoopVectorizationCostModel &MainCM, const Loop *L,
7205 OptimizationRemarkEmitter *ORE) {
7206 // Epilogue TF is only enabled when explicitly requested via command line.
7207 if (!EpilogueTailFoldingPolicy.getNumOccurrences() ||
7208 EpilogueTailFoldingPolicy != TailFoldingPolicyTy::PreferFoldTail)
7209 return CM_EpilogueAllowed;
7210
7211 if (!EnableEpilogueVectorization) {
7212 reportVectorizationInfo(
7213 Msg: "Options conflict, epilogue vectorization is disallowed while "
7214 "epilogue tail-folding allowed!\n",
7215 ORETag: "UnsupportedEpilogueTailFoldingPolicy", ORE, TheLoop: L);
7216 return CM_EpilogueAllowed;
7217 }
7218
7219 // If scalar epilogue is explicitly required, we can't apply TF.
7220 if (MainCM.requiresScalarEpilogue(/*IsVectorizing*/ true)) {
7221 LLVM_DEBUG(dbgs() << "LV: Epilogue tail-folding can't be applied because "
7222 "scalar epilogue is required\n"
7223 "LV: Fall back to a normal epilogue\n");
7224 return CM_EpilogueAllowed;
7225 }
7226
7227 // If having epilogue is NOT allowed, then no epilogue to apply TF for.
7228 if (!MainCM.isEpilogueAllowed()) {
7229 LLVM_DEBUG(dbgs() << "LV: No epilogue to apply tail-folding for.\n"
7230 "LV: Fall back to a normal epilogue\n");
7231 return CM_EpilogueAllowed;
7232 }
7233
7234 // We can apply tail-folding on the vectorized epilogue loop.
7235 return CM_EpilogueNotNeededFoldTail;
7236}
7237
7238// Emit a remark if there are stores to floats that required a floating point
7239// extension. If the vectorized loop was generated with floating point there
7240// will be a performance penalty from the conversion overhead and the change in
7241// the vector width.
7242static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
7243 SmallVector<Instruction *, 4> Worklist;
7244 for (BasicBlock *BB : L->getBlocks()) {
7245 for (Instruction &Inst : *BB) {
7246 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
7247 if (S->getValueOperand()->getType()->isFloatTy())
7248 Worklist.push_back(Elt: S);
7249 }
7250 }
7251 }
7252
7253 // Traverse the floating point stores upwards searching, for floating point
7254 // conversions.
7255 SmallPtrSet<const Instruction *, 4> Visited;
7256 SmallPtrSet<const Instruction *, 4> EmittedRemark;
7257 while (!Worklist.empty()) {
7258 auto *I = Worklist.pop_back_val();
7259 if (!L->contains(Inst: I))
7260 continue;
7261 if (!Visited.insert(Ptr: I).second)
7262 continue;
7263
7264 // Emit a remark if the floating point store required a floating
7265 // point conversion.
7266 // TODO: More work could be done to identify the root cause such as a
7267 // constant or a function return type and point the user to it.
7268 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
7269 ORE->emit(RemarkBuilder: [&]() {
7270 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
7271 I->getDebugLoc(), L->getHeader())
7272 << "floating point conversion changes vector width. "
7273 << "Mixed floating point precision requires an up/down "
7274 << "cast that will negatively impact performance.";
7275 });
7276
7277 for (Use &Op : I->operands())
7278 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
7279 Worklist.push_back(Elt: OpI);
7280 }
7281}
7282
7283/// For loops with uncountable early exits, find the cost of doing work when
7284/// exiting the loop early, such as calculating the final exit values of
7285/// variables used outside the loop.
7286/// TODO: This is currently overly pessimistic because the loop may not take
7287/// the early exit, but better to keep this conservative for now. In future,
7288/// it might be possible to relax this by using branch probabilities.
7289static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
7290 VPlan &Plan, ElementCount VF) {
7291 InstructionCost Cost = 0;
7292 for (auto *ExitVPBB : Plan.getExitBlocks()) {
7293 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
7294 // If the predecessor is not the middle.block, then it must be the
7295 // vector.early.exit block, which may contain work to calculate the exit
7296 // values of variables used outside the loop.
7297 if (PredVPBB != Plan.getMiddleBlock()) {
7298 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
7299 << PredVPBB->getName() << ":\n");
7300 Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
7301 }
7302 }
7303 }
7304 return Cost;
7305}
7306
7307/// This function determines whether or not it's still profitable to vectorize
7308/// the loop given the extra work we have to do outside of the loop:
7309/// 1. Perform the runtime checks before entering the loop to ensure it's safe
7310/// to vectorize.
7311/// 2. In the case of loops with uncountable early exits, we may have to do
7312/// extra work when exiting the loop early, such as calculating the final
7313/// exit values of variables used outside the loop.
7314/// 3. The middle block.
7315static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
7316 VectorizationFactor &VF, Loop *L,
7317 PredicatedScalarEvolution &PSE,
7318 VPCostContext &CostCtx, VPlan &Plan,
7319 EpilogueLowering SEL,
7320 std::optional<unsigned> VScale) {
7321 InstructionCost RtC = Checks.getCost();
7322 if (!RtC.isValid())
7323 return false;
7324
7325 // When interleaving only scalar and vector cost will be equal, which in turn
7326 // would lead to a divide by 0. Fall back to hard threshold.
7327 if (VF.Width.isScalar()) {
7328 // TODO: Should we rename VectorizeMemoryCheckThreshold?
7329 if (RtC > VectorizeMemoryCheckThreshold) {
7330 LLVM_DEBUG(
7331 dbgs()
7332 << "LV: Interleaving only is not profitable due to runtime checks\n");
7333 return false;
7334 }
7335 return true;
7336 }
7337
7338 // The scalar cost should only be 0 when vectorizing with a user specified
7339 // VF/IC. In those cases, runtime checks should always be generated.
7340 uint64_t ScalarC = VF.ScalarCost.getValue();
7341 if (ScalarC == 0)
7342 return true;
7343
7344 InstructionCost TotalCost = RtC;
7345 // Add on the cost of any work required in the vector early exit block, if
7346 // one exists.
7347 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
7348 TotalCost += Plan.getMiddleBlock()->cost(VF: VF.Width, Ctx&: CostCtx);
7349
7350 // First, compute the minimum iteration count required so that the vector
7351 // loop outperforms the scalar loop.
7352 // The total cost of the scalar loop is
7353 // ScalarC * TC
7354 // where
7355 // * TC is the actual trip count of the loop.
7356 // * ScalarC is the cost of a single scalar iteration.
7357 //
7358 // The total cost of the vector loop is
7359 // TotalCost + VecC * (TC / VF) + EpiC
7360 // where
7361 // * TotalCost is the sum of the costs cost of
7362 // - the generated runtime checks, i.e. RtC
7363 // - performing any additional work in the vector.early.exit block for
7364 // loops with uncountable early exits.
7365 // - the middle block, if ExpectedTC <= VF.Width.
7366 // * VecC is the cost of a single vector iteration.
7367 // * TC is the actual trip count of the loop
7368 // * VF is the vectorization factor
7369 // * EpiCost is the cost of the generated epilogue, including the cost
7370 // of the remaining scalar operations.
7371 //
7372 // Vectorization is profitable once the total vector cost is less than the
7373 // total scalar cost:
7374 // TotalCost + VecC * (TC / VF) + EpiC < ScalarC * TC
7375 //
7376 // Now we can compute the minimum required trip count TC as
7377 // VF * (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC
7378 //
7379 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
7380 // the computations are performed on doubles, not integers and the result
7381 // is rounded up, hence we get an upper estimate of the TC.
7382 unsigned IntVF = estimateElementCount(VF: VF.Width, VScale);
7383 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
7384 uint64_t MinTC1 =
7385 Div == 0 ? 0 : divideCeil(Numerator: TotalCost.getValue() * IntVF, Denominator: Div);
7386
7387 // Second, compute a minimum iteration count so that the cost of the
7388 // runtime checks is only a fraction of the total scalar loop cost. This
7389 // adds a loop-dependent bound on the overhead incurred if the runtime
7390 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
7391 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
7392 // cost, compute
7393 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
7394 uint64_t MinTC2 = divideCeil(Numerator: RtC.getValue() * 10, Denominator: ScalarC);
7395
7396 // Now pick the larger minimum. If it is not a multiple of VF and an epilogue
7397 // is allowed, choose the next closest multiple of VF. This should partly
7398 // compensate for ignoring the epilogue cost.
7399 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
7400 if (SEL == CM_EpilogueAllowed)
7401 MinTC = alignTo(Value: MinTC, Align: IntVF);
7402 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
7403
7404 LLVM_DEBUG(
7405 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
7406 << VF.MinProfitableTripCount << "\n");
7407
7408 // Skip vectorization if the expected trip count is less than the minimum
7409 // required trip count.
7410 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
7411 if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
7412 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
7413 "trip count < minimum profitable VF ("
7414 << *ExpectedTC << " < " << VF.MinProfitableTripCount
7415 << ")\n");
7416
7417 return false;
7418 }
7419 }
7420 return true;
7421}
7422
7423LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
7424 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
7425 !EnableLoopInterleaving),
7426 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
7427 !EnableLoopVectorization) {}
7428
7429/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
7430/// vectorization.
7431static SmallVector<VPInstruction *>
7432preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
7433 using namespace VPlanPatternMatch;
7434 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
7435 // introduce multiple uses of undef/poison. If the reduction start value may
7436 // be undef or poison it needs to be frozen and the frozen start has to be
7437 // used when computing the reduction result. We also need to use the frozen
7438 // value in the resume phi generated by the main vector loop, as this is also
7439 // used to compute the reduction result after the epilogue vector loop.
7440 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
7441 bool UpdateResumePhis) {
7442 VPBuilder Builder(Plan.getEntry());
7443 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
7444 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
7445 if (!VPI)
7446 continue;
7447 VPValue *OrigStart;
7448 if (!matchFindIVResult(VPI, ReducedIV: m_VPValue(), Start: m_VPValue(V&: OrigStart)))
7449 continue;
7450 if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
7451 continue;
7452 VPInstruction *Freeze =
7453 Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, DL: {}, Name: "fr");
7454 VPI->setOperand(I: 2, New: Freeze);
7455 if (UpdateResumePhis)
7456 OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
7457 return Freeze != &U && isa<VPPhi>(Val: &U);
7458 });
7459 }
7460 };
7461 AddFreezeForFindLastIVReductions(MainPlan, true);
7462 AddFreezeForFindLastIVReductions(EpiPlan, false);
7463
7464 VPValue *VectorTC = nullptr;
7465 auto *Term =
7466 MainPlan.getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
7467 [[maybe_unused]] bool MatchedTC =
7468 match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue(V&: VectorTC)));
7469 assert(MatchedTC && "must match vector trip count");
7470
7471 // If there is a suitable resume value for the canonical induction in the
7472 // scalar (which will become vector) epilogue loop, use it and move it to the
7473 // beginning of the scalar preheader. Otherwise create it below.
7474 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
7475 auto ResumePhiIter =
7476 find_if(Range: MainScalarPH->phis(), P: [VectorTC](VPRecipeBase &R) {
7477 return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Ops: m_Specific(VPV: VectorTC),
7478 Ops: m_ZeroInt()));
7479 });
7480 VPPhi *ResumePhi = nullptr;
7481 if (ResumePhiIter == MainScalarPH->phis().end()) {
7482 assert(MainPlan.getVectorLoopRegion()->getCanonicalIV() &&
7483 "canonical IV must exist");
7484 Type *Ty = VectorTC->getScalarType();
7485 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
7486 ResumePhi = ScalarPHBuilder.createScalarPhi(
7487 IncomingValues: {VectorTC, MainPlan.getZero(Ty)}, DL: {}, Name: "vec.epilog.resume.val");
7488 } else {
7489 ResumePhi = cast<VPPhi>(Val: &*ResumePhiIter);
7490 ResumePhi->setName("vec.epilog.resume.val");
7491 if (&MainScalarPH->front() != ResumePhi)
7492 ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->begin());
7493 }
7494
7495 // Create a ResumeForEpilogue for the canonical IV resume and its bypass value
7496 // as the first non-phi, to keep them alive for the epilogue.
7497 VPBuilder ResumeBuilder(MainScalarPH);
7498 ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue,
7499 Operands: {ResumePhi, ResumePhi->getOperand(N: 1)});
7500
7501 // Create ResumeForEpilogue instructions for the resume phis of the
7502 // VPIRPhis and their bypass values in the scalar header of the main plan and
7503 // return them so they can be used as resume values when vectorizing the
7504 // epilogue.
7505 return to_vector(
7506 Range: map_range(C: MainPlan.getScalarHeader()->phis(), F: [&](VPRecipeBase &R) {
7507 assert(isa<VPIRPhi>(R) &&
7508 "only VPIRPhis expected in the scalar header");
7509 VPValue *MainResumePhi = R.getOperand(N: 0);
7510 VPValue *Bypass = MainResumePhi->getDefiningRecipe()->getOperand(N: 1);
7511 return ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue,
7512 Operands: {MainResumePhi, Bypass});
7513 }));
7514}
7515
7516/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
7517/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
7518/// reductions require creating new instructions to compute the resume values.
7519/// They are collected in a vector and returned. They must be moved to the
7520/// preheader of the vector epilogue loop, after created by the execution of \p
7521/// Plan.
7522static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
7523 VPlan &MainPlan, VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs,
7524 EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
7525 VFSelectionContext &Config, ScalarEvolution &SE,
7526 ArrayRef<VPInstruction *> ResumeValues) {
7527 // Build a map from the scalar-header PHI to the ResumeForEpilogue markers
7528 // from the main plan.
7529 // TODO: Replace the IR PHI key.
7530 DenseMap<PHINode *, VPInstruction *> IRPhiToResumeForEpi;
7531 for (auto [HeaderPhi, ResumeForEpi] :
7532 zip_equal(t: MainPlan.getScalarHeader()->phis(), u&: ResumeValues))
7533 IRPhiToResumeForEpi[&cast<VPIRPhi>(Val&: HeaderPhi).getIRPhi()] = ResumeForEpi;
7534 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
7535 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
7536 Header->setName("vec.epilog.vector.body");
7537
7538 VPValue *IV = VectorLoop->getCanonicalIV();
7539 // When vectorizing the epilogue loop, the canonical induction needs to start
7540 // at the resume value from the main vector loop. Find the resume value
7541 // created during execution of the main VPlan. Add this resume value as an
7542 // offset to the canonical IV of the epilogue loop.
7543 using namespace llvm::PatternMatch;
7544 VPInstruction *ResumeForEpilogue =
7545 cast<VPInstruction>(Val: &*MainPlan.getScalarPreheader()->getFirstNonPhi());
7546 Value *EPResumeVal = ResumeForEpilogue->getUnderlyingValue();
7547 if (auto *ResumePhi = dyn_cast<PHINode>(Val: EPResumeVal)) {
7548 for (Value *Inc : ResumePhi->incoming_values()) {
7549 if (match(V: Inc, P: m_SpecificInt(V: 0)))
7550 continue;
7551 assert(!EPI.VectorTripCount &&
7552 "Must only have a single non-zero incoming value");
7553 EPI.VectorTripCount = Inc;
7554 }
7555 // If we didn't find a non-zero vector trip count, all incoming values
7556 // must be zero, which also means the vector trip count is zero.
7557 if (!EPI.VectorTripCount) {
7558 assert(ResumePhi->getNumIncomingValues() > 0 &&
7559 all_of(ResumePhi->incoming_values(), match_fn(m_SpecificInt(0))) &&
7560 "all incoming values must be 0");
7561 EPI.VectorTripCount = ResumePhi->getIncomingValue(i: 0);
7562 }
7563 } else {
7564 EPI.VectorTripCount = EPResumeVal;
7565 }
7566 VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
7567 assert(all_of(IV->users(),
7568 [](const VPUser *U) {
7569 if (isa<VPScalarIVStepsRecipe, VPDerivedIVRecipe>(U))
7570 return true;
7571 unsigned Opc = cast<VPInstruction>(U)->getOpcode();
7572 return Instruction::isCast(Opc) || Opc == Instruction::Add;
7573 }) &&
7574 "the canonical IV should only be used by its increment or "
7575 "ScalarIVSteps when resetting the start value");
7576 VPBuilder Builder(Header, Header->getFirstNonPhi());
7577 VPInstruction *Add = Builder.createAdd(LHS: IV, RHS: VPV);
7578 // Replace all users of the canonical IV and its increment with the offset
7579 // version, except for the Add itself and the canonical IV increment.
7580 auto *Increment = vputils::findCanonicalIVIncrement(Plan);
7581 assert(Increment && "Must have a canonical IV increment at this point");
7582 IV->replaceUsesWithIf(New: Add, ShouldReplace: [Add, Increment](VPUser &U, unsigned) {
7583 return &U != Add && &U != Increment;
7584 });
7585 VPInstruction *OffsetIVInc =
7586 VPBuilder::getToInsertAfter(R: Increment).createAdd(LHS: Increment, RHS: VPV);
7587 Increment->replaceAllUsesWith(New: OffsetIVInc);
7588 OffsetIVInc->setOperand(I: 0, New: Increment);
7589
7590 DenseMap<Value *, Value *> ToFrozen;
7591 SmallVector<Instruction *> InstsToMove;
7592 // Ensure that the start values for all header phi recipes are updated before
7593 // vectorizing the epilogue loop.
7594 for (VPRecipeBase &R : Header->phis()) {
7595 Value *ResumeV = nullptr;
7596 // TODO: Move setting of resume values to prepareToExecute.
7597 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
7598 // Find the reduction result by searching users of the phi or its backedge
7599 // value.
7600 auto IsReductionResult = [](VPRecipeBase *R) {
7601 auto *VPI = dyn_cast<VPInstruction>(Val: R);
7602 return VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult;
7603 };
7604 auto *RdxResult = cast<VPInstruction>(
7605 Val: vputils::findRecipe(Start: ReductionPhi->getBackedgeValue(), Pred: IsReductionResult));
7606 assert(RdxResult && "expected to find reduction result");
7607
7608 VPInstruction *ResumeForEpi = IRPhiToResumeForEpi.at(
7609 Val: cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr()));
7610 ResumeV = ResumeForEpi->getUnderlyingValue();
7611
7612 // Check for FindIV pattern by looking for icmp user of RdxResult.
7613 // The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
7614 using namespace VPlanPatternMatch;
7615 VPValue *SentinelVPV = nullptr;
7616 bool IsFindIV = any_of(Range: RdxResult->users(), P: [&](VPUser *U) {
7617 return match(U, P: VPlanPatternMatch::m_SpecificICmp(
7618 MatchPred: ICmpInst::ICMP_NE, Op0: m_Specific(VPV: RdxResult),
7619 Op1: m_VPValue(V&: SentinelVPV)));
7620 });
7621
7622 RecurKind RK = ReductionPhi->getRecurrenceKind();
7623 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) || IsFindIV) {
7624 auto *ResumePhi = cast<PHINode>(Val: ResumeV);
7625 VPValue *BypassOp = ResumeForEpi->getOperand(N: 1);
7626 assert((isa<VPIRValue>(BypassOp) ||
7627 VPlanPatternMatch::match(
7628 BypassOp,
7629 m_VPInstruction<Instruction::Freeze>(m_VPValue()))) &&
7630 "expected live-in or Freeze");
7631 Value *StartV = BypassOp->getUnderlyingValue();
7632 IRBuilder<> Builder(ResumePhi->getParent(),
7633 ResumePhi->getParent()->getFirstNonPHIIt());
7634
7635 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
7636 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
7637 // start value; compare the final value from the main vector loop
7638 // to the start value.
7639 ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
7640 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
7641 InstsToMove.push_back(Elt: I);
7642 } else {
7643 assert(SentinelVPV && "expected to find icmp using RdxResult");
7644 if (auto *FreezeI = dyn_cast<FreezeInst>(Val: StartV))
7645 ToFrozen[FreezeI->getOperand(i_nocapture: 0)] = StartV;
7646
7647 // Adjust resume: select(icmp eq ResumeV, StartV), Sentinel, ResumeV
7648 Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: StartV);
7649 if (auto *I = dyn_cast<Instruction>(Val: Cmp))
7650 InstsToMove.push_back(Elt: I);
7651 ResumeV = Builder.CreateSelect(C: Cmp, True: SentinelVPV->getLiveInIRValue(),
7652 False: ResumeV);
7653 if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
7654 InstsToMove.push_back(Elt: I);
7655 }
7656 } else {
7657 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
7658 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
7659 if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
7660 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
7661 "unexpected start value");
7662 // Partial sub-reductions always start at 0 and account for the
7663 // reduction start value in a final subtraction. Update it to use the
7664 // resume value from the main vector loop.
7665 if (PhiR->getVFScaleFactor() > 1 &&
7666 RecurrenceDescriptor::isSubRecurrenceKind(
7667 Kind: PhiR->getRecurrenceKind())) {
7668 auto *Sub = cast<VPInstruction>(Val: RdxResult->getSingleUser());
7669 assert((Sub->getOpcode() == Instruction::Sub ||
7670 Sub->getOpcode() == Instruction::FSub) &&
7671 "Unexpected opcode");
7672 assert(isa<VPIRValue>(Sub->getOperand(0)) &&
7673 "Expected operand to match the original start value of the "
7674 "reduction");
7675 // For integer sub-reductions, verify start value is zero.
7676 // For FP sub-reductions, verify start value is negative zero.
7677 [[maybe_unused]] auto StartValueIsIdentity = [&] {
7678 Value *IdentityValue = getRecurrenceIdentity(
7679 K: PhiR->getRecurrenceKind(), Tp: ResumeV->getType(),
7680 FMF: PhiR->getFastMathFlagsOrNone());
7681 auto *StartValue = dyn_cast<VPIRValue>(Val: VPI->getOperand(N: 0));
7682 return StartValue && StartValue->getValue() == IdentityValue;
7683 };
7684 assert(StartValueIsIdentity() &&
7685 "Expected start value for partial sub-reduction to be zero "
7686 "(or negative zero)");
7687
7688 Sub->setOperand(I: 0, New: StartVal);
7689 } else
7690 VPI->setOperand(I: 0, New: StartVal);
7691 continue;
7692 }
7693 }
7694 } else {
7695 // Retrieve the induction resume value via ResumeForEpilogue.
7696 PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
7697 ResumeV = IRPhiToResumeForEpi.at(Val: IndPhi)->getUnderlyingValue();
7698 }
7699 assert(ResumeV && "Must have a resume value");
7700 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
7701 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
7702 }
7703
7704 // For some VPValues in the epilogue plan we must re-use the generated IR
7705 // values from the main plan. Replace them with live-in VPValues.
7706 // TODO: This is a workaround needed for epilogue vectorization and it
7707 // should be removed once induction resume value creation is done
7708 // directly in VPlan.
7709 for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
7710 // Re-use frozen values from the main plan for Freeze VPInstructions in the
7711 // epilogue plan. This ensures all users use the same frozen value.
7712 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
7713 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
7714 VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
7715 V: ToFrozen.lookup(Val: VPI->getOperand(N: 0)->getLiveInIRValue())));
7716 continue;
7717 }
7718
7719 // Re-use the trip count and steps expanded for the main loop, as
7720 // skeleton creation needs it as a value that dominates both the scalar
7721 // and vector epilogue loops
7722 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
7723 if (!ExpandR)
7724 continue;
7725 VPValue *ExpandedVal =
7726 Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
7727 ExpandR->replaceAllUsesWith(New: ExpandedVal);
7728 if (Plan.getTripCount() == ExpandR)
7729 Plan.resetTripCount(NewTripCount: ExpandedVal);
7730 ExpandR->eraseFromParent();
7731 }
7732
7733 auto VScale = Config.getVScaleForTuning();
7734 unsigned MainLoopStep =
7735 estimateElementCount(VF: EPI.MainLoopVF * EPI.MainLoopUF, VScale);
7736 unsigned EpilogueLoopStep =
7737 estimateElementCount(VF: EPI.EpilogueVF * EPI.EpilogueUF, VScale);
7738 RUN_VPLAN_PASS(
7739 VPlanTransforms::addMinimumVectorEpilogueIterationCheck, Plan,
7740 EPI.VectorTripCount, CM.requiresScalarEpilogue(EPI.EpilogueVF.isVector()),
7741 EPI.EpilogueVF, EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
7742
7743 return InstsToMove;
7744}
7745
7746static void
7747fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
7748 VPlan &BestEpiPlan,
7749 ArrayRef<VPInstruction *> ResumeValues) {
7750 // Fix resume values from the additional bypass block.
7751 BasicBlock *PH = L->getLoopPreheader();
7752 for (auto *Pred : predecessors(BB: PH)) {
7753 for (PHINode &Phi : PH->phis()) {
7754 if (Phi.getBasicBlockIndex(BB: Pred) != -1)
7755 continue;
7756 Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
7757 }
7758 }
7759 auto *ScalarPH = cast<VPIRBasicBlock>(Val: BestEpiPlan.getScalarPreheader());
7760 if (ScalarPH->hasPredecessors()) {
7761 // Fix resume values for inductions and reductions from the additional
7762 // bypass block using the incoming values from the main loop's resume phis.
7763 // ResumeValues correspond 1:1 with the scalar loop header phis.
7764 for (auto [ResumeV, HeaderPhi] :
7765 zip(t&: ResumeValues, u: BestEpiPlan.getScalarHeader()->phis())) {
7766 auto *HeaderPhiR = cast<VPIRPhi>(Val: &HeaderPhi);
7767 auto *EpiResumePhi =
7768 cast<PHINode>(Val: HeaderPhiR->getIRPhi().getIncomingValueForBlock(BB: PH));
7769 if (EpiResumePhi->getBasicBlockIndex(BB: BypassBlock) == -1)
7770 continue;
7771 auto *MainResumePhi = cast<PHINode>(Val: ResumeV->getUnderlyingValue());
7772 EpiResumePhi->setIncomingValueForBlock(
7773 BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
7774 }
7775 }
7776}
7777
7778/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
7779/// loop, after both plans have executed, updating branches from the iteration
7780/// and runtime checks of the main loop, as well as updating various phis. \p
7781/// InstsToMove contains instructions that need to be moved to the preheader of
7782/// the epilogue vector loop.
7783static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
7784 EpilogueLoopVectorizationInfo &EPI,
7785 DominatorTree *DT,
7786 GeneratedRTChecks &Checks,
7787 ArrayRef<Instruction *> InstsToMove,
7788 ArrayRef<VPInstruction *> ResumeValues) {
7789 BasicBlock *VecEpilogueIterationCountCheck =
7790 cast<VPIRBasicBlock>(Val: EpiPlan.getEntry())->getIRBasicBlock();
7791
7792 BasicBlock *VecEpiloguePreHeader =
7793 cast<CondBrInst>(Val: VecEpilogueIterationCountCheck->getTerminator())
7794 ->getSuccessor(i: 1);
7795 // Adjust the control flow taking the state info from the main loop
7796 // vectorization into account.
7797 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7798 "expected this to be saved from the previous pass.");
7799 DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
7800
7801 // Helper to redirect an edge from \p BB to \p VecEpilogueIterationCountCheck
7802 // to \p NewSucc instead, updating the DomTree.
7803 auto RedirectEdge = [&](BasicBlock *BB, BasicBlock *NewSucc) {
7804 BB->getTerminator()->replaceUsesOfWith(From: VecEpilogueIterationCountCheck,
7805 To: NewSucc);
7806 DTU.applyUpdates(
7807 Updates: {{DominatorTree::Delete, BB, VecEpilogueIterationCountCheck},
7808 {DominatorTree::Insert, BB, NewSucc}});
7809 };
7810
7811 RedirectEdge(EPI.MainLoopIterationCountCheck, VecEpiloguePreHeader);
7812
7813 BasicBlock *ScalarPH =
7814 cast<VPIRBasicBlock>(Val: EpiPlan.getScalarPreheader())->getIRBasicBlock();
7815 RedirectEdge(EPI.EpilogueIterationCountCheck, ScalarPH);
7816
7817 // Adjust the terminators of runtime check blocks and phis using them.
7818 BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
7819 BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
7820 if (SCEVCheckBlock)
7821 RedirectEdge(SCEVCheckBlock, ScalarPH);
7822 if (MemCheckBlock)
7823 RedirectEdge(MemCheckBlock, ScalarPH);
7824
7825 // The vec.epilog.iter.check block may contain Phi nodes from inductions
7826 // or reductions which merge control-flow from the latch block and the
7827 // middle block. Update the incoming values here and move the Phi into the
7828 // preheader.
7829 SmallVector<PHINode *, 4> PhisInBlock(
7830 llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
7831
7832 for (PHINode *Phi : PhisInBlock) {
7833 Phi->moveBefore(InsertPos: VecEpiloguePreHeader->getFirstNonPHIIt());
7834 Phi->replaceIncomingBlockWith(
7835 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7836 New: VecEpilogueIterationCountCheck);
7837
7838 // If the phi doesn't have an incoming value from the
7839 // EpilogueIterationCountCheck, we are done. Otherwise remove the
7840 // incoming value and also those from other check blocks. This is needed
7841 // for reduction phis only.
7842 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7843 return EPI.EpilogueIterationCountCheck == IncB;
7844 }))
7845 continue;
7846 for (BasicBlock *BB :
7847 {EPI.EpilogueIterationCountCheck, SCEVCheckBlock, MemCheckBlock}) {
7848 if (BB)
7849 Phi->removeIncomingValue(BB);
7850 }
7851 }
7852
7853 auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
7854 for (auto *I : InstsToMove)
7855 I->moveBefore(InsertPos: IP);
7856
7857 // VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
7858 // after executing the main loop. We need to update the resume values of
7859 // inductions and reductions during epilogue vectorization.
7860 fixScalarResumeValuesFromBypass(BypassBlock: VecEpilogueIterationCountCheck, L, BestEpiPlan&: EpiPlan,
7861 ResumeValues);
7862
7863 // Remove dead phis that were moved to the epilogue preheader but are unused
7864 // (e.g., resume phis for inductions not widened in the epilogue vector loop).
7865 for (PHINode &Phi : make_early_inc_range(Range: VecEpiloguePreHeader->phis()))
7866 if (Phi.use_empty())
7867 Phi.eraseFromParent();
7868}
7869
7870bool LoopVectorizePass::processLoop(Loop *L) {
7871 assert((EnableVPlanNativePath || L->isInnermost()) &&
7872 "VPlan-native path is not enabled. Only process inner loops.");
7873
7874 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
7875 << L->getHeader()->getParent()->getName() << "' from "
7876 << L->getLocStr() << "\n");
7877
7878 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
7879
7880 LLVM_DEBUG(
7881 dbgs() << "LV: Loop hints:"
7882 << " force="
7883 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7884 ? "disabled"
7885 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7886 ? "enabled"
7887 : "?"))
7888 << " width=" << Hints.getWidth()
7889 << " interleave=" << Hints.getInterleave() << "\n");
7890
7891 // Function containing loop
7892 Function *F = L->getHeader()->getParent();
7893
7894 // Looking at the diagnostic output is the only way to determine if a loop
7895 // was vectorized (other than looking at the IR or machine code), so it
7896 // is important to generate an optimization remark for each loop. Most of
7897 // these messages are generated as OptimizationRemarkAnalysis. Remarks
7898 // generated as OptimizationRemark and OptimizationRemarkMissed are
7899 // less verbose reporting vectorized loops and unvectorized loops that may
7900 // benefit from vectorization, respectively.
7901
7902 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7903 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7904 return false;
7905 }
7906
7907 PredicatedScalarEvolution PSE(*SE, *L);
7908
7909 // Query this against the original loop and save it here because the profile
7910 // of the original loop header may change as the transformation happens.
7911 bool OptForSize = llvm::shouldOptimizeForSize(
7912 BB: L->getHeader(), PSI,
7913 BFI: PSI && PSI->hasProfileSummary() ? &GetBFI() : nullptr,
7914 QueryType: PGSOQueryType::IRPass);
7915
7916 // Check if it is legal to vectorize the loop.
7917 LoopVectorizationRequirements Requirements;
7918 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
7919 &Requirements, &Hints, DB, AC,
7920 /*AllowRuntimeSCEVChecks=*/!OptForSize, AA);
7921 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
7922 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7923 Hints.emitRemarkWithHints();
7924 return false;
7925 }
7926
7927 bool IsInnerLoop = L->isInnermost();
7928
7929 // Outer loops require a computable trip count.
7930 if (!IsInnerLoop && isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
7931 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
7932 return false;
7933 }
7934
7935 if (LVL.hasUncountableEarlyExit()) {
7936 if (!EnableEarlyExitVectorization) {
7937 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
7938 "early exit is not enabled",
7939 ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
7940 return false;
7941 }
7942 if (LVL.hasUncountableExitWithSideEffects() &&
7943 !EnableEarlyExitVectorizationWithSideEffects) {
7944 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
7945 "early exit and side effects is not enabled",
7946 ORETag: "UncountableEarlyExitSideEffectLoopsDisabled",
7947 ORE, TheLoop: L);
7948 return false;
7949 }
7950 }
7951
7952 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI(), OptForSize);
7953 bool UseInterleaved =
7954 IsInnerLoop && TTI->enableInterleavedAccessVectorization();
7955
7956 // If an override option has been passed in for interleaved accesses, use it.
7957 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7958 UseInterleaved = IsInnerLoop && EnableInterleavedMemAccesses;
7959
7960 // Analyze interleaved memory accesses.
7961 if (UseInterleaved)
7962 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
7963
7964 if (LVL.hasUncountableEarlyExit()) {
7965 BasicBlock *LoopLatch = L->getLoopLatch();
7966 if (IAI.requiresScalarEpilogue() ||
7967 any_of(Range: LVL.getCountableExitingBlocks(), P: not_equal_to(Arg&: LoopLatch))) {
7968 reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
7969 "requiring a scalar epilogue is unsupported",
7970 ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
7971 return false;
7972 }
7973 }
7974
7975 // Check the function attributes and profiles to find out if this function
7976 // should be optimized for size.
7977 EpilogueLowering SEL =
7978 getEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, IAI: &IAI);
7979
7980 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7981 // count by optimizing for size, to minimize overheads.
7982 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
7983 if (ExpectedTC && ExpectedTC->isFixed() &&
7984 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
7985 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7986 << "This loop is worth vectorizing only if no scalar "
7987 << "iteration overheads are incurred.");
7988 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7989 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7990 else {
7991 LLVM_DEBUG(dbgs() << "\n");
7992 // Tail-folded loops are efficient even when the loop
7993 // iteration count is low. However, setting the epilogue policy to
7994 // `CM_EpilogueNotAllowedLowTripLoop` prevents vectorizing loops
7995 // with runtime checks. It's more effective to let
7996 // `isOutsideLoopWorkProfitable` determine if vectorization is
7997 // beneficial for the loop.
7998 if (SEL != CM_EpilogueNotNeededFoldTail)
7999 SEL = CM_EpilogueNotAllowedLowTripLoop;
8000 }
8001 }
8002
8003 // Check the function attributes to see if implicit floats or vectors are
8004 // allowed.
8005 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
8006 reportVectorizationFailure(
8007 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
8008 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
8009 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
8010 Hints.emitRemarkWithHints();
8011 return false;
8012 }
8013
8014 // Check if the target supports potentially unsafe FP vectorization.
8015 // FIXME: Add a check for the type of safety issue (denormal, signaling)
8016 // for the target we're vectorizing for, to make sure none of the
8017 // additional fp-math flags can help.
8018 if (Hints.isPotentiallyUnsafe() &&
8019 TTI->isFPVectorizationPotentiallyUnsafe()) {
8020 reportVectorizationFailure(
8021 DebugMsg: "Potentially unsafe FP op prevents vectorization",
8022 OREMsg: "loop not vectorized due to unsafe FP support.", ORETag: "UnsafeFP", ORE, TheLoop: L);
8023 Hints.emitRemarkWithHints();
8024 return false;
8025 }
8026
8027 bool AllowOrderedReductions;
8028 // If the flag is set, use that instead and override the TTI behaviour.
8029 if (ForceOrderedReductions.getNumOccurrences() > 0)
8030 AllowOrderedReductions = ForceOrderedReductions;
8031 else
8032 AllowOrderedReductions = TTI->enableOrderedReductions();
8033 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
8034 ORE->emit(RemarkBuilder: [&]() {
8035 auto *ExactFPMathInst = Requirements.getExactFPInst();
8036 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
8037 ExactFPMathInst->getDebugLoc(),
8038 ExactFPMathInst->getParent())
8039 << "loop not vectorized: cannot prove it is safe to reorder "
8040 "floating-point operations";
8041 });
8042 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
8043 "reorder floating-point operations\n");
8044 Hints.emitRemarkWithHints();
8045 return false;
8046 }
8047
8048 // Use the cost model.
8049 VFSelectionContext Config(*TTI, &LVL, L, *F, PSE, DB, ORE, &Hints,
8050 OptForSize);
8051 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, AC, ORE,
8052 GetBFI, F, &Hints, IAI, Config);
8053 // Use the planner for vectorization.
8054 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, Config, IAI, PSE,
8055 Hints, ORE);
8056
8057 EpilogueLowering EpilogueTailLoweringStatus =
8058 getEpilogueTailLowering(MainCM: CM, L, ORE);
8059 if (EpilogueTailLoweringStatus ==
8060 EpilogueLowering::CM_EpilogueNotNeededFoldTail) {
8061 // TODO: Apply tail-folding on the vectorized epilogue loop.
8062 LLVM_DEBUG(dbgs() << "LV: epilogue tail-folding is not supported yet\n");
8063 reportVectorizationInfo(
8064 Msg: "The epilogue-tail-folding policy prefer-fold-tail is not supported "
8065 "yet, fall back to a normal epilogue",
8066 ORETag: "UnsupportedEpilogueTailFoldingPolicy", ORE, TheLoop: L);
8067 }
8068
8069 // Get user vectorization factor and interleave count.
8070 ElementCount UserVF = Hints.getWidth();
8071 unsigned UserIC = Hints.getInterleave();
8072 // Outer loops don't have LoopAccessInfo, so skip the safety check and reset
8073 // UserIC (interleaving is not supported for outer loops).
8074 if (!IsInnerLoop)
8075 UserIC = 0;
8076 else if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
8077 UserIC = 1;
8078
8079 // Plan how to best vectorize.
8080 LVP.plan(UserVF, UserIC);
8081 auto [VF, BestPlanPtr] = LVP.computeBestVF();
8082 unsigned IC = 1;
8083
8084 // For VPlan build stress testing of outer loops, bail after plan
8085 // construction.
8086 if (!IsInnerLoop && VPlanBuildOuterloopStressTest)
8087 return false;
8088
8089 if (IsInnerLoop && ORE->allowExtraAnalysis(LV_NAME))
8090 LVP.emitInvalidCostRemarks(ORE);
8091
8092 assert((IsInnerLoop || !CM.maskPartialAliasing()) &&
8093 "Did not expect to alias-mask outer loop");
8094
8095 GeneratedRTChecks Checks(PSE, DT, LI, TTI, Config.CostKind,
8096 CM.maskPartialAliasing());
8097 if (IsInnerLoop && LVP.hasPlanWithVF(VF: VF.Width)) {
8098 // Select the interleave count.
8099 IC = LVP.selectInterleaveCount(Plan&: *BestPlanPtr, VF: VF.Width, LoopCost: VF.Cost);
8100
8101 unsigned SelectedIC = std::max(a: IC, b: UserIC);
8102 // Optimistically generate runtime checks if they are needed. Drop them if
8103 // they turn out to not be profitable.
8104 if (VF.Width.isVector() || SelectedIC > 1) {
8105 Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC,
8106 ORE&: *ORE);
8107
8108 // Bail out early if either the SCEV or memory runtime checks are known to
8109 // fail. In that case, the vector loop would never execute.
8110 using namespace llvm::PatternMatch;
8111 if (Checks.getSCEVChecks().first &&
8112 match(V: Checks.getSCEVChecks().first, P: m_One()))
8113 return false;
8114 if (Checks.getMemRuntimeChecks().first &&
8115 match(V: Checks.getMemRuntimeChecks().first, P: m_One()))
8116 return false;
8117 }
8118
8119 // Check if it is profitable to vectorize with runtime checks.
8120 bool ForceVectorization =
8121 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
8122 VPCostContext CostCtx(CM.TTI, *CM.TLI, *BestPlanPtr, CM, Config.CostKind,
8123 CM.PSE, L);
8124 if (!ForceVectorization &&
8125 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx, Plan&: *BestPlanPtr,
8126 SEL, VScale: Config.getVScaleForTuning())) {
8127 ORE->emit(RemarkBuilder: [&]() {
8128 return OptimizationRemarkAnalysisAliasing(
8129 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
8130 L->getHeader())
8131 << "loop not vectorized: cannot prove it is safe to reorder "
8132 "memory operations";
8133 });
8134 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8135 Hints.emitRemarkWithHints();
8136 return false;
8137 }
8138 }
8139
8140 // Identify the diagnostic messages that should be produced.
8141 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
8142 bool VectorizeLoop = true, InterleaveLoop = true;
8143 if (VF.Width.isScalar()) {
8144 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
8145 VecDiagMsg = {
8146 "VectorizationNotBeneficial",
8147 "the cost-model indicates that vectorization is not beneficial"};
8148 VectorizeLoop = false;
8149 }
8150
8151 if (UserIC == 1 && Hints.getInterleave() > 1) {
8152 assert(!LVL.isSafeForAnyVectorWidth() &&
8153 "UserIC should only be ignored due to unsafe dependencies");
8154 LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
8155 IntDiagMsg = {"InterleavingUnsafe",
8156 "Ignoring user-specified interleave count due to possibly "
8157 "unsafe dependencies in the loop."};
8158 InterleaveLoop = false;
8159 } else if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > 1) {
8160 // Tell the user interleaving was avoided up-front, despite being explicitly
8161 // requested.
8162 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
8163 "interleaving should be avoided up front\n");
8164 IntDiagMsg = {"InterleavingAvoided",
8165 "Ignoring UserIC, because interleaving was avoided up front"};
8166 InterleaveLoop = false;
8167 } else if (IC == 1 && UserIC <= 1) {
8168 // Tell the user interleaving is not beneficial.
8169 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
8170 IntDiagMsg = {
8171 "InterleavingNotBeneficial",
8172 "the cost-model indicates that interleaving is not beneficial"};
8173 InterleaveLoop = false;
8174 if (UserIC == 1) {
8175 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
8176 IntDiagMsg.second +=
8177 " and is explicitly disabled or interleave count is set to 1";
8178 }
8179 } else if (IC > 1 && UserIC == 1) {
8180 // Tell the user interleaving is beneficial, but it explicitly disabled.
8181 LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
8182 "disabled.\n");
8183 IntDiagMsg = {"InterleavingBeneficialButDisabled",
8184 "the cost-model indicates that interleaving is beneficial "
8185 "but is explicitly disabled or interleave count is set to 1"};
8186 InterleaveLoop = false;
8187 }
8188
8189 // If there is a histogram in the loop, do not just interleave without
8190 // vectorizing. The order of operations will be incorrect without the
8191 // histogram intrinsics, which are only used for recipes with VF > 1.
8192 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
8193 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
8194 << "to histogram operations.\n");
8195 IntDiagMsg = {
8196 "HistogramPreventsScalarInterleaving",
8197 "Unable to interleave without vectorization due to constraints on "
8198 "the order of histogram operations"};
8199 InterleaveLoop = false;
8200 }
8201
8202 // Override IC if user provided an interleave count.
8203 IC = UserIC > 0 ? UserIC : IC;
8204
8205 if (CM.maskPartialAliasing()) {
8206 LLVM_DEBUG(
8207 dbgs()
8208 << "LV: Not interleaving due to partial aliasing vectorization.\n");
8209 IntDiagMsg = {
8210 "PartialAliasingVectorization",
8211 "Unable to interleave due to partial aliasing vectorization."};
8212 InterleaveLoop = false;
8213 IC = 1;
8214 }
8215
8216 // FIXME: Enable interleaving for EE-with-side-effects.
8217 if (InterleaveLoop && LVL.hasUncountableExitWithSideEffects()) {
8218 LLVM_DEBUG(dbgs() << "LV: Not interleaving due to EE with side effects.\n");
8219 IntDiagMsg = {"EEWithSideEffectsPreventsInterleaving",
8220 "Unable to interleave due to early exit with side effects."};
8221 InterleaveLoop = false;
8222 IC = 1;
8223 }
8224
8225 // Emit diagnostic messages, if any.
8226 if (!VectorizeLoop && !InterleaveLoop) {
8227 // Do not vectorize or interleaving the loop.
8228 ORE->emit(RemarkBuilder: [&]() {
8229 return OptimizationRemarkMissed(LV_NAME, VecDiagMsg.first,
8230 L->getStartLoc(), L->getHeader())
8231 << VecDiagMsg.second;
8232 });
8233 ORE->emit(RemarkBuilder: [&]() {
8234 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
8235 L->getStartLoc(), L->getHeader())
8236 << IntDiagMsg.second;
8237 });
8238 return false;
8239 }
8240
8241 if (!VectorizeLoop && InterleaveLoop) {
8242 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8243 ORE->emit(RemarkBuilder: [&]() {
8244 return OptimizationRemarkAnalysis(LV_NAME, VecDiagMsg.first,
8245 L->getStartLoc(), L->getHeader())
8246 << VecDiagMsg.second;
8247 });
8248 } else if (VectorizeLoop && !InterleaveLoop) {
8249 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8250 << ") in " << L->getLocStr() << '\n');
8251 ORE->emit(RemarkBuilder: [&]() {
8252 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
8253 L->getStartLoc(), L->getHeader())
8254 << IntDiagMsg.second;
8255 });
8256 } else if (VectorizeLoop && InterleaveLoop) {
8257 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
8258 << ") in " << L->getLocStr() << '\n');
8259 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
8260 }
8261
8262 // Report the vectorization decision.
8263 if (VF.Width.isScalar()) {
8264 using namespace ore;
8265 assert(IC > 1);
8266 ORE->emit(RemarkBuilder: [&]() {
8267 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
8268 L->getHeader())
8269 << "interleaved loop (interleaved count: "
8270 << NV("InterleaveCount", IC) << ")";
8271 });
8272 } else {
8273 // Report the vectorization decision.
8274 reportVectorization(ORE, TheLoop: L, VFWidth: VF.Width, IC);
8275 }
8276 if (ORE->allowExtraAnalysis(LV_NAME))
8277 checkMixedPrecision(L, ORE);
8278
8279 // If we decided that it is *legal* to interleave or vectorize the loop, then
8280 // do it.
8281
8282 VPlan &BestPlan = *BestPlanPtr;
8283 // Consider vectorizing the epilogue too if it's profitable.
8284 std::unique_ptr<VPlan> EpiPlan =
8285 LVP.selectBestEpiloguePlan(MainPlan&: BestPlan, MainLoopVF: VF.Width, IC);
8286 bool HasBranchWeights =
8287 hasBranchWeightMD(I: *L->getLoopLatch()->getTerminator());
8288 if (EpiPlan) {
8289 VPlan &BestEpiPlan = *EpiPlan;
8290 VPlan &BestMainPlan = BestPlan;
8291 ElementCount EpilogueVF = BestEpiPlan.getSingleVF();
8292
8293 // The first pass vectorizes the main loop and creates a scalar epilogue
8294 // to be vectorized by executing the plan (potentially with a different
8295 // factor) again shortly afterwards.
8296 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
8297 BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
8298 SmallVector<VPInstruction *> ResumeValues =
8299 preparePlanForMainVectorLoop(MainPlan&: BestMainPlan, EpiPlan&: BestEpiPlan);
8300 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF, 1, BestEpiPlan);
8301
8302 // Add minimum iteration check for the epilogue plan, followed by runtime
8303 // checks for the main plan.
8304 LVP.addMinimumIterationCheck(Plan&: BestMainPlan, VF: EPI.EpilogueVF, UF: EPI.EpilogueUF,
8305 MinProfitableTripCount: ElementCount::getFixed(MinVal: 0));
8306 LVP.attachRuntimeChecks(Plan&: BestMainPlan, RTChecks&: Checks, HasBranchWeights);
8307 RUN_VPLAN_PASS(VPlanTransforms::addIterationCountCheckBlock, BestMainPlan,
8308 EPI.MainLoopVF, EPI.MainLoopUF,
8309 CM.requiresScalarEpilogue(EPI.MainLoopVF.isVector()), L,
8310 HasBranchWeights ? MinItersBypassWeights : nullptr,
8311 L->getLoopPredecessor()->getTerminator()->getDebugLoc(),
8312 PSE);
8313
8314 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
8315 Checks, BestMainPlan);
8316 auto ExpandedSCEVs = LVP.executePlan(
8317 BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF, BestVPlan&: BestMainPlan, ILV&: MainILV, DT,
8318 EpilogueVecKind: LoopVectorizationPlanner::EpilogueVectorizationKind::MainLoop);
8319 ++LoopsVectorized;
8320
8321 // Derive EPI fields from VPlan-generated IR.
8322 BasicBlock *EntryBB =
8323 cast<VPIRBasicBlock>(Val: BestMainPlan.getEntry())->getIRBasicBlock();
8324 EntryBB->setName("iter.check");
8325 EPI.EpilogueIterationCountCheck = EntryBB;
8326 // The check chain is: Entry -> [SCEV] -> [Mem] -> MainCheck -> VecPH.
8327 // MainCheck is the non-bypass successor of the last runtime check block
8328 // (or Entry if there are no runtime checks).
8329 BasicBlock *LastCheck = EntryBB;
8330 if (BasicBlock *MemBB = Checks.getMemRuntimeChecks().second)
8331 LastCheck = MemBB;
8332 else if (BasicBlock *SCEVBB = Checks.getSCEVChecks().second)
8333 LastCheck = SCEVBB;
8334 BasicBlock *ScalarPH = L->getLoopPreheader();
8335 auto *BI = cast<CondBrInst>(Val: LastCheck->getTerminator());
8336 EPI.MainLoopIterationCountCheck =
8337 BI->getSuccessor(i: BI->getSuccessor(i: 0) == ScalarPH);
8338
8339 // Second pass vectorizes the epilogue and adjusts the control flow
8340 // edges from the first pass.
8341 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
8342 Checks, BestEpiPlan);
8343 SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
8344 MainPlan&: BestMainPlan, Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI, CM, Config,
8345 SE&: *PSE.getSE(), ResumeValues);
8346 LVP.attachRuntimeChecks(Plan&: BestEpiPlan, RTChecks&: Checks, HasBranchWeights);
8347 LVP.executePlan(
8348 BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, DT,
8349 EpilogueVecKind: LoopVectorizationPlanner::EpilogueVectorizationKind::Epilogue);
8350 connectEpilogueVectorLoop(EpiPlan&: BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
8351 ResumeValues);
8352 ++LoopsEpilogueVectorized;
8353 } else {
8354 InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
8355 BestPlan);
8356 LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, UF: IC,
8357 MinProfitableTripCount: VF.MinProfitableTripCount);
8358 LVP.attachRuntimeChecks(Plan&: BestPlan, RTChecks&: Checks, HasBranchWeights);
8359
8360 if (!IsInnerLoop)
8361 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << F->getName()
8362 << "\"\n");
8363 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT);
8364 ++LoopsVectorized;
8365 }
8366
8367 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
8368 "DT not preserved correctly");
8369 assert(!verifyFunction(*F, &dbgs()));
8370
8371 return true;
8372}
8373
8374LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
8375
8376 // Don't attempt if
8377 // 1. the target claims to have no vector registers, and
8378 // 2. interleaving won't help ILP.
8379 //
8380 // The second condition is necessary because, even if the target has no
8381 // vector registers, loop vectorization may still enable scalar
8382 // interleaving.
8383 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
8384 (TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1), HasUnorderedReductions: false) < 2 ||
8385 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1), HasUnorderedReductions: true) < 2))
8386 return LoopVectorizeResult(false, false);
8387
8388 bool Changed = false, CFGChanged = false;
8389
8390 // The vectorizer requires loops to be in simplified form.
8391 // Since simplification may add new inner loops, it has to run before the
8392 // legality and profitability checks. This means running the loop vectorizer
8393 // will simplify all loops, regardless of whether anything end up being
8394 // vectorized.
8395 for (const auto &L : *LI)
8396 Changed |= CFGChanged |=
8397 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
8398
8399 // Build up a worklist of inner-loops to vectorize. This is necessary as
8400 // the act of vectorizing or partially unrolling a loop creates new loops
8401 // and can invalidate iterators across the loops.
8402 SmallVector<Loop *, 8> Worklist;
8403
8404 for (Loop *L : *LI)
8405 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
8406
8407 LoopsAnalyzed += Worklist.size();
8408
8409 // Now walk the identified inner loops.
8410 while (!Worklist.empty()) {
8411 Loop *L = Worklist.pop_back_val();
8412
8413 // For the inner loops we actually process, form LCSSA to simplify the
8414 // transform.
8415 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
8416
8417 Changed |= CFGChanged |= processLoop(L);
8418
8419 if (Changed) {
8420 LAIs->clear();
8421
8422#ifndef NDEBUG
8423 if (VerifySCEV)
8424 SE->verify();
8425#endif
8426 }
8427 }
8428
8429 // Process each loop nest in the function.
8430 return LoopVectorizeResult(Changed, CFGChanged);
8431}
8432
8433PreservedAnalyses LoopVectorizePass::run(Function &F,
8434 FunctionAnalysisManager &AM) {
8435 LI = &AM.getResult<LoopAnalysis>(IR&: F);
8436 // There are no loops in the function. Return before computing other
8437 // expensive analyses.
8438 if (LI->empty())
8439 return PreservedAnalyses::all();
8440 SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
8441 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
8442 DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
8443 TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
8444 AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
8445 DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
8446 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
8447 LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
8448 AA = &AM.getResult<AAManager>(IR&: F);
8449
8450 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
8451 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
8452 GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
8453 return AM.getResult<BlockFrequencyAnalysis>(IR&: F);
8454 };
8455 LoopVectorizeResult Result = runImpl(F);
8456 if (!Result.MadeAnyChange)
8457 return PreservedAnalyses::all();
8458 PreservedAnalyses PA;
8459
8460 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
8461 for (auto &BB : F)
8462 RemoveRedundantDbgInstrs(BB: &BB);
8463 }
8464
8465 PA.preserve<LoopAnalysis>();
8466 PA.preserve<DominatorTreeAnalysis>();
8467 PA.preserve<ScalarEvolutionAnalysis>();
8468 PA.preserve<LoopAccessAnalysis>();
8469
8470 if (Result.MadeCFGChange) {
8471 // Making CFG changes likely means a loop got vectorized. Indicate that
8472 // extra simplification passes should be run.
8473 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
8474 // be run if runtime checks have been added.
8475 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
8476 PA.preserve<ShouldRunExtraVectorPasses>();
8477 } else {
8478 PA.preserveSet<CFGAnalyses>();
8479 }
8480 return PA;
8481}
8482
8483void LoopVectorizePass::printPipeline(
8484 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
8485 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
8486 OS, MapClassName2PassName);
8487
8488 OS << '<';
8489 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
8490 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
8491 OS << '>';
8492}
8493