LoopVectorize.cpp source code [llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp]

1	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10	// and generates target-independent LLVM-IR.
11	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12	// of instructions in order to estimate the profitability of vectorization.
13	//
14	// The loop vectorizer combines consecutive loop iterations into a single
15	// 'wide' iteration. After this transformation the index is incremented
16	// by the SIMD vector width, and not by one.
17	//
18	// This pass has three parts:
19	// 1. The main loop pass that drives the different parts.
20	// 2. LoopVectorizationLegality - A unit that checks for the legality
21	// of the vectorization.
22	// 3. InnerLoopVectorizer - A unit that performs the actual
23	// widening of instructions.
24	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25	// of vectorization. It decides on the optimal vector width, which
26	// can be one, if vectorization is not profitable.
27	//
28	// There is a development effort going on to migrate loop vectorizer to the
29	// VPlan infrastructure and to introduce outer loop vectorization support (see
30	// docs/VectorizationPlan.rst and
31	// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32	// purpose, we temporarily introduced the VPlan-native vectorization path: an
33	// alternative vectorization path that is natively implemented on top of the
34	// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35	//
36	//===----------------------------------------------------------------------===//
37	//
38	// The reduction-variable vectorization is based on the paper:
39	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40	//
41	// Variable uniformity checks are inspired by:
42	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43	//
44	// The interleaved access vectorization is based on the paper:
45	// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46	// Data for SIMD
47	//
48	// Other ideas/concepts are from:
49	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50	//
51	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52	// Vectorizing Compilers.
53	//
54	//===----------------------------------------------------------------------===//
55
56	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57	#include "LoopVectorizationPlanner.h"
58	#include "VPRecipeBuilder.h"
59	#include "VPlan.h"
60	#include "VPlanAnalysis.h"
61	#include "VPlanCFG.h"
62	#include "VPlanHelpers.h"
63	#include "VPlanPatternMatch.h"
64	#include "VPlanTransforms.h"
65	#include "VPlanUtils.h"
66	#include "VPlanVerifier.h"
67	#include "llvm/ADT/APInt.h"
68	#include "llvm/ADT/ArrayRef.h"
69	#include "llvm/ADT/DenseMap.h"
70	#include "llvm/ADT/DenseMapInfo.h"
71	#include "llvm/ADT/Hashing.h"
72	#include "llvm/ADT/MapVector.h"
73	#include "llvm/ADT/STLExtras.h"
74	#include "llvm/ADT/SmallPtrSet.h"
75	#include "llvm/ADT/SmallVector.h"
76	#include "llvm/ADT/Statistic.h"
77	#include "llvm/ADT/StringRef.h"
78	#include "llvm/ADT/Twine.h"
79	#include "llvm/ADT/TypeSwitch.h"
80	#include "llvm/ADT/iterator_range.h"
81	#include "llvm/Analysis/AssumptionCache.h"
82	#include "llvm/Analysis/BasicAliasAnalysis.h"
83	#include "llvm/Analysis/BlockFrequencyInfo.h"
84	#include "llvm/Analysis/CFG.h"
85	#include "llvm/Analysis/CodeMetrics.h"
86	#include "llvm/Analysis/DemandedBits.h"
87	#include "llvm/Analysis/GlobalsModRef.h"
88	#include "llvm/Analysis/LoopAccessAnalysis.h"
89	#include "llvm/Analysis/LoopAnalysisManager.h"
90	#include "llvm/Analysis/LoopInfo.h"
91	#include "llvm/Analysis/LoopIterator.h"
92	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93	#include "llvm/Analysis/ProfileSummaryInfo.h"
94	#include "llvm/Analysis/ScalarEvolution.h"
95	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96	#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
97	#include "llvm/Analysis/TargetLibraryInfo.h"
98	#include "llvm/Analysis/TargetTransformInfo.h"
99	#include "llvm/Analysis/ValueTracking.h"
100	#include "llvm/Analysis/VectorUtils.h"
101	#include "llvm/IR/Attributes.h"
102	#include "llvm/IR/BasicBlock.h"
103	#include "llvm/IR/CFG.h"
104	#include "llvm/IR/Constant.h"
105	#include "llvm/IR/Constants.h"
106	#include "llvm/IR/DataLayout.h"
107	#include "llvm/IR/DebugInfo.h"
108	#include "llvm/IR/DebugLoc.h"
109	#include "llvm/IR/DerivedTypes.h"
110	#include "llvm/IR/DiagnosticInfo.h"
111	#include "llvm/IR/Dominators.h"
112	#include "llvm/IR/Function.h"
113	#include "llvm/IR/IRBuilder.h"
114	#include "llvm/IR/InstrTypes.h"
115	#include "llvm/IR/Instruction.h"
116	#include "llvm/IR/Instructions.h"
117	#include "llvm/IR/IntrinsicInst.h"
118	#include "llvm/IR/Intrinsics.h"
119	#include "llvm/IR/MDBuilder.h"
120	#include "llvm/IR/Metadata.h"
121	#include "llvm/IR/Module.h"
122	#include "llvm/IR/Operator.h"
123	#include "llvm/IR/PatternMatch.h"
124	#include "llvm/IR/ProfDataUtils.h"
125	#include "llvm/IR/Type.h"
126	#include "llvm/IR/Use.h"
127	#include "llvm/IR/User.h"
128	#include "llvm/IR/Value.h"
129	#include "llvm/IR/Verifier.h"
130	#include "llvm/Support/Casting.h"
131	#include "llvm/Support/CommandLine.h"
132	#include "llvm/Support/Debug.h"
133	#include "llvm/Support/ErrorHandling.h"
134	#include "llvm/Support/InstructionCost.h"
135	#include "llvm/Support/MathExtras.h"
136	#include "llvm/Support/NativeFormatting.h"
137	#include "llvm/Support/raw_ostream.h"
138	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
139	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
140	#include "llvm/Transforms/Utils/Local.h"
141	#include "llvm/Transforms/Utils/LoopSimplify.h"
142	#include "llvm/Transforms/Utils/LoopUtils.h"
143	#include "llvm/Transforms/Utils/LoopVersioning.h"
144	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
145	#include "llvm/Transforms/Utils/SizeOpts.h"
146	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
147	#include <algorithm>
148	#include <cassert>
149	#include <cmath>
150	#include <cstdint>
151	#include <functional>
152	#include <iterator>
153	#include <limits>
154	#include <memory>
155	#include <string>
156	#include <tuple>
157	#include <utility>
158
159	using namespace llvm;
160	using namespace SCEVPatternMatch;
161
162	#define LV_NAME "loop-vectorize"
163	#define DEBUG_TYPE LV_NAME
164
165	#ifndef NDEBUG
166	const char VerboseDebug[] = DEBUG_TYPE "-verbose";
167	#endif
168
169	STATISTIC(LoopsVectorized, "Number of loops vectorized");
170	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171	STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
172	STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
173
174	static cl::opt<bool> EnableEpilogueVectorization(
175	"enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
176	cl::desc ("Enable vectorization of epilogue loops."));
177
178	static cl::opt<unsigned> EpilogueVectorizationForceVF(
179	"epilogue-vectorization-force-VF", cl::init(Val: `1`), cl::Hidden,
180	cl::desc ("When epilogue vectorization is enabled, and a value greater than "
181	"1 is specified, forces the given VF for all applicable epilogue "
182	"loops."));
183
184	static cl::opt<unsigned> EpilogueVectorizationMinVF(
185	"epilogue-vectorization-minimum-VF", cl::Hidden,
186	cl::desc ("Only loops with vectorization factor equal to or larger than "
187	"the specified value are considered for epilogue vectorization."));
188
189	/// Loops with a known constant trip count below this number are vectorized only
190	/// if no scalar iteration overheads are incurred.
191	static cl::opt<unsigned> TinyTripCountVectorThreshold(
192	"vectorizer-min-trip-count", cl::init(Val: `16`), cl::Hidden,
193	cl::desc ("Loops with a constant trip count that is smaller than this "
194	"value are vectorized only if no scalar iteration overheads "
195	"are incurred."));
196
197	static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
198	"vectorize-memory-check-threshold", cl::init(Val: `128`), cl::Hidden,
199	cl::desc ("The maximum allowed number of runtime memory checks"));
200
201	/// Note: This currently only applies to `llvm.masked.load` and
202	/// `llvm.masked.store`. TODO: Extend this to cover other operations as needed.
203	static cl::opt<bool> ForceTargetSupportsMaskedMemoryOps(
204	"force-target-supports-masked-memory-ops", cl::init(Val: false), cl::Hidden,
205	cl::desc ("Assume the target supports masked memory operations (used for "
206	"testing)."));
207
208	// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
209	// that predication is preferred, and this lists all options. I.e., the
210	// vectorizer will try to fold the tail-loop (epilogue) into the vector body
211	// and predicate the instructions accordingly. If tail-folding fails, there are
212	// different fallback strategies depending on these values:
213	namespace PreferPredicateTy {
214	enum Option {
215	ScalarEpilogue = `0`,
216	PredicateElseScalarEpilogue,
217	PredicateOrDontVectorize
218	};
219	} // namespace PreferPredicateTy
220
221	static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
222	"prefer-predicate-over-epilogue",
223	cl::init(Val: PreferPredicateTy::ScalarEpilogue),
224	cl::Hidden,
225	cl::desc ("Tail-folding and predication preferences over creating a scalar "
226	"epilogue loop."),
227	cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
228	"scalar-epilogue",
229	"Don't tail-predicate loops, create scalar epilogue"),
230	clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
231	"predicate-else-scalar-epilogue",
232	"prefer tail-folding, create scalar epilogue if tail "
233	"folding fails."),
234	clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
235	"predicate-dont-vectorize",
236	"prefers tail-folding, don't attempt vectorization if "
237	"tail-folding fails.")));
238
239	static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
240	"force-tail-folding-style", cl::desc ("Force the tail folding style"),
241	cl::init(Val: TailFoldingStyle::None),
242	cl::values(
243	clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
244	clEnumValN(
245	TailFoldingStyle::Data, "data",
246	"Create lane mask for data only, using active.lane.mask intrinsic"),
247	clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
248	"data-without-lane-mask",
249	"Create lane mask with compare/stepvector"),
250	clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
251	"Create lane mask using active.lane.mask intrinsic, and use "
252	"it for both data and control flow"),
253	clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
254	"Use predicated EVL instructions for tail folding. If EVL "
255	"is unsupported, fallback to data-without-lane-mask.")));
256
257	cl::opt<bool> llvm::EnableWideActiveLaneMask(
258	"enable-wide-lane-mask", cl::init(Val: false), cl::Hidden,
259	cl::desc ("Enable use of wide lane masks when used for control flow in "
260	"tail-folded loops"));
261
262	static cl::opt<bool> MaximizeBandwidth(
263	"vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
264	cl::desc ("Maximize bandwidth when selecting vectorization factor which "
265	"will be determined by the smallest type in loop."));
266
267	static cl::opt<bool> EnableInterleavedMemAccesses(
268	"enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
269	cl::desc ("Enable vectorization on interleaved memory accesses in a loop"));
270
271	/// An interleave-group may need masking if it resides in a block that needs
272	/// predication, or in order to mask away gaps.
273	static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
274	"enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
275	cl::desc ("Enable vectorization on masked interleaved memory accesses in a loop"));
276
277	static cl::opt<unsigned> ForceTargetNumScalarRegs(
278	"force-target-num-scalar-regs", cl::init(Val: `0`), cl::Hidden,
279	cl::desc ("A flag that overrides the target's number of scalar registers."));
280
281	static cl::opt<unsigned> ForceTargetNumVectorRegs(
282	"force-target-num-vector-regs", cl::init(Val: `0`), cl::Hidden,
283	cl::desc ("A flag that overrides the target's number of vector registers."));
284
285	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
286	"force-target-max-scalar-interleave", cl::init(Val: `0`), cl::Hidden,
287	cl::desc ("A flag that overrides the target's max interleave factor for "
288	"scalar loops."));
289
290	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
291	"force-target-max-vector-interleave", cl::init(Val: `0`), cl::Hidden,
292	cl::desc ("A flag that overrides the target's max interleave factor for "
293	"vectorized loops."));
294
295	cl::opt<unsigned> llvm::ForceTargetInstructionCost(
296	"force-target-instruction-cost", cl::init(Val: `0`), cl::Hidden,
297	cl::desc ("A flag that overrides the target's expected cost for "
298	"an instruction to a single constant value. Mostly "
299	"useful for getting consistent testing."));
300
301	static cl::opt<bool> ForceTargetSupportsScalableVectors(
302	"force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
303	cl::desc (
304	"Pretend that scalable vectors are supported, even if the target does "
305	"not support them. This flag should only be used for testing."));
306
307	static cl::opt<unsigned> SmallLoopCost(
308	"small-loop-cost", cl::init(Val: `20`), cl::Hidden,
309	cl::desc (
310	"The cost of a loop that is considered 'small' by the interleaver."));
311
312	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
313	"loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
314	cl::desc ("Enable the use of the block frequency analysis to access PGO "
315	"heuristics minimizing code growth in cold regions and being more "
316	"aggressive in hot regions."));
317
318	// Runtime interleave loops for load/store throughput.
319	static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
320	"enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
321	cl::desc (
322	"Enable runtime interleaving until load/store ports are saturated"));
323
324	/// The number of stores in a loop that are allowed to need predication.
325	cl::opt<unsigned> NumberOfStoresToPredicate(
326	"vectorize-num-stores-pred", cl::init(Val: `1`), cl::Hidden,
327	cl::desc ("Max number of stores to be predicated behind an if."));
328
329	static cl::opt<bool> EnableIndVarRegisterHeur(
330	"enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
331	cl::desc ("Count the induction variable only once when interleaving"));
332
333	static cl::opt<bool> EnableCondStoresVectorization(
334	"enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
335	cl::desc ("Enable if predication of stores during vectorization."));
336
337	static cl::opt<unsigned> MaxNestedScalarReductionIC(
338	"max-nested-scalar-reduction-interleave", cl::init(Val: `2`), cl::Hidden,
339	cl::desc ("The maximum interleave count to use when interleaving a scalar "
340	"reduction in a nested loop."));
341
342	static cl::opt<bool>
343	PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
344	cl::Hidden,
345	cl::desc ("Prefer in-loop vector reductions, "
346	"overriding the targets preference."));
347
348	static cl::opt<bool> ForceOrderedReductions(
349	"force-ordered-reductions", cl::init(Val: false), cl::Hidden,
350	cl::desc ("Enable the vectorisation of loops with in-order (strict) "
351	"FP reductions"));
352
353	static cl::opt<bool> PreferPredicatedReductionSelect(
354	"prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
355	cl::desc (
356	"Prefer predicating a reduction operation over an after loop select."));
357
358	cl::opt<bool> llvm::EnableVPlanNativePath(
359	"enable-vplan-native-path", cl::Hidden,
360	cl::desc ("Enable VPlan-native vectorization path with "
361	"support for outer loop vectorization."));
362
363	cl::opt<bool>
364	llvm::VerifyEachVPlan("vplan-verify-each",
365	#ifdef EXPENSIVE_CHECKS
366	cl::init(true),
367	#else
368	cl::init(Val: false),
369	#endif
370	cl::Hidden,
371	cl::desc ("Verify VPlans after VPlan transforms."));
372
373	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
374	cl::opt<bool> llvm::VPlanPrintAfterAll(
375	"vplan-print-after-all", cl::init(false), cl::Hidden,
376	cl::desc("Print VPlans after all VPlan transformations."));
377
378	cl::list<std::string> llvm::VPlanPrintAfterPasses(
379	"vplan-print-after", cl::Hidden,
380	cl::desc("Print VPlans after specified VPlan transformations (regexp)."));
381
382	cl::opt<bool> llvm::VPlanPrintVectorRegionScope(
383	"vplan-print-vector-region-scope", cl::init(false), cl::Hidden,
384	cl::desc("Limit VPlan printing to vector loop region in "
385	"`-vplan-print-after*` if the plan has one."));
386	#endif
387
388	// This flag enables the stress testing of the VPlan H-CFG construction in the
389	// VPlan-native vectorization path. It must be used in conjuction with
390	// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
391	// verification of the H-CFGs built.
392	static cl::opt<bool> VPlanBuildStressTest(
393	"vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
394	cl::desc (
395	"Build VPlan for every supported loop nest in the function and bail "
396	"out right after the build (stress test the VPlan H-CFG construction "
397	"in the VPlan-native vectorization path)."));
398
399	cl::opt<bool> llvm::EnableLoopInterleaving(
400	"interleave-loops", cl::init(Val: true), cl::Hidden,
401	cl::desc ("Enable loop interleaving in Loop vectorization passes"));
402	cl::opt<bool> llvm::EnableLoopVectorization(
403	"vectorize-loops", cl::init(Val: true), cl::Hidden,
404	cl::desc ("Run the Loop vectorization passes"));
405
406	static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
407	"force-widen-divrem-via-safe-divisor", cl::Hidden,
408	cl::desc (
409	"Override cost based safe divisor widening for div/rem instructions"));
410
411	static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
412	"vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
413	cl::Hidden,
414	cl::desc ("Try wider VFs if they enable the use of vector variants"));
415
416	static cl::opt<bool> EnableEarlyExitVectorization(
417	"enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
418	cl::desc (
419	"Enable vectorization of early exit loops with uncountable exits."));
420
421	static cl::opt<bool> ConsiderRegPressure(
422	"vectorizer-consider-reg-pressure", cl::init(Val: false), cl::Hidden,
423	cl::desc ("Discard VFs if their register pressure is too high."));
424
425	// Likelyhood of bypassing the vectorized loop because there are zero trips left
426	// after prolog. See `emitIterationCountCheck`.
427	static constexpr uint32_t MinItersBypassWeights[] = {`1`, `127`};
428
429	/// A helper function that returns true if the given type is irregular. The
430	/// type is irregular if its allocated size doesn't equal the store size of an
431	/// element of the corresponding vector type.
432	static bool hasIrregularType(Type Ty, const* DataLayout &DL) {
433	// Determine if an array of N elements of type Ty is "bitcast compatible"
434	// with a <N x Ty> vector.
435	// This is only true if there is no padding between the array elements.
436	return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
437	}
438
439	/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
440	/// ElementCount to include loops whose trip count is a function of vscale.
441	static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
442	const Loop *L) {
443	if (unsigned ExpectedTC = SE->getSmallConstantTripCount(L))
444	return ElementCount::getFixed(MinVal: ExpectedTC);
445
446	const SCEV *BTC = SE->getBackedgeTakenCount(L);
447	if (isa<SCEVCouldNotCompute>(Val: BTC))
448	return ElementCount::getFixed(MinVal: `0`);
449
450	const SCEV *ExitCount = SE->getTripCountFromExitCount(ExitCount: BTC, EvalTy: BTC->getType(), L);
451	if (isa<SCEVVScale>(Val: ExitCount))
452	return ElementCount::getScalable(MinVal: `1`);
453
454	const APInt *Scale;
455	if (match(S: ExitCount, P: m_scev_Mul(Op0: m_scev_APInt(C&: Scale), Op1: m_SCEVVScale())))
456	if (cast<SCEVMulExpr>(Val: ExitCount)->hasNoUnsignedWrap())
457	if (Scale->getActiveBits() <= `32`)
458	return ElementCount::getScalable(MinVal: Scale->getZExtValue());
459
460	return ElementCount::getFixed(MinVal: `0`);
461	}
462
463	/// Returns "best known" trip count, which is either a valid positive trip count
464	/// or std::nullopt when an estimate cannot be made (including when the trip
465	/// count would overflow), for the specified loop \p L as defined by the
466	/// following procedure:
467	/// 1) Returns exact trip count if it is known.
468	/// 2) Returns expected trip count according to profile data if any.
469	/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
470	/// 4) Returns std::nullopt if all of the above failed.
471	static std::optional<ElementCount>
472	getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
473	bool CanUseConstantMax = true) {
474	// Check if exact trip count is known.
475	if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
476	return ExpectedTC;
477
478	// Check if there is an expected trip count available from profile data.
479	if (LoopVectorizeWithBlockFrequency)
480	if (auto EstimatedTC = getLoopEstimatedTripCount(L))
481	return ElementCount::getFixed(MinVal: *EstimatedTC);
482
483	if (!CanUseConstantMax)
484	return std::nullopt;
485
486	// Check if upper bound estimate is known.
487	if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
488	return ElementCount::getFixed(MinVal: ExpectedTC);
489
490	return std::nullopt;
491	}
492
493	namespace {
494	// Forward declare GeneratedRTChecks.
495	class GeneratedRTChecks;
496
497	using SCEV2ValueTy = DenseMap<const SCEV , Value >;
498	} // namespace
499
500	namespace llvm {
501
502	AnalysisKey ShouldRunExtraVectorPasses::Key;
503
504	/// InnerLoopVectorizer vectorizes loops which contain only one basic
505	/// block to a specified vectorization factor (VF).
506	/// This class performs the widening of scalars into vectors, or multiple
507	/// scalars. This class also implements the following features:
508	/// It inserts an epilogue loop for handling loops that don't have iteration*
509	/// counts that are known to be a multiple of the vectorization factor.
510	/// It handles the code generation for reduction variables.*
511	/// Scalarization (implementation using scalars) of un-vectorizable*
512	/// instructions.
513	/// InnerLoopVectorizer does not perform any vectorization-legality
514	/// checks, and relies on the caller to check for the different legality
515	/// aspects. The InnerLoopVectorizer relies on the
516	/// LoopVectorizationLegality class to provide information about the induction
517	/// and reduction variables that were found to a given vectorization factor.
518	class InnerLoopVectorizer {
519	public:
520	InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
521	LoopInfo LI, DominatorTree DT,
522	const TargetTransformInfo TTI, AssumptionCache AC,
523	ElementCount VecWidth, unsigned UnrollFactor,
524	LoopVectorizationCostModel *CM,
525	GeneratedRTChecks &RTChecks, VPlan &Plan)
526	: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TTI(TTI), AC(AC),
527	VF (VecWidth), UF(UnrollFactor), Builder (PSE.getSE()->getContext()),
528	Cost(CM), RTChecks(RTChecks), Plan(Plan),
529	VectorPHVPBB(cast<VPBasicBlock>(
530	Val: Plan.getVectorLoopRegion()->getSinglePredecessor())) {}
531
532	virtual ~InnerLoopVectorizer() = default;
533
534	/// Creates a basic block for the scalar preheader. Both
535	/// EpilogueVectorizerMainLoop and EpilogueVectorizerEpilogueLoop overwrite
536	/// the method to create additional blocks and checks needed for epilogue
537	/// vectorization.
538	virtual BasicBlock *createVectorizedLoopSkeleton();
539
540	/// Fix the vectorized code, taking care of header phi's, and more.
541	void fixVectorizedLoop(VPTransformState &State);
542
543	/// Fix the non-induction PHIs in \p Plan.
544	void fixNonInductionPHIs(VPTransformState &State);
545
546	/// Returns the original loop trip count.
547	Value getTripCount() const* { return TripCount; }
548
549	/// Used to set the trip count after ILV's construction and after the
550	/// preheader block has been executed. Note that this always holds the trip
551	/// count of the original loop for both main loop and epilogue vectorization.
552	void setTripCount(Value *TC) { TripCount = TC; }
553
554	protected:
555	friend class LoopVectorizationPlanner;
556
557	/// Create and return a new IR basic block for the scalar preheader whose name
558	/// is prefixed with \p Prefix.
559	BasicBlock *createScalarPreheader(StringRef Prefix);
560
561	/// Allow subclasses to override and print debug traces before/after vplan
562	/// execution, when trace information is requested.
563	virtual void printDebugTracesAtStart() {}
564	virtual void printDebugTracesAtEnd() {}
565
566	/// The original loop.
567	Loop *OrigLoop;
568
569	/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
570	/// dynamic knowledge to simplify SCEV expressions and converts them to a
571	/// more usable form.
572	PredicatedScalarEvolution &PSE;
573
574	/// Loop Info.
575	LoopInfo *LI;
576
577	/// Dominator Tree.
578	DominatorTree *DT;
579
580	/// Target Transform Info.
581	const TargetTransformInfo *TTI;
582
583	/// Assumption Cache.
584	AssumptionCache *AC;
585
586	/// The vectorization SIMD factor to use. Each vector will have this many
587	/// vector elements.
588	ElementCount VF;
589
590	/// The vectorization unroll factor to use. Each scalar is vectorized to this
591	/// many different vector instructions.
592	unsigned UF;
593
594	/// The builder that we use
595	IRBuilder<> Builder;
596
597	// --- Vectorization state ---
598
599	/// Trip count of the original loop.
600	Value TripCount = nullptr*;
601
602	/// The profitablity analysis.
603	LoopVectorizationCostModel *Cost;
604
605	/// Structure to hold information about generated runtime checks, responsible
606	/// for cleaning the checks, if vectorization turns out unprofitable.
607	GeneratedRTChecks &RTChecks;
608
609	VPlan &Plan;
610
611	/// The vector preheader block of \p Plan, used as target for check blocks
612	/// introduced during skeleton creation.
613	VPBasicBlock *VectorPHVPBB;
614	};
615
616	/// Encapsulate information regarding vectorization of a loop and its epilogue.
617	/// This information is meant to be updated and used across two stages of
618	/// epilogue vectorization.
619	struct EpilogueLoopVectorizationInfo {
620	ElementCount MainLoopVF = ElementCount::getFixed(MinVal: `0`);
621	unsigned MainLoopUF = `0`;
622	ElementCount EpilogueVF = ElementCount::getFixed(MinVal: `0`);
623	unsigned EpilogueUF = `0`;
624	BasicBlock MainLoopIterationCountCheck = nullptr*;
625	BasicBlock EpilogueIterationCountCheck = nullptr*;
626	Value TripCount = nullptr*;
627	Value VectorTripCount = nullptr*;
628	VPlan &EpiloguePlan;
629
630	EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
631	ElementCount EVF, unsigned EUF,
632	VPlan &EpiloguePlan)
633	: MainLoopVF (MVF), MainLoopUF(MUF), EpilogueVF (EVF), EpilogueUF(EUF),
634	EpiloguePlan(EpiloguePlan) {
635	assert(EUF == `1` &&
636	"A high UF for the epilogue loop is likely not beneficial.");
637	}
638	};
639
640	/// An extension of the inner loop vectorizer that creates a skeleton for a
641	/// vectorized loop that has its epilogue (residual) also vectorized.
642	/// The idea is to run the vplan on a given loop twice, firstly to setup the
643	/// skeleton and vectorize the main loop, and secondly to complete the skeleton
644	/// from the first step and vectorize the epilogue. This is achieved by
645	/// deriving two concrete strategy classes from this base class and invoking
646	/// them in succession from the loop vectorizer planner.
647	class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
648	public:
649	InnerLoopAndEpilogueVectorizer(
650	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
651	DominatorTree DT, const* TargetTransformInfo TTI, AssumptionCache AC,
652	EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel *CM,
653	GeneratedRTChecks &Checks, VPlan &Plan, ElementCount VecWidth,
654	ElementCount MinProfitableTripCount, unsigned UnrollFactor)
655	: InnerLoopVectorizer (OrigLoop, PSE, LI, DT, TTI, AC, VecWidth,
656	UnrollFactor, CM, Checks, Plan),
657	EPI(EPI), MinProfitableTripCount (MinProfitableTripCount) {}
658
659	/// Holds and updates state information required to vectorize the main loop
660	/// and its epilogue in two separate passes. This setup helps us avoid
661	/// regenerating and recomputing runtime safety checks. It also helps us to
662	/// shorten the iteration-count-check path length for the cases where the
663	/// iteration count of the loop is so small that the main vector loop is
664	/// completely skipped.
665	EpilogueLoopVectorizationInfo &EPI;
666
667	protected:
668	ElementCount MinProfitableTripCount;
669	};
670
671	/// A specialized derived class of inner loop vectorizer that performs
672	/// vectorization of main* loops in the process of vectorizing loops and their*
673	/// epilogues.
674	class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
675	public:
676	EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
677	LoopInfo LI, DominatorTree DT,
678	const TargetTransformInfo *TTI,
679	AssumptionCache *AC,
680	EpilogueLoopVectorizationInfo &EPI,
681	LoopVectorizationCostModel *CM,
682	GeneratedRTChecks &Check, VPlan &Plan)
683	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
684	Check, Plan, EPI.MainLoopVF,
685	EPI.MainLoopVF, EPI.MainLoopUF) {}
686	/// Implements the interface for creating a vectorized skeleton using the
687	/// main loop* strategy (i.e., the first pass of VPlan execution).*
688	BasicBlock *createVectorizedLoopSkeleton() final;
689
690	protected:
691	/// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
692	/// vector preheader and its predecessor, also connecting the new block to the
693	/// scalar preheader.
694	void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
695
696	// Create a check to see if the main vector loop should be executed
697	Value createIterationCountCheck(BasicBlock VectorPH, ElementCount VF,
698	unsigned UF) const;
699
700	/// Emits an iteration count bypass check once for the main loop (when \p
701	/// ForEpilogue is false) and once for the epilogue loop (when \p
702	/// ForEpilogue is true).
703	BasicBlock emitIterationCountCheck(BasicBlock VectorPH, BasicBlock *Bypass,
704	bool ForEpilogue);
705	void printDebugTracesAtStart() override;
706	void printDebugTracesAtEnd() override;
707	};
708
709	// A specialized derived class of inner loop vectorizer that performs
710	// vectorization of epilogue* loops in the process of vectorizing loops and*
711	// their epilogues.
712	class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
713	public:
714	EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
715	LoopInfo LI, DominatorTree DT,
716	const TargetTransformInfo *TTI,
717	AssumptionCache *AC,
718	EpilogueLoopVectorizationInfo &EPI,
719	LoopVectorizationCostModel *CM,
720	GeneratedRTChecks &Checks, VPlan &Plan)
721	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TTI, AC, EPI, CM,
722	Checks, Plan, EPI.EpilogueVF,
723	EPI.EpilogueVF, EPI.EpilogueUF) {}
724	/// Implements the interface for creating a vectorized skeleton using the
725	/// epilogue loop* strategy (i.e., the second pass of VPlan execution).*
726	BasicBlock *createVectorizedLoopSkeleton() final;
727
728	protected:
729	void printDebugTracesAtStart() override;
730	void printDebugTracesAtEnd() override;
731	};
732	} // end namespace llvm
733
734	/// Look for a meaningful debug location on the instruction or its operands.
735	static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
736	if (!I)
737	return DebugLoc::getUnknown();
738
739	DebugLoc Empty;
740	if (I->getDebugLoc() != Empty)
741	return I->getDebugLoc();
742
743	for (Use &Op : I->operands()) {
744	if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
745	if (OpInst->getDebugLoc() != Empty)
746	return OpInst->getDebugLoc();
747	}
748
749	return I->getDebugLoc();
750	}
751
752	/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
753	/// is passed, the message relates to that particular instruction.
754	#ifndef NDEBUG
755	static void debugVectorizationMessage(const StringRef Prefix,
756	const StringRef DebugMsg,
757	Instruction *I) {
758	dbgs() << "LV: " << Prefix << DebugMsg;
759	if (I != nullptr)
760	dbgs() << " " << *I;
761	else
762	dbgs() << `'.'`;
763	dbgs() << `'\n'`;
764	}
765	#endif
766
767	/// Create an analysis remark that explains why vectorization failed
768	///
769	/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
770	/// RemarkName is the identifier for the remark. If \p I is passed it is an
771	/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
772	/// the location of the remark. If \p DL is passed, use it as debug location for
773	/// the remark. \return the remark object that can be streamed to.
774	static OptimizationRemarkAnalysis
775	createLVAnalysis(const char PassName, StringRef RemarkName, Loop TheLoop,
776	Instruction *I, DebugLoc DL = {}) {
777	BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
778	// If debug location is attached to the instruction, use it. Otherwise if DL
779	// was not provided, use the loop's.
780	if (I && I->getDebugLoc())
781	DL = I->getDebugLoc();
782	else if (!DL)
783	DL = TheLoop->getStartLoc();
784
785	return OptimizationRemarkAnalysis (PassName, RemarkName, DL, CodeRegion);
786	}
787
788	namespace llvm {
789
790	/// Return a value for Step multiplied by VF.
791	Value createStepForVF(IRBuilderBase &B, Type Ty, ElementCount VF,
792	int64_t Step) {
793	assert(Ty->isIntegerTy() && "Expected an integer step");
794	ElementCount VFxStep = VF.multiplyCoefficientBy(RHS: Step);
795	assert(isPowerOf2_64(VF.getKnownMinValue()) && "must pass power-of-2 VF");
796	if (VF.isScalable() && isPowerOf2_64(Value: Step)) {
797	return B.CreateShl(
798	LHS: B.CreateVScale(Ty),
799	RHS: ConstantInt::get(Ty, V: Log2_64(Value: VFxStep.getKnownMinValue())), Name: "", HasNUW: true);
800	}
801	return B.CreateElementCount(Ty, EC: VFxStep);
802	}
803
804	/// Return the runtime value for VF.
805	Value getRuntimeVF(IRBuilderBase &B, Type Ty, ElementCount VF) {
806	return B.CreateElementCount(Ty, EC: VF);
807	}
808
809	void reportVectorizationFailure(const StringRef DebugMsg,
810	const StringRef OREMsg, const StringRef ORETag,
811	OptimizationRemarkEmitter ORE, Loop TheLoop,
812	Instruction *I) {
813	LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
814	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
815	ORE->emit(
816	OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
817	<< "loop not vectorized: " << OREMsg);
818	}
819
820	/// Reports an informative message: print \p Msg for debugging purposes as well
821	/// as an optimization remark. Uses either \p I as location of the remark, or
822	/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
823	/// remark. If \p DL is passed, use it as debug location for the remark.
824	static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
825	OptimizationRemarkEmitter *ORE,
826	Loop TheLoop, Instruction I = nullptr,
827	DebugLoc DL = {}) {
828	LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
829	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
830	ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
831	I, DL)
832	<< Msg);
833	}
834
835	/// Report successful vectorization of the loop. In case an outer loop is
836	/// vectorized, prepend "outer" to the vectorization remark.
837	static void reportVectorization(OptimizationRemarkEmitter ORE, Loop TheLoop,
838	VectorizationFactor VF, unsigned IC) {
839	LLVM_DEBUG(debugVectorizationMessage(
840	"Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
841	nullptr));
842	StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
843	ORE->emit(RemarkBuilder: [&]() {
844	return OptimizationRemark (LV_NAME, "Vectorized", TheLoop->getStartLoc(),
845	TheLoop->getHeader())
846	<< "vectorized " << LoopType << "loop (vectorization width: "
847	<< ore::NV ("VectorizationFactor", VF.Width)
848	<< ", interleaved count: " << ore::NV ("InterleaveCount", IC) << ")";
849	});
850	}
851
852	} // end namespace llvm
853
854	namespace llvm {
855
856	// Loop vectorization cost-model hints how the scalar epilogue loop should be
857	// lowered.
858	enum ScalarEpilogueLowering {
859
860	// The default: allowing scalar epilogues.
861	CM_ScalarEpilogueAllowed,
862
863	// Vectorization with OptForSize: don't allow epilogues.
864	CM_ScalarEpilogueNotAllowedOptSize,
865
866	// A special case of vectorisation with OptForSize: loops with a very small
867	// trip count are considered for vectorization under OptForSize, thereby
868	// making sure the cost of their loop body is dominant, free of runtime
869	// guards and scalar iteration overheads.
870	CM_ScalarEpilogueNotAllowedLowTripLoop,
871
872	// Loop hint predicate indicating an epilogue is undesired.
873	CM_ScalarEpilogueNotNeededUsePredicate,
874
875	// Directive indicating we must either tail fold or not vectorize
876	CM_ScalarEpilogueNotAllowedUsePredicate
877	};
878
879	/// LoopVectorizationCostModel - estimates the expected speedups due to
880	/// vectorization.
881	/// In many cases vectorization is not profitable. This can happen because of
882	/// a number of reasons. In this class we mainly attempt to predict the
883	/// expected speedup/slowdowns due to the supported instruction set. We use the
884	/// TargetTransformInfo to query the different backends for the cost of
885	/// different operations.
886	class LoopVectorizationCostModel {
887	friend class LoopVectorizationPlanner;
888
889	public:
890	LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
891	PredicatedScalarEvolution &PSE, LoopInfo *LI,
892	LoopVectorizationLegality *Legal,
893	const TargetTransformInfo &TTI,
894	const TargetLibraryInfo TLI, DemandedBits DB,
895	AssumptionCache *AC,
896	OptimizationRemarkEmitter *ORE,
897	std::function<BlockFrequencyInfo &()> GetBFI,
898	const Function F, const* LoopVectorizeHints *Hints,
899	InterleavedAccessInfo &IAI, bool OptForSize)
900	: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
901	TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), GetBFI (GetBFI),
902	TheFunction(F), Hints(Hints), InterleaveInfo(IAI),
903	OptForSize(OptForSize) {
904	if (TTI.supportsScalableVectors() \|\| ForceTargetSupportsScalableVectors)
905	initializeVScaleForTuning();
906	CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
907	}
908
909	/// \return An upper bound for the vectorization factors (both fixed and
910	/// scalable). If the factors are 0, vectorization and interleaving should be
911	/// avoided up front.
912	FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
913
914	/// \return True if runtime checks are required for vectorization, and false
915	/// otherwise.
916	bool runtimeChecksRequired();
917
918	/// Setup cost-based decisions for user vectorization factor.
919	/// \return true if the UserVF is a feasible VF to be chosen.
920	bool selectUserVectorizationFactor(ElementCount UserVF) {
921	collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
922	return expectedCost(VF: UserVF).isValid();
923	}
924
925	/// \return True if maximizing vector bandwidth is enabled by the target or
926	/// user options, for the given register kind.
927	bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
928
929	/// \return True if register pressure should be considered for the given VF.
930	bool shouldConsiderRegPressureForVF(ElementCount VF);
931
932	/// \return The size (in bits) of the smallest and widest types in the code
933	/// that needs to be vectorized. We ignore values that remain scalar such as
934	/// 64 bit loop indices.
935	std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
936
937	/// Memory access instruction may be vectorized in more than one way.
938	/// Form of instruction after vectorization depends on cost.
939	/// This function takes cost-based decisions for Load/Store instructions
940	/// and collects them in a map. This decisions map is used for building
941	/// the lists of loop-uniform and loop-scalar instructions.
942	/// The calculated cost is saved with widening decision in order to
943	/// avoid redundant calculations.
944	void setCostBasedWideningDecision(ElementCount VF);
945
946	/// A call may be vectorized in different ways depending on whether we have
947	/// vectorized variants available and whether the target supports masking.
948	/// This function analyzes all calls in the function at the supplied VF,
949	/// makes a decision based on the costs of available options, and stores that
950	/// decision in a map for use in planning and plan execution.
951	void setVectorizedCallDecision(ElementCount VF);
952
953	/// Collect values we want to ignore in the cost model.
954	void collectValuesToIgnore();
955
956	/// Collect all element types in the loop for which widening is needed.
957	void collectElementTypesForWidening();
958
959	/// Split reductions into those that happen in the loop, and those that happen
960	/// outside. In loop reductions are collected into InLoopReductions.
961	void collectInLoopReductions();
962
963	/// Returns true if we should use strict in-order reductions for the given
964	/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
965	/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
966	/// of FP operations.
967	bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
968	return !Hints->allowReordering() && RdxDesc.isOrdered();
969	}
970
971	/// \returns The smallest bitwidth each instruction can be represented with.
972	/// The vector equivalents of these instructions should be truncated to this
973	/// type.
974	const MapVector<Instruction , uint64_t> &getMinimalBitwidths() const* {
975	return MinBWs;
976	}
977
978	/// \returns True if it is more profitable to scalarize instruction \p I for
979	/// vectorization factor \p VF.
980	bool isProfitableToScalarize(Instruction I, ElementCount VF) const* {
981	assert(VF.isVector() &&
982	"Profitable to scalarize relevant only for VF > 1.");
983	assert(
984	TheLoop->isInnermost() &&
985	"cost-model should not be used for outer loops (in VPlan-native path)");
986
987	auto Scalars = InstsToScalarize.find(Key: VF);
988	assert(Scalars != InstsToScalarize.end() &&
989	"VF not yet analyzed for scalarization profitability");
990	return Scalars->second.contains(Key: I);
991	}
992
993	/// Returns true if \p I is known to be uniform after vectorization.
994	bool isUniformAfterVectorization(Instruction I, ElementCount VF) const* {
995	assert(
996	TheLoop->isInnermost() &&
997	"cost-model should not be used for outer loops (in VPlan-native path)");
998	// Pseudo probe needs to be duplicated for each unrolled iteration and
999	// vector lane so that profiled loop trip count can be accurately
1000	// accumulated instead of being under counted.
1001	if (isa<PseudoProbeInst>(Val: I))
1002	return false;
1003
1004	if (VF.isScalar())
1005	return true;
1006
1007	auto UniformsPerVF = Uniforms.find(Val: VF);
1008	assert(UniformsPerVF != Uniforms.end() &&
1009	"VF not yet analyzed for uniformity");
1010	return UniformsPerVF ->second.count(Ptr: I);
1011	}
1012
1013	/// Returns true if \p I is known to be scalar after vectorization.
1014	bool isScalarAfterVectorization(Instruction I, ElementCount VF) const* {
1015	assert(
1016	TheLoop->isInnermost() &&
1017	"cost-model should not be used for outer loops (in VPlan-native path)");
1018	if (VF.isScalar())
1019	return true;
1020
1021	auto ScalarsPerVF = Scalars.find(Val: VF);
1022	assert(ScalarsPerVF != Scalars.end() &&
1023	"Scalar values are not calculated for VF");
1024	return ScalarsPerVF ->second.count(Ptr: I);
1025	}
1026
1027	/// \returns True if instruction \p I can be truncated to a smaller bitwidth
1028	/// for vectorization factor \p VF.
1029	bool canTruncateToMinimalBitwidth(Instruction I, ElementCount VF) const* {
1030	// Truncs must truncate at most to their destination type.
1031	if (isa_and_nonnull<TruncInst>(Val: I) && MinBWs.contains(Key: I) &&
1032	I->getType()->getScalarSizeInBits() < MinBWs.lookup(Key: I))
1033	return false;
1034	return VF.isVector() && MinBWs.contains(Key: I) &&
1035	!isProfitableToScalarize(I, VF) &&
1036	!isScalarAfterVectorization(I, VF);
1037	}
1038
1039	/// Decision that was taken during cost calculation for memory instruction.
1040	enum InstWidening {
1041	CM_Unknown,
1042	CM_Widen, // For consecutive accesses with stride +1.
1043	CM_Widen_Reverse, // For consecutive accesses with stride -1.
1044	CM_Interleave,
1045	CM_GatherScatter,
1046	CM_Scalarize,
1047	CM_VectorCall,
1048	CM_IntrinsicCall
1049	};
1050
1051	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1052	/// instruction \p I and vector width \p VF.
1053	void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1054	InstructionCost Cost) {
1055	assert(VF.isVector() && "Expected VF >=2");
1056	WideningDecisions [{I, VF}] = {W, Cost};
1057	}
1058
1059	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1060	/// interleaving group \p Grp and vector width \p VF.
1061	void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1062	ElementCount VF, InstWidening W,
1063	InstructionCost Cost) {
1064	assert(VF.isVector() && "Expected VF >=2");
1065	/// Broadcast this decicion to all instructions inside the group.
1066	/// When interleaving, the cost will only be assigned one instruction, the
1067	/// insert position. For other cases, add the appropriate fraction of the
1068	/// total cost to each instruction. This ensures accurate costs are used,
1069	/// even if the insert position instruction is not used.
1070	InstructionCost InsertPosCost = Cost;
1071	InstructionCost OtherMemberCost = `0`;
1072	if (W != CM_Interleave)
1073	OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1074	;
1075	for (unsigned Idx = `0`; Idx < Grp->getFactor(); ++Idx) {
1076	if (auto *I = Grp->getMember(Index: Idx)) {
1077	if (Grp->getInsertPos() == I)
1078	WideningDecisions [{I, VF}] = {W, InsertPosCost};
1079	else
1080	WideningDecisions [{I, VF}] = {W, OtherMemberCost};
1081	}
1082	}
1083	}
1084
1085	/// Return the cost model decision for the given instruction \p I and vector
1086	/// width \p VF. Return CM_Unknown if this instruction did not pass
1087	/// through the cost modeling.
1088	InstWidening getWideningDecision(Instruction I, ElementCount VF) const* {
1089	assert(VF.isVector() && "Expected VF to be a vector VF");
1090	assert(
1091	TheLoop->isInnermost() &&
1092	"cost-model should not be used for outer loops (in VPlan-native path)");
1093
1094	std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1095	auto Itr = WideningDecisions.find(Val: InstOnVF);
1096	if (Itr == WideningDecisions.end())
1097	return CM_Unknown;
1098	return Itr ->second.first;
1099	}
1100
1101	/// Return the vectorization cost for the given instruction \p I and vector
1102	/// width \p VF.
1103	InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1104	assert(VF.isVector() && "Expected VF >=2");
1105	std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1106	assert(WideningDecisions.contains(InstOnVF) &&
1107	"The cost is not calculated");
1108	return WideningDecisions [InstOnVF].second;
1109	}
1110
1111	struct CallWideningDecision {
1112	InstWidening Kind;
1113	Function *Variant;
1114	Intrinsic::ID IID;
1115	std::optional<unsigned> MaskPos;
1116	InstructionCost Cost;
1117	};
1118
1119	void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1120	Function *Variant, Intrinsic::ID IID,
1121	std::optional<unsigned> MaskPos,
1122	InstructionCost Cost) {
1123	assert(!VF.isScalar() && "Expected vector VF");
1124	CallWideningDecisions [{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1125	}
1126
1127	CallWideningDecision getCallWideningDecision(CallInst *CI,
1128	ElementCount VF) const {
1129	assert(!VF.isScalar() && "Expected vector VF");
1130	auto I = CallWideningDecisions.find(Val: {CI, VF});
1131	if (I == CallWideningDecisions.end())
1132	return {.Kind: CM_Unknown, .Variant: nullptr, .IID: Intrinsic::not_intrinsic, .MaskPos: std::nullopt, .Cost: `0`};
1133	return I ->second;
1134	}
1135
1136	/// Return True if instruction \p I is an optimizable truncate whose operand
1137	/// is an induction variable. Such a truncate will be removed by adding a new
1138	/// induction variable with the destination type.
1139	bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1140	// If the instruction is not a truncate, return false.
1141	auto *Trunc = dyn_cast<TruncInst>(Val: I);
1142	if (!Trunc)
1143	return false;
1144
1145	// Get the source and destination types of the truncate.
1146	Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1147	Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1148
1149	// If the truncate is free for the given types, return false. Replacing a
1150	// free truncate with an induction variable would add an induction variable
1151	// update instruction to each iteration of the loop. We exclude from this
1152	// check the primary induction variable since it will need an update
1153	// instruction regardless.
1154	Value *Op = Trunc->getOperand(i_nocapture: `0`);
1155	if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1156	return false;
1157
1158	// If the truncated value is not an induction variable, return false.
1159	return Legal->isInductionPhi(V: Op);
1160	}
1161
1162	/// Collects the instructions to scalarize for each predicated instruction in
1163	/// the loop.
1164	void collectInstsToScalarize(ElementCount VF);
1165
1166	/// Collect values that will not be widened, including Uniforms, Scalars, and
1167	/// Instructions to Scalarize for the given \p VF.
1168	/// The sets depend on CM decision for Load/Store instructions
1169	/// that may be vectorized as interleave, gather-scatter or scalarized.
1170	/// Also make a decision on what to do about call instructions in the loop
1171	/// at that VF -- scalarize, call a known vector routine, or call a
1172	/// vector intrinsic.
1173	void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1174	// Do the analysis once.
1175	if (VF.isScalar() \|\| Uniforms.contains(Val: VF))
1176	return;
1177	setCostBasedWideningDecision(VF);
1178	collectLoopUniforms(VF);
1179	setVectorizedCallDecision(VF);
1180	collectLoopScalars(VF);
1181	collectInstsToScalarize(VF);
1182	}
1183
1184	/// Returns true if the target machine supports masked store operation
1185	/// for the given \p DataType and kind of access to \p Ptr.
1186	bool isLegalMaskedStore(Type DataType, Value Ptr, Align Alignment,
1187	unsigned AddressSpace) const {
1188	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1189	(ForceTargetSupportsMaskedMemoryOps \|\|
1190	TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace));
1191	}
1192
1193	/// Returns true if the target machine supports masked load operation
1194	/// for the given \p DataType and kind of access to \p Ptr.
1195	bool isLegalMaskedLoad(Type DataType, Value Ptr, Align Alignment,
1196	unsigned AddressSpace) const {
1197	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1198	(ForceTargetSupportsMaskedMemoryOps \|\|
1199	TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace));
1200	}
1201
1202	/// Returns true if the target machine can represent \p V as a masked gather
1203	/// or scatter operation.
1204	bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1205	bool LI = isa<LoadInst>(Val: V);
1206	bool SI = isa<StoreInst>(Val: V);
1207	if (!LI && !SI)
1208	return false;
1209	auto *Ty = getLoadStoreType(I: V);
1210	Align Align = getLoadStoreAlignment(I: V);
1211	if (VF.isVector())
1212	Ty = VectorType::get(ElementType: Ty, EC: VF);
1213	return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) \|\|
1214	(SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1215	}
1216
1217	/// Returns true if the target machine supports all of the reduction
1218	/// variables found for the given VF.
1219	bool canVectorizeReductions(ElementCount VF) const {
1220	return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1221	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1222	return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1223	}));
1224	}
1225
1226	/// Given costs for both strategies, return true if the scalar predication
1227	/// lowering should be used for div/rem. This incorporates an override
1228	/// option so it is not simply a cost comparison.
1229	bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1230	InstructionCost SafeDivisorCost) const {
1231	switch (ForceSafeDivisor) {
1232	case cl::BOU_UNSET:
1233	return ScalarCost < SafeDivisorCost;
1234	case cl::BOU_TRUE:
1235	return false;
1236	case cl::BOU_FALSE:
1237	return true;
1238	}
1239	llvm_unreachable("impossible case value");
1240	}
1241
1242	/// Returns true if \p I is an instruction which requires predication and
1243	/// for which our chosen predication strategy is scalarization (i.e. we
1244	/// don't have an alternate strategy such as masking available).
1245	/// \p VF is the vectorization factor that will be used to vectorize \p I.
1246	bool isScalarWithPredication(Instruction *I, ElementCount VF);
1247
1248	/// Returns true if \p I is an instruction that needs to be predicated
1249	/// at runtime. The result is independent of the predication mechanism.
1250	/// Superset of instructions that return true for isScalarWithPredication.
1251	bool isPredicatedInst(Instruction I) const*;
1252
1253	/// A helper function that returns how much we should divide the cost of a
1254	/// predicated block by. Typically this is the reciprocal of the block
1255	/// probability, i.e. if we return X we are assuming the predicated block will
1256	/// execute once for every X iterations of the loop header so the block should
1257	/// only contribute 1/X of its cost to the total cost calculation, but when
1258	/// optimizing for code size it will just be 1 as code size costs don't depend
1259	/// on execution probabilities.
1260	///
1261	/// Note that if a block wasn't originally predicated but was predicated due
1262	/// to tail folding, the divisor will still be 1 because it will execute for
1263	/// every iteration of the loop header.
1264	inline uint64_t
1265	getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1266	const BasicBlock *BB);
1267
1268	/// Returns true if an artificially high cost for emulated masked memrefs
1269	/// should be used.
1270	bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1271
1272	/// Return the costs for our two available strategies for lowering a
1273	/// div/rem operation which requires speculating at least one lane.
1274	/// First result is for scalarization (will be invalid for scalable
1275	/// vectors); second is for the safe-divisor strategy.
1276	std::pair<InstructionCost, InstructionCost>
1277	getDivRemSpeculationCost(Instruction *I, ElementCount VF);
1278
1279	/// Returns true if \p I is a memory instruction with consecutive memory
1280	/// access that can be widened.
1281	bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1282
1283	/// Returns true if \p I is a memory instruction in an interleaved-group
1284	/// of memory accesses that can be vectorized with wide vector loads/stores
1285	/// and shuffles.
1286	bool interleavedAccessCanBeWidened(Instruction I, ElementCount VF) const*;
1287
1288	/// Check if \p Instr belongs to any interleaved access group.
1289	bool isAccessInterleaved(Instruction Instr) const* {
1290	return InterleaveInfo.isInterleaved(Instr);
1291	}
1292
1293	/// Get the interleaved access group that \p Instr belongs to.
1294	const InterleaveGroup<Instruction> *
1295	getInterleavedAccessGroup(Instruction Instr) const* {
1296	return InterleaveInfo.getInterleaveGroup(Instr);
1297	}
1298
1299	/// Returns true if we're required to use a scalar epilogue for at least
1300	/// the final iteration of the original loop.
1301	bool requiresScalarEpilogue(bool IsVectorizing) const {
1302	if (!isScalarEpilogueAllowed()) {
1303	LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1304	return false;
1305	}
1306	// If we might exit from anywhere but the latch and early exit vectorization
1307	// is disabled, we must run the exiting iteration in scalar form.
1308	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1309	!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1310	LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1311	"from latch block\n");
1312	return true;
1313	}
1314	if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1315	LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1316	"interleaved group requires scalar epilogue\n");
1317	return true;
1318	}
1319	LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1320	return false;
1321	}
1322
1323	/// Returns true if a scalar epilogue is not allowed due to optsize or a
1324	/// loop hint annotation.
1325	bool isScalarEpilogueAllowed() const {
1326	return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1327	}
1328
1329	/// Returns true if tail-folding is preferred over a scalar epilogue.
1330	bool preferPredicatedLoop() const {
1331	return ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate \|\|
1332	ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate;
1333	}
1334
1335	/// Returns the TailFoldingStyle that is best for the current loop.
1336	TailFoldingStyle getTailFoldingStyle() const {
1337	return ChosenTailFoldingStyle;
1338	}
1339
1340	/// Selects and saves TailFoldingStyle.
1341	/// \param IsScalableVF true if scalable vector factors enabled.
1342	/// \param UserIC User specific interleave count.
1343	void setTailFoldingStyle(bool IsScalableVF, unsigned UserIC) {
1344	assert(ChosenTailFoldingStyle == TailFoldingStyle::None &&
1345	"Tail folding must not be selected yet.");
1346	if (!Legal->canFoldTailByMasking()) {
1347	ChosenTailFoldingStyle = TailFoldingStyle::None;
1348	return;
1349	}
1350
1351	// Default to TTI preference, but allow command line override.
1352	ChosenTailFoldingStyle = TTI.getPreferredTailFoldingStyle();
1353	if (ForceTailFoldingStyle.getNumOccurrences())
1354	ChosenTailFoldingStyle = ForceTailFoldingStyle.getValue();
1355
1356	if (ChosenTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1357	return;
1358	// Override EVL styles if needed.
1359	// FIXME: Investigate opportunity for fixed vector factor.
1360	bool EVLIsLegal = UserIC <= `1` && IsScalableVF &&
1361	TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1362	if (EVLIsLegal)
1363	return;
1364	// If for some reason EVL mode is unsupported, fallback to a scalar epilogue
1365	// if it's allowed, or DataWithoutLaneMask otherwise.
1366	if (ScalarEpilogueStatus == CM_ScalarEpilogueAllowed \|\|
1367	ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate)
1368	ChosenTailFoldingStyle = TailFoldingStyle::None;
1369	else
1370	ChosenTailFoldingStyle = TailFoldingStyle::DataWithoutLaneMask;
1371
1372	LLVM_DEBUG(
1373	dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1374	"not try to generate VP Intrinsics "
1375	<< (UserIC > `1`
1376	? "since interleave count specified is greater than 1.\n"
1377	: "due to non-interleaving reasons.\n"));
1378	}
1379
1380	/// Returns true if all loop blocks should be masked to fold tail loop.
1381	bool foldTailByMasking() const {
1382	return getTailFoldingStyle() != TailFoldingStyle::None;
1383	}
1384
1385	/// Returns true if the use of wide lane masks is requested and the loop is
1386	/// using tail-folding with a lane mask for control flow.
1387	bool useWideActiveLaneMask() const {
1388	if (!EnableWideActiveLaneMask)
1389	return false;
1390
1391	return getTailFoldingStyle() == TailFoldingStyle::DataAndControlFlow;
1392	}
1393
1394	/// Return maximum safe number of elements to be processed per vector
1395	/// iteration, which do not prevent store-load forwarding and are safe with
1396	/// regard to the memory dependencies. Required for EVL-based VPlans to
1397	/// correctly calculate AVL (application vector length) as min(remaining AVL,
1398	/// MaxSafeElements).
1399	/// TODO: need to consider adjusting cost model to use this value as a
1400	/// vectorization factor for EVL-based vectorization.
1401	std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1402
1403	/// Returns true if the instructions in this block requires predication
1404	/// for any reason, e.g. because tail folding now requires a predicate
1405	/// or because the block in the original loop was predicated.
1406	bool blockNeedsPredicationForAnyReason(BasicBlock BB) const* {
1407	return foldTailByMasking() \|\| Legal->blockNeedsPredication(BB);
1408	}
1409
1410	/// Returns true if VP intrinsics with explicit vector length support should
1411	/// be generated in the tail folded loop.
1412	bool foldTailWithEVL() const {
1413	return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1414	}
1415
1416	/// Returns true if the Phi is part of an inloop reduction.
1417	bool isInLoopReduction(PHINode Phi) const* {
1418	return InLoopReductions.contains(Ptr: Phi);
1419	}
1420
1421	/// Returns the set of in-loop reduction PHIs.
1422	const SmallPtrSetImpl<PHINode > &getInLoopReductions() const* {
1423	return InLoopReductions;
1424	}
1425
1426	/// Returns true if the predicated reduction select should be used to set the
1427	/// incoming value for the reduction phi.
1428	bool usePredicatedReductionSelect(RecurKind RecurrenceKind) const {
1429	// Force to use predicated reduction select since the EVL of the
1430	// second-to-last iteration might not be VFUF.*
1431	if (foldTailWithEVL())
1432	return true;
1433
1434	// Note: For FindLast recurrences we prefer a predicated select to simplify
1435	// matching in handleFindLastReductions(), rather than handle multiple
1436	// cases.
1437	if (RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RecurrenceKind))
1438	return true;
1439
1440	return PreferPredicatedReductionSelect \|\|
1441	TTI.preferPredicatedReductionSelect();
1442	}
1443
1444	/// Estimate cost of an intrinsic call instruction CI if it were vectorized
1445	/// with factor VF. Return the cost of the instruction, including
1446	/// scalarization overhead if it's needed.
1447	InstructionCost getVectorIntrinsicCost(CallInst CI, ElementCount VF) const*;
1448
1449	/// Estimate cost of a call instruction CI if it were vectorized with factor
1450	/// VF. Return the cost of the instruction, including scalarization overhead
1451	/// if it's needed.
1452	InstructionCost getVectorCallCost(CallInst CI, ElementCount VF) const*;
1453
1454	/// Invalidates decisions already taken by the cost model.
1455	void invalidateCostModelingDecisions() {
1456	WideningDecisions.clear();
1457	CallWideningDecisions.clear();
1458	Uniforms.clear();
1459	Scalars.clear();
1460	}
1461
1462	/// Returns the expected execution cost. The unit of the cost does
1463	/// not matter because we use the 'cost' units to compare different
1464	/// vector widths. The cost that is returned is not* normalized by*
1465	/// the factor width.
1466	InstructionCost expectedCost(ElementCount VF);
1467
1468	bool hasPredStores() const { return NumPredStores > `0`; }
1469
1470	/// Returns true if epilogue vectorization is considered profitable, and
1471	/// false otherwise.
1472	/// \p VF is the vectorization factor chosen for the original loop.
1473	/// \p Multiplier is an aditional scaling factor applied to VF before
1474	/// comparing to EpilogueVectorizationMinVF.
1475	bool isEpilogueVectorizationProfitable(const ElementCount VF,
1476	const unsigned IC) const;
1477
1478	/// Returns the execution time cost of an instruction for a given vector
1479	/// width. Vector width of one means scalar.
1480	InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1481
1482	/// Return the cost of instructions in an inloop reduction pattern, if I is
1483	/// part of that pattern.
1484	std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1485	ElementCount VF,
1486	Type VectorTy) const*;
1487
1488	/// Returns true if \p Op should be considered invariant and if it is
1489	/// trivially hoistable.
1490	bool shouldConsiderInvariant(Value *Op);
1491
1492	/// Return the value of vscale used for tuning the cost model.
1493	std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1494
1495	private:
1496	unsigned NumPredStores = `0`;
1497
1498	/// Used to store the value of vscale used for tuning the cost model. It is
1499	/// initialized during object construction.
1500	std::optional<unsigned> VScaleForTuning;
1501
1502	/// Initializes the value of vscale used for tuning the cost model. If
1503	/// vscale_range.min == vscale_range.max then return vscale_range.max, else
1504	/// return the value returned by the corresponding TTI method.
1505	void initializeVScaleForTuning() {
1506	const Function *Fn = TheLoop->getHeader()->getParent();
1507	if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1508	auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1509	auto Min = Attr.getVScaleRangeMin();
1510	auto Max = Attr.getVScaleRangeMax();
1511	if (Max && Min == Max) {
1512	VScaleForTuning = Max;
1513	return;
1514	}
1515	}
1516
1517	VScaleForTuning = TTI.getVScaleForTuning();
1518	}
1519
1520	/// \return An upper bound for the vectorization factors for both
1521	/// fixed and scalable vectorization, where the minimum-known number of
1522	/// elements is a power-of-2 larger than zero. If scalable vectorization is
1523	/// disabled or unsupported, then the scalable part will be equal to
1524	/// ElementCount::getScalable(0).
1525	FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1526	ElementCount UserVF, unsigned UserIC,
1527	bool FoldTailByMasking);
1528
1529	/// If \p VF \p UserIC > MaxTripcount, clamps VF to the next lower VF that*
1530	/// results in VF UserIC <= MaxTripCount.*
1531	ElementCount clampVFByMaxTripCount(ElementCount VF, unsigned MaxTripCount,
1532	unsigned UserIC,
1533	bool FoldTailByMasking) const;
1534
1535	/// \return the maximized element count based on the targets vector
1536	/// registers and the loop trip-count, but limited to a maximum safe VF.
1537	/// This is a helper function of computeFeasibleMaxVF.
1538	ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1539	unsigned SmallestType,
1540	unsigned WidestType,
1541	ElementCount MaxSafeVF, unsigned UserIC,
1542	bool FoldTailByMasking);
1543
1544	/// Checks if scalable vectorization is supported and enabled. Caches the
1545	/// result to avoid repeated debug dumps for repeated queries.
1546	bool isScalableVectorizationAllowed();
1547
1548	/// \return the maximum legal scalable VF, based on the safe max number
1549	/// of elements.
1550	ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1551
1552	/// Calculate vectorization cost of memory instruction \p I.
1553	InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1554
1555	/// The cost computation for scalarized memory instruction.
1556	InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1557
1558	/// The cost computation for interleaving group of memory instructions.
1559	InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1560
1561	/// The cost computation for Gather/Scatter instruction.
1562	InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1563
1564	/// The cost computation for widening instruction \p I with consecutive
1565	/// memory access.
1566	InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1567
1568	/// The cost calculation for Load/Store instruction \p I with uniform pointer -
1569	/// Load: scalar load + broadcast.
1570	/// Store: scalar store + (loop invariant value stored? 0 : extract of last
1571	/// element)
1572	InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1573
1574	/// Estimate the overhead of scalarizing an instruction. This is a
1575	/// convenience wrapper for the type-based getScalarizationOverhead API.
1576	InstructionCost getScalarizationOverhead(Instruction *I,
1577	ElementCount VF) const;
1578
1579	/// Map of scalar integer values to the smallest bitwidth they can be legally
1580	/// represented as. The vector equivalents of these values should be truncated
1581	/// to this type.
1582	MapVector<Instruction *, uint64_t> MinBWs;
1583
1584	/// A type representing the costs for instructions if they were to be
1585	/// scalarized rather than vectorized. The entries are Instruction-Cost
1586	/// pairs.
1587	using ScalarCostsTy = MapVector<Instruction *, InstructionCost>;
1588
1589	/// A set containing all BasicBlocks that are known to present after
1590	/// vectorization as a predicated block.
1591	DenseMap<ElementCount, SmallPtrSet<BasicBlock *, `4`>>
1592	PredicatedBBsAfterVectorization;
1593
1594	/// Records whether it is allowed to have the original scalar loop execute at
1595	/// least once. This may be needed as a fallback loop in case runtime
1596	/// aliasing/dependence checks fail, or to handle the tail/remainder
1597	/// iterations when the trip count is unknown or doesn't divide by the VF,
1598	/// or as a peel-loop to handle gaps in interleave-groups.
1599	/// Under optsize and when the trip count is very small we don't allow any
1600	/// iterations to execute in the scalar loop.
1601	ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1602
1603	/// Control finally chosen tail folding style.
1604	TailFoldingStyle ChosenTailFoldingStyle = TailFoldingStyle::None;
1605
1606	/// true if scalable vectorization is supported and enabled.
1607	std::optional<bool> IsScalableVectorizationAllowed;
1608
1609	/// Maximum safe number of elements to be processed per vector iteration,
1610	/// which do not prevent store-load forwarding and are safe with regard to the
1611	/// memory dependencies. Required for EVL-based veectorization, where this
1612	/// value is used as the upper bound of the safe AVL.
1613	std::optional<unsigned> MaxSafeElements;
1614
1615	/// A map holding scalar costs for different vectorization factors. The
1616	/// presence of a cost for an instruction in the mapping indicates that the
1617	/// instruction will be scalarized when vectorizing with the associated
1618	/// vectorization factor. The entries are VF-ScalarCostTy pairs.
1619	MapVector<ElementCount, ScalarCostsTy> InstsToScalarize;
1620
1621	/// Holds the instructions known to be uniform after vectorization.
1622	/// The data is collected per VF.
1623	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Uniforms;
1624
1625	/// Holds the instructions known to be scalar after vectorization.
1626	/// The data is collected per VF.
1627	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Scalars;
1628
1629	/// Holds the instructions (address computations) that are forced to be
1630	/// scalarized.
1631	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> ForcedScalars;
1632
1633	/// PHINodes of the reductions that should be expanded in-loop.
1634	SmallPtrSet<PHINode *, `4`> InLoopReductions;
1635
1636	/// A Map of inloop reduction operations and their immediate chain operand.
1637	/// FIXME: This can be removed once reductions can be costed correctly in
1638	/// VPlan. This was added to allow quick lookup of the inloop operations.
1639	DenseMap<Instruction , Instruction > InLoopReductionImmediateChains;
1640
1641	/// Returns the expected difference in cost from scalarizing the expression
1642	/// feeding a predicated instruction \p PredInst. The instructions to
1643	/// scalarize and their scalar costs are collected in \p ScalarCosts. A
1644	/// non-negative return value implies the expression will be scalarized.
1645	/// Currently, only single-use chains are considered for scalarization.
1646	InstructionCost computePredInstDiscount(Instruction *PredInst,
1647	ScalarCostsTy &ScalarCosts,
1648	ElementCount VF);
1649
1650	/// Collect the instructions that are uniform after vectorization. An
1651	/// instruction is uniform if we represent it with a single scalar value in
1652	/// the vectorized loop corresponding to each vector iteration. Examples of
1653	/// uniform instructions include pointer operands of consecutive or
1654	/// interleaved memory accesses. Note that although uniformity implies an
1655	/// instruction will be scalar, the reverse is not true. In general, a
1656	/// scalarized instruction will be represented by VF scalar values in the
1657	/// vectorized loop, each corresponding to an iteration of the original
1658	/// scalar loop.
1659	void collectLoopUniforms(ElementCount VF);
1660
1661	/// Collect the instructions that are scalar after vectorization. An
1662	/// instruction is scalar if it is known to be uniform or will be scalarized
1663	/// during vectorization. collectLoopScalars should only add non-uniform nodes
1664	/// to the list if they are used by a load/store instruction that is marked as
1665	/// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1666	/// VF values in the vectorized loop, each corresponding to an iteration of
1667	/// the original scalar loop.
1668	void collectLoopScalars(ElementCount VF);
1669
1670	/// Keeps cost model vectorization decision and cost for instructions.
1671	/// Right now it is used for memory instructions only.
1672	using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1673	std::pair<InstWidening, InstructionCost>>;
1674
1675	DecisionList WideningDecisions;
1676
1677	using CallDecisionList =
1678	DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1679
1680	CallDecisionList CallWideningDecisions;
1681
1682	/// Returns true if \p V is expected to be vectorized and it needs to be
1683	/// extracted.
1684	bool needsExtract(Value V, ElementCount VF) const* {
1685	Instruction *I = dyn_cast<Instruction>(Val: V);
1686	if (VF.isScalar() \|\| !I \|\| !TheLoop->contains(Inst: I) \|\|
1687	TheLoop->isLoopInvariant(V: I) \|\|
1688	getWideningDecision(I, VF) == CM_Scalarize \|\|
1689	(isa<CallInst>(Val: I) &&
1690	getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize))
1691	return false;
1692
1693	// Assume we can vectorize V (and hence we need extraction) if the
1694	// scalars are not computed yet. This can happen, because it is called
1695	// via getScalarizationOverhead from setCostBasedWideningDecision, before
1696	// the scalars are collected. That should be a safe assumption in most
1697	// cases, because we check if the operands have vectorizable types
1698	// beforehand in LoopVectorizationLegality.
1699	return !Scalars.contains(Val: VF) \|\| !isScalarAfterVectorization(I, VF);
1700	};
1701
1702	/// Returns a range containing only operands needing to be extracted.
1703	SmallVector<Value *, `4`> filterExtractingOperands(Instruction::op_range Ops,
1704	ElementCount VF) const {
1705
1706	SmallPtrSet<const Value *, `4`> UniqueOperands;
1707	SmallVector<Value *, `4`> Res;
1708	for (Value *Op : Ops) {
1709	if (isa<Constant>(Val: Op) \|\| !UniqueOperands.insert(Ptr: Op).second \|\|
1710	!needsExtract(V: Op, VF))
1711	continue;
1712	Res.push_back(Elt: Op);
1713	}
1714	return Res;
1715	}
1716
1717	public:
1718	/// The loop that we evaluate.
1719	Loop *TheLoop;
1720
1721	/// Predicated scalar evolution analysis.
1722	PredicatedScalarEvolution &PSE;
1723
1724	/// Loop Info analysis.
1725	LoopInfo *LI;
1726
1727	/// Vectorization legality.
1728	LoopVectorizationLegality *Legal;
1729
1730	/// Vector target information.
1731	const TargetTransformInfo &TTI;
1732
1733	/// Target Library Info.
1734	const TargetLibraryInfo *TLI;
1735
1736	/// Demanded bits analysis.
1737	DemandedBits *DB;
1738
1739	/// Assumption cache.
1740	AssumptionCache *AC;
1741
1742	/// Interface to emit optimization remarks.
1743	OptimizationRemarkEmitter *ORE;
1744
1745	/// A function to lazily fetch BlockFrequencyInfo. This avoids computing it
1746	/// unless necessary, e.g. when the loop isn't legal to vectorize or when
1747	/// there is no predication.
1748	std::function<BlockFrequencyInfo &()> GetBFI;
1749	/// The BlockFrequencyInfo returned from GetBFI.
1750	BlockFrequencyInfo BFI = nullptr*;
1751	/// Returns the BlockFrequencyInfo for the function if cached, otherwise
1752	/// fetches it via GetBFI. Avoids an indirect call to the std::function.
1753	BlockFrequencyInfo &getBFI() {
1754	if (!BFI)
1755	BFI = &GetBFI ();
1756	return *BFI;
1757	}
1758
1759	const Function *TheFunction;
1760
1761	/// Loop Vectorize Hint.
1762	const LoopVectorizeHints *Hints;
1763
1764	/// The interleave access information contains groups of interleaved accesses
1765	/// with the same stride and close to each other.
1766	InterleavedAccessInfo &InterleaveInfo;
1767
1768	/// Values to ignore in the cost model.
1769	SmallPtrSet<const Value *, `16`> ValuesToIgnore;
1770
1771	/// Values to ignore in the cost model when VF > 1.
1772	SmallPtrSet<const Value *, `16`> VecValuesToIgnore;
1773
1774	/// All element types found in the loop.
1775	SmallPtrSet<Type *, `16`> ElementTypesInLoop;
1776
1777	/// The kind of cost that we are calculating
1778	TTI::TargetCostKind CostKind;
1779
1780	/// Whether this loop should be optimized for size based on function attribute
1781	/// or profile information.
1782	bool OptForSize;
1783
1784	/// The highest VF possible for this loop, without using MaxBandwidth.
1785	FixedScalableVFPair MaxPermissibleVFWithoutMaxBW;
1786	};
1787	} // end namespace llvm
1788
1789	namespace {
1790	/// Helper struct to manage generating runtime checks for vectorization.
1791	///
1792	/// The runtime checks are created up-front in temporary blocks to allow better
1793	/// estimating the cost and un-linked from the existing IR. After deciding to
1794	/// vectorize, the checks are moved back. If deciding not to vectorize, the
1795	/// temporary blocks are completely removed.
1796	class GeneratedRTChecks {
1797	/// Basic block which contains the generated SCEV checks, if any.
1798	BasicBlock SCEVCheckBlock = nullptr*;
1799
1800	/// The value representing the result of the generated SCEV checks. If it is
1801	/// nullptr no SCEV checks have been generated.
1802	Value SCEVCheckCond = nullptr*;
1803
1804	/// Basic block which contains the generated memory runtime checks, if any.
1805	BasicBlock MemCheckBlock = nullptr*;
1806
1807	/// The value representing the result of the generated memory runtime checks.
1808	/// If it is nullptr no memory runtime checks have been generated.
1809	Value MemRuntimeCheckCond = nullptr*;
1810
1811	DominatorTree *DT;
1812	LoopInfo *LI;
1813	TargetTransformInfo *TTI;
1814
1815	SCEVExpander SCEVExp;
1816	SCEVExpander MemCheckExp;
1817
1818	bool CostTooHigh = false;
1819
1820	Loop OuterLoop = nullptr*;
1821
1822	PredicatedScalarEvolution &PSE;
1823
1824	/// The kind of cost that we are calculating
1825	TTI::TargetCostKind CostKind;
1826
1827	public:
1828	GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1829	LoopInfo LI, TargetTransformInfo TTI,
1830	TTI::TargetCostKind CostKind)
1831	: DT(DT), LI(LI), TTI(TTI),
1832	SCEVExp (PSE.getSE(), "scev.check", /PreserveLCSSA=/*false),
1833	MemCheckExp (PSE.getSE(), "scev.check", /PreserveLCSSA=/*false),
1834	PSE(PSE), CostKind(CostKind) {}
1835
1836	/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1837	/// accurately estimate the cost of the runtime checks. The blocks are
1838	/// un-linked from the IR and are added back during vector code generation. If
1839	/// there is no vector code generation, the check blocks are removed
1840	/// completely.
1841	void create(Loop L, const* LoopAccessInfo &LAI,
1842	const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
1843	OptimizationRemarkEmitter &ORE) {
1844
1845	// Hard cutoff to limit compile-time increase in case a very large number of
1846	// runtime checks needs to be generated.
1847	// TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1848	// profile info.
1849	CostTooHigh =
1850	LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1851	if (CostTooHigh) {
1852	// Mark runtime checks as never succeeding when they exceed the threshold.
1853	MemRuntimeCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1854	SCEVCheckCond = ConstantInt::getTrue(Context&: L->getHeader()->getContext());
1855	ORE.emit(RemarkBuilder: [&]() {
1856	return OptimizationRemarkAnalysisAliasing (
1857	DEBUG_TYPE, "TooManyMemoryRuntimeChecks", L->getStartLoc(),
1858	L->getHeader())
1859	<< "loop not vectorized: too many memory checks needed";
1860	});
1861	LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
1862	return;
1863	}
1864
1865	BasicBlock *LoopHeader = L->getHeader();
1866	BasicBlock *Preheader = L->getLoopPreheader();
1867
1868	// Use SplitBlock to create blocks for SCEV & memory runtime checks to
1869	// ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1870	// may be used by SCEVExpander. The blocks will be un-linked from their
1871	// predecessors and removed from LI & DT at the end of the function.
1872	if (!UnionPred.isAlwaysTrue()) {
1873	SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1874	MSSAU: nullptr, BBName: "vector.scevcheck");
1875
1876	SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1877	Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1878	if (isa<Constant>(Val: SCEVCheckCond)) {
1879	// Clean up directly after expanding the predicate to a constant, to
1880	// avoid further expansions re-using anything left over from SCEVExp.
1881	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1882	SCEVCleaner.cleanup();
1883	}
1884	}
1885
1886	const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1887	if (RtPtrChecking.Need) {
1888	auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1889	MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1890	BBName: "vector.memcheck");
1891
1892	auto DiffChecks = RtPtrChecking.getDiffChecks();
1893	if (DiffChecks) {
1894	Value RuntimeVF = nullptr*;
1895	MemRuntimeCheckCond = addDiffRuntimeChecks(
1896	Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1897	GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1898	if (!RuntimeVF)
1899	RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1900	return RuntimeVF;
1901	},
1902	IC);
1903	} else {
1904	MemRuntimeCheckCond = addRuntimeChecks(
1905	Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1906	Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1907	}
1908	assert(MemRuntimeCheckCond &&
1909	"no RT checks generated although RtPtrChecking "
1910	"claimed checks are required");
1911	}
1912
1913	SCEVExp.eraseDeadInstructions(Root: SCEVCheckCond);
1914
1915	if (!MemCheckBlock && !SCEVCheckBlock)
1916	return;
1917
1918	// Unhook the temporary block with the checks, update various places
1919	// accordingly.
1920	if (SCEVCheckBlock)
1921	SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1922	if (MemCheckBlock)
1923	MemCheckBlock->replaceAllUsesWith(V: Preheader);
1924
1925	if (SCEVCheckBlock) {
1926	SCEVCheckBlock->getTerminator()->moveBefore(
1927	InsertPos: Preheader->getTerminator()->getIterator());
1928	auto UI = new* UnreachableInst (Preheader->getContext(), SCEVCheckBlock);
1929	UI->setDebugLoc(DebugLoc::getTemporary());
1930	Preheader->getTerminator()->eraseFromParent();
1931	}
1932	if (MemCheckBlock) {
1933	MemCheckBlock->getTerminator()->moveBefore(
1934	InsertPos: Preheader->getTerminator()->getIterator());
1935	auto UI = new* UnreachableInst (Preheader->getContext(), MemCheckBlock);
1936	UI->setDebugLoc(DebugLoc::getTemporary());
1937	Preheader->getTerminator()->eraseFromParent();
1938	}
1939
1940	DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1941	if (MemCheckBlock) {
1942	DT->eraseNode(BB: MemCheckBlock);
1943	LI->removeBlock(BB: MemCheckBlock);
1944	}
1945	if (SCEVCheckBlock) {
1946	DT->eraseNode(BB: SCEVCheckBlock);
1947	LI->removeBlock(BB: SCEVCheckBlock);
1948	}
1949
1950	// Outer loop is used as part of the later cost calculations.
1951	OuterLoop = L->getParentLoop();
1952	}
1953
1954	InstructionCost getCost() {
1955	if (SCEVCheckBlock \|\| MemCheckBlock)
1956	LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1957
1958	if (CostTooHigh) {
1959	InstructionCost Cost;
1960	Cost.setInvalid();
1961	LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1962	return Cost;
1963	}
1964
1965	InstructionCost RTCheckCost = `0`;
1966	if (SCEVCheckBlock)
1967	for (Instruction &I : *SCEVCheckBlock) {
1968	if (SCEVCheckBlock->getTerminator() == &I)
1969	continue;
1970	InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1971	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1972	RTCheckCost += C;
1973	}
1974	if (MemCheckBlock) {
1975	InstructionCost MemCheckCost = `0`;
1976	for (Instruction &I : *MemCheckBlock) {
1977	if (MemCheckBlock->getTerminator() == &I)
1978	continue;
1979	InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1980	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1981	MemCheckCost += C;
1982	}
1983
1984	// If the runtime memory checks are being created inside an outer loop
1985	// we should find out if these checks are outer loop invariant. If so,
1986	// the checks will likely be hoisted out and so the effective cost will
1987	// reduce according to the outer loop trip count.
1988	if (OuterLoop) {
1989	ScalarEvolution *SE = MemCheckExp.getSE();
1990	// TODO: If profitable, we could refine this further by analysing every
1991	// individual memory check, since there could be a mixture of loop
1992	// variant and invariant checks that mean the final condition is
1993	// variant.
1994	const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1995	if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1996	// It seems reasonable to assume that we can reduce the effective
1997	// cost of the checks even when we know nothing about the trip
1998	// count. Assume that the outer loop executes at least twice.
1999	unsigned BestTripCount = `2`;
2000
2001	// Get the best known TC estimate.
2002	if (auto EstimatedTC = getSmallBestKnownTC(
2003	PSE, L: OuterLoop, / CanUseConstantMax = / false))
2004	if (EstimatedTC ->isFixed())
2005	BestTripCount = EstimatedTC ->getFixedValue();
2006
2007	InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
2008
2009	// Let's ensure the cost is always at least 1.
2010	NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
2011	b: (InstructionCost::CostType)`1`);
2012
2013	if (BestTripCount > `1`)
2014	LLVM_DEBUG(dbgs()
2015	<< "We expect runtime memory checks to be hoisted "
2016	<< "out of the outer loop. Cost reduced from "
2017	<< MemCheckCost << " to " << NewMemCheckCost << `'\n'`);
2018
2019	MemCheckCost = NewMemCheckCost;
2020	}
2021	}
2022
2023	RTCheckCost += MemCheckCost;
2024	}
2025
2026	if (SCEVCheckBlock \|\| MemCheckBlock)
2027	LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
2028	<< "\n");
2029
2030	return RTCheckCost;
2031	}
2032
2033	/// Remove the created SCEV & memory runtime check blocks & instructions, if
2034	/// unused.
2035	~GeneratedRTChecks() {
2036	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2037	SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2038	bool SCEVChecksUsed = !SCEVCheckBlock \|\| !pred_empty(BB: SCEVCheckBlock);
2039	bool MemChecksUsed = !MemCheckBlock \|\| !pred_empty(BB: MemCheckBlock);
2040	if (SCEVChecksUsed)
2041	SCEVCleaner.markResultUsed();
2042
2043	if (MemChecksUsed) {
2044	MemCheckCleaner.markResultUsed();
2045	} else {
2046	auto &SE = *MemCheckExp.getSE();
2047	// Memory runtime check generation creates compares that use expanded
2048	// values. Remove them before running the SCEVExpanderCleaners.
2049	for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
2050	if (MemCheckExp.isInsertedInstruction(I: &I))
2051	continue;
2052	SE.forgetValue(V: &I);
2053	I.eraseFromParent();
2054	}
2055	}
2056	MemCheckCleaner.cleanup();
2057	SCEVCleaner.cleanup();
2058
2059	if (!SCEVChecksUsed)
2060	SCEVCheckBlock->eraseFromParent();
2061	if (!MemChecksUsed)
2062	MemCheckBlock->eraseFromParent();
2063	}
2064
2065	/// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2066	/// outside VPlan.
2067	std::pair<Value , BasicBlock > getSCEVChecks() const {
2068	using namespace llvm::PatternMatch;
2069	if (!SCEVCheckCond \|\| match(V: SCEVCheckCond, P: m_ZeroInt()))
2070	return {nullptr, nullptr};
2071
2072	return {SCEVCheckCond, SCEVCheckBlock};
2073	}
2074
2075	/// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2076	/// outside VPlan.
2077	std::pair<Value , BasicBlock > getMemRuntimeChecks() const {
2078	using namespace llvm::PatternMatch;
2079	if (MemRuntimeCheckCond && match(V: MemRuntimeCheckCond, P: m_ZeroInt()))
2080	return {nullptr, nullptr};
2081	return {MemRuntimeCheckCond, MemCheckBlock};
2082	}
2083
2084	/// Return true if any runtime checks have been added
2085	bool hasChecks() const {
2086	return getSCEVChecks().first \|\| getMemRuntimeChecks().first;
2087	}
2088	};
2089	} // namespace
2090
2091	static bool useActiveLaneMask(TailFoldingStyle Style) {
2092	return Style == TailFoldingStyle::Data \|\|
2093	Style == TailFoldingStyle::DataAndControlFlow;
2094	}
2095
2096	static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2097	return Style == TailFoldingStyle::DataAndControlFlow;
2098	}
2099
2100	// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2101	// vectorization. The loop needs to be annotated with #pragma omp simd
2102	// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2103	// vector length information is not provided, vectorization is not considered
2104	// explicit. Interleave hints are not allowed either. These limitations will be
2105	// relaxed in the future.
2106	// Please, note that we are currently forced to abuse the pragma 'clang
2107	// vectorize' semantics. This pragma provides auto-vectorization hints
2108	// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2109	// provides explicit vectorization hints* (LV can bypass legal checks and*
2110	// assume that vectorization is legal). However, both hints are implemented
2111	// using the same metadata (llvm.loop.vectorize, processed by
2112	// LoopVectorizeHints). This will be fixed in the future when the native IR
2113	// representation for pragma 'omp simd' is introduced.
2114	static bool isExplicitVecOuterLoop(Loop *OuterLp,
2115	OptimizationRemarkEmitter *ORE) {
2116	assert(!OuterLp->isInnermost() && "This is not an outer loop");
2117	LoopVectorizeHints Hints(OuterLp, true /DisableInterleaving/, *ORE);
2118
2119	// Only outer loops with an explicit vectorization hint are supported.
2120	// Unannotated outer loops are ignored.
2121	if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2122	return false;
2123
2124	Function *Fn = OuterLp->getHeader()->getParent();
2125	if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2126	VectorizeOnlyWhenForced: true /VectorizeOnlyWhenForced/)) {
2127	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2128	return false;
2129	}
2130
2131	if (Hints.getInterleave() > `1`) {
2132	// TODO: Interleave support is future work.
2133	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2134	"outer loops.\n");
2135	Hints.emitRemarkWithHints();
2136	return false;
2137	}
2138
2139	return true;
2140	}
2141
2142	static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2143	OptimizationRemarkEmitter *ORE,
2144	SmallVectorImpl<Loop *> &V) {
2145	// Collect inner loops and outer loops without irreducible control flow. For
2146	// now, only collect outer loops that have explicit vectorization hints. If we
2147	// are stress testing the VPlan H-CFG construction, we collect the outermost
2148	// loop of every loop nest.
2149	if (L.isInnermost() \|\| VPlanBuildStressTest \|\|
2150	(EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2151	LoopBlocksRPO RPOT(&L);
2152	RPOT.perform(LI);
2153	if (!containsIrreducibleCFG<const BasicBlock >(RPOTraversal&: RPOT, LI: LI)) {
2154	V.push_back(Elt: &L);
2155	// TODO: Collect inner loops inside marked outer loops in case
2156	// vectorization fails for the outer loop. Do not invoke
2157	// 'containsIrreducibleCFG' again for inner loops when the outer loop is
2158	// already known to be reducible. We can use an inherited attribute for
2159	// that.
2160	return;
2161	}
2162	}
2163	for (Loop *InnerL : L)
2164	collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2165	}
2166
2167	//===----------------------------------------------------------------------===//
2168	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2169	// LoopVectorizationCostModel and LoopVectorizationPlanner.
2170	//===----------------------------------------------------------------------===//
2171
2172	/// FIXME: The newly created binary instructions should contain nsw/nuw
2173	/// flags, which can be found from the original scalar operations.
2174	Value *
2175	llvm::emitTransformedIndex(IRBuilderBase &B, Value Index, Value StartValue,
2176	Value *Step,
2177	InductionDescriptor::InductionKind InductionKind,
2178	const BinaryOperator *InductionBinOp) {
2179	using namespace llvm::PatternMatch;
2180	Type *StepTy = Step->getType();
2181	Value *CastedIndex = StepTy->isIntegerTy()
2182	? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2183	: B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2184	if (CastedIndex != Index) {
2185	CastedIndex->setName(CastedIndex->getName() + ".cast");
2186	Index = CastedIndex;
2187	}
2188
2189	// Note: the IR at this point is broken. We cannot use SE to create any new
2190	// SCEV and then expand it, hoping that SCEV's simplification will give us
2191	// a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2192	// lead to various SCEV crashes. So all we can do is to use builder and rely
2193	// on InstCombine for future simplifications. Here we handle some trivial
2194	// cases only.
2195	auto CreateAdd = [&B](Value X, Value Y) {
2196	assert(X->getType() == Y->getType() && "Types don't match!");
2197	if (match(V: X, P: m_ZeroInt()))
2198	return Y;
2199	if (match(V: Y, P: m_ZeroInt()))
2200	return X;
2201	return B.CreateAdd(LHS: X, RHS: Y);
2202	};
2203
2204	// We allow X to be a vector type, in which case Y will potentially be
2205	// splatted into a vector with the same element count.
2206	auto CreateMul = [&B](Value X, Value Y) {
2207	assert(X->getType()->getScalarType() == Y->getType() &&
2208	"Types don't match!");
2209	if (match(V: X, P: m_One()))
2210	return Y;
2211	if (match(V: Y, P: m_One()))
2212	return X;
2213	VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2214	if (XVTy && !isa<VectorType>(Val: Y->getType()))
2215	Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2216	return B.CreateMul(LHS: X, RHS: Y);
2217	};
2218
2219	switch (InductionKind) {
2220	case InductionDescriptor::IK_IntInduction: {
2221	assert(!isa<VectorType>(Index->getType()) &&
2222	"Vector indices not supported for integer inductions yet");
2223	assert(Index->getType() == StartValue->getType() &&
2224	"Index type does not match StartValue type");
2225	if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2226	return B.CreateSub(LHS: StartValue, RHS: Index);
2227	auto *Offset = CreateMul (Index, Step);
2228	return CreateAdd (StartValue, Offset);
2229	}
2230	case InductionDescriptor::IK_PtrInduction:
2231	return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul (Index, Step));
2232	case InductionDescriptor::IK_FpInduction: {
2233	assert(!isa<VectorType>(Index->getType()) &&
2234	"Vector indices not supported for FP inductions yet");
2235	assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2236	assert(InductionBinOp &&
2237	(InductionBinOp->getOpcode() == Instruction::FAdd \|\|
2238	InductionBinOp->getOpcode() == Instruction::FSub) &&
2239	"Original bin op should be defined for FP induction");
2240
2241	Value *MulExp = B.CreateFMul(L: Step, R: Index);
2242	return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2243	Name: "induction");
2244	}
2245	case InductionDescriptor::IK_NoInduction:
2246	return nullptr;
2247	}
2248	llvm_unreachable("invalid enum");
2249	}
2250
2251	static std::optional<unsigned> getMaxVScale(const Function &F,
2252	const TargetTransformInfo &TTI) {
2253	if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2254	return MaxVScale;
2255
2256	if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2257	return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2258
2259	return std::nullopt;
2260	}
2261
2262	/// For the given VF and UF and maximum trip count computed for the loop, return
2263	/// whether the induction variable might overflow in the vectorized loop. If not,
2264	/// then we know a runtime overflow check always evaluates to false and can be
2265	/// removed.
2266	static bool isIndvarOverflowCheckKnownFalse(
2267	const LoopVectorizationCostModel *Cost,
2268	ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2269	// Always be conservative if we don't know the exact unroll factor.
2270	unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2271
2272	IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2273	APInt MaxUIntTripCount = IdxTy->getMask();
2274
2275	// We know the runtime overflow check is known false iff the (max) trip-count
2276	// is known and (max) trip-count + (VF UF) does not overflow in the type of*
2277	// the vector loop induction variable.
2278	if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2279	uint64_t MaxVF = VF.getKnownMinValue();
2280	if (VF.isScalable()) {
2281	std::optional<unsigned> MaxVScale =
2282	getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2283	if (!MaxVScale)
2284	return false;
2285	MaxVF = MaxVScale;
2286	}
2287
2288	return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2289	}
2290
2291	return false;
2292	}
2293
2294	// Return whether we allow using masked interleave-groups (for dealing with
2295	// strided loads/stores that reside in predicated blocks, or for dealing
2296	// with gaps).
2297	static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2298	// If an override option has been passed in for interleaved accesses, use it.
2299	if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > `0`)
2300	return EnableMaskedInterleavedMemAccesses;
2301
2302	return TTI.enableMaskedInterleavedAccessVectorization();
2303	}
2304
2305	void EpilogueVectorizerMainLoop::introduceCheckBlockInVPlan(
2306	BasicBlock *CheckIRBB) {
2307	// Note: The block with the minimum trip-count check is already connected
2308	// during earlier VPlan construction.
2309	VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2310	VPBlockBase *PreVectorPH = VectorPHVPBB->getSinglePredecessor();
2311	assert(PreVectorPH->getNumSuccessors() == `2` && "Expected 2 successors");
2312	assert(PreVectorPH->getSuccessors()[`0`] == ScalarPH && "Unexpected successor");
2313	VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2314	VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPBB, BlockPtr: CheckVPIRBB);
2315	PreVectorPH = CheckVPIRBB;
2316	VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2317	PreVectorPH->swapSuccessors();
2318
2319	// We just connected a new block to the scalar preheader. Update all
2320	// VPPhis by adding an incoming value for it, replicating the last value.
2321	unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2322	for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2323	assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2324	assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - `1` &&
2325	"must have incoming values for all operands");
2326	R.addOperand(Operand: R.getOperand(N: NumPredecessors - `2`));
2327	}
2328	}
2329
2330	Value *EpilogueVectorizerMainLoop::createIterationCountCheck(
2331	BasicBlock VectorPH, ElementCount VF, unsigned* UF) const {
2332	// Generate code to check if the loop's trip count is less than VF UF, or*
2333	// equal to it in case a scalar epilogue is required; this implies that the
2334	// vector trip count is zero. This check also covers the case where adding one
2335	// to the backedge-taken count overflowed leading to an incorrect trip count
2336	// of zero. In this case we will also jump to the scalar loop.
2337	auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2338	: ICmpInst::ICMP_ULT;
2339
2340	// Reuse existing vector loop preheader for TC checks.
2341	// Note that new preheader block is generated for vector loop.
2342	BasicBlock *const TCCheckBlock = VectorPH;
2343	IRBuilder<InstSimplifyFolder> Builder(
2344	TCCheckBlock->getContext(),
2345	InstSimplifyFolder (TCCheckBlock->getDataLayout()));
2346	Builder.SetInsertPoint(TCCheckBlock->getTerminator());
2347
2348	// If tail is to be folded, vector loop takes care of all iterations.
2349	Value *Count = getTripCount();
2350	Type *CountTy = Count->getType();
2351	Value *CheckMinIters = Builder.getFalse();
2352	auto CreateStep = [&]() -> Value * {
2353	// Create step with max(MinProTripCount, UF VF).*
2354	if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2355	return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2356
2357	Value *MinProfTC =
2358	Builder.CreateElementCount(Ty: CountTy, EC: MinProfitableTripCount);
2359	if (!VF.isScalable())
2360	return MinProfTC;
2361	return Builder.CreateBinaryIntrinsic(
2362	ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2363	};
2364
2365	TailFoldingStyle Style = Cost->getTailFoldingStyle();
2366	if (Style == TailFoldingStyle::None) {
2367	Value *Step = CreateStep ();
2368	ScalarEvolution &SE = *PSE.getSE();
2369	// TODO: Emit unconditional branch to vector preheader instead of
2370	// conditional branch with known condition.
2371	const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2372	// Check if the trip count is < the step.
2373	if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2374	// TODO: Ensure step is at most the trip count when determining max VF and
2375	// UF, w/o tail folding.
2376	CheckMinIters = Builder.getTrue();
2377	} else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2378	LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2379	// Generate the minimum iteration check only if we cannot prove the
2380	// check is known to be true, or known to be false.
2381	CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2382	} // else step known to be < trip count, use CheckMinIters preset to false.
2383	}
2384
2385	return CheckMinIters;
2386	}
2387
2388	/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2389	/// VPBB are moved to the end of the newly created VPIRBasicBlock. All
2390	/// predecessors and successors of VPBB, if any, are rewired to the new
2391	/// VPIRBasicBlock. If \p VPBB may be unreachable, \p Plan must be passed.
2392	static VPIRBasicBlock replaceVPBBWithIRVPBB(VPBasicBlock VPBB,
2393	BasicBlock *IRBB,
2394	VPlan Plan = nullptr*) {
2395	if (!Plan)
2396	Plan = VPBB->getPlan();
2397	VPIRBasicBlock *IRVPBB = Plan->createVPIRBasicBlock(IRBB);
2398	auto IP = IRVPBB->begin();
2399	for (auto &R : make_early_inc_range(Range: VPBB->phis()))
2400	R.moveBefore(BB&: *IRVPBB, I: IP);
2401
2402	for (auto &R :
2403	make_early_inc_range(Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end())))
2404	R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2405
2406	VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2407	// VPBB is now dead and will be cleaned up when the plan gets destroyed.
2408	return IRVPBB;
2409	}
2410
2411	BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
2412	BasicBlock *VectorPH = OrigLoop->getLoopPreheader();
2413	assert(VectorPH && "Invalid loop structure");
2414	assert((OrigLoop->getUniqueLatchExitBlock() \|\|
2415	Cost->requiresScalarEpilogue(VF.isVector())) &&
2416	"loops not exiting via the latch without required epilogue?");
2417
2418	// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2419	// wrapping the newly created scalar preheader here at the moment, because the
2420	// Plan's scalar preheader may be unreachable at this point. Instead it is
2421	// replaced in executePlan.
2422	return SplitBlock(Old: VectorPH, SplitPt: VectorPH->getTerminator(), DT, LI, MSSAU: nullptr,
2423	BBName: Twine (Prefix) + "scalar.ph");
2424	}
2425
2426	/// Knowing that loop \p L executes a single vector iteration, add instructions
2427	/// that will get simplified and thus should not have any cost to \p
2428	/// InstsToIgnore.
2429	static void addFullyUnrolledInstructionsToIgnore(
2430	Loop L, const* LoopVectorizationLegality::InductionList &IL,
2431	SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2432	auto *Cmp = L->getLatchCmpInst();
2433	if (Cmp)
2434	InstsToIgnore.insert(Ptr: Cmp);
2435	for (const auto &KV : IL) {
2436	// Extract the key by hand so that it can be used in the lambda below. Note
2437	// that captured structured bindings are a C++20 extension.
2438	const PHINode *IV = KV.first;
2439
2440	// Get next iteration value of the induction variable.
2441	Instruction *IVInst =
2442	cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2443	if (all_of(Range: IVInst->users(),
2444	P: [&](const User U) { return* U == IV \|\| U == Cmp; }))
2445	InstsToIgnore.insert(Ptr: IVInst);
2446	}
2447	}
2448
2449	BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2450	// Create a new IR basic block for the scalar preheader.
2451	BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
2452	return ScalarPH->getSinglePredecessor();
2453	}
2454
2455	namespace {
2456
2457	struct CSEDenseMapInfo {
2458	static bool canHandle(const Instruction *I) {
2459	return isa<InsertElementInst>(Val: I) \|\| isa<ExtractElementInst>(Val: I) \|\|
2460	isa<ShuffleVectorInst>(Val: I) \|\| isa<GetElementPtrInst>(Val: I);
2461	}
2462
2463	static inline Instruction *getEmptyKey() {
2464	return DenseMapInfo<Instruction *>::getEmptyKey();
2465	}
2466
2467	static inline Instruction *getTombstoneKey() {
2468	return DenseMapInfo<Instruction *>::getTombstoneKey();
2469	}
2470
2471	static unsigned getHashValue(const Instruction *I) {
2472	assert(canHandle(I) && "Unknown instruction!");
2473	return hash_combine(args: I->getOpcode(),
2474	args: hash_combine_range(R: I->operand_values()));
2475	}
2476
2477	static bool isEqual(const Instruction LHS, const* Instruction *RHS) {
2478	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
2479	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
2480	return LHS == RHS;
2481	return LHS->isIdenticalTo(I: RHS);
2482	}
2483	};
2484
2485	} // end anonymous namespace
2486
2487	/// FIXME: This legacy common-subexpression-elimination routine is scheduled for
2488	/// removal, in favor of the VPlan-based one.
2489	static void legacyCSE(BasicBlock *BB) {
2490	// Perform simple cse.
2491	SmallDenseMap<Instruction , Instruction , `4`, CSEDenseMapInfo> CSEMap;
2492	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2493	if (!CSEDenseMapInfo::canHandle(I: &In))
2494	continue;
2495
2496	// Check if we can replace this instruction with any of the
2497	// visited instructions.
2498	if (Instruction *V = CSEMap.lookup(Val: &In)) {
2499	In.replaceAllUsesWith(V);
2500	In.eraseFromParent();
2501	continue;
2502	}
2503
2504	CSEMap [&In] = &In;
2505	}
2506	}
2507
2508	/// This function attempts to return a value that represents the ElementCount
2509	/// at runtime. For fixed-width VFs we know this precisely at compile
2510	/// time, but for scalable VFs we calculate it based on an estimate of the
2511	/// vscale value.
2512	static unsigned estimateElementCount(ElementCount VF,
2513	std::optional<unsigned> VScale) {
2514	unsigned EstimatedVF = VF.getKnownMinValue();
2515	if (VF.isScalable())
2516	if (VScale)
2517	EstimatedVF = VScale;
2518	assert(EstimatedVF >= `1` && "Estimated VF shouldn't be less than 1");
2519	return EstimatedVF;
2520	}
2521
2522	InstructionCost
2523	LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2524	ElementCount VF) const {
2525	// We only need to calculate a cost if the VF is scalar; for actual vectors
2526	// we should already have a pre-calculated cost at each VF.
2527	if (!VF.isScalar())
2528	return getCallWideningDecision(CI, VF).Cost;
2529
2530	Type *RetTy = CI->getType();
2531	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2532	if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2533	return *RedCost;
2534
2535	SmallVector<Type *, `4`> Tys;
2536	for (auto &ArgOp : CI->args())
2537	Tys.push_back(Elt: ArgOp ->getType());
2538
2539	InstructionCost ScalarCallCost =
2540	TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2541
2542	// If this is an intrinsic we may have a lower cost for it.
2543	if (getVectorIntrinsicIDForCall(CI, TLI)) {
2544	InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2545	return std::min(a: ScalarCallCost, b: IntrinsicCost);
2546	}
2547	return ScalarCallCost;
2548	}
2549
2550	static Type maybeVectorizeType(Type Ty, ElementCount VF) {
2551	if (VF.isScalar() \|\| !canVectorizeTy(Ty))
2552	return Ty;
2553	return toVectorizedTy(Ty, EC: VF);
2554	}
2555
2556	InstructionCost
2557	LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2558	ElementCount VF) const {
2559	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2560	assert(ID && "Expected intrinsic call!");
2561	Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2562	FastMathFlags FMF;
2563	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2564	FMF = FPMO->getFastMathFlags();
2565
2566	SmallVector<const Value *> Arguments(CI->args());
2567	FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2568	SmallVector<Type *> ParamTys;
2569	std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2570	result: std::back_inserter(x&: ParamTys),
2571	unary_op: [&](Type Ty) { return* maybeVectorizeType(Ty, VF); });
2572
2573	IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2574	dyn_cast<IntrinsicInst>(Val: CI),
2575	InstructionCost::getInvalid());
2576	return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2577	}
2578
2579	void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2580	// Fix widened non-induction PHIs by setting up the PHI operands.
2581	fixNonInductionPHIs(State);
2582
2583	// Don't apply optimizations below when no (vector) loop remains, as they all
2584	// require one at the moment.
2585	VPBasicBlock *HeaderVPBB =
2586	vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2587	if (!HeaderVPBB)
2588	return;
2589
2590	BasicBlock *HeaderBB = State.CFG.VPBB2IRBB [HeaderVPBB];
2591
2592	// Remove redundant induction instructions.
2593	legacyCSE(BB: HeaderBB);
2594	}
2595
2596	void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2597	auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2598	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2599	for (VPRecipeBase &P : VPBB->phis()) {
2600	VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2601	if (!VPPhi)
2602	continue;
2603	PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2604	// Make sure the builder has a valid insert point.
2605	Builder.SetInsertPoint(NewPhi);
2606	for (const auto &[Inc, VPBB] : VPPhi->incoming_values_and_blocks())
2607	NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB [VPBB]);
2608	}
2609	}
2610	}
2611
2612	void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2613	// We should not collect Scalars more than once per VF. Right now, this
2614	// function is called from collectUniformsAndScalars(), which already does
2615	// this check. Collecting Scalars for VF=1 does not make any sense.
2616	assert(VF.isVector() && !Scalars.contains(VF) &&
2617	"This function should not be visited twice for the same VF");
2618
2619	// This avoids any chances of creating a REPLICATE recipe during planning
2620	// since that would result in generation of scalarized code during execution,
2621	// which is not supported for scalable vectors.
2622	if (VF.isScalable()) {
2623	Scalars [VF].insert_range(R&: Uniforms [VF]);
2624	return;
2625	}
2626
2627	SmallSetVector<Instruction *, `8`> Worklist;
2628
2629	// These sets are used to seed the analysis with pointers used by memory
2630	// accesses that will remain scalar.
2631	SmallSetVector<Instruction *, `8`> ScalarPtrs;
2632	SmallPtrSet<Instruction *, `8`> PossibleNonScalarPtrs;
2633	auto *Latch = TheLoop->getLoopLatch();
2634
2635	// A helper that returns true if the use of Ptr by MemAccess will be scalar.
2636	// The pointer operands of loads and stores will be scalar as long as the
2637	// memory access is not a gather or scatter operation. The value operand of a
2638	// store will remain scalar if the store is scalarized.
2639	auto IsScalarUse = [&](Instruction MemAccess, Value Ptr) {
2640	InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2641	assert(WideningDecision != CM_Unknown &&
2642	"Widening decision should be ready at this moment");
2643	if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2644	if (Ptr == Store->getValueOperand())
2645	return WideningDecision == CM_Scalarize;
2646	assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2647	"Ptr is neither a value or pointer operand");
2648	return WideningDecision != CM_GatherScatter;
2649	};
2650
2651	// A helper that returns true if the given value is a getelementptr
2652	// instruction contained in the loop.
2653	auto IsLoopVaryingGEP = [&](Value *V) {
2654	return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2655	};
2656
2657	// A helper that evaluates a memory access's use of a pointer. If the use will
2658	// be a scalar use and the pointer is only used by memory accesses, we place
2659	// the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2660	// PossibleNonScalarPtrs.
2661	auto EvaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {
2662	// We only care about bitcast and getelementptr instructions contained in
2663	// the loop.
2664	if (!IsLoopVaryingGEP (Ptr))
2665	return;
2666
2667	// If the pointer has already been identified as scalar (e.g., if it was
2668	// also identified as uniform), there's nothing to do.
2669	auto *I = cast<Instruction>(Val: Ptr);
2670	if (Worklist.count(key: I))
2671	return;
2672
2673	// If the use of the pointer will be a scalar use, and all users of the
2674	// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2675	// place the pointer in PossibleNonScalarPtrs.
2676	if (IsScalarUse (MemAccess, Ptr) &&
2677	all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2678	ScalarPtrs.insert(X: I);
2679	else
2680	PossibleNonScalarPtrs.insert(Ptr: I);
2681	};
2682
2683	// We seed the scalars analysis with three classes of instructions: (1)
2684	// instructions marked uniform-after-vectorization and (2) bitcast,
2685	// getelementptr and (pointer) phi instructions used by memory accesses
2686	// requiring a scalar use.
2687	//
2688	// (1) Add to the worklist all instructions that have been identified as
2689	// uniform-after-vectorization.
2690	Worklist.insert_range(R&: Uniforms [VF]);
2691
2692	// (2) Add to the worklist all bitcast and getelementptr instructions used by
2693	// memory accesses requiring a scalar use. The pointer operands of loads and
2694	// stores will be scalar unless the operation is a gather or scatter.
2695	// The value operand of a store will remain scalar if the store is scalarized.
2696	for (auto *BB : TheLoop->blocks())
2697	for (auto &I : *BB) {
2698	if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2699	EvaluatePtrUse (Load, Load->getPointerOperand());
2700	} else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2701	EvaluatePtrUse (Store, Store->getPointerOperand());
2702	EvaluatePtrUse (Store, Store->getValueOperand());
2703	}
2704	}
2705	for (auto *I : ScalarPtrs)
2706	if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2707	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2708	Worklist.insert(X: I);
2709	}
2710
2711	// Insert the forced scalars.
2712	// FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2713	// induction variable when the PHI user is scalarized.
2714	auto ForcedScalar = ForcedScalars.find(Val: VF);
2715	if (ForcedScalar != ForcedScalars.end())
2716	for (auto *I : ForcedScalar ->second) {
2717	LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2718	Worklist.insert(X: I);
2719	}
2720
2721	// Expand the worklist by looking through any bitcasts and getelementptr
2722	// instructions we've already identified as scalar. This is similar to the
2723	// expansion step in collectLoopUniforms(); however, here we're only
2724	// expanding to include additional bitcasts and getelementptr instructions.
2725	unsigned Idx = `0`;
2726	while (Idx != Worklist.size()) {
2727	Instruction *Dst = Worklist [Idx++];
2728	if (!IsLoopVaryingGEP (Dst->getOperand(i: `0`)))
2729	continue;
2730	auto *Src = cast<Instruction>(Val: Dst->getOperand(i: `0`));
2731	if (llvm::all_of(Range: Src->users(), P: [&](User U) -> bool* {
2732	auto *J = cast<Instruction>(Val: U);
2733	return !TheLoop->contains(Inst: J) \|\| Worklist.count(key: J) \|\|
2734	((isa<LoadInst>(Val: J) \|\| isa<StoreInst>(Val: J)) &&
2735	IsScalarUse (J, Src));
2736	})) {
2737	Worklist.insert(X: Src);
2738	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2739	}
2740	}
2741
2742	// An induction variable will remain scalar if all users of the induction
2743	// variable and induction variable update remain scalar.
2744	for (const auto &Induction : Legal->getInductionVars()) {
2745	auto *Ind = Induction.first;
2746	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2747
2748	// If tail-folding is applied, the primary induction variable will be used
2749	// to feed a vector compare.
2750	if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2751	continue;
2752
2753	// Returns true if \p Indvar is a pointer induction that is used directly by
2754	// load/store instruction \p I.
2755	auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2756	Instruction *I) {
2757	return Induction.second.getKind() ==
2758	InductionDescriptor::IK_PtrInduction &&
2759	(isa<LoadInst>(Val: I) \|\| isa<StoreInst>(Val: I)) &&
2760	Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse (I, Indvar);
2761	};
2762
2763	// Determine if all users of the induction variable are scalar after
2764	// vectorization.
2765	bool ScalarInd = all_of(Range: Ind->users(), P: [&](User U) -> bool* {
2766	auto *I = cast<Instruction>(Val: U);
2767	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
2768	IsDirectLoadStoreFromPtrIndvar (Ind, I);
2769	});
2770	if (!ScalarInd)
2771	continue;
2772
2773	// If the induction variable update is a fixed-order recurrence, neither the
2774	// induction variable or its update should be marked scalar after
2775	// vectorization.
2776	auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2777	if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2778	continue;
2779
2780	// Determine if all users of the induction variable update instruction are
2781	// scalar after vectorization.
2782	bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
2783	auto *I = cast<Instruction>(Val: U);
2784	return I == Ind \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
2785	IsDirectLoadStoreFromPtrIndvar (IndUpdate, I);
2786	});
2787	if (!ScalarIndUpdate)
2788	continue;
2789
2790	// The induction variable and its update instruction will remain scalar.
2791	Worklist.insert(X: Ind);
2792	Worklist.insert(X: IndUpdate);
2793	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2794	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2795	<< "\n");
2796	}
2797
2798	Scalars [VF].insert_range(R&: Worklist);
2799	}
2800
2801	bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
2802	ElementCount VF) {
2803	if (!isPredicatedInst(I))
2804	return false;
2805
2806	// Do we have a non-scalar lowering for this predicated
2807	// instruction? No - it is scalar with predication.
2808	switch(I->getOpcode()) {
2809	default:
2810	return true;
2811	case Instruction::Call:
2812	if (VF.isScalar())
2813	return true;
2814	return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
2815	case Instruction::Load:
2816	case Instruction::Store: {
2817	auto *Ptr = getLoadStorePointerOperand(V: I);
2818	auto *Ty = getLoadStoreType(I);
2819	unsigned AS = getLoadStoreAddressSpace(I);
2820	Type *VTy = Ty;
2821	if (VF.isVector())
2822	VTy = VectorType::get(ElementType: Ty, EC: VF);
2823	const Align Alignment = getLoadStoreAlignment(I);
2824	return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) \|\|
2825	TTI.isLegalMaskedGather(DataType: VTy, Alignment))
2826	: !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) \|\|
2827	TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
2828	}
2829	case Instruction::UDiv:
2830	case Instruction::SDiv:
2831	case Instruction::SRem:
2832	case Instruction::URem: {
2833	// We have the option to use the safe-divisor idiom to avoid predication.
2834	// The cost based decision here will always select safe-divisor for
2835	// scalable vectors as scalarization isn't legal.
2836	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2837	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2838	}
2839	}
2840	}
2841
2842	// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2843	bool LoopVectorizationCostModel::isPredicatedInst(Instruction I) const* {
2844	// TODO: We can use the loop-preheader as context point here and get
2845	// context sensitive reasoning for isSafeToSpeculativelyExecute.
2846	if (isSafeToSpeculativelyExecute(I) \|\|
2847	(isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) \|\|
2848	isa<UncondBrInst, CondBrInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2849	return false;
2850
2851	// If the instruction was executed conditionally in the original scalar loop,
2852	// predication is needed with a mask whose lanes are all possibly inactive.
2853	if (Legal->blockNeedsPredication(BB: I->getParent()))
2854	return true;
2855
2856	// If we're not folding the tail by masking, predication is unnecessary.
2857	if (!foldTailByMasking())
2858	return false;
2859
2860	// All that remain are instructions with side-effects originally executed in
2861	// the loop unconditionally, but now execute under a tail-fold mask (only)
2862	// having at least one active lane (the first). If the side-effects of the
2863	// instruction are invariant, executing it w/o (the tail-folding) mask is safe
2864	// - it will cause the same side-effects as when masked.
2865	switch(I->getOpcode()) {
2866	default:
2867	llvm_unreachable(
2868	"instruction should have been considered by earlier checks");
2869	case Instruction::Call:
2870	// Side-effects of a Call are assumed to be non-invariant, needing a
2871	// (fold-tail) mask.
2872	assert(Legal->isMaskRequired(I) &&
2873	"should have returned earlier for calls not needing a mask");
2874	return true;
2875	case Instruction::Load:
2876	// If the address is loop invariant no predication is needed.
2877	return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2878	case Instruction::Store: {
2879	// For stores, we need to prove both speculation safety (which follows from
2880	// the same argument as loads), but also must prove the value being stored
2881	// is correct. The easiest form of the later is to require that all values
2882	// stored are the same.
2883	return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2884	TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2885	}
2886	case Instruction::UDiv:
2887	case Instruction::URem:
2888	// If the divisor is loop-invariant no predication is needed.
2889	return !Legal->isInvariant(V: I->getOperand(i: `1`));
2890	case Instruction::SDiv:
2891	case Instruction::SRem:
2892	// Conservative for now, since masked-off lanes may be poison and could
2893	// trigger signed overflow.
2894	return true;
2895	}
2896	}
2897
2898	uint64_t LoopVectorizationCostModel::getPredBlockCostDivisor(
2899	TargetTransformInfo::TargetCostKind CostKind, const BasicBlock *BB) {
2900	if (CostKind == TTI::TCK_CodeSize)
2901	return `1`;
2902	// If the block wasn't originally predicated then return early to avoid
2903	// computing BlockFrequencyInfo unnecessarily.
2904	if (!Legal->blockNeedsPredication(BB))
2905	return `1`;
2906
2907	uint64_t HeaderFreq =
2908	getBFI().getBlockFreq(BB: TheLoop->getHeader()).getFrequency();
2909	uint64_t BBFreq = getBFI().getBlockFreq(BB).getFrequency();
2910	assert(HeaderFreq >= BBFreq &&
2911	"Header has smaller block freq than dominated BB?");
2912	return std::round(x: (double)HeaderFreq / BBFreq);
2913	}
2914
2915	std::pair<InstructionCost, InstructionCost>
2916	LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
2917	ElementCount VF) {
2918	assert(I->getOpcode() == Instruction::UDiv \|\|
2919	I->getOpcode() == Instruction::SDiv \|\|
2920	I->getOpcode() == Instruction::SRem \|\|
2921	I->getOpcode() == Instruction::URem);
2922	assert(!isSafeToSpeculativelyExecute(I));
2923
2924	// Scalarization isn't legal for scalable vector types
2925	InstructionCost ScalarizationCost = InstructionCost::getInvalid();
2926	if (!VF.isScalable()) {
2927	// Get the scalarization cost and scale this amount by the probability of
2928	// executing the predicated block. If the instruction is not predicated,
2929	// we fall through to the next case.
2930	ScalarizationCost = `0`;
2931
2932	// These instructions have a non-void type, so account for the phi nodes
2933	// that we will create. This cost is likely to be zero. The phi node
2934	// cost, if any, should be scaled by the block probability because it
2935	// models a copy at the end of each predicated block.
2936	ScalarizationCost +=
2937	VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
2938
2939	// The cost of the non-predicated instruction.
2940	ScalarizationCost +=
2941	VF.getFixedValue() *
2942	TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
2943
2944	// The cost of insertelement and extractelement instructions needed for
2945	// scalarization.
2946	ScalarizationCost += getScalarizationOverhead(I, VF);
2947
2948	// Scale the cost by the probability of executing the predicated blocks.
2949	// This assumes the predicated block for each vector lane is equally
2950	// likely.
2951	ScalarizationCost =
2952	ScalarizationCost / getPredBlockCostDivisor(CostKind, BB: I->getParent());
2953	}
2954
2955	InstructionCost SafeDivisorCost = `0`;
2956	auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
2957	// The cost of the select guard to ensure all lanes are well defined
2958	// after we speculate above any internal control flow.
2959	SafeDivisorCost +=
2960	TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
2961	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
2962	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
2963
2964	SmallVector<const Value *, `4`> Operands(I->operand_values());
2965	SafeDivisorCost += TTI.getArithmeticInstrCost(
2966	Opcode: I->getOpcode(), Ty: VecTy, CostKind,
2967	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2968	Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
2969	Args: Operands, CxtI: I);
2970	return {ScalarizationCost, SafeDivisorCost};
2971	}
2972
2973	bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
2974	Instruction I, ElementCount VF) const* {
2975	assert(isAccessInterleaved(I) && "Expecting interleaved access.");
2976	assert(getWideningDecision(I, VF) == CM_Unknown &&
2977	"Decision should not be set yet.");
2978	auto *Group = getInterleavedAccessGroup(Instr: I);
2979	assert(Group && "Must have a group.");
2980	unsigned InterleaveFactor = Group->getFactor();
2981
2982	// If the instruction's allocated size doesn't equal its type size, it
2983	// requires padding and will be scalarized.
2984	auto &DL = I->getDataLayout();
2985	auto *ScalarTy = getLoadStoreType(I);
2986	if (hasIrregularType(Ty: ScalarTy, DL))
2987	return false;
2988
2989	// For scalable vectors, the interleave factors must be <= 8 since we require
2990	// the (de)interleaveN intrinsics instead of shufflevectors.
2991	if (VF.isScalable() && InterleaveFactor > `8`)
2992	return false;
2993
2994	// If the group involves a non-integral pointer, we may not be able to
2995	// losslessly cast all values to a common type.
2996	bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
2997	for (unsigned Idx = `0`; Idx < InterleaveFactor; Idx++) {
2998	Instruction *Member = Group->getMember(Index: Idx);
2999	if (!Member)
3000	continue;
3001	auto *MemberTy = getLoadStoreType(I: Member);
3002	bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3003	// Don't coerce non-integral pointers to integers or vice versa.
3004	if (MemberNI != ScalarNI)
3005	// TODO: Consider adding special nullptr value case here
3006	return false;
3007	if (MemberNI && ScalarNI &&
3008	ScalarTy->getPointerAddressSpace() !=
3009	MemberTy->getPointerAddressSpace())
3010	return false;
3011	}
3012
3013	// Check if masking is required.
3014	// A Group may need masking for one of two reasons: it resides in a block that
3015	// needs predication, or it was decided to use masking to deal with gaps
3016	// (either a gap at the end of a load-access that may result in a speculative
3017	// load, or any gaps in a store-access).
3018	bool PredicatedAccessRequiresMasking =
3019	blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3020	Legal->isMaskRequired(I);
3021	bool LoadAccessWithGapsRequiresEpilogMasking =
3022	isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3023	!isScalarEpilogueAllowed();
3024	bool StoreAccessWithGapsRequiresMasking =
3025	isa<StoreInst>(Val: I) && !Group->isFull();
3026	if (!PredicatedAccessRequiresMasking &&
3027	!LoadAccessWithGapsRequiresEpilogMasking &&
3028	!StoreAccessWithGapsRequiresMasking)
3029	return true;
3030
3031	// If masked interleaving is required, we expect that the user/target had
3032	// enabled it, because otherwise it either wouldn't have been created or
3033	// it should have been invalidated by the CostModel.
3034	assert(useMaskedInterleavedAccesses(TTI) &&
3035	"Masked interleave-groups for predicated accesses are not enabled.");
3036
3037	if (Group->isReverse())
3038	return false;
3039
3040	// TODO: Support interleaved access that requires a gap mask for scalable VFs.
3041	bool NeedsMaskForGaps = LoadAccessWithGapsRequiresEpilogMasking \|\|
3042	StoreAccessWithGapsRequiresMasking;
3043	if (VF.isScalable() && NeedsMaskForGaps)
3044	return false;
3045
3046	auto *Ty = getLoadStoreType(I);
3047	const Align Alignment = getLoadStoreAlignment(I);
3048	unsigned AS = getLoadStoreAddressSpace(I);
3049	return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3050	: TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3051	}
3052
3053	bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3054	Instruction *I, ElementCount VF) {
3055	// Get and ensure we have a valid memory instruction.
3056	assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3057
3058	auto *Ptr = getLoadStorePointerOperand(V: I);
3059	auto *ScalarTy = getLoadStoreType(I);
3060
3061	// In order to be widened, the pointer should be consecutive, first of all.
3062	if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3063	return false;
3064
3065	// If the instruction is a store located in a predicated block, it will be
3066	// scalarized.
3067	if (isScalarWithPredication(I, VF))
3068	return false;
3069
3070	// If the instruction's allocated size doesn't equal it's type size, it
3071	// requires padding and will be scalarized.
3072	auto &DL = I->getDataLayout();
3073	if (hasIrregularType(Ty: ScalarTy, DL))
3074	return false;
3075
3076	return true;
3077	}
3078
3079	void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3080	// We should not collect Uniforms more than once per VF. Right now,
3081	// this function is called from collectUniformsAndScalars(), which
3082	// already does this check. Collecting Uniforms for VF=1 does not make any
3083	// sense.
3084
3085	assert(VF.isVector() && !Uniforms.contains(VF) &&
3086	"This function should not be visited twice for the same VF");
3087
3088	// Visit the list of Uniforms. If we find no uniform value, we won't
3089	// analyze again. Uniforms.count(VF) will return 1.
3090	Uniforms [VF].clear();
3091
3092	// Now we know that the loop is vectorizable!
3093	// Collect instructions inside the loop that will remain uniform after
3094	// vectorization.
3095
3096	// Global values, params and instructions outside of current loop are out of
3097	// scope.
3098	auto IsOutOfScope = [&](Value V) -> bool* {
3099	Instruction *I = dyn_cast<Instruction>(Val: V);
3100	return (!I \|\| !TheLoop->contains(Inst: I));
3101	};
3102
3103	// Worklist containing uniform instructions demanding lane 0.
3104	SetVector<Instruction *> Worklist;
3105
3106	// Add uniform instructions demanding lane 0 to the worklist. Instructions
3107	// that require predication must not be considered uniform after
3108	// vectorization, because that would create an erroneous replicating region
3109	// where only a single instance out of VF should be formed.
3110	auto AddToWorklistIfAllowed = [&](Instruction I) -> void* {
3111	if (IsOutOfScope (I)) {
3112	LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3113	<< *I << "\n");
3114	return;
3115	}
3116	if (isPredicatedInst(I)) {
3117	LLVM_DEBUG(
3118	dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3119	<< "\n");
3120	return;
3121	}
3122	LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3123	Worklist.insert(X: I);
3124	};
3125
3126	// Start with the conditional branches exiting the loop. If the branch
3127	// condition is an instruction contained in the loop that is only used by the
3128	// branch, it is uniform. Note conditions from uncountable early exits are not
3129	// uniform.
3130	SmallVector<BasicBlock *> Exiting;
3131	TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3132	for (BasicBlock *E : Exiting) {
3133	if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3134	continue;
3135	auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: `0`));
3136	if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3137	AddToWorklistIfAllowed (Cmp);
3138	}
3139
3140	auto PrevVF = VF.divideCoefficientBy(RHS: `2`);
3141	// Return true if all lanes perform the same memory operation, and we can
3142	// thus choose to execute only one.
3143	auto IsUniformMemOpUse = [&](Instruction *I) {
3144	// If the value was already known to not be uniform for the previous
3145	// (smaller VF), it cannot be uniform for the larger VF.
3146	if (PrevVF.isVector()) {
3147	auto Iter = Uniforms.find(Val: PrevVF);
3148	if (Iter != Uniforms.end() && !Iter ->second.contains(Ptr: I))
3149	return false;
3150	}
3151	if (!Legal->isUniformMemOp(I&: *I, VF))
3152	return false;
3153	if (isa<LoadInst>(Val: I))
3154	// Loading the same address always produces the same result - at least
3155	// assuming aliasing and ordering which have already been checked.
3156	return true;
3157	// Storing the same value on every iteration.
3158	return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3159	};
3160
3161	auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3162	InstWidening WideningDecision = getWideningDecision(I, VF);
3163	assert(WideningDecision != CM_Unknown &&
3164	"Widening decision should be ready at this moment");
3165
3166	if (IsUniformMemOpUse (I))
3167	return true;
3168
3169	return (WideningDecision == CM_Widen \|\|
3170	WideningDecision == CM_Widen_Reverse \|\|
3171	WideningDecision == CM_Interleave);
3172	};
3173
3174	// Returns true if Ptr is the pointer operand of a memory access instruction
3175	// I, I is known to not require scalarization, and the pointer is not also
3176	// stored.
3177	auto IsVectorizedMemAccessUse = [&](Instruction I, Value Ptr) -> bool {
3178	if (isa<StoreInst>(Val: I) && I->getOperand(i: `0`) == Ptr)
3179	return false;
3180	return getLoadStorePointerOperand(V: I) == Ptr &&
3181	(IsUniformDecision (I, VF) \|\| Legal->isInvariant(V: Ptr));
3182	};
3183
3184	// Holds a list of values which are known to have at least one uniform use.
3185	// Note that there may be other uses which aren't uniform. A "uniform use"
3186	// here is something which only demands lane 0 of the unrolled iterations;
3187	// it does not imply that all lanes produce the same value (e.g. this is not
3188	// the usual meaning of uniform)
3189	SetVector<Value *> HasUniformUse;
3190
3191	// Scan the loop for instructions which are either a) known to have only
3192	// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3193	for (auto *BB : TheLoop->blocks())
3194	for (auto &I : *BB) {
3195	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3196	switch (II->getIntrinsicID()) {
3197	case Intrinsic::sideeffect:
3198	case Intrinsic::experimental_noalias_scope_decl:
3199	case Intrinsic::assume:
3200	case Intrinsic::lifetime_start:
3201	case Intrinsic::lifetime_end:
3202	if (TheLoop->hasLoopInvariantOperands(I: &I))
3203	AddToWorklistIfAllowed (&I);
3204	break;
3205	default:
3206	break;
3207	}
3208	}
3209
3210	if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3211	if (IsOutOfScope (EVI->getAggregateOperand())) {
3212	AddToWorklistIfAllowed (EVI);
3213	continue;
3214	}
3215	// Only ExtractValue instructions where the aggregate value comes from a
3216	// call are allowed to be non-uniform.
3217	assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3218	"Expected aggregate value to be call return value");
3219	}
3220
3221	// If there's no pointer operand, there's nothing to do.
3222	auto *Ptr = getLoadStorePointerOperand(V: &I);
3223	if (!Ptr)
3224	continue;
3225
3226	// If the pointer can be proven to be uniform, always add it to the
3227	// worklist.
3228	if (isa<Instruction>(Val: Ptr) && Legal->isUniform(V: Ptr, VF))
3229	AddToWorklistIfAllowed (cast<Instruction>(Val: Ptr));
3230
3231	if (IsUniformMemOpUse (&I))
3232	AddToWorklistIfAllowed (&I);
3233
3234	if (IsVectorizedMemAccessUse (&I, Ptr))
3235	HasUniformUse.insert(X: Ptr);
3236	}
3237
3238	// Add to the worklist any operands which have only* uniform (e.g. lane 0*
3239	// demanding) users. Since loops are assumed to be in LCSSA form, this
3240	// disallows uses outside the loop as well.
3241	for (auto *V : HasUniformUse) {
3242	if (IsOutOfScope (V))
3243	continue;
3244	auto *I = cast<Instruction>(Val: V);
3245	bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User U) -> bool* {
3246	auto *UI = cast<Instruction>(Val: U);
3247	return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse (UI, V);
3248	});
3249	if (UsersAreMemAccesses)
3250	AddToWorklistIfAllowed (I);
3251	}
3252
3253	// Expand Worklist in topological order: whenever a new instruction
3254	// is added , its users should be already inside Worklist. It ensures
3255	// a uniform instruction will only be used by uniform instructions.
3256	unsigned Idx = `0`;
3257	while (Idx != Worklist.size()) {
3258	Instruction *I = Worklist [Idx++];
3259
3260	for (auto *OV : I->operand_values()) {
3261	// isOutOfScope operands cannot be uniform instructions.
3262	if (IsOutOfScope (OV))
3263	continue;
3264	// First order recurrence Phi's should typically be considered
3265	// non-uniform.
3266	auto *OP = dyn_cast<PHINode>(Val: OV);
3267	if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3268	continue;
3269	// If all the users of the operand are uniform, then add the
3270	// operand into the uniform worklist.
3271	auto *OI = cast<Instruction>(Val: OV);
3272	if (llvm::all_of(Range: OI->users(), P: [&](User U) -> bool* {
3273	auto *J = cast<Instruction>(Val: U);
3274	return Worklist.count(key: J) \|\| IsVectorizedMemAccessUse (J, OI);
3275	}))
3276	AddToWorklistIfAllowed (OI);
3277	}
3278	}
3279
3280	// For an instruction to be added into Worklist above, all its users inside
3281	// the loop should also be in Worklist. However, this condition cannot be
3282	// true for phi nodes that form a cyclic dependence. We must process phi
3283	// nodes separately. An induction variable will remain uniform if all users
3284	// of the induction variable and induction variable update remain uniform.
3285	// The code below handles both pointer and non-pointer induction variables.
3286	BasicBlock *Latch = TheLoop->getLoopLatch();
3287	for (const auto &Induction : Legal->getInductionVars()) {
3288	auto *Ind = Induction.first;
3289	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3290
3291	// Determine if all users of the induction variable are uniform after
3292	// vectorization.
3293	bool UniformInd = all_of(Range: Ind->users(), P: [&](User U) -> bool* {
3294	auto *I = cast<Instruction>(Val: U);
3295	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
3296	IsVectorizedMemAccessUse (I, Ind);
3297	});
3298	if (!UniformInd)
3299	continue;
3300
3301	// Determine if all users of the induction variable update instruction are
3302	// uniform after vectorization.
3303	bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
3304	auto *I = cast<Instruction>(Val: U);
3305	return I == Ind \|\| Worklist.count(key: I) \|\|
3306	IsVectorizedMemAccessUse (I, IndUpdate);
3307	});
3308	if (!UniformIndUpdate)
3309	continue;
3310
3311	// The induction variable and its update instruction will remain uniform.
3312	AddToWorklistIfAllowed (Ind);
3313	AddToWorklistIfAllowed (IndUpdate);
3314	}
3315
3316	Uniforms [VF].insert_range(R&: Worklist);
3317	}
3318
3319	bool LoopVectorizationCostModel::runtimeChecksRequired() {
3320	LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3321
3322	if (Legal->getRuntimePointerChecking()->Need) {
3323	reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3324	OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3325	"loop with '#pragma clang loop vectorize(enable)' when "
3326	"compiling with -Os/-Oz",
3327	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3328	return true;
3329	}
3330
3331	if (!PSE.getPredicate().isAlwaysTrue()) {
3332	reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3333	OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3334	"loop with '#pragma clang loop vectorize(enable)' when "
3335	"compiling with -Os/-Oz",
3336	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3337	return true;
3338	}
3339
3340	// FIXME: Avoid specializing for stride==1 instead of bailing out.
3341	if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3342	reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3343	OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3344	"this loop without such check by compiling with -Os/-Oz",
3345	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3346	return true;
3347	}
3348
3349	return false;
3350	}
3351
3352	bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3353	if (IsScalableVectorizationAllowed)
3354	return *IsScalableVectorizationAllowed;
3355
3356	IsScalableVectorizationAllowed = false;
3357	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3358	return false;
3359
3360	if (Hints->isScalableVectorizationDisabled()) {
3361	reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3362	ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3363	return false;
3364	}
3365
3366	LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3367
3368	auto MaxScalableVF = ElementCount::getScalable(
3369	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3370
3371	// Test that the loop-vectorizer can legalize all operations for this MaxVF.
3372	// FIXME: While for scalable vectors this is currently sufficient, this should
3373	// be replaced by a more detailed mechanism that filters out specific VFs,
3374	// instead of invalidating vectorization for a whole set of VFs based on the
3375	// MaxVF.
3376
3377	// Disable scalable vectorization if the loop contains unsupported reductions.
3378	if (!canVectorizeReductions(VF: MaxScalableVF)) {
3379	reportVectorizationInfo(
3380	Msg: "Scalable vectorization not supported for the reduction "
3381	"operations found in this loop.",
3382	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3383	return false;
3384	}
3385
3386	// Disable scalable vectorization if the loop contains any instructions
3387	// with element types not supported for scalable vectors.
3388	if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3389	return !Ty->isVoidTy() &&
3390	!this->TTI.isElementTypeLegalForScalableVector(Ty);
3391	})) {
3392	reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3393	"for all element types found in this loop.",
3394	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3395	return false;
3396	}
3397
3398	if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3399	reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3400	"for safe distance analysis.",
3401	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3402	return false;
3403	}
3404
3405	IsScalableVectorizationAllowed = true;
3406	return true;
3407	}
3408
3409	ElementCount
3410	LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3411	if (!isScalableVectorizationAllowed())
3412	return ElementCount::getScalable(MinVal: `0`);
3413
3414	auto MaxScalableVF = ElementCount::getScalable(
3415	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3416	if (Legal->isSafeForAnyVectorWidth())
3417	return MaxScalableVF;
3418
3419	std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3420	// Limit MaxScalableVF by the maximum safe dependence distance.
3421	MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3422
3423	if (!MaxScalableVF)
3424	reportVectorizationInfo(
3425	Msg: "Max legal vector width too small, scalable vectorization "
3426	"unfeasible.",
3427	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3428
3429	return MaxScalableVF;
3430	}
3431
3432	FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3433	unsigned MaxTripCount, ElementCount UserVF, unsigned UserIC,
3434	bool FoldTailByMasking) {
3435	MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3436	unsigned SmallestType, WidestType;
3437	std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3438
3439	// Get the maximum safe dependence distance in bits computed by LAA.
3440	// It is computed by MaxVF sizeOf(type) * 8, where type is taken from*
3441	// the memory accesses that is most restrictive (involved in the smallest
3442	// dependence distance).
3443	unsigned MaxSafeElementsPowerOf2 =
3444	bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3445	if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3446	unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3447	MaxSafeElementsPowerOf2 =
3448	std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3449	}
3450	auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3451	auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3452
3453	if (!Legal->isSafeForAnyVectorWidth())
3454	this->MaxSafeElements = MaxSafeElementsPowerOf2;
3455
3456	LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3457	<< ".\n");
3458	LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3459	<< ".\n");
3460
3461	// First analyze the UserVF, fall back if the UserVF should be ignored.
3462	if (UserVF) {
3463	auto MaxSafeUserVF =
3464	UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3465
3466	if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3467	// If `VF=vscale x N` is safe, then so is `VF=N`
3468	if (UserVF.isScalable())
3469	return FixedScalableVFPair (
3470	ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3471
3472	return UserVF;
3473	}
3474
3475	assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3476
3477	// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3478	// is better to ignore the hint and let the compiler choose a suitable VF.
3479	if (!UserVF.isScalable()) {
3480	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3481	<< " is unsafe, clamping to max safe VF="
3482	<< MaxSafeFixedVF << ".\n");
3483	ORE->emit(RemarkBuilder: [&]() {
3484	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3485	TheLoop->getStartLoc(),
3486	TheLoop->getHeader())
3487	<< "User-specified vectorization factor "
3488	<< ore::NV ("UserVectorizationFactor", UserVF)
3489	<< " is unsafe, clamping to maximum safe vectorization factor "
3490	<< ore::NV ("VectorizationFactor", MaxSafeFixedVF);
3491	});
3492	return MaxSafeFixedVF;
3493	}
3494
3495	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3496	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3497	<< " is ignored because scalable vectors are not "
3498	"available.\n");
3499	ORE->emit(RemarkBuilder: [&]() {
3500	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3501	TheLoop->getStartLoc(),
3502	TheLoop->getHeader())
3503	<< "User-specified vectorization factor "
3504	<< ore::NV ("UserVectorizationFactor", UserVF)
3505	<< " is ignored because the target does not support scalable "
3506	"vectors. The compiler will pick a more suitable value.";
3507	});
3508	} else {
3509	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3510	<< " is unsafe. Ignoring scalable UserVF.\n");
3511	ORE->emit(RemarkBuilder: [&]() {
3512	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3513	TheLoop->getStartLoc(),
3514	TheLoop->getHeader())
3515	<< "User-specified vectorization factor "
3516	<< ore::NV ("UserVectorizationFactor", UserVF)
3517	<< " is unsafe. Ignoring the hint to let the compiler pick a "
3518	"more suitable value.";
3519	});
3520	}
3521	}
3522
3523	LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3524	<< " / " << WidestType << " bits.\n");
3525
3526	FixedScalableVFPair Result(ElementCount::getFixed(MinVal: `1`),
3527	ElementCount::getScalable(MinVal: `0`));
3528	if (auto MaxVF =
3529	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3530	MaxSafeVF: MaxSafeFixedVF, UserIC, FoldTailByMasking))
3531	Result.FixedVF = MaxVF;
3532
3533	if (auto MaxVF =
3534	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3535	MaxSafeVF: MaxSafeScalableVF, UserIC, FoldTailByMasking))
3536	if (MaxVF.isScalable()) {
3537	Result.ScalableVF = MaxVF;
3538	LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3539	<< "\n");
3540	}
3541
3542	return Result;
3543	}
3544
3545	FixedScalableVFPair
3546	LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3547	if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3548	// TODO: It may be useful to do since it's still likely to be dynamically
3549	// uniform if the target can skip.
3550	reportVectorizationFailure(
3551	DebugMsg: "Not inserting runtime ptr check for divergent target",
3552	OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3553	ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3554	return FixedScalableVFPair::getNone();
3555	}
3556
3557	ScalarEvolution *SE = PSE.getSE();
3558	ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3559	unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3560	LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << `'\n'`);
3561	if (TC != ElementCount::getFixed(MinVal: MaxTC))
3562	LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << `'\n'`);
3563	if (TC.isScalar()) {
3564	reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3565	OREMsg: "loop trip count is one, irrelevant for vectorization",
3566	ORETag: "SingleIterationLoop", ORE, TheLoop);
3567	return FixedScalableVFPair::getNone();
3568	}
3569
3570	// If BTC matches the widest induction type and is -1 then the trip count
3571	// computation will wrap to 0 and the vector trip count will be 0. Do not try
3572	// to vectorize.
3573	const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3574	if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3575	BTC->getType()->getScalarSizeInBits() >=
3576	Legal->getWidestInductionType()->getScalarSizeInBits() &&
3577	SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3578	RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3579	reportVectorizationFailure(
3580	DebugMsg: "Trip count computation wrapped",
3581	OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3582	ORETag: "TripCountWrapped", ORE, TheLoop);
3583	return FixedScalableVFPair::getNone();
3584	}
3585
3586	switch (ScalarEpilogueStatus) {
3587	case CM_ScalarEpilogueAllowed:
3588	return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: false);
3589	case CM_ScalarEpilogueNotAllowedUsePredicate:
3590	[[fallthrough]];
3591	case CM_ScalarEpilogueNotNeededUsePredicate:
3592	LLVM_DEBUG(
3593	dbgs() << "LV: vector predicate hint/switch found.\n"
3594	<< "LV: Not allowing scalar epilogue, creating predicated "
3595	<< "vector loop.\n");
3596	break;
3597	case CM_ScalarEpilogueNotAllowedLowTripLoop:
3598	// fallthrough as a special case of OptForSize
3599	case CM_ScalarEpilogueNotAllowedOptSize:
3600	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3601	LLVM_DEBUG(
3602	dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3603	else
3604	LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3605	<< "count.\n");
3606
3607	// Bail if runtime checks are required, which are not good when optimising
3608	// for size.
3609	if (runtimeChecksRequired())
3610	return FixedScalableVFPair::getNone();
3611
3612	break;
3613	}
3614
3615	// Now try the tail folding
3616
3617	// Invalidate interleave groups that require an epilogue if we can't mask
3618	// the interleave-group.
3619	if (!useMaskedInterleavedAccesses(TTI)) {
3620	assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3621	"No decisions should have been taken at this point");
3622	// Note: There is no need to invalidate any cost modeling decisions here, as
3623	// none were taken so far.
3624	InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3625	}
3626
3627	FixedScalableVFPair MaxFactors =
3628	computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, UserIC, FoldTailByMasking: true);
3629
3630	// Avoid tail folding if the trip count is known to be a multiple of any VF
3631	// we choose.
3632	std::optional<unsigned> MaxPowerOf2RuntimeVF =
3633	MaxFactors.FixedVF.getFixedValue();
3634	if (MaxFactors.ScalableVF) {
3635	std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3636	if (MaxVScale) {
3637	MaxPowerOf2RuntimeVF = std::max<unsigned>(
3638	a: *MaxPowerOf2RuntimeVF,
3639	b: MaxVScale MaxFactors.ScalableVF.getKnownMinValue());
3640	} else
3641	MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3642	}
3643
3644	auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3645	// Return false if the loop is neither a single-latch-exit loop nor an
3646	// early-exit loop as tail-folding is not supported in that case.
3647	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3648	!Legal->hasUncountableEarlyExit())
3649	return false;
3650	unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3651	ScalarEvolution *SE = PSE.getSE();
3652	// Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3653	// with uncountable exits. For countable loops, the symbolic maximum must
3654	// remain identical to the known back-edge taken count.
3655	const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3656	assert((Legal->hasUncountableEarlyExit() \|\|
3657	BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3658	"Invalid loop count");
3659	const SCEV *ExitCount = SE->getAddExpr(
3660	LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3661	const SCEV *Rem = SE->getURemExpr(
3662	LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3663	RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3664	return Rem->isZero();
3665	};
3666
3667	if (MaxPowerOf2RuntimeVF > `0u`) {
3668	assert((UserVF.isNonZero() \|\| isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3669	"MaxFixedVF must be a power of 2");
3670	if (NoScalarEpilogueNeeded (*MaxPowerOf2RuntimeVF)) {
3671	// Accept MaxFixedVF if we do not have a tail.
3672	LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3673	return MaxFactors;
3674	}
3675	}
3676
3677	auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3678	if (ExpectedTC && ExpectedTC ->isFixed() &&
3679	ExpectedTC ->getFixedValue() <=
3680	TTI.getMinTripCountTailFoldingThreshold()) {
3681	if (MaxPowerOf2RuntimeVF > `0u`) {
3682	// If we have a low-trip-count, and the fixed-width VF is known to divide
3683	// the trip count but the scalable factor does not, use the fixed-width
3684	// factor in preference to allow the generation of a non-predicated loop.
3685	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3686	NoScalarEpilogueNeeded (MaxFactors.FixedVF.getFixedValue())) {
3687	LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3688	"remain for any chosen VF.\n");
3689	MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: `0`);
3690	return MaxFactors;
3691	}
3692	}
3693
3694	reportVectorizationFailure(
3695	DebugMsg: "The trip count is below the minial threshold value.",
3696	OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3697	ORE, TheLoop);
3698	return FixedScalableVFPair::getNone();
3699	}
3700
3701	// If we don't know the precise trip count, or if the trip count that we
3702	// found modulo the vectorization factor is not zero, try to fold the tail
3703	// by masking.
3704	// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3705	bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3706	setTailFoldingStyle(IsScalableVF: ContainsScalableVF, UserIC);
3707	if (foldTailByMasking()) {
3708	if (foldTailWithEVL()) {
3709	LLVM_DEBUG(
3710	dbgs()
3711	<< "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3712	"try to generate VP Intrinsics with scalable vector "
3713	"factors only.\n");
3714	// Tail folded loop using VP intrinsics restricts the VF to be scalable
3715	// for now.
3716	// TODO: extend it for fixed vectors, if required.
3717	assert(ContainsScalableVF && "Expected scalable vector factor.");
3718
3719	MaxFactors.FixedVF = ElementCount::getFixed(MinVal: `1`);
3720	}
3721	return MaxFactors;
3722	}
3723
3724	// If there was a tail-folding hint/switch, but we can't fold the tail by
3725	// masking, fallback to a vectorization with a scalar epilogue.
3726	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3727	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3728	"scalar epilogue instead.\n");
3729	ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3730	return MaxFactors;
3731	}
3732
3733	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3734	LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3735	return FixedScalableVFPair::getNone();
3736	}
3737
3738	if (TC.isZero()) {
3739	reportVectorizationFailure(
3740	DebugMsg: "unable to calculate the loop count due to complex control flow",
3741	ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3742	return FixedScalableVFPair::getNone();
3743	}
3744
3745	reportVectorizationFailure(
3746	DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3747	OREMsg: "cannot optimize for size and vectorize at the same time. "
3748	"Enable vectorization of this loop with '#pragma clang loop "
3749	"vectorize(enable)' when compiling with -Os/-Oz",
3750	ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3751	return FixedScalableVFPair::getNone();
3752	}
3753
3754	bool LoopVectorizationCostModel::shouldConsiderRegPressureForVF(
3755	ElementCount VF) {
3756	if (ConsiderRegPressure.getNumOccurrences())
3757	return ConsiderRegPressure;
3758
3759	// TODO: We should eventually consider register pressure for all targets. The
3760	// TTI hook is temporary whilst target-specific issues are being fixed.
3761	if (TTI.shouldConsiderVectorizationRegPressure())
3762	return true;
3763
3764	if (!useMaxBandwidth(RegKind: VF.isScalable()
3765	? TargetTransformInfo::RGK_ScalableVector
3766	: TargetTransformInfo::RGK_FixedWidthVector))
3767	return false;
3768	// Only calculate register pressure for VFs enabled by MaxBandwidth.
3769	return ElementCount::isKnownGT(
3770	LHS: VF, RHS: VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF
3771	: MaxPermissibleVFWithoutMaxBW.FixedVF);
3772	}
3773
3774	bool LoopVectorizationCostModel::useMaxBandwidth(
3775	TargetTransformInfo::RegisterKind RegKind) {
3776	return MaximizeBandwidth \|\| (MaximizeBandwidth.getNumOccurrences() == `0` &&
3777	(TTI.shouldMaximizeVectorBandwidth(K: RegKind) \|\|
3778	(UseWiderVFIfCallVariantsPresent &&
3779	Legal->hasVectorCallVariants())));
3780	}
3781
3782	ElementCount LoopVectorizationCostModel::clampVFByMaxTripCount(
3783	ElementCount VF, unsigned MaxTripCount, unsigned UserIC,
3784	bool FoldTailByMasking) const {
3785	unsigned EstimatedVF = VF.getKnownMinValue();
3786	if (VF.isScalable() && TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3787	auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3788	auto Min = Attr.getVScaleRangeMin();
3789	EstimatedVF *= Min;
3790	}
3791
3792	// When a scalar epilogue is required, at least one iteration of the scalar
3793	// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3794	// max VF that results in a dead vector loop.
3795	if (MaxTripCount > `0` && requiresScalarEpilogue(IsVectorizing: true))
3796	MaxTripCount -= `1`;
3797
3798	// When the user specifies an interleave count, we need to ensure that
3799	// VF UserIC <= MaxTripCount to avoid a dead vector loop.*
3800	unsigned IC = UserIC > `0` ? UserIC : `1`;
3801	unsigned EstimatedVFTimesIC = EstimatedVF * IC;
3802
3803	if (MaxTripCount && MaxTripCount <= EstimatedVFTimesIC &&
3804	(!FoldTailByMasking \|\| isPowerOf2_32(Value: MaxTripCount))) {
3805	// If upper bound loop trip count (TC) is known at compile time there is no
3806	// point in choosing VF greater than TC / IC (as done in the loop below).
3807	// Select maximum power of two which doesn't exceed TC / IC. If VF is
3808	// scalable, we only fall back on a fixed VF when the TC is less than or
3809	// equal to the known number of lanes.
3810	auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount / IC);
3811	if (ClampedUpperTripCount == `0`)
3812	ClampedUpperTripCount = `1`;
3813	LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3814	"exceeding the constant trip count"
3815	<< (UserIC > `0` ? " divided by UserIC" : "") << ": "
3816	<< ClampedUpperTripCount << "\n");
3817	return ElementCount::get(MinVal: ClampedUpperTripCount,
3818	Scalable: FoldTailByMasking ? VF.isScalable() : false);
3819	}
3820	return VF;
3821	}
3822
3823	ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3824	unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3825	ElementCount MaxSafeVF, unsigned UserIC, bool FoldTailByMasking) {
3826	bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3827	const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3828	K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3829	: TargetTransformInfo::RGK_FixedWidthVector);
3830
3831	// Convenience function to return the minimum of two ElementCounts.
3832	auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3833	assert((LHS.isScalable() == RHS.isScalable()) &&
3834	"Scalable flags must match");
3835	return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3836	};
3837
3838	// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3839	// Note that both WidestRegister and WidestType may not be a powers of 2.
3840	auto MaxVectorElementCount = ElementCount::get(
3841	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3842	Scalable: ComputeScalableMaxVF);
3843	MaxVectorElementCount = MinVF (MaxVectorElementCount, MaxSafeVF);
3844	LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3845	<< (MaxVectorElementCount * WidestType) << " bits.\n");
3846
3847	if (!MaxVectorElementCount) {
3848	LLVM_DEBUG(dbgs() << "LV: The target has no "
3849	<< (ComputeScalableMaxVF ? "scalable" : "fixed")
3850	<< " vector registers.\n");
3851	return ElementCount::getFixed(MinVal: `1`);
3852	}
3853
3854	ElementCount MaxVF = clampVFByMaxTripCount(
3855	VF: MaxVectorElementCount, MaxTripCount, UserIC, FoldTailByMasking);
3856	// If the MaxVF was already clamped, there's no point in trying to pick a
3857	// larger one.
3858	if (MaxVF != MaxVectorElementCount)
3859	return MaxVF;
3860
3861	TargetTransformInfo::RegisterKind RegKind =
3862	ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3863	: TargetTransformInfo::RGK_FixedWidthVector;
3864
3865	if (MaxVF.isScalable())
3866	MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF;
3867	else
3868	MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF;
3869
3870	if (useMaxBandwidth(RegKind)) {
3871	auto MaxVectorElementCountMaxBW = ElementCount::get(
3872	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
3873	Scalable: ComputeScalableMaxVF);
3874	MaxVF = MinVF (MaxVectorElementCountMaxBW, MaxSafeVF);
3875
3876	if (ElementCount MinVF =
3877	TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
3878	if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
3879	LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3880	<< ") with target's minimum: " << MinVF << `'\n'`);
3881	MaxVF = MinVF;
3882	}
3883	}
3884
3885	MaxVF =
3886	clampVFByMaxTripCount(VF: MaxVF, MaxTripCount, UserIC, FoldTailByMasking);
3887
3888	if (MaxVectorElementCount != MaxVF) {
3889	// Invalidate any widening decisions we might have made, in case the loop
3890	// requires prediction (decided later), but we have already made some
3891	// load/store widening decisions.
3892	invalidateCostModelingDecisions();
3893	}
3894	}
3895	return MaxVF;
3896	}
3897
3898	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3899	const VectorizationFactor &B,
3900	const unsigned MaxTripCount,
3901	bool HasTail,
3902	bool IsEpilogue) const {
3903	InstructionCost CostA = A.Cost;
3904	InstructionCost CostB = B.Cost;
3905
3906	// Improve estimate for the vector width if it is scalable.
3907	unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3908	unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3909	if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3910	if (A.Width.isScalable())
3911	EstimatedWidthA = VScale;
3912	if (B.Width.isScalable())
3913	EstimatedWidthB = VScale;
3914	}
3915
3916	// When optimizing for size choose whichever is smallest, which will be the
3917	// one with the smallest cost for the whole loop. On a tie pick the larger
3918	// vector width, on the assumption that throughput will be greater.
3919	if (CM.CostKind == TTI::TCK_CodeSize)
3920	return CostA < CostB \|\|
3921	(CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3922
3923	// Assume vscale may be larger than 1 (or the value being tuned for),
3924	// so that scalable vectorization is slightly favorable over fixed-width
3925	// vectorization.
3926	bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost(IsEpilogue) &&
3927	A.Width.isScalable() && !B.Width.isScalable();
3928
3929	auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3930	const InstructionCost &RHS) {
3931	return PreferScalable ? LHS <= RHS : LHS < RHS;
3932	};
3933
3934	// To avoid the need for FP division:
3935	// (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3936	// <=> (CostA EstimatedWidthB) < (CostB * EstimatedWidthA)*
3937	if (!MaxTripCount)
3938	return CmpFn (CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3939
3940	auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3941	InstructionCost VectorCost,
3942	InstructionCost ScalarCost) {
3943	// If the trip count is a known (possibly small) constant, the trip count
3944	// will be rounded up to an integer number of iterations under
3945	// FoldTailByMasking. The total cost in that case will be
3946	// VecCostceil(TripCount/VF). When not folding the tail, the total*
3947	// cost will be VecCostfloor(TC/VF) + ScalarCost(TC%VF). There will be
3948	// some extra overheads, but for the purpose of comparing the costs of
3949	// different VFs we can use this to compare the total loop-body cost
3950	// expected after vectorization.
3951	if (HasTail)
3952	return VectorCost * (MaxTripCount / VF) +
3953	ScalarCost * (MaxTripCount % VF);
3954	return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
3955	};
3956
3957	auto RTCostA = GetCostForTC (EstimatedWidthA, CostA, A.ScalarCost);
3958	auto RTCostB = GetCostForTC (EstimatedWidthB, CostB, B.ScalarCost);
3959	return CmpFn (RTCostA, RTCostB);
3960	}
3961
3962	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3963	const VectorizationFactor &B,
3964	bool HasTail,
3965	bool IsEpilogue) const {
3966	const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
3967	return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount, HasTail,
3968	IsEpilogue);
3969	}
3970
3971	void LoopVectorizationPlanner::emitInvalidCostRemarks(
3972	OptimizationRemarkEmitter *ORE) {
3973	using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
3974	SmallVector<RecipeVFPair> InvalidCosts;
3975	for (const auto &Plan : VPlans) {
3976	for (ElementCount VF : Plan ->vectorFactors()) {
3977	// The VPlan-based cost model is designed for computing vector cost.
3978	// Querying VPlan-based cost model with a scarlar VF will cause some
3979	// errors because we expect the VF is vector for most of the widen
3980	// recipes.
3981	if (VF.isScalar())
3982	continue;
3983
3984	VPCostContext CostCtx(CM.TTI, CM.TLI, Plan, CM, CM.CostKind, CM.PSE,
3985	OrigLoop);
3986	precomputeCosts(Plan&: *Plan, VF, CostCtx);
3987	auto Iter = vp_depth_first_deep(G: Plan ->getVectorLoopRegion()->getEntry());
3988	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
3989	for (auto &R : *VPBB) {
3990	if (!R.cost(VF, Ctx&: CostCtx).isValid())
3991	InvalidCosts.emplace_back(Args: &R, Args&: VF);
3992	}
3993	}
3994	}
3995	}
3996	if (InvalidCosts.empty())
3997	return;
3998
3999	// Emit a report of VFs with invalid costs in the loop.
4000
4001	// Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4002	DenseMap<VPRecipeBase , unsigned*> Numbering;
4003	unsigned I = `0`;
4004	for (auto &Pair : InvalidCosts)
4005	if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4006	++I;
4007
4008	// Sort the list, first on recipe(number) then on VF.
4009	sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4010	unsigned NA = Numbering [A.first];
4011	unsigned NB = Numbering [B.first];
4012	if (NA != NB)
4013	return NA < NB;
4014	return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4015	});
4016
4017	// For a list of ordered recipe-VF pairs:
4018	// [(load, VF1), (load, VF2), (store, VF1)]
4019	// group the recipes together to emit separate remarks for:
4020	// load (VF1, VF2)
4021	// store (VF1)
4022	auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4023	auto Subset = ArrayRef<RecipeVFPair>();
4024	do {
4025	if (Subset.empty())
4026	Subset = Tail.take_front(N: `1`);
4027
4028	VPRecipeBase *R = Subset.front().first;
4029
4030	unsigned Opcode =
4031	TypeSwitch<const VPRecipeBase , unsigned*>(R)
4032	.Case(caseFn: [](const VPHeaderPHIRecipe R) { return* Instruction::PHI; })
4033	.Case(
4034	caseFn: [](const VPWidenStoreRecipe R) { return* Instruction::Store; })
4035	.Case(caseFn: [](const VPWidenLoadRecipe R) { return* Instruction::Load; })
4036	.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4037	caseFn: [](const auto R) { return* Instruction::Call; })
4038	.Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4039	VPWidenCastRecipe>(
4040	caseFn: [](const auto R) { return* R->getOpcode(); })
4041	.Case(caseFn: [](const VPInterleaveRecipe *R) {
4042	return R->getStoredValues().empty() ? Instruction::Load
4043	: Instruction::Store;
4044	})
4045	.Case(caseFn: [](const VPReductionRecipe *R) {
4046	return RecurrenceDescriptor::getOpcode(Kind: R->getRecurrenceKind());
4047	});
4048
4049	// If the next recipe is different, or if there are no other pairs,
4050	// emit a remark for the collated subset. e.g.
4051	// [(load, VF1), (load, VF2))]
4052	// to emit:
4053	// remark: invalid costs for 'load' at VF=(VF1, VF2)
4054	if (Subset == Tail \|\| Tail [Subset.size()].first != R) {
4055	std::string OutString;
4056	raw_string_ostream OS(OutString);
4057	assert(!Subset.empty() && "Unexpected empty range");
4058	OS << "Recipe with invalid costs prevented vectorization at VF=(";
4059	for (const auto &Pair : Subset)
4060	OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4061	OS << "):";
4062	if (Opcode == Instruction::Call) {
4063	StringRef Name = "";
4064	if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4065	Name = Int->getIntrinsicName();
4066	} else {
4067	auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4068	Function *CalledFn =
4069	WidenCall ? WidenCall->getCalledScalarFunction()
4070	: cast<Function>(Val: R->getOperand(N: R->getNumOperands() - `1`)
4071	->getLiveInIRValue());
4072	Name = CalledFn->getName();
4073	}
4074	OS << " call to " << Name;
4075	} else
4076	OS << " " << Instruction::getOpcodeName(Opcode);
4077	reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4078	DL: R->getDebugLoc());
4079	Tail = Tail.drop_front(N: Subset.size());
4080	Subset = {};
4081	} else
4082	// Grow the subset by one element
4083	Subset = Tail.take_front(N: Subset.size() + `1`);
4084	} while (!Tail.empty());
4085	}
4086
4087	/// Check if any recipe of \p Plan will generate a vector value, which will be
4088	/// assigned a vector register.
4089	static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4090	const TargetTransformInfo &TTI) {
4091	assert(VF.isVector() && "Checking a scalar VF?");
4092	VPTypeAnalysis TypeInfo(Plan);
4093	DenseSet<VPRecipeBase *> EphemeralRecipes;
4094	collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4095	// Set of already visited types.
4096	DenseSet<Type *> Visited;
4097	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4098	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4099	for (VPRecipeBase &R : *VPBB) {
4100	if (EphemeralRecipes.contains(V: &R))
4101	continue;
4102	// Continue early if the recipe is considered to not produce a vector
4103	// result. Note that this includes VPInstruction where some opcodes may
4104	// produce a vector, to preserve existing behavior as VPInstructions model
4105	// aspects not directly mapped to existing IR instructions.
4106	switch (R.getVPRecipeID()) {
4107	case VPRecipeBase::VPDerivedIVSC:
4108	case VPRecipeBase::VPScalarIVStepsSC:
4109	case VPRecipeBase::VPReplicateSC:
4110	case VPRecipeBase::VPInstructionSC:
4111	case VPRecipeBase::VPCanonicalIVPHISC:
4112	case VPRecipeBase::VPCurrentIterationPHISC:
4113	case VPRecipeBase::VPVectorPointerSC:
4114	case VPRecipeBase::VPVectorEndPointerSC:
4115	case VPRecipeBase::VPExpandSCEVSC:
4116	case VPRecipeBase::VPPredInstPHISC:
4117	case VPRecipeBase::VPBranchOnMaskSC:
4118	continue;
4119	case VPRecipeBase::VPReductionSC:
4120	case VPRecipeBase::VPActiveLaneMaskPHISC:
4121	case VPRecipeBase::VPWidenCallSC:
4122	case VPRecipeBase::VPWidenCanonicalIVSC:
4123	case VPRecipeBase::VPWidenCastSC:
4124	case VPRecipeBase::VPWidenGEPSC:
4125	case VPRecipeBase::VPWidenIntrinsicSC:
4126	case VPRecipeBase::VPWidenSC:
4127	case VPRecipeBase::VPBlendSC:
4128	case VPRecipeBase::VPFirstOrderRecurrencePHISC:
4129	case VPRecipeBase::VPHistogramSC:
4130	case VPRecipeBase::VPWidenPHISC:
4131	case VPRecipeBase::VPWidenIntOrFpInductionSC:
4132	case VPRecipeBase::VPWidenPointerInductionSC:
4133	case VPRecipeBase::VPReductionPHISC:
4134	case VPRecipeBase::VPInterleaveEVLSC:
4135	case VPRecipeBase::VPInterleaveSC:
4136	case VPRecipeBase::VPWidenLoadEVLSC:
4137	case VPRecipeBase::VPWidenLoadSC:
4138	case VPRecipeBase::VPWidenStoreEVLSC:
4139	case VPRecipeBase::VPWidenStoreSC:
4140	break;
4141	default:
4142	llvm_unreachable("unhandled recipe");
4143	}
4144
4145	auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4146	unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4147	if (!NumLegalParts)
4148	return false;
4149	if (VF.isScalable()) {
4150	// <vscale x 1 x iN> is assumed to be profitable over iN because
4151	// scalable registers are a distinct register class from scalar
4152	// ones. If we ever find a target which wants to lower scalable
4153	// vectors back to scalars, we'll need to update this code to
4154	// explicitly ask TTI about the register class uses for each part.
4155	return NumLegalParts <= VF.getKnownMinValue();
4156	}
4157	// Two or more elements that share a register - are vectorized.
4158	return NumLegalParts < VF.getFixedValue();
4159	};
4160
4161	// If no def nor is a store, e.g., branches, continue - no value to check.
4162	if (R.getNumDefinedValues() == `0` &&
4163	!isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveBase>(Val: &R))
4164	continue;
4165	// For multi-def recipes, currently only interleaved loads, suffice to
4166	// check first def only.
4167	// For stores check their stored value; for interleaved stores suffice
4168	// the check first stored value only. In all cases this is the second
4169	// operand.
4170	VPValue *ToCheck =
4171	R.getNumDefinedValues() >= `1` ? R.getVPValue(I: `0`) : R.getOperand(N: `1`);
4172	Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4173	if (!Visited.insert(V: {ScalarTy}).second)
4174	continue;
4175	Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4176	if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4177	return true;
4178	}
4179	}
4180
4181	return false;
4182	}
4183
4184	static bool hasReplicatorRegion(VPlan &Plan) {
4185	return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4186	G: Plan.getVectorLoopRegion()->getEntry())),
4187	P: [](auto VPRB) { return* VPRB->isReplicator(); });
4188	}
4189
4190	#ifndef NDEBUG
4191	VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4192	InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(`1`));
4193	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4194	assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4195	assert(
4196	any_of(VPlans,
4197	[](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4198	"Expected Scalar VF to be a candidate");
4199
4200	const VectorizationFactor ScalarCost(ElementCount::getFixed(`1`), ExpectedCost,
4201	ExpectedCost);
4202	VectorizationFactor ChosenFactor = ScalarCost;
4203
4204	bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4205	if (ForceVectorization &&
4206	(VPlans.size() > `1` \|\| !VPlans[`0`]->hasScalarVFOnly())) {
4207	// Ignore scalar width, because the user explicitly wants vectorization.
4208	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
4209	// evaluation.
4210	ChosenFactor.Cost = InstructionCost::getMax();
4211	}
4212
4213	for (auto &P : VPlans) {
4214	ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4215	P->vectorFactors().end());
4216
4217	SmallVector<VPRegisterUsage, `8`> RUs;
4218	if (any_of(VFs, [this](ElementCount VF) {
4219	return CM.shouldConsiderRegPressureForVF(VF);
4220	}))
4221	RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4222
4223	for (unsigned I = `0`; I < VFs.size(); I++) {
4224	ElementCount VF = VFs[I];
4225	// The cost for scalar VF=1 is already calculated, so ignore it.
4226	if (VF.isScalar())
4227	continue;
4228
4229	/// If the register pressure needs to be considered for VF,
4230	/// don't consider the VF as valid if it exceeds the number
4231	/// of registers for the target.
4232	if (CM.shouldConsiderRegPressureForVF(VF) &&
4233	RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs))
4234	continue;
4235
4236	InstructionCost C = CM.expectedCost(VF);
4237
4238	// Add on other costs that are modelled in VPlan, but not in the legacy
4239	// cost model.
4240	VPCostContext CostCtx(CM.TTI, CM.TLI, P, CM, CM.CostKind, CM.PSE,
4241	OrigLoop);
4242	VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4243	assert(VectorRegion && "Expected to have a vector region!");
4244	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4245	vp_depth_first_shallow(VectorRegion->getEntry()))) {
4246	for (VPRecipeBase &R : *VPBB) {
4247	auto *VPI = dyn_cast<VPInstruction>(&R);
4248	if (!VPI)
4249	continue;
4250	switch (VPI->getOpcode()) {
4251	// Selects are only modelled in the legacy cost model for safe
4252	// divisors.
4253	case Instruction::Select: {
4254	if (auto *WR =
4255	dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
4256	switch (WR->getOpcode()) {
4257	case Instruction::UDiv:
4258	case Instruction::SDiv:
4259	case Instruction::URem:
4260	case Instruction::SRem:
4261	continue;
4262	default:
4263	break;
4264	}
4265	}
4266	C += VPI->cost(VF, CostCtx);
4267	break;
4268	}
4269	case VPInstruction::ActiveLaneMask: {
4270	unsigned Multiplier =
4271	cast<VPConstantInt>(VPI->getOperand(`2`))->getZExtValue();
4272	C += VPI->cost(VF * Multiplier, CostCtx);
4273	break;
4274	}
4275	case VPInstruction::ExplicitVectorLength:
4276	case VPInstruction::AnyOf:
4277	C += VPI->cost(VF, CostCtx);
4278	break;
4279	default:
4280	break;
4281	}
4282	}
4283	}
4284
4285	VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4286	unsigned Width =
4287	estimateElementCount(Candidate.Width, CM.getVScaleForTuning());
4288	LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4289	<< " costs: " << (Candidate.Cost / Width));
4290	if (VF.isScalable())
4291	LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4292	<< CM.getVScaleForTuning().value_or(`1`) << ")");
4293	LLVM_DEBUG(dbgs() << ".\n");
4294
4295	if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4296	LLVM_DEBUG(
4297	dbgs()
4298	<< "LV: Not considering vector loop of width " << VF
4299	<< " because it will not generate any vector instructions.\n");
4300	continue;
4301	}
4302
4303	if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4304	LLVM_DEBUG(
4305	dbgs()
4306	<< "LV: Not considering vector loop of width " << VF
4307	<< " because it would cause replicated blocks to be generated,"
4308	<< " which isn't allowed when optimizing for size.\n");
4309	continue;
4310	}
4311
4312	if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4313	ChosenFactor = Candidate;
4314	}
4315	}
4316
4317	if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4318	reportVectorizationFailure(
4319	"There are conditional stores.",
4320	"store that is conditionally executed prevents vectorization",
4321	"ConditionalStore", ORE, OrigLoop);
4322	ChosenFactor = ScalarCost;
4323	}
4324
4325	LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4326	!isMoreProfitable(ChosenFactor, ScalarCost,
4327	!CM.foldTailByMasking())) dbgs()
4328	<< "LV: Vectorization seems to be not beneficial, "
4329	<< "but was forced by a user.\n");
4330	return ChosenFactor;
4331	}
4332	#endif
4333
4334	/// Returns true if the VPlan contains a VPReductionPHIRecipe with
4335	/// FindLast recurrence kind.
4336	static bool hasFindLastReductionPhi(VPlan &Plan) {
4337	return any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4338	P: [](VPRecipeBase &R) {
4339	auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4340	return RedPhi &&
4341	RecurrenceDescriptor::isFindLastRecurrenceKind(
4342	Kind: RedPhi->getRecurrenceKind());
4343	});
4344	}
4345
4346	/// Returns true if the VPlan contains header phi recipes that are not currently
4347	/// supported for epilogue vectorization.
4348	static bool hasUnsupportedHeaderPhiRecipe(VPlan &Plan) {
4349	return any_of(
4350	Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4351	P: [](VPRecipeBase &R) {
4352	if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: &R))
4353	return !WidenInd->getPHINode();
4354	auto *RedPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4355	return RedPhi && (RecurrenceDescriptor::isFindLastRecurrenceKind(
4356	Kind: RedPhi->getRecurrenceKind()) \|\|
4357	!RedPhi->getUnderlyingValue());
4358	});
4359	}
4360
4361	bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4362	ElementCount VF) const {
4363	// Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4364	// reductions need special handling and are currently unsupported.
4365	if (any_of(Range: OrigLoop->getHeader()->phis(), P: [&](PHINode &Phi) {
4366	if (!Legal->isReductionVariable(PN: &Phi))
4367	return Legal->isFixedOrderRecurrence(Phi: &Phi);
4368	RecurKind Kind =
4369	Legal->getRecurrenceDescriptor(PN: &Phi).getRecurrenceKind();
4370	return RecurrenceDescriptor::isFPMinMaxNumRecurrenceKind(Kind);
4371	}))
4372	return false;
4373
4374	// FindLast reductions and inductions without underlying PHI require special
4375	// handling and are currently not supported for epilogue vectorization.
4376	if (hasUnsupportedHeaderPhiRecipe(Plan&: getPlanFor(VF)))
4377	return false;
4378
4379	// Phis with uses outside of the loop require special handling and are
4380	// currently unsupported.
4381	for (const auto &Entry : Legal->getInductionVars()) {
4382	// Look for uses of the value of the induction at the last iteration.
4383	Value *PostInc =
4384	Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4385	for (User *U : PostInc->users())
4386	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4387	return false;
4388	// Look for uses of penultimate value of the induction.
4389	for (User *U : Entry.first->users())
4390	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4391	return false;
4392	}
4393
4394	// Epilogue vectorization code has not been auditted to ensure it handles
4395	// non-latch exits properly. It may be fine, but it needs auditted and
4396	// tested.
4397	// TODO: Add support for loops with an early exit.
4398	if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4399	return false;
4400
4401	return true;
4402	}
4403
4404	bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4405	const ElementCount VF, const unsigned IC) const {
4406	// FIXME: We need a much better cost-model to take different parameters such
4407	// as register pressure, code size increase and cost of extra branches into
4408	// account. For now we apply a very crude heuristic and only consider loops
4409	// with vectorization factors larger than a certain value.
4410
4411	// Allow the target to opt out.
4412	if (!TTI.preferEpilogueVectorization(Iters: VF * IC))
4413	return false;
4414
4415	unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > `0`
4416	? EpilogueVectorizationMinVF
4417	: TTI.getEpilogueVectorizationMinVF();
4418	return estimateElementCount(VF: VF * IC, VScale: VScaleForTuning) >= MinVFThreshold;
4419	}
4420
4421	VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4422	const ElementCount MainLoopVF, unsigned IC) {
4423	VectorizationFactor Result = VectorizationFactor::Disabled();
4424	if (!EnableEpilogueVectorization) {
4425	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4426	return Result;
4427	}
4428
4429	if (!CM.isScalarEpilogueAllowed()) {
4430	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4431	"epilogue is allowed.\n");
4432	return Result;
4433	}
4434
4435	// Not really a cost consideration, but check for unsupported cases here to
4436	// simplify the logic.
4437	if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4438	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4439	"is not a supported candidate.\n");
4440	return Result;
4441	}
4442
4443	if (EpilogueVectorizationForceVF > `1`) {
4444	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4445	ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4446	if (hasPlanWithVF(VF: ForcedEC))
4447	return {ForcedEC, `0`, `0`};
4448
4449	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4450	"viable.\n");
4451	return Result;
4452	}
4453
4454	if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4455	LLVM_DEBUG(
4456	dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4457	return Result;
4458	}
4459
4460	if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4461	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4462	"this loop\n");
4463	return Result;
4464	}
4465
4466	// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4467	// the main loop handles 8 lanes per iteration. We could still benefit from
4468	// vectorizing the epilogue loop with VF=4.
4469	ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4470	MinVal: estimateElementCount(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4471
4472	Type *TCType = Legal->getWidestInductionType();
4473	const SCEV RemainingIterations = nullptr*;
4474	unsigned MaxTripCount = `0`;
4475	const SCEV *TC = vputils::getSCEVExprForVPValue(
4476	V: getPlanFor(VF: MainLoopVF).getTripCount(), PSE);
4477	assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
4478	const SCEV *KnownMinTC;
4479	bool ScalableTC = match(S: TC, P: m_scev_c_Mul(Op0: m_SCEV(V&: KnownMinTC), Op1: m_SCEVVScale()));
4480	bool ScalableRemIter = false;
4481	ScalarEvolution &SE = *PSE.getSE();
4482	// Use versions of TC and VF in which both are either scalable or fixed.
4483	if (ScalableTC == MainLoopVF.isScalable()) {
4484	ScalableRemIter = ScalableTC;
4485	RemainingIterations =
4486	SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4487	} else if (ScalableTC) {
4488	const SCEV *EstimatedTC = SE.getMulExpr(
4489	LHS: KnownMinTC,
4490	RHS: SE.getConstant(Ty: TCType, V: CM.getVScaleForTuning().value_or(u: `1`)));
4491	RemainingIterations = SE.getURemExpr(
4492	LHS: EstimatedTC, RHS: SE.getElementCount(Ty: TCType, EC: MainLoopVF * IC));
4493	} else
4494	RemainingIterations =
4495	SE.getURemExpr(LHS: TC, RHS: SE.getElementCount(Ty: TCType, EC: EstimatedRuntimeVF * IC));
4496
4497	// No iterations left to process in the epilogue.
4498	if (RemainingIterations->isZero())
4499	return Result;
4500
4501	if (MainLoopVF.isFixed()) {
4502	MaxTripCount = MainLoopVF.getFixedValue() * IC - `1`;
4503	if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4504	RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4505	MaxTripCount = SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4506	}
4507	LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4508	<< MaxTripCount << "\n");
4509	}
4510
4511	auto SkipVF = [&](const SCEV VF, const* SCEV RemIter) -> bool* {
4512	return SE.isKnownPredicate(Pred: CmpInst::ICMP_UGT, LHS: VF, RHS: RemIter);
4513	};
4514	for (auto &NextVF : ProfitableVFs) {
4515	// Skip candidate VFs without a corresponding VPlan.
4516	if (!hasPlanWithVF(VF: NextVF.Width))
4517	continue;
4518
4519	// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4520	// vectors) or > the VF of the main loop (fixed vectors).
4521	if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4522	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) \|\|
4523	(NextVF.Width.isScalable() &&
4524	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) \|\|
4525	(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4526	ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4527	continue;
4528
4529	// If NextVF is greater than the number of remaining iterations, the
4530	// epilogue loop would be dead. Skip such factors.
4531	// TODO: We should also consider comparing against a scalable
4532	// RemainingIterations when SCEV be able to evaluate non-canonical
4533	// vscale-based expressions.
4534	if (!ScalableRemIter) {
4535	// Handle the case where NextVF and RemainingIterations are in different
4536	// numerical spaces.
4537	ElementCount EC = NextVF.Width;
4538	if (NextVF.Width.isScalable())
4539	EC = ElementCount::getFixed(
4540	MinVal: estimateElementCount(VF: NextVF.Width, VScale: CM.getVScaleForTuning()));
4541	if (SkipVF (SE.getElementCount(Ty: TCType, EC), RemainingIterations))
4542	continue;
4543	}
4544
4545	if (Result.Width.isScalar() \|\|
4546	isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking(),
4547	/IsEpilogue/ true))
4548	Result = NextVF;
4549	}
4550
4551	if (Result != VectorizationFactor::Disabled())
4552	LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4553	<< Result.Width << "\n");
4554	return Result;
4555	}
4556
4557	std::pair<unsigned, unsigned>
4558	LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4559	unsigned MinWidth = -`1U`;
4560	unsigned MaxWidth = `8`;
4561	const DataLayout &DL = TheFunction->getDataLayout();
4562	// For in-loop reductions, no element types are added to ElementTypesInLoop
4563	// if there are no loads/stores in the loop. In this case, check through the
4564	// reduction variables to determine the maximum width.
4565	if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4566	for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4567	const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4568	// When finding the min width used by the recurrence we need to account
4569	// for casts on the input operands of the recurrence.
4570	MinWidth = std::min(
4571	a: MinWidth,
4572	b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4573	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4574	MaxWidth = std::max(a: MaxWidth,
4575	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4576	}
4577	} else {
4578	for (Type *T : ElementTypesInLoop) {
4579	MinWidth = std::min<unsigned>(
4580	a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4581	MaxWidth = std::max<unsigned>(
4582	a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4583	}
4584	}
4585	return {MinWidth, MaxWidth};
4586	}
4587
4588	void LoopVectorizationCostModel::collectElementTypesForWidening() {
4589	ElementTypesInLoop.clear();
4590	// For each block.
4591	for (BasicBlock *BB : TheLoop->blocks()) {
4592	// For each instruction in the loop.
4593	for (Instruction &I : BB->instructionsWithoutDebug()) {
4594	Type *T = I.getType();
4595
4596	// Skip ignored values.
4597	if (ValuesToIgnore.count(Ptr: &I))
4598	continue;
4599
4600	// Only examine Loads, Stores and PHINodes.
4601	if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4602	continue;
4603
4604	// Examine PHI nodes that are reduction variables. Update the type to
4605	// account for the recurrence type.
4606	if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4607	if (!Legal->isReductionVariable(PN))
4608	continue;
4609	const RecurrenceDescriptor &RdxDesc =
4610	Legal->getRecurrenceDescriptor(PN);
4611	if (PreferInLoopReductions \|\| useOrderedReductions(RdxDesc) \|\|
4612	TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4613	Ty: RdxDesc.getRecurrenceType()))
4614	continue;
4615	T = RdxDesc.getRecurrenceType();
4616	}
4617
4618	// Examine the stored values.
4619	if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4620	T = ST->getValueOperand()->getType();
4621
4622	assert(T->isSized() &&
4623	"Expected the load/store/recurrence type to be sized");
4624
4625	ElementTypesInLoop.insert(Ptr: T);
4626	}
4627	}
4628	}
4629
4630	unsigned
4631	LoopVectorizationPlanner::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4632	InstructionCost LoopCost) {
4633	// -- The interleave heuristics --
4634	// We interleave the loop in order to expose ILP and reduce the loop overhead.
4635	// There are many micro-architectural considerations that we can't predict
4636	// at this level. For example, frontend pressure (on decode or fetch) due to
4637	// code size, or the number and capabilities of the execution ports.
4638	//
4639	// We use the following heuristics to select the interleave count:
4640	// 1. If the code has reductions, then we interleave to break the cross
4641	// iteration dependency.
4642	// 2. If the loop is really small, then we interleave to reduce the loop
4643	// overhead.
4644	// 3. We don't interleave if we think that we will spill registers to memory
4645	// due to the increased register pressure.
4646
4647	// Only interleave tail-folded loops if wide lane masks are requested, as the
4648	// overhead of multiple instructions to calculate the predicate is likely
4649	// not beneficial. If a scalar epilogue is not allowed for any other reason,
4650	// do not interleave.
4651	if (!CM.isScalarEpilogueAllowed() &&
4652	!(CM.preferPredicatedLoop() && CM.useWideActiveLaneMask()))
4653	return `1`;
4654
4655	if (any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4656	P: IsaPred<VPCurrentIterationPHIRecipe>)) {
4657	LLVM_DEBUG(dbgs() << "LV: Loop requires variable-length step. "
4658	"Unroll factor forced to be 1.\n");
4659	return `1`;
4660	}
4661
4662	// We used the distance for the interleave count.
4663	if (!Legal->isSafeForAnyVectorWidth())
4664	return `1`;
4665
4666	// We don't attempt to perform interleaving for loops with uncountable early
4667	// exits because the VPInstruction::AnyOf code cannot currently handle
4668	// multiple parts.
4669	if (Plan.hasEarlyExit())
4670	return `1`;
4671
4672	const bool HasReductions =
4673	any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4674	P: IsaPred<VPReductionPHIRecipe>);
4675
4676	// FIXME: implement interleaving for FindLast transform correctly.
4677	if (hasFindLastReductionPhi(Plan))
4678	return `1`;
4679
4680	// If we did not calculate the cost for VF (because the user selected the VF)
4681	// then we calculate the cost of VF here.
4682	if (LoopCost == `0`) {
4683	if (VF.isScalar())
4684	LoopCost = CM.expectedCost(VF);
4685	else
4686	LoopCost = cost(Plan, VF);
4687	assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4688
4689	// Loop body is free and there is no need for interleaving.
4690	if (LoopCost == `0`)
4691	return `1`;
4692	}
4693
4694	VPRegisterUsage R =
4695	calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore: CM.ValuesToIgnore)[`0`];
4696	// We divide by these constants so assume that we have at least one
4697	// instruction that uses at least one register.
4698	for (auto &Pair : R.MaxLocalUsers) {
4699	Pair.second = std::max(a: Pair.second, b: `1U`);
4700	}
4701
4702	// We calculate the interleave count using the following formula.
4703	// Subtract the number of loop invariants from the number of available
4704	// registers. These registers are used by all of the interleaved instances.
4705	// Next, divide the remaining registers by the number of registers that is
4706	// required by the loop, in order to estimate how many parallel instances
4707	// fit without causing spills. All of this is rounded down if necessary to be
4708	// a power of two. We want power of two interleave count to simplify any
4709	// addressing operations or alignment considerations.
4710	// We also want power of two interleave counts to ensure that the induction
4711	// variable of the vector loop wraps to zero, when tail is folded by masking;
4712	// this currently happens when OptForSize, in which case IC is set to 1 above.
4713	unsigned IC = UINT_MAX;
4714
4715	for (const auto &Pair : R.MaxLocalUsers) {
4716	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4717	LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4718	<< " registers of "
4719	<< TTI.getRegisterClassName(Pair.first)
4720	<< " register class\n");
4721	if (VF.isScalar()) {
4722	if (ForceTargetNumScalarRegs.getNumOccurrences() > `0`)
4723	TargetNumRegisters = ForceTargetNumScalarRegs;
4724	} else {
4725	if (ForceTargetNumVectorRegs.getNumOccurrences() > `0`)
4726	TargetNumRegisters = ForceTargetNumVectorRegs;
4727	}
4728	unsigned MaxLocalUsers = Pair.second;
4729	unsigned LoopInvariantRegs = `0`;
4730	if (R.LoopInvariantRegs.contains(Key: Pair.first))
4731	LoopInvariantRegs = R.LoopInvariantRegs [Pair.first];
4732
4733	unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4734	MaxLocalUsers);
4735	// Don't count the induction variable as interleaved.
4736	if (EnableIndVarRegisterHeur) {
4737	TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - `1`) /
4738	std::max(a: `1U`, b: (MaxLocalUsers - `1`)));
4739	}
4740
4741	IC = std::min(a: IC, b: TmpIC);
4742	}
4743
4744	// Clamp the interleave ranges to reasonable counts.
4745	unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4746	LLVM_DEBUG(dbgs() << "LV: MaxInterleaveFactor for the target is "
4747	<< MaxInterleaveCount << "\n");
4748
4749	// Check if the user has overridden the max.
4750	if (VF.isScalar()) {
4751	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > `0`)
4752	MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4753	} else {
4754	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > `0`)
4755	MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4756	}
4757
4758	// Try to get the exact trip count, or an estimate based on profiling data or
4759	// ConstantMax from PSE, failing that.
4760	auto BestKnownTC = getSmallBestKnownTC(PSE, L: OrigLoop);
4761
4762	// For fixed length VFs treat a scalable trip count as unknown.
4763	if (BestKnownTC && (BestKnownTC ->isFixed() \|\| VF.isScalable())) {
4764	// Re-evaluate trip counts and VFs to be in the same numerical space.
4765	unsigned AvailableTC =
4766	estimateElementCount(VF: *BestKnownTC, VScale: CM.getVScaleForTuning());
4767	unsigned EstimatedVF = estimateElementCount(VF, VScale: CM.getVScaleForTuning());
4768
4769	// At least one iteration must be scalar when this constraint holds. So the
4770	// maximum available iterations for interleaving is one less.
4771	if (CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()))
4772	--AvailableTC;
4773
4774	unsigned InterleaveCountLB = bit_floor(Value: std::max(
4775	a: `1u`, b: std::min(a: AvailableTC / (EstimatedVF * `2`), b: MaxInterleaveCount)));
4776
4777	if (getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop).isNonZero()) {
4778	// If the best known trip count is exact, we select between two
4779	// prospective ICs, where
4780	//
4781	// 1) the aggressive IC is capped by the trip count divided by VF
4782	// 2) the conservative IC is capped by the trip count divided by (VF 2)*
4783	//
4784	// The final IC is selected in a way that the epilogue loop trip count is
4785	// minimized while maximizing the IC itself, so that we either run the
4786	// vector loop at least once if it generates a small epilogue loop, or
4787	// else we run the vector loop at least twice.
4788
4789	unsigned InterleaveCountUB = bit_floor(Value: std::max(
4790	a: `1u`, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4791	MaxInterleaveCount = InterleaveCountLB;
4792
4793	if (InterleaveCountUB != InterleaveCountLB) {
4794	unsigned TailTripCountUB =
4795	(AvailableTC % (EstimatedVF * InterleaveCountUB));
4796	unsigned TailTripCountLB =
4797	(AvailableTC % (EstimatedVF * InterleaveCountLB));
4798	// If both produce same scalar tail, maximize the IC to do the same work
4799	// in fewer vector loop iterations
4800	if (TailTripCountUB == TailTripCountLB)
4801	MaxInterleaveCount = InterleaveCountUB;
4802	}
4803	} else {
4804	// If trip count is an estimated compile time constant, limit the
4805	// IC to be capped by the trip count divided by VF 2, such that the*
4806	// vector loop runs at least twice to make interleaving seem profitable
4807	// when there is an epilogue loop present. Since exact Trip count is not
4808	// known we choose to be conservative in our IC estimate.
4809	MaxInterleaveCount = InterleaveCountLB;
4810	}
4811	}
4812
4813	assert(MaxInterleaveCount > `0` &&
4814	"Maximum interleave count must be greater than 0");
4815
4816	// Clamp the calculated IC to be between the 1 and the max interleave count
4817	// that the target and trip count allows.
4818	if (IC > MaxInterleaveCount)
4819	IC = MaxInterleaveCount;
4820	else
4821	// Make sure IC is greater than 0.
4822	IC = std::max(a: `1u`, b: IC);
4823
4824	assert(IC > `0` && "Interleave count must be greater than 0.");
4825
4826	// Interleave if we vectorized this loop and there is a reduction that could
4827	// benefit from interleaving.
4828	if (VF.isVector() && HasReductions) {
4829	LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4830	return IC;
4831	}
4832
4833	// For any scalar loop that either requires runtime checks or predication we
4834	// are better off leaving this to the unroller. Note that if we've already
4835	// vectorized the loop we will have done the runtime check and so interleaving
4836	// won't require further checks.
4837	bool ScalarInterleavingRequiresPredication =
4838	(VF.isScalar() && any_of(Range: OrigLoop->blocks(), P: [this](BasicBlock *BB) {
4839	return Legal->blockNeedsPredication(BB);
4840	}));
4841	bool ScalarInterleavingRequiresRuntimePointerCheck =
4842	(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4843
4844	// We want to interleave small loops in order to reduce the loop overhead and
4845	// potentially expose ILP opportunities.
4846	LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << `'\n'`
4847	<< "LV: IC is " << IC << `'\n'`
4848	<< "LV: VF is " << VF << `'\n'`);
4849	const bool AggressivelyInterleave =
4850	TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4851	if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4852	!ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4853	// We assume that the cost overhead is 1 and we use the cost model
4854	// to estimate the cost of the loop and interleave until the cost of the
4855	// loop overhead is about 5% of the cost of the loop.
4856	unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4857	Value: SmallLoopCost / LoopCost.getValue()));
4858
4859	// Interleave until store/load ports (estimated by max interleave count) are
4860	// saturated.
4861	unsigned NumStores = `0`;
4862	unsigned NumLoads = `0`;
4863	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4864	Range: vp_depth_first_deep(G: Plan.getVectorLoopRegion()->getEntry()))) {
4865	for (VPRecipeBase &R : *VPBB) {
4866	if (isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(Val: &R)) {
4867	NumLoads++;
4868	continue;
4869	}
4870	if (isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(Val: &R)) {
4871	NumStores++;
4872	continue;
4873	}
4874
4875	if (auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(Val: &R)) {
4876	if (unsigned StoreOps = InterleaveR->getNumStoreOperands())
4877	NumStores += StoreOps;
4878	else
4879	NumLoads += InterleaveR->getNumDefinedValues();
4880	continue;
4881	}
4882	if (auto *RepR = dyn_cast<VPReplicateRecipe>(Val: &R)) {
4883	NumLoads += isa<LoadInst>(Val: RepR->getUnderlyingInstr());
4884	NumStores += isa<StoreInst>(Val: RepR->getUnderlyingInstr());
4885	continue;
4886	}
4887	if (isa<VPHistogramRecipe>(Val: &R)) {
4888	NumLoads++;
4889	NumStores++;
4890	continue;
4891	}
4892	}
4893	}
4894	unsigned StoresIC = IC / (NumStores ? NumStores : `1`);
4895	unsigned LoadsIC = IC / (NumLoads ? NumLoads : `1`);
4896
4897	// There is little point in interleaving for reductions containing selects
4898	// and compares when VF=1 since it may just create more overhead than it's
4899	// worth for loops with small trip counts. This is because we still have to
4900	// do the final reduction after the loop.
4901	bool HasSelectCmpReductions =
4902	HasReductions &&
4903	any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4904	P: [](VPRecipeBase &R) {
4905	auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4906	return RedR && (RecurrenceDescriptor::isAnyOfRecurrenceKind(
4907	Kind: RedR->getRecurrenceKind()) \|\|
4908	RecurrenceDescriptor::isFindIVRecurrenceKind(
4909	Kind: RedR->getRecurrenceKind()));
4910	});
4911	if (HasSelectCmpReductions) {
4912	LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4913	return `1`;
4914	}
4915
4916	// If we have a scalar reduction (vector reductions are already dealt with
4917	// by this point), we can increase the critical path length if the loop
4918	// we're interleaving is inside another loop. For tree-wise reductions
4919	// set the limit to 2, and for ordered reductions it's best to disable
4920	// interleaving entirely.
4921	if (HasReductions && OrigLoop->getLoopDepth() > `1`) {
4922	bool HasOrderedReductions =
4923	any_of(Range: Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
4924	P: [](VPRecipeBase &R) {
4925	auto *RedR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
4926
4927	return RedR && RedR->isOrdered();
4928	});
4929	if (HasOrderedReductions) {
4930	LLVM_DEBUG(
4931	dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4932	return `1`;
4933	}
4934
4935	unsigned F = MaxNestedScalarReductionIC;
4936	SmallIC = std::min(a: SmallIC, b: F);
4937	StoresIC = std::min(a: StoresIC, b: F);
4938	LoadsIC = std::min(a: LoadsIC, b: F);
4939	}
4940
4941	if (EnableLoadStoreRuntimeInterleave &&
4942	std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4943	LLVM_DEBUG(
4944	dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4945	return std::max(a: StoresIC, b: LoadsIC);
4946	}
4947
4948	// If there are scalar reductions and TTI has enabled aggressive
4949	// interleaving for reductions, we will interleave to expose ILP.
4950	if (VF.isScalar() && AggressivelyInterleave) {
4951	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4952	// Interleave no less than SmallIC but not as aggressive as the normal IC
4953	// to satisfy the rare situation when resources are too limited.
4954	return std::max(a: IC / `2`, b: SmallIC);
4955	}
4956
4957	LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4958	return SmallIC;
4959	}
4960
4961	// Interleave if this is a large loop (small loops are already dealt with by
4962	// this point) that could benefit from interleaving.
4963	if (AggressivelyInterleave) {
4964	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4965	return IC;
4966	}
4967
4968	LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4969	return `1`;
4970	}
4971
4972	bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4973	ElementCount VF) {
4974	// TODO: Cost model for emulated masked load/store is completely
4975	// broken. This hack guides the cost model to use an artificially
4976	// high enough value to practically disable vectorization with such
4977	// operations, except where previously deployed legality hack allowed
4978	// using very low cost values. This is to avoid regressions coming simply
4979	// from moving "masked load/store" check from legality to cost model.
4980	// Masked Load/Gather emulation was previously never allowed.
4981	// Limited number of Masked Store/Scatter emulation was allowed.
4982	assert((isPredicatedInst(I)) &&
4983	"Expecting a scalar emulated instruction");
4984	return isa<LoadInst>(Val: I) \|\|
4985	(isa<StoreInst>(Val: I) &&
4986	NumPredStores > NumberOfStoresToPredicate);
4987	}
4988
4989	void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4990	assert(VF.isVector() && "Expected VF >= 2");
4991
4992	// If we've already collected the instructions to scalarize or the predicated
4993	// BBs after vectorization, there's nothing to do. Collection may already have
4994	// occurred if we have a user-selected VF and are now computing the expected
4995	// cost for interleaving.
4996	if (InstsToScalarize.contains(Key: VF) \|\|
4997	PredicatedBBsAfterVectorization.contains(Val: VF))
4998	return;
4999
5000	// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5001	// not profitable to scalarize any instructions, the presence of VF in the
5002	// map will indicate that we've analyzed it already.
5003	ScalarCostsTy &ScalarCostsVF = InstsToScalarize [VF];
5004
5005	// Find all the instructions that are scalar with predication in the loop and
5006	// determine if it would be better to not if-convert the blocks they are in.
5007	// If so, we also record the instructions to scalarize.
5008	for (BasicBlock *BB : TheLoop->blocks()) {
5009	if (!blockNeedsPredicationForAnyReason(BB))
5010	continue;
5011	for (Instruction &I : *BB)
5012	if (isScalarWithPredication(I: &I, VF)) {
5013	ScalarCostsTy ScalarCosts;
5014	// Do not apply discount logic for:
5015	// 1. Scalars after vectorization, as there will only be a single copy
5016	// of the instruction.
5017	// 2. Scalable VF, as that would lead to invalid scalarization costs.
5018	// 3. Emulated masked memrefs, if a hacked cost is needed.
5019	if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
5020	!useEmulatedMaskMemRefHack(I: &I, VF) &&
5021	computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= `0`) {
5022	for (const auto &[I, IC] : ScalarCosts)
5023	ScalarCostsVF.insert(KV: {I, IC});
5024	// Check if we decided to scalarize a call. If so, update the widening
5025	// decision of the call to CM_Scalarize with the computed scalar cost.
5026	for (const auto &[I, Cost] : ScalarCosts) {
5027	auto *CI = dyn_cast<CallInst>(Val: I);
5028	if (!CI \|\| !CallWideningDecisions.contains(Val: {CI, VF}))
5029	continue;
5030	CallWideningDecisions [{CI, VF}].Kind = CM_Scalarize;
5031	CallWideningDecisions [{CI, VF}].Cost = Cost;
5032	}
5033	}
5034	// Remember that BB will remain after vectorization.
5035	PredicatedBBsAfterVectorization [VF].insert(Ptr: BB);
5036	for (auto *Pred : predecessors(BB)) {
5037	if (Pred->getSingleSuccessor() == BB)
5038	PredicatedBBsAfterVectorization [VF].insert(Ptr: Pred);
5039	}
5040	}
5041	}
5042	}
5043
5044	InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5045	Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
5046	assert(!isUniformAfterVectorization(PredInst, VF) &&
5047	"Instruction marked uniform-after-vectorization will be predicated");
5048
5049	// Initialize the discount to zero, meaning that the scalar version and the
5050	// vector version cost the same.
5051	InstructionCost Discount = `0`;
5052
5053	// Holds instructions to analyze. The instructions we visit are mapped in
5054	// ScalarCosts. Those instructions are the ones that would be scalarized if
5055	// we find that the scalar version costs less.
5056	SmallVector<Instruction *, `8`> Worklist;
5057
5058	// Returns true if the given instruction can be scalarized.
5059	auto CanBeScalarized = [&](Instruction I) -> bool* {
5060	// We only attempt to scalarize instructions forming a single-use chain
5061	// from the original predicated block that would otherwise be vectorized.
5062	// Although not strictly necessary, we give up on instructions we know will
5063	// already be scalar to avoid traversing chains that are unlikely to be
5064	// beneficial.
5065	if (!I->hasOneUse() \|\| PredInst->getParent() != I->getParent() \|\|
5066	isScalarAfterVectorization(I, VF))
5067	return false;
5068
5069	// If the instruction is scalar with predication, it will be analyzed
5070	// separately. We ignore it within the context of PredInst.
5071	if (isScalarWithPredication(I, VF))
5072	return false;
5073
5074	// If any of the instruction's operands are uniform after vectorization,
5075	// the instruction cannot be scalarized. This prevents, for example, a
5076	// masked load from being scalarized.
5077	//
5078	// We assume we will only emit a value for lane zero of an instruction
5079	// marked uniform after vectorization, rather than VF identical values.
5080	// Thus, if we scalarize an instruction that uses a uniform, we would
5081	// create uses of values corresponding to the lanes we aren't emitting code
5082	// for. This behavior can be changed by allowing getScalarValue to clone
5083	// the lane zero values for uniforms rather than asserting.
5084	for (Use &U : I->operands())
5085	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
5086	if (isUniformAfterVectorization(I: J, VF))
5087	return false;
5088
5089	// Otherwise, we can scalarize the instruction.
5090	return true;
5091	};
5092
5093	// Compute the expected cost discount from scalarizing the entire expression
5094	// feeding the predicated instruction. We currently only consider expressions
5095	// that are single-use instruction chains.
5096	Worklist.push_back(Elt: PredInst);
5097	while (!Worklist.empty()) {
5098	Instruction *I = Worklist.pop_back_val();
5099
5100	// If we've already analyzed the instruction, there's nothing to do.
5101	if (ScalarCosts.contains(Key: I))
5102	continue;
5103
5104	// Cannot scalarize fixed-order recurrence phis at the moment.
5105	if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5106	continue;
5107
5108	// Compute the cost of the vector instruction. Note that this cost already
5109	// includes the scalarization overhead of the predicated instruction.
5110	InstructionCost VectorCost = getInstructionCost(I, VF);
5111
5112	// Compute the cost of the scalarized instruction. This cost is the cost of
5113	// the instruction as if it wasn't if-converted and instead remained in the
5114	// predicated block. We will scale this cost by block probability after
5115	// computing the scalarization overhead.
5116	InstructionCost ScalarCost =
5117	VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`));
5118
5119	// Compute the scalarization overhead of needed insertelement instructions
5120	// and phi nodes.
5121	if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5122	Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5123	for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5124	ScalarCost += TTI.getScalarizationOverhead(
5125	Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5126	/Insert=/true,
5127	/Extract=/false, CostKind);
5128	}
5129	ScalarCost +=
5130	VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5131	}
5132
5133	// Compute the scalarization overhead of needed extractelement
5134	// instructions. For each of the instruction's operands, if the operand can
5135	// be scalarized, add it to the worklist; otherwise, account for the
5136	// overhead.
5137	for (Use &U : I->operands())
5138	if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5139	assert(canVectorizeTy(J->getType()) &&
5140	"Instruction has non-scalar type");
5141	if (CanBeScalarized (J))
5142	Worklist.push_back(Elt: J);
5143	else if (needsExtract(V: J, VF)) {
5144	Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5145	for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5146	ScalarCost += TTI.getScalarizationOverhead(
5147	Ty: cast<VectorType>(Val: VectorTy),
5148	DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /Insert/ false,
5149	/Extract/ true, CostKind);
5150	}
5151	}
5152	}
5153
5154	// Scale the total scalar cost by block probability.
5155	ScalarCost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5156
5157	// Compute the discount. A non-negative discount means the vector version
5158	// of the instruction costs more, and scalarizing would be beneficial.
5159	Discount += VectorCost - ScalarCost;
5160	ScalarCosts [I] = ScalarCost;
5161	}
5162
5163	return Discount;
5164	}
5165
5166	InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5167	InstructionCost Cost;
5168
5169	// If the vector loop gets executed exactly once with the given VF, ignore the
5170	// costs of comparison and induction instructions, as they'll get simplified
5171	// away.
5172	SmallPtrSet<Instruction *, `2`> ValuesToIgnoreForVF;
5173	auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5174	if (TC == VF && !foldTailByMasking())
5175	addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5176	InstsToIgnore&: ValuesToIgnoreForVF);
5177
5178	// For each block.
5179	for (BasicBlock *BB : TheLoop->blocks()) {
5180	InstructionCost BlockCost;
5181
5182	// For each instruction in the old loop.
5183	for (Instruction &I : BB->instructionsWithoutDebug()) {
5184	// Skip ignored values.
5185	if (ValuesToIgnore.count(Ptr: &I) \|\| ValuesToIgnoreForVF.count(Ptr: &I) \|\|
5186	(VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5187	continue;
5188
5189	InstructionCost C = getInstructionCost(I: &I, VF);
5190
5191	// Check if we should override the cost.
5192	if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > `0`) {
5193	// For interleave groups, use ForceTargetInstructionCost once for the
5194	// whole group.
5195	if (VF.isVector() && getWideningDecision(I: &I, VF) == CM_Interleave) {
5196	if (getInterleavedAccessGroup(Instr: &I)->getInsertPos() == &I)
5197	C = InstructionCost (ForceTargetInstructionCost);
5198	else
5199	C = InstructionCost (`0`);
5200	} else {
5201	C = InstructionCost (ForceTargetInstructionCost);
5202	}
5203	}
5204
5205	BlockCost += C;
5206	LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5207	<< VF << " For instruction: " << I << `'\n'`);
5208	}
5209
5210	// If we are vectorizing a predicated block, it will have been
5211	// if-converted. This means that the block's instructions (aside from
5212	// stores and instructions that may divide by zero) will now be
5213	// unconditionally executed. For the scalar case, we may not always execute
5214	// the predicated block, if it is an if-else block. Thus, scale the block's
5215	// cost by the probability of executing it.
5216	// getPredBlockCostDivisor will return 1 for blocks that are only predicated
5217	// by the header mask when folding the tail.
5218	if (VF.isScalar())
5219	BlockCost /= getPredBlockCostDivisor(CostKind, BB);
5220
5221	Cost += BlockCost;
5222	}
5223
5224	return Cost;
5225	}
5226
5227	/// Gets the address access SCEV for Ptr, if it should be used for cost modeling
5228	/// according to isAddressSCEVForCost.
5229	///
5230	/// This SCEV can be sent to the Target in order to estimate the address
5231	/// calculation cost.
5232	static const SCEV *getAddressAccessSCEV(
5233	Value *Ptr,
5234	PredicatedScalarEvolution &PSE,
5235	const Loop *TheLoop) {
5236	const SCEV *Addr = PSE.getSCEV(V: Ptr);
5237	return vputils::isAddressSCEVForCost(Addr, SE&: *PSE.getSE(), L: TheLoop) ? Addr
5238	: nullptr;
5239	}
5240
5241	InstructionCost
5242	LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5243	ElementCount VF) {
5244	assert(VF.isVector() &&
5245	"Scalarization cost of instruction implies vectorization.");
5246	if (VF.isScalable())
5247	return InstructionCost::getInvalid();
5248
5249	Type *ValTy = getLoadStoreType(I);
5250	auto *SE = PSE.getSE();
5251
5252	unsigned AS = getLoadStoreAddressSpace(I);
5253	Value *Ptr = getLoadStorePointerOperand(V: I);
5254	Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5255	// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5256	// that it is being called from this specific place.
5257
5258	// Figure out whether the access is strided and get the stride value
5259	// if it's known in compile time
5260	const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, PSE, TheLoop);
5261
5262	// Get the cost of the scalar memory instruction and address computation.
5263	InstructionCost Cost = VF.getFixedValue() * TTI.getAddressComputationCost(
5264	PtrTy, SE, Ptr: PtrSCEV, CostKind);
5265
5266	// Don't pass I here, since it is scalar but will actually be part of a*
5267	// vectorized loop where the user of it is a vectorized instruction.
5268	const Align Alignment = getLoadStoreAlignment(I);
5269	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
5270	Cost += VF.getFixedValue() *
5271	TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy->getScalarType(), Alignment,
5272	AddressSpace: AS, CostKind, OpdInfo: OpInfo);
5273
5274	// Get the overhead of the extractelement and insertelement instructions
5275	// we might create due to scalarization.
5276	Cost += getScalarizationOverhead(I, VF);
5277
5278	// If we have a predicated load/store, it will need extra i1 extracts and
5279	// conditional branches, but may not be executed for each vector lane. Scale
5280	// the cost by the probability of executing the predicated block.
5281	if (isPredicatedInst(I)) {
5282	Cost /= getPredBlockCostDivisor(CostKind, BB: I->getParent());
5283
5284	// Add the cost of an i1 extract and a branch
5285	auto *VecI1Ty =
5286	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5287	Cost += TTI.getScalarizationOverhead(
5288	Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5289	/Insert=/false, /Extract=/true, CostKind);
5290	Cost += TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind);
5291
5292	if (useEmulatedMaskMemRefHack(I, VF))
5293	// Artificially setting to a high enough value to practically disable
5294	// vectorization with such operations.
5295	Cost = `3000000`;
5296	}
5297
5298	return Cost;
5299	}
5300
5301	InstructionCost
5302	LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5303	ElementCount VF) {
5304	Type *ValTy = getLoadStoreType(I);
5305	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5306	Value *Ptr = getLoadStorePointerOperand(V: I);
5307	unsigned AS = getLoadStoreAddressSpace(I);
5308	int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5309
5310	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
5311	"Stride should be 1 or -1 for consecutive memory access");
5312	const Align Alignment = getLoadStoreAlignment(I);
5313	InstructionCost Cost = `0`;
5314	if (Legal->isMaskRequired(I)) {
5315	unsigned IID = I->getOpcode() == Instruction::Load
5316	? Intrinsic::masked_load
5317	: Intrinsic::masked_store;
5318	Cost += TTI.getMemIntrinsicInstrCost(
5319	MICA: MemIntrinsicCostAttributes (IID, VectorTy, Alignment, AS), CostKind);
5320	} else {
5321	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
5322	Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5323	CostKind, OpdInfo: OpInfo, I);
5324	}
5325
5326	bool Reverse = ConsecutiveStride < `0`;
5327	if (Reverse)
5328	Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5329	SrcTy: VectorTy, Mask: {}, CostKind, Index: `0`);
5330	return Cost;
5331	}
5332
5333	InstructionCost
5334	LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5335	ElementCount VF) {
5336	assert(Legal->isUniformMemOp(*I, VF));
5337
5338	Type *ValTy = getLoadStoreType(I);
5339	Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5340	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5341	const Align Alignment = getLoadStoreAlignment(I);
5342	unsigned AS = getLoadStoreAddressSpace(I);
5343	if (isa<LoadInst>(Val: I)) {
5344	return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5345	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5346	CostKind) +
5347	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5348	SrcTy: VectorTy, Mask: {}, CostKind);
5349	}
5350	StoreInst *SI = cast<StoreInst>(Val: I);
5351
5352	bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5353	// TODO: We have existing tests that request the cost of extracting element
5354	// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5355	// the actual generated code, which involves extracting the last element of
5356	// a scalable vector where the lane to extract is unknown at compile time.
5357	InstructionCost Cost =
5358	TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5359	TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS, CostKind);
5360	if (!IsLoopInvariantStoreValue)
5361	Cost += TTI.getIndexedVectorInstrCostFromEnd(Opcode: Instruction::ExtractElement,
5362	Val: VectorTy, CostKind, Index: `0`);
5363	return Cost;
5364	}
5365
5366	InstructionCost
5367	LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5368	ElementCount VF) {
5369	Type *ValTy = getLoadStoreType(I);
5370	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5371	const Align Alignment = getLoadStoreAlignment(I);
5372	Value *Ptr = getLoadStorePointerOperand(V: I);
5373	Type *PtrTy = Ptr->getType();
5374
5375	if (!Legal->isUniform(V: Ptr, VF))
5376	PtrTy = toVectorTy(Scalar: PtrTy, EC: VF);
5377
5378	unsigned IID = I->getOpcode() == Instruction::Load
5379	? Intrinsic::masked_gather
5380	: Intrinsic::masked_scatter;
5381	return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5382	TTI.getMemIntrinsicInstrCost(
5383	MICA: MemIntrinsicCostAttributes (IID, VectorTy, Ptr,
5384	Legal->isMaskRequired(I), Alignment, I),
5385	CostKind);
5386	}
5387
5388	InstructionCost
5389	LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5390	ElementCount VF) {
5391	const auto *Group = getInterleavedAccessGroup(Instr: I);
5392	assert(Group && "Fail to get an interleaved access group.");
5393
5394	Instruction *InsertPos = Group->getInsertPos();
5395	Type *ValTy = getLoadStoreType(I: InsertPos);
5396	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5397	unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5398
5399	unsigned InterleaveFactor = Group->getFactor();
5400	auto WideVecTy = VectorType::get(ElementType: ValTy, EC: VF InterleaveFactor);
5401
5402	// Holds the indices of existing members in the interleaved group.
5403	SmallVector<unsigned, `4`> Indices;
5404	for (unsigned IF = `0`; IF < InterleaveFactor; IF++)
5405	if (Group->getMember(Index: IF))
5406	Indices.push_back(Elt: IF);
5407
5408	// Calculate the cost of the whole interleaved group.
5409	bool UseMaskForGaps =
5410	(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) \|\|
5411	(isa<StoreInst>(Val: I) && !Group->isFull());
5412	InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5413	Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5414	Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5415	UseMaskForGaps);
5416
5417	if (Group->isReverse()) {
5418	// TODO: Add support for reversed masked interleaved access.
5419	assert(!Legal->isMaskRequired(I) &&
5420	"Reverse masked interleaved access not supported.");
5421	Cost += Group->getNumMembers() *
5422	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5423	SrcTy: VectorTy, Mask: {}, CostKind, Index: `0`);
5424	}
5425	return Cost;
5426	}
5427
5428	std::optional<InstructionCost>
5429	LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5430	ElementCount VF,
5431	Type Ty) const* {
5432	using namespace llvm::PatternMatch;
5433	// Early exit for no inloop reductions
5434	if (InLoopReductions.empty() \|\| VF.isScalar() \|\| !isa<VectorType>(Val: Ty))
5435	return std::nullopt;
5436	auto *VectorTy = cast<VectorType>(Val: Ty);
5437
5438	// We are looking for a pattern of, and finding the minimal acceptable cost:
5439	// reduce(mul(ext(A), ext(B))) or
5440	// reduce(mul(A, B)) or
5441	// reduce(ext(A)) or
5442	// reduce(A).
5443	// The basic idea is that we walk down the tree to do that, finding the root
5444	// reduction instruction in InLoopReductionImmediateChains. From there we find
5445	// the pattern of mul/ext and test the cost of the entire pattern vs the cost
5446	// of the components. If the reduction cost is lower then we return it for the
5447	// reduction instruction and 0 for the other instructions in the pattern. If
5448	// it is not we return an invalid cost specifying the orignal cost method
5449	// should be used.
5450	Instruction *RetI = I;
5451	if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5452	if (!RetI->hasOneUser())
5453	return std::nullopt;
5454	RetI = RetI->user_back();
5455	}
5456
5457	if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5458	RetI->user_back()->getOpcode() == Instruction::Add) {
5459	RetI = RetI->user_back();
5460	}
5461
5462	// Test if the found instruction is a reduction, and if not return an invalid
5463	// cost specifying the parent to use the original cost modelling.
5464	Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5465	if (!LastChain)
5466	return std::nullopt;
5467
5468	// Find the reduction this chain is a part of and calculate the basic cost of
5469	// the reduction on its own.
5470	Instruction *ReductionPhi = LastChain;
5471	while (!isa<PHINode>(Val: ReductionPhi))
5472	ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5473
5474	const RecurrenceDescriptor &RdxDesc =
5475	Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
5476
5477	InstructionCost BaseCost;
5478	RecurKind RK = RdxDesc.getRecurrenceKind();
5479	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5480	Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5481	BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5482	FMF: RdxDesc.getFastMathFlags(), CostKind);
5483	} else {
5484	BaseCost = TTI.getArithmeticReductionCost(
5485	Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5486	}
5487
5488	// For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5489	// normal fmul instruction to the cost of the fadd reduction.
5490	if (RK == RecurKind::FMulAdd)
5491	BaseCost +=
5492	TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5493
5494	// If we're using ordered reductions then we can just return the base cost
5495	// here, since getArithmeticReductionCost calculates the full ordered
5496	// reduction cost when FP reassociation is not allowed.
5497	if (useOrderedReductions(RdxDesc))
5498	return BaseCost;
5499
5500	// Get the operand that was not the reduction chain and match it to one of the
5501	// patterns, returning the better cost if it is found.
5502	Instruction *RedOp = RetI->getOperand(i: `1`) == LastChain
5503	? dyn_cast<Instruction>(Val: RetI->getOperand(i: `0`))
5504	: dyn_cast<Instruction>(Val: RetI->getOperand(i: `1`));
5505
5506	VectorTy = VectorType::get(ElementType: I->getOperand(i: `0`)->getType(), Other: VectorTy);
5507
5508	Instruction Op0, Op1;
5509	if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5510	match(V: RedOp,
5511	P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5512	match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5513	Op0->getOpcode() == Op1->getOpcode() &&
5514	Op0->getOperand(i: `0`)->getType() == Op1->getOperand(i: `0`)->getType() &&
5515	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5516	(Op0->getOpcode() == RedOp->getOpcode() \|\| Op0 == Op1)) {
5517
5518	// Matched reduce.add(ext(mul(ext(A), ext(B)))
5519	// Note that the extend opcodes need to all match, or if A==B they will have
5520	// been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5521	// which is equally fine.
5522	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5523	auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: `0`)->getType(), Other: VectorTy);
5524	auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5525
5526	InstructionCost ExtCost =
5527	TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5528	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5529	InstructionCost MulCost =
5530	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5531	InstructionCost Ext2Cost =
5532	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5533	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5534
5535	InstructionCost RedCost = TTI.getMulAccReductionCost(
5536	IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5537	CostKind);
5538
5539	if (RedCost.isValid() &&
5540	RedCost < ExtCost * `2` + MulCost + Ext2Cost + BaseCost)
5541	return I == RetI ? RedCost : `0`;
5542	} else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5543	!TheLoop->isLoopInvariant(V: RedOp)) {
5544	// Matched reduce(ext(A))
5545	bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5546	auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: `0`)->getType(), Other: VectorTy);
5547	InstructionCost RedCost = TTI.getExtendedReductionCost(
5548	Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5549	FMF: RdxDesc.getFastMathFlags(), CostKind);
5550
5551	InstructionCost ExtCost =
5552	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5553	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5554	if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5555	return I == RetI ? RedCost : `0`;
5556	} else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5557	match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5558	if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5559	Op0->getOpcode() == Op1->getOpcode() &&
5560	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5561	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5562	Type *Op0Ty = Op0->getOperand(i: `0`)->getType();
5563	Type *Op1Ty = Op1->getOperand(i: `0`)->getType();
5564	Type *LargestOpTy =
5565	Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5566	: Op0Ty;
5567	auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5568
5569	// Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5570	// different sizes. We take the largest type as the ext to reduce, and add
5571	// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5572	InstructionCost ExtCost0 = TTI.getCastInstrCost(
5573	Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5574	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5575	InstructionCost ExtCost1 = TTI.getCastInstrCost(
5576	Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5577	CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5578	InstructionCost MulCost =
5579	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5580
5581	InstructionCost RedCost = TTI.getMulAccReductionCost(
5582	IsUnsigned, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5583	CostKind);
5584	InstructionCost ExtraExtCost = `0`;
5585	if (Op0Ty != LargestOpTy \|\| Op1Ty != LargestOpTy) {
5586	Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5587	ExtraExtCost = TTI.getCastInstrCost(
5588	Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5589	Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: `0`)->getType(), Other: VectorTy),
5590	CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5591	}
5592
5593	if (RedCost.isValid() &&
5594	(RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5595	return I == RetI ? RedCost : `0`;
5596	} else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5597	// Matched reduce.add(mul())
5598	InstructionCost MulCost =
5599	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5600
5601	InstructionCost RedCost = TTI.getMulAccReductionCost(
5602	IsUnsigned: true, RedOpcode: RdxDesc.getOpcode(), ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy,
5603	CostKind);
5604
5605	if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5606	return I == RetI ? RedCost : `0`;
5607	}
5608	}
5609
5610	return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5611	}
5612
5613	InstructionCost
5614	LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5615	ElementCount VF) {
5616	// Calculate scalar cost only. Vectorization cost should be ready at this
5617	// moment.
5618	if (VF.isScalar()) {
5619	Type *ValTy = getLoadStoreType(I);
5620	Type *PtrTy = getLoadStorePointerOperand(V: I)->getType();
5621	const Align Alignment = getLoadStoreAlignment(I);
5622	unsigned AS = getLoadStoreAddressSpace(I);
5623
5624	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
5625	return TTI.getAddressComputationCost(PtrTy, SE: nullptr, Ptr: nullptr, CostKind) +
5626	TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5627	OpdInfo: OpInfo, I);
5628	}
5629	return getWideningCost(I, VF);
5630	}
5631
5632	InstructionCost
5633	LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5634	ElementCount VF) const {
5635
5636	// There is no mechanism yet to create a scalable scalarization loop,
5637	// so this is currently Invalid.
5638	if (VF.isScalable())
5639	return InstructionCost::getInvalid();
5640
5641	if (VF.isScalar())
5642	return `0`;
5643
5644	InstructionCost Cost = `0`;
5645	Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5646	if (!RetTy->isVoidTy() &&
5647	(!isa<LoadInst>(Val: I) \|\| !TTI.supportsEfficientVectorElementLoadStore())) {
5648
5649	TTI::VectorInstrContext VIC = TTI::VectorInstrContext::None;
5650	if (isa<LoadInst>(Val: I))
5651	VIC = TTI::VectorInstrContext::Load;
5652	else if (isa<StoreInst>(Val: I))
5653	VIC = TTI::VectorInstrContext::Store;
5654
5655	for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5656	Cost += TTI.getScalarizationOverhead(
5657	Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5658	/Insert=/true, /Extract=/false, CostKind,
5659	/ForPoisonSrc=/true, VL: {}, VIC);
5660	}
5661	}
5662
5663	// Some targets keep addresses scalar.
5664	if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5665	return Cost;
5666
5667	// Some targets support efficient element stores.
5668	if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5669	return Cost;
5670
5671	// Collect operands to consider.
5672	CallInst *CI = dyn_cast<CallInst>(Val: I);
5673	Instruction::op_range Ops = CI ? CI->args() : I->operands();
5674
5675	// Skip operands that do not require extraction/scalarization and do not incur
5676	// any overhead.
5677	SmallVector<Type *> Tys;
5678	for (auto *V : filterExtractingOperands(Ops, VF))
5679	Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5680
5681	TTI::VectorInstrContext OperandVIC = isa<StoreInst>(Val: I)
5682	? TTI::VectorInstrContext::Store
5683	: TTI::VectorInstrContext::None;
5684	return Cost + TTI.getOperandsScalarizationOverhead(Tys, CostKind, VIC: OperandVIC);
5685	}
5686
5687	void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5688	if (VF.isScalar())
5689	return;
5690	NumPredStores = `0`;
5691	for (BasicBlock *BB : TheLoop->blocks()) {
5692	// For each instruction in the old loop.
5693	for (Instruction &I : *BB) {
5694	Value *Ptr = getLoadStorePointerOperand(V: &I);
5695	if (!Ptr)
5696	continue;
5697
5698	// TODO: We should generate better code and update the cost model for
5699	// predicated uniform stores. Today they are treated as any other
5700	// predicated store (see added test cases in
5701	// invariant-store-vectorization.ll).
5702	if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5703	NumPredStores++;
5704
5705	if (Legal->isUniformMemOp(I, VF)) {
5706	auto IsLegalToScalarize = [&]() {
5707	if (!VF.isScalable())
5708	// Scalarization of fixed length vectors "just works".
5709	return true;
5710
5711	// We have dedicated lowering for unpredicated uniform loads and
5712	// stores. Note that even with tail folding we know that at least
5713	// one lane is active (i.e. generalized predication is not possible
5714	// here), and the logic below depends on this fact.
5715	if (!foldTailByMasking())
5716	return true;
5717
5718	// For scalable vectors, a uniform memop load is always
5719	// uniform-by-parts and we know how to scalarize that.
5720	if (isa<LoadInst>(Val: I))
5721	return true;
5722
5723	// A uniform store isn't neccessarily uniform-by-part
5724	// and we can't assume scalarization.
5725	auto &SI = cast<StoreInst>(Val&: I);
5726	return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5727	};
5728
5729	const InstructionCost GatherScatterCost =
5730	isLegalGatherOrScatter(V: &I, VF) ?
5731	getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5732
5733	// Load: Scalar load + broadcast
5734	// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5735	// FIXME: This cost is a significant under-estimate for tail folded
5736	// memory ops.
5737	const InstructionCost ScalarizationCost =
5738	IsLegalToScalarize () ? getUniformMemOpCost(I: &I, VF)
5739	: InstructionCost::getInvalid();
5740
5741	// Choose better solution for the current VF, Note that Invalid
5742	// costs compare as maximumal large. If both are invalid, we get
5743	// scalable invalid which signals a failure and a vectorization abort.
5744	if (GatherScatterCost < ScalarizationCost)
5745	setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5746	else
5747	setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5748	continue;
5749	}
5750
5751	// We assume that widening is the best solution when possible.
5752	if (memoryInstructionCanBeWidened(I: &I, VF)) {
5753	InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5754	int ConsecutiveStride = Legal->isConsecutivePtr(
5755	AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5756	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
5757	"Expected consecutive stride.");
5758	InstWidening Decision =
5759	ConsecutiveStride == `1` ? CM_Widen : CM_Widen_Reverse;
5760	setWideningDecision(I: &I, VF, W: Decision, Cost);
5761	continue;
5762	}
5763
5764	// Choose between Interleaving, Gather/Scatter or Scalarization.
5765	InstructionCost InterleaveCost = InstructionCost::getInvalid();
5766	unsigned NumAccesses = `1`;
5767	if (isAccessInterleaved(Instr: &I)) {
5768	const auto *Group = getInterleavedAccessGroup(Instr: &I);
5769	assert(Group && "Fail to get an interleaved access group.");
5770
5771	// Make one decision for the whole group.
5772	if (getWideningDecision(I: &I, VF) != CM_Unknown)
5773	continue;
5774
5775	NumAccesses = Group->getNumMembers();
5776	if (interleavedAccessCanBeWidened(I: &I, VF))
5777	InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5778	}
5779
5780	InstructionCost GatherScatterCost =
5781	isLegalGatherOrScatter(V: &I, VF)
5782	? getGatherScatterCost(I: &I, VF) * NumAccesses
5783	: InstructionCost::getInvalid();
5784
5785	InstructionCost ScalarizationCost =
5786	getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5787
5788	// Choose better solution for the current VF,
5789	// write down this decision and use it during vectorization.
5790	InstructionCost Cost;
5791	InstWidening Decision;
5792	if (InterleaveCost <= GatherScatterCost &&
5793	InterleaveCost < ScalarizationCost) {
5794	Decision = CM_Interleave;
5795	Cost = InterleaveCost;
5796	} else if (GatherScatterCost < ScalarizationCost) {
5797	Decision = CM_GatherScatter;
5798	Cost = GatherScatterCost;
5799	} else {
5800	Decision = CM_Scalarize;
5801	Cost = ScalarizationCost;
5802	}
5803	// If the instructions belongs to an interleave group, the whole group
5804	// receives the same decision. The whole group receives the cost, but
5805	// the cost will actually be assigned to one instruction.
5806	if (const auto *Group = getInterleavedAccessGroup(Instr: &I)) {
5807	if (Decision == CM_Scalarize) {
5808	for (unsigned Idx = `0`; Idx < Group->getFactor(); ++Idx) {
5809	if (auto *I = Group->getMember(Index: Idx)) {
5810	setWideningDecision(I, VF, W: Decision,
5811	Cost: getMemInstScalarizationCost(I, VF));
5812	}
5813	}
5814	} else {
5815	setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5816	}
5817	} else
5818	setWideningDecision(I: &I, VF, W: Decision, Cost);
5819	}
5820	}
5821
5822	// Make sure that any load of address and any other address computation
5823	// remains scalar unless there is gather/scatter support. This avoids
5824	// inevitable extracts into address registers, and also has the benefit of
5825	// activating LSR more, since that pass can't optimize vectorized
5826	// addresses.
5827	if (TTI.prefersVectorizedAddressing())
5828	return;
5829
5830	// Start with all scalar pointer uses.
5831	SmallPtrSet<Instruction *, `8`> AddrDefs;
5832	for (BasicBlock *BB : TheLoop->blocks())
5833	for (Instruction &I : *BB) {
5834	Instruction *PtrDef =
5835	dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5836	if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5837	getWideningDecision(I: &I, VF) != CM_GatherScatter)
5838	AddrDefs.insert(Ptr: PtrDef);
5839	}
5840
5841	// Add all instructions used to generate the addresses.
5842	SmallVector<Instruction *, `4`> Worklist;
5843	append_range(C&: Worklist, R&: AddrDefs);
5844	while (!Worklist.empty()) {
5845	Instruction *I = Worklist.pop_back_val();
5846	for (auto &Op : I->operands())
5847	if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5848	if (TheLoop->contains(Inst: InstOp) && !isa<PHINode>(Val: InstOp) &&
5849	AddrDefs.insert(Ptr: InstOp).second)
5850	Worklist.push_back(Elt: InstOp);
5851	}
5852
5853	auto UpdateMemOpUserCost = [this, VF](LoadInst *LI) {
5854	// If there are direct memory op users of the newly scalarized load,
5855	// their cost may have changed because there's no scalarization
5856	// overhead for the operand. Update it.
5857	for (User *U : LI->users()) {
5858	if (!isa<LoadInst, StoreInst>(Val: U))
5859	continue;
5860	if (getWideningDecision(I: cast<Instruction>(Val: U), VF) != CM_Scalarize)
5861	continue;
5862	setWideningDecision(
5863	I: cast<Instruction>(Val: U), VF, W: CM_Scalarize,
5864	Cost: getMemInstScalarizationCost(I: cast<Instruction>(Val: U), VF));
5865	}
5866	};
5867	for (auto *I : AddrDefs) {
5868	if (isa<LoadInst>(Val: I)) {
5869	// Setting the desired widening decision should ideally be handled in
5870	// by cost functions, but since this involves the task of finding out
5871	// if the loaded register is involved in an address computation, it is
5872	// instead changed here when we know this is the case.
5873	InstWidening Decision = getWideningDecision(I, VF);
5874	if (!isPredicatedInst(I) &&
5875	(Decision == CM_Widen \|\| Decision == CM_Widen_Reverse \|\|
5876	(!Legal->isUniformMemOp(I&: *I, VF) && Decision == CM_Scalarize))) {
5877	// Scalarize a widened load of address or update the cost of a scalar
5878	// load of an address.
5879	setWideningDecision(
5880	I, VF, W: CM_Scalarize,
5881	Cost: (VF.getKnownMinValue() *
5882	getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`))));
5883	UpdateMemOpUserCost (cast<LoadInst>(Val: I));
5884	} else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5885	// Scalarize all members of this interleaved group when any member
5886	// is used as an address. The address-used load skips scalarization
5887	// overhead, other members include it.
5888	for (unsigned Idx = `0`; Idx < Group->getFactor(); ++Idx) {
5889	if (Instruction *Member = Group->getMember(Index: Idx)) {
5890	InstructionCost Cost =
5891	AddrDefs.contains(Ptr: Member)
5892	? (VF.getKnownMinValue() *
5893	getMemoryInstructionCost(I: Member,
5894	VF: ElementCount::getFixed(MinVal: `1`)))
5895	: getMemInstScalarizationCost(I: Member, VF);
5896	setWideningDecision(I: Member, VF, W: CM_Scalarize, Cost);
5897	UpdateMemOpUserCost (cast<LoadInst>(Val: Member));
5898	}
5899	}
5900	}
5901	} else {
5902	// Cannot scalarize fixed-order recurrence phis at the moment.
5903	if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5904	continue;
5905
5906	// Make sure I gets scalarized and a cost estimate without
5907	// scalarization overhead.
5908	ForcedScalars [VF].insert(Ptr: I);
5909	}
5910	}
5911	}
5912
5913	void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5914	assert(!VF.isScalar() &&
5915	"Trying to set a vectorization decision for a scalar VF");
5916
5917	auto ForcedScalar = ForcedScalars.find(Val: VF);
5918	for (BasicBlock *BB : TheLoop->blocks()) {
5919	// For each instruction in the old loop.
5920	for (Instruction &I : *BB) {
5921	CallInst *CI = dyn_cast<CallInst>(Val: &I);
5922
5923	if (!CI)
5924	continue;
5925
5926	InstructionCost ScalarCost = InstructionCost::getInvalid();
5927	InstructionCost VectorCost = InstructionCost::getInvalid();
5928	InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5929	Function *ScalarFunc = CI->getCalledFunction();
5930	Type *ScalarRetTy = CI->getType();
5931	SmallVector<Type *, `4`> Tys, ScalarTys;
5932	for (auto &ArgOp : CI->args())
5933	ScalarTys.push_back(Elt: ArgOp ->getType());
5934
5935	// Estimate cost of scalarized vector call. The source operands are
5936	// assumed to be vectors, so we need to extract individual elements from
5937	// there, execute VF scalar calls, and then gather the result into the
5938	// vector return value.
5939	if (VF.isFixed()) {
5940	InstructionCost ScalarCallCost =
5941	TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5942
5943	// Compute costs of unpacking argument values for the scalar calls and
5944	// packing the return values to a vector.
5945	InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5946	ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5947	} else {
5948	// There is no point attempting to calculate the scalar cost for a
5949	// scalable VF as we know it will be Invalid.
5950	assert(!getScalarizationOverhead(CI, VF).isValid() &&
5951	"Unexpected valid cost for scalarizing scalable vectors");
5952	ScalarCost = InstructionCost::getInvalid();
5953	}
5954
5955	// Honor ForcedScalars and UniformAfterVectorization decisions.
5956	// TODO: For calls, it might still be more profitable to widen. Use
5957	// VPlan-based cost model to compare different options.
5958	if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5959	ForcedScalar ->second.contains(Ptr: CI)) \|\|
5960	isUniformAfterVectorization(I: CI, VF))) {
5961	setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5962	IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5963	Cost: ScalarCost);
5964	continue;
5965	}
5966
5967	bool MaskRequired = Legal->isMaskRequired(I: CI);
5968	// Compute corresponding vector type for return value and arguments.
5969	Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5970	for (Type *ScalarTy : ScalarTys)
5971	Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5972
5973	// An in-loop reduction using an fmuladd intrinsic is a special case;
5974	// we don't want the normal cost for that intrinsic.
5975	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5976	if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5977	setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5978	IID: getVectorIntrinsicIDForCall(CI, TLI),
5979	MaskPos: std::nullopt, Cost: *RedCost);
5980	continue;
5981	}
5982
5983	// Find the cost of vectorizing the call, if we can find a suitable
5984	// vector variant of the function.
5985	VFInfo FuncInfo;
5986	Function VecFunc = nullptr*;
5987	// Search through any available variants for one we can use at this VF.
5988	for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5989	// Must match requested VF.
5990	if (Info.Shape.VF != VF)
5991	continue;
5992
5993	// Must take a mask argument if one is required
5994	if (MaskRequired && !Info.isMasked())
5995	continue;
5996
5997	// Check that all parameter kinds are supported
5998	bool ParamsOk = true;
5999	for (VFParameter Param : Info.Shape.Parameters) {
6000	switch (Param.ParamKind) {
6001	case VFParamKind::Vector:
6002	break;
6003	case VFParamKind::OMP_Uniform: {
6004	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6005	// Make sure the scalar parameter in the loop is invariant.
6006	if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
6007	L: TheLoop))
6008	ParamsOk = false;
6009	break;
6010	}
6011	case VFParamKind::OMP_Linear: {
6012	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
6013	// Find the stride for the scalar parameter in this loop and see if
6014	// it matches the stride for the variant.
6015	// TODO: do we need to figure out the cost of an extract to get the
6016	// first lane? Or do we hope that it will be folded away?
6017	ScalarEvolution *SE = PSE.getSE();
6018	if (!match(S: SE->getSCEV(V: ScalarParam),
6019	P: m_scev_AffineAddRec(
6020	Op0: m_SCEV(), Op1: m_scev_SpecificSInt(V: Param.LinearStepOrPos),
6021	L: m_SpecificLoop(L: TheLoop))))
6022	ParamsOk = false;
6023	break;
6024	}
6025	case VFParamKind::GlobalPredicate:
6026	break;
6027	default:
6028	ParamsOk = false;
6029	break;
6030	}
6031	}
6032
6033	if (!ParamsOk)
6034	continue;
6035
6036	// Found a suitable candidate, stop here.
6037	VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
6038	FuncInfo = Info;
6039	break;
6040	}
6041
6042	if (TLI && VecFunc && !CI->isNoBuiltin())
6043	VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
6044
6045	// Find the cost of an intrinsic; some targets may have instructions that
6046	// perform the operation without needing an actual call.
6047	Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
6048	if (IID != Intrinsic::not_intrinsic)
6049	IntrinsicCost = getVectorIntrinsicCost(CI, VF);
6050
6051	InstructionCost Cost = ScalarCost;
6052	InstWidening Decision = CM_Scalarize;
6053
6054	if (VectorCost.isValid() && VectorCost <= Cost) {
6055	Cost = VectorCost;
6056	Decision = CM_VectorCall;
6057	}
6058
6059	if (IntrinsicCost.isValid() && IntrinsicCost <= Cost) {
6060	Cost = IntrinsicCost;
6061	Decision = CM_IntrinsicCall;
6062	}
6063
6064	setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
6065	MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
6066	}
6067	}
6068	}
6069
6070	bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
6071	if (!Legal->isInvariant(V: Op))
6072	return false;
6073	// Consider Op invariant, if it or its operands aren't predicated
6074	// instruction in the loop. In that case, it is not trivially hoistable.
6075	auto *OpI = dyn_cast<Instruction>(Val: Op);
6076	return !OpI \|\| !TheLoop->contains(Inst: OpI) \|\|
6077	(!isPredicatedInst(I: OpI) &&
6078	(!isa<PHINode>(Val: OpI) \|\| OpI->getParent() != TheLoop->getHeader()) &&
6079	all_of(Range: OpI->operands(),
6080	P: [this](Value Op) { return* shouldConsiderInvariant(Op); }));
6081	}
6082
6083	InstructionCost
6084	LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6085	ElementCount VF) {
6086	// If we know that this instruction will remain uniform, check the cost of
6087	// the scalar version.
6088	if (isUniformAfterVectorization(I, VF))
6089	VF = ElementCount::getFixed(MinVal: `1`);
6090
6091	if (VF.isVector() && isProfitableToScalarize(I, VF))
6092	return InstsToScalarize [VF][I];
6093
6094	// Forced scalars do not have any scalarization overhead.
6095	auto ForcedScalar = ForcedScalars.find(Val: VF);
6096	if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6097	auto InstSet = ForcedScalar ->second;
6098	if (InstSet.count(Ptr: I))
6099	return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`)) *
6100	VF.getKnownMinValue();
6101	}
6102
6103	Type *RetTy = I->getType();
6104	if (canTruncateToMinimalBitwidth(I, VF))
6105	RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs [I]);
6106	auto *SE = PSE.getSE();
6107
6108	Type *VectorTy;
6109	if (isScalarAfterVectorization(I, VF)) {
6110	[[maybe_unused]] auto HasSingleCopyAfterVectorization =
6111	[this](Instruction I, ElementCount VF) -> bool* {
6112	if (VF.isScalar())
6113	return true;
6114
6115	auto Scalarized = InstsToScalarize.find(Key: VF);
6116	assert(Scalarized != InstsToScalarize.end() &&
6117	"VF not yet analyzed for scalarization profitability");
6118	return !Scalarized->second.count(Key: I) &&
6119	llvm::all_of(Range: I->users(), P: [&](User *U) {
6120	auto *UI = cast<Instruction>(Val: U);
6121	return !Scalarized->second.count(Key: UI);
6122	});
6123	};
6124
6125	// With the exception of GEPs and PHIs, after scalarization there should
6126	// only be one copy of the instruction generated in the loop. This is
6127	// because the VF is either 1, or any instructions that need scalarizing
6128	// have already been dealt with by the time we get here. As a result,
6129	// it means we don't have to multiply the instruction cost by VF.
6130	assert(I->getOpcode() == Instruction::GetElementPtr \|\|
6131	I->getOpcode() == Instruction::PHI \|\|
6132	(I->getOpcode() == Instruction::BitCast &&
6133	I->getType()->isPointerTy()) \|\|
6134	HasSingleCopyAfterVectorization(I, VF));
6135	VectorTy = RetTy;
6136	} else
6137	VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
6138
6139	if (VF.isVector() && VectorTy->isVectorTy() &&
6140	!TTI.getNumberOfParts(Tp: VectorTy))
6141	return InstructionCost::getInvalid();
6142
6143	// TODO: We need to estimate the cost of intrinsic calls.
6144	switch (I->getOpcode()) {
6145	case Instruction::GetElementPtr:
6146	// We mark this instruction as zero-cost because the cost of GEPs in
6147	// vectorized code depends on whether the corresponding memory instruction
6148	// is scalarized or not. Therefore, we handle GEPs with the memory
6149	// instruction cost.
6150	return `0`;
6151	case Instruction::UncondBr:
6152	case Instruction::CondBr: {
6153	// In cases of scalarized and predicated instructions, there will be VF
6154	// predicated blocks in the vectorized loop. Each branch around these
6155	// blocks requires also an extract of its vector compare i1 element.
6156	// Note that the conditional branch from the loop latch will be replaced by
6157	// a single branch controlling the loop, so there is no extra overhead from
6158	// scalarization.
6159	bool ScalarPredicatedBB = false;
6160	CondBrInst *BI = dyn_cast<CondBrInst>(Val: I);
6161	if (VF.isVector() && BI &&
6162	(PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `0`)) \|\|
6163	PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `1`))) &&
6164	BI->getParent() != TheLoop->getLoopLatch())
6165	ScalarPredicatedBB = true;
6166
6167	if (ScalarPredicatedBB) {
6168	// Not possible to scalarize scalable vector with predicated instructions.
6169	if (VF.isScalable())
6170	return InstructionCost::getInvalid();
6171	// Return cost for branches around scalarized and predicated blocks.
6172	auto *VecI1Ty =
6173	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6174	return (TTI.getScalarizationOverhead(
6175	Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6176	/Insert/ false, /Extract/ true, CostKind) +
6177	(TTI.getCFInstrCost(Opcode: Instruction::CondBr, CostKind) *
6178	VF.getFixedValue()));
6179	}
6180
6181	if (I->getParent() == TheLoop->getLoopLatch() \|\| VF.isScalar())
6182	// The back-edge branch will remain, as will all scalar branches.
6183	return TTI.getCFInstrCost(Opcode: Instruction::UncondBr, CostKind);
6184
6185	// This branch will be eliminated by if-conversion.
6186	return `0`;
6187	// Note: We currently assume zero cost for an unconditional branch inside
6188	// a predicated block since it will become a fall-through, although we
6189	// may decide in the future to call TTI for all branches.
6190	}
6191	case Instruction::Switch: {
6192	if (VF.isScalar())
6193	return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6194	auto *Switch = cast<SwitchInst>(Val: I);
6195	return Switch->getNumCases() *
6196	TTI.getCmpSelInstrCost(
6197	Opcode: Instruction::ICmp,
6198	ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6199	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6200	VecPred: CmpInst::ICMP_EQ, CostKind);
6201	}
6202	case Instruction::PHI: {
6203	auto *Phi = cast<PHINode>(Val: I);
6204
6205	// First-order recurrences are replaced by vector shuffles inside the loop.
6206	if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6207	SmallVector<int> Mask(VF.getKnownMinValue());
6208	std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - `1`);
6209	return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6210	DstTy: cast<VectorType>(Val: VectorTy),
6211	SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6212	Index: VF.getKnownMinValue() - `1`);
6213	}
6214
6215	// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6216	// converted into select instructions. We require N - 1 selects per phi
6217	// node, where N is the number of incoming values.
6218	if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6219	Type *ResultTy = Phi->getType();
6220
6221	// All instructions in an Any-of reduction chain are narrowed to bool.
6222	// Check if that is the case for this phi node.
6223	auto *HeaderUser = cast_if_present<PHINode>(
6224	Val: find_singleton<User>(Range: Phi->users(), P: [this](User U, bool) -> User {
6225	auto *Phi = dyn_cast<PHINode>(Val: U);
6226	if (Phi && Phi->getParent() == TheLoop->getHeader())
6227	return Phi;
6228	return nullptr;
6229	}));
6230	if (HeaderUser) {
6231	auto &ReductionVars = Legal->getReductionVars();
6232	auto Iter = ReductionVars.find(Key: HeaderUser);
6233	if (Iter != ReductionVars.end() &&
6234	RecurrenceDescriptor::isAnyOfRecurrenceKind(
6235	Kind: Iter->second.getRecurrenceKind()))
6236	ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6237	}
6238	return (Phi->getNumIncomingValues() - `1`) *
6239	TTI.getCmpSelInstrCost(
6240	Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6241	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6242	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6243	}
6244
6245	// When tail folding with EVL, if the phi is part of an out of loop
6246	// reduction then it will be transformed into a wide vp_merge.
6247	if (VF.isVector() && foldTailWithEVL() &&
6248	Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6249	IntrinsicCostAttributes ICA(
6250	Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6251	{toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6252	return TTI.getIntrinsicInstrCost(ICA, CostKind);
6253	}
6254
6255	return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6256	}
6257	case Instruction::UDiv:
6258	case Instruction::SDiv:
6259	case Instruction::URem:
6260	case Instruction::SRem:
6261	if (VF.isVector() && isPredicatedInst(I)) {
6262	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6263	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6264	ScalarCost : SafeDivisorCost;
6265	}
6266	// We've proven all lanes safe to speculate, fall through.
6267	[[fallthrough]];
6268	case Instruction::Add:
6269	case Instruction::Sub: {
6270	auto Info = Legal->getHistogramInfo(I);
6271	if (Info && VF.isVector()) {
6272	const HistogramInfo *HGram = Info.value();
6273	// Assume that a non-constant update value (or a constant != 1) requires
6274	// a multiply, and add that into the cost.
6275	InstructionCost MulCost = TTI::TCC_Free;
6276	ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`));
6277	if (!RHS \|\| RHS->getZExtValue() != `1`)
6278	MulCost =
6279	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6280
6281	// Find the cost of the histogram operation itself.
6282	Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6283	Type *ScalarTy = I->getType();
6284	Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6285	IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6286	Type::getVoidTy(C&: I->getContext()),
6287	{PtrTy, ScalarTy, MaskTy});
6288
6289	// Add the costs together with the add/sub operation.
6290	return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6291	TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6292	}
6293	[[fallthrough]];
6294	}
6295	case Instruction::FAdd:
6296	case Instruction::FSub:
6297	case Instruction::Mul:
6298	case Instruction::FMul:
6299	case Instruction::FDiv:
6300	case Instruction::FRem:
6301	case Instruction::Shl:
6302	case Instruction::LShr:
6303	case Instruction::AShr:
6304	case Instruction::And:
6305	case Instruction::Or:
6306	case Instruction::Xor: {
6307	// If we're speculating on the stride being 1, the multiplication may
6308	// fold away. We can generalize this for all operations using the notion
6309	// of neutral elements. (TODO)
6310	if (I->getOpcode() == Instruction::Mul &&
6311	((TheLoop->isLoopInvariant(V: I->getOperand(i: `0`)) &&
6312	PSE.getSCEV(V: I->getOperand(i: `0`))->isOne()) \|\|
6313	(TheLoop->isLoopInvariant(V: I->getOperand(i: `1`)) &&
6314	PSE.getSCEV(V: I->getOperand(i: `1`))->isOne())))
6315	return `0`;
6316
6317	// Detect reduction patterns
6318	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6319	return *RedCost;
6320
6321	// Certain instructions can be cheaper to vectorize if they have a constant
6322	// second vector operand. One example of this are shifts on x86.
6323	Value *Op2 = I->getOperand(i: `1`);
6324	if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6325	PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6326	isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6327	Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6328	}
6329	auto Op2Info = TTI.getOperandInfo(V: Op2);
6330	if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6331	shouldConsiderInvariant(Op: Op2))
6332	Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6333
6334	SmallVector<const Value *, `4`> Operands(I->operand_values());
6335	return TTI.getArithmeticInstrCost(
6336	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6337	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6338	Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6339	}
6340	case Instruction::FNeg: {
6341	return TTI.getArithmeticInstrCost(
6342	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6343	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6344	Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6345	Args: I->getOperand(i: `0`), CxtI: I);
6346	}
6347	case Instruction::Select: {
6348	SelectInst *SI = cast<SelectInst>(Val: I);
6349	const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6350	bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6351
6352	const Value Op0, Op1;
6353	using namespace llvm::PatternMatch;
6354	if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) \|\|
6355	match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6356	// select x, y, false --> x & y
6357	// select x, true, y --> x \| y
6358	const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6359	const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6360	assert(Op0->getType()->getScalarSizeInBits() == `1` &&
6361	Op1->getType()->getScalarSizeInBits() == `1`);
6362
6363	return TTI.getArithmeticInstrCost(
6364	Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And,
6365	Ty: VectorTy, CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: {Op0, Op1}, CxtI: I);
6366	}
6367
6368	Type *CondTy = SI->getCondition()->getType();
6369	if (!ScalarCond)
6370	CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6371
6372	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6373	if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6374	Pred = Cmp->getPredicate();
6375	return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6376	CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6377	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6378	}
6379	case Instruction::ICmp:
6380	case Instruction::FCmp: {
6381	Type *ValTy = I->getOperand(i: `0`)->getType();
6382
6383	if (canTruncateToMinimalBitwidth(I, VF)) {
6384	[[maybe_unused]] Instruction *Op0AsInstruction =
6385	dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
6386	assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) \|\|
6387	MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6388	"if both the operand and the compare are marked for "
6389	"truncation, they must have the same bitwidth");
6390	ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs [I]);
6391	}
6392
6393	VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6394	return TTI.getCmpSelInstrCost(
6395	Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6396	VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6397	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6398	}
6399	case Instruction::Store:
6400	case Instruction::Load: {
6401	ElementCount Width = VF;
6402	if (Width.isVector()) {
6403	InstWidening Decision = getWideningDecision(I, VF: Width);
6404	assert(Decision != CM_Unknown &&
6405	"CM decision should be taken at this point");
6406	if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6407	return InstructionCost::getInvalid();
6408	if (Decision == CM_Scalarize)
6409	Width = ElementCount::getFixed(MinVal: `1`);
6410	}
6411	VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6412	return getMemoryInstructionCost(I, VF);
6413	}
6414	case Instruction::BitCast:
6415	if (I->getType()->isPointerTy())
6416	return `0`;
6417	[[fallthrough]];
6418	case Instruction::ZExt:
6419	case Instruction::SExt:
6420	case Instruction::FPToUI:
6421	case Instruction::FPToSI:
6422	case Instruction::FPExt:
6423	case Instruction::PtrToInt:
6424	case Instruction::IntToPtr:
6425	case Instruction::SIToFP:
6426	case Instruction::UIToFP:
6427	case Instruction::Trunc:
6428	case Instruction::FPTrunc: {
6429	// Computes the CastContextHint from a Load/Store instruction.
6430	auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6431	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
6432	"Expected a load or a store!");
6433
6434	if (VF.isScalar() \|\| !TheLoop->contains(Inst: I))
6435	return TTI::CastContextHint::Normal;
6436
6437	switch (getWideningDecision(I, VF)) {
6438	case LoopVectorizationCostModel::CM_GatherScatter:
6439	return TTI::CastContextHint::GatherScatter;
6440	case LoopVectorizationCostModel::CM_Interleave:
6441	return TTI::CastContextHint::Interleave;
6442	case LoopVectorizationCostModel::CM_Scalarize:
6443	case LoopVectorizationCostModel::CM_Widen:
6444	return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6445	: TTI::CastContextHint::Normal;
6446	case LoopVectorizationCostModel::CM_Widen_Reverse:
6447	return TTI::CastContextHint::Reversed;
6448	case LoopVectorizationCostModel::CM_Unknown:
6449	llvm_unreachable("Instr did not go through cost modelling?");
6450	case LoopVectorizationCostModel::CM_VectorCall:
6451	case LoopVectorizationCostModel::CM_IntrinsicCall:
6452	llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6453	}
6454
6455	llvm_unreachable("Unhandled case!");
6456	};
6457
6458	unsigned Opcode = I->getOpcode();
6459	TTI::CastContextHint CCH = TTI::CastContextHint::None;
6460	// For Trunc, the context is the only user, which must be a StoreInst.
6461	if (Opcode == Instruction::Trunc \|\| Opcode == Instruction::FPTrunc) {
6462	if (I->hasOneUse())
6463	if (StoreInst Store = dyn_cast<StoreInst>(Val: I->user_begin()))
6464	CCH = ComputeCCH (Store);
6465	}
6466	// For Z/Sext, the context is the operand, which must be a LoadInst.
6467	else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt \|\|
6468	Opcode == Instruction::FPExt) {
6469	if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
6470	CCH = ComputeCCH (Load);
6471	}
6472
6473	// We optimize the truncation of induction variables having constant
6474	// integer steps. The cost of these truncations is the same as the scalar
6475	// operation.
6476	if (isOptimizableIVTruncate(I, VF)) {
6477	auto *Trunc = cast<TruncInst>(Val: I);
6478	return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6479	Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6480	}
6481
6482	// Detect reduction patterns
6483	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6484	return *RedCost;
6485
6486	Type *SrcScalarTy = I->getOperand(i: `0`)->getType();
6487	Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
6488	if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6489	SrcScalarTy =
6490	IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs [Op0AsInstruction]);
6491	Type *SrcVecTy =
6492	VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6493
6494	if (canTruncateToMinimalBitwidth(I, VF)) {
6495	// If the result type is <= the source type, there will be no extend
6496	// after truncating the users to the minimal required bitwidth.
6497	if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6498	(I->getOpcode() == Instruction::ZExt \|\|
6499	I->getOpcode() == Instruction::SExt))
6500	return `0`;
6501	}
6502
6503	return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6504	}
6505	case Instruction::Call:
6506	return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6507	case Instruction::ExtractValue:
6508	return TTI.getInstructionCost(U: I, CostKind);
6509	case Instruction::Alloca:
6510	// We cannot easily widen alloca to a scalable alloca, as
6511	// the result would need to be a vector of pointers.
6512	if (VF.isScalable())
6513	return InstructionCost::getInvalid();
6514	return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: RetTy, CostKind);
6515	default:
6516	// This opcode is unknown. Assume that it is the same as 'mul'.
6517	return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6518	} // end of switch.
6519	}
6520
6521	void LoopVectorizationCostModel::collectValuesToIgnore() {
6522	// Ignore ephemeral values.
6523	CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6524
6525	SmallVector<Value *, `4`> DeadInterleavePointerOps;
6526	SmallVector<Value *, `4`> DeadOps;
6527
6528	// If a scalar epilogue is required, users outside the loop won't use
6529	// live-outs from the vector loop but from the scalar epilogue. Ignore them if
6530	// that is the case.
6531	bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6532	auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6533	return RequiresScalarEpilogue &&
6534	!TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6535	};
6536
6537	LoopBlocksDFS DFS(TheLoop);
6538	DFS.perform(LI);
6539	for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6540	for (Instruction &I : reverse(C&: *BB)) {
6541	if (VecValuesToIgnore.contains(Ptr: &I) \|\| ValuesToIgnore.contains(Ptr: &I))
6542	continue;
6543
6544	// Add instructions that would be trivially dead and are only used by
6545	// values already ignored to DeadOps to seed worklist.
6546	if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6547	all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6548	return VecValuesToIgnore.contains(Ptr: U) \|\|
6549	ValuesToIgnore.contains(Ptr: U) \|\| IsLiveOutDead (U);
6550	}))
6551	DeadOps.push_back(Elt: &I);
6552
6553	// For interleave groups, we only create a pointer for the start of the
6554	// interleave group. Queue up addresses of group members except the insert
6555	// position for further processing.
6556	if (isAccessInterleaved(Instr: &I)) {
6557	auto *Group = getInterleavedAccessGroup(Instr: &I);
6558	if (Group->getInsertPos() == &I)
6559	continue;
6560	Value *PointerOp = getLoadStorePointerOperand(V: &I);
6561	DeadInterleavePointerOps.push_back(Elt: PointerOp);
6562	}
6563
6564	// Queue branches for analysis. They are dead, if their successors only
6565	// contain dead instructions.
6566	if (isa<CondBrInst>(Val: &I))
6567	DeadOps.push_back(Elt: &I);
6568	}
6569
6570	// Mark ops feeding interleave group members as free, if they are only used
6571	// by other dead computations.
6572	for (unsigned I = `0`; I != DeadInterleavePointerOps.size(); ++I) {
6573	auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps [I]);
6574	if (!Op \|\| !TheLoop->contains(Inst: Op) \|\| any_of(Range: Op->users(), P: [this](User *U) {
6575	Instruction *UI = cast<Instruction>(Val: U);
6576	return !VecValuesToIgnore.contains(Ptr: U) &&
6577	(!isAccessInterleaved(Instr: UI) \|\|
6578	getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6579	}))
6580	continue;
6581	VecValuesToIgnore.insert(Ptr: Op);
6582	append_range(C&: DeadInterleavePointerOps, R: Op->operands());
6583	}
6584
6585	// Mark ops that would be trivially dead and are only used by ignored
6586	// instructions as free.
6587	BasicBlock *Header = TheLoop->getHeader();
6588
6589	// Returns true if the block contains only dead instructions. Such blocks will
6590	// be removed by VPlan-to-VPlan transforms and won't be considered by the
6591	// VPlan-based cost model, so skip them in the legacy cost-model as well.
6592	auto IsEmptyBlock = [this](BasicBlock *BB) {
6593	return all_of(Range&: BB, P: [this*](Instruction &I) {
6594	return ValuesToIgnore.contains(Ptr: &I) \|\| VecValuesToIgnore.contains(Ptr: &I) \|\|
6595	isa<UncondBrInst>(Val: &I);
6596	});
6597	};
6598	for (unsigned I = `0`; I != DeadOps.size(); ++I) {
6599	auto *Op = dyn_cast<Instruction>(Val: DeadOps [I]);
6600
6601	// Check if the branch should be considered dead.
6602	if (auto *Br = dyn_cast_or_null<CondBrInst>(Val: Op)) {
6603	BasicBlock *ThenBB = Br->getSuccessor(i: `0`);
6604	BasicBlock *ElseBB = Br->getSuccessor(i: `1`);
6605	// Don't considers branches leaving the loop for simplification.
6606	if (!TheLoop->contains(BB: ThenBB) \|\| !TheLoop->contains(BB: ElseBB))
6607	continue;
6608	bool ThenEmpty = IsEmptyBlock (ThenBB);
6609	bool ElseEmpty = IsEmptyBlock (ElseBB);
6610	if ((ThenEmpty && ElseEmpty) \|\|
6611	(ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6612	ElseBB->phis().empty()) \|\|
6613	(ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6614	ThenBB->phis().empty())) {
6615	VecValuesToIgnore.insert(Ptr: Br);
6616	DeadOps.push_back(Elt: Br->getCondition());
6617	}
6618	continue;
6619	}
6620
6621	// Skip any op that shouldn't be considered dead.
6622	if (!Op \|\| !TheLoop->contains(Inst: Op) \|\|
6623	(isa<PHINode>(Val: Op) && Op->getParent() == Header) \|\|
6624	!wouldInstructionBeTriviallyDead(I: Op, TLI) \|\|
6625	any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6626	return !VecValuesToIgnore.contains(Ptr: U) &&
6627	!ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead (U);
6628	}))
6629	continue;
6630
6631	// If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6632	// which applies for both scalar and vector versions. Otherwise it is only
6633	// dead in vector versions, so only add it to VecValuesToIgnore.
6634	if (all_of(Range: Op->users(),
6635	P: [this](User U) { return* ValuesToIgnore.contains(Ptr: U); }))
6636	ValuesToIgnore.insert(Ptr: Op);
6637
6638	VecValuesToIgnore.insert(Ptr: Op);
6639	append_range(C&: DeadOps, R: Op->operands());
6640	}
6641
6642	// Ignore type-promoting instructions we identified during reduction
6643	// detection.
6644	for (const auto &Reduction : Legal->getReductionVars()) {
6645	const RecurrenceDescriptor &RedDes = Reduction.second;
6646	const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6647	VecValuesToIgnore.insert_range(R: Casts);
6648	}
6649	// Ignore type-casting instructions we identified during induction
6650	// detection.
6651	for (const auto &Induction : Legal->getInductionVars()) {
6652	const InductionDescriptor &IndDes = Induction.second;
6653	VecValuesToIgnore.insert_range(R: IndDes.getCastInsts());
6654	}
6655	}
6656
6657	void LoopVectorizationCostModel::collectInLoopReductions() {
6658	// Avoid duplicating work finding in-loop reductions.
6659	if (!InLoopReductions.empty())
6660	return;
6661
6662	for (const auto &Reduction : Legal->getReductionVars()) {
6663	PHINode *Phi = Reduction.first;
6664	const RecurrenceDescriptor &RdxDesc = Reduction.second;
6665
6666	// Multi-use reductions (e.g., used in FindLastIV patterns) are handled
6667	// separately and should not be considered for in-loop reductions.
6668	if (RdxDesc.hasUsesOutsideReductionChain())
6669	continue;
6670
6671	// We don't collect reductions that are type promoted (yet).
6672	if (RdxDesc.getRecurrenceType() != Phi->getType())
6673	continue;
6674
6675	// In-loop AnyOf and FindIV reductions are not yet supported.
6676	RecurKind Kind = RdxDesc.getRecurrenceKind();
6677	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) \|\|
6678	RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) \|\|
6679	RecurrenceDescriptor::isFindLastRecurrenceKind(Kind))
6680	continue;
6681
6682	// If the target would prefer this reduction to happen "in-loop", then we
6683	// want to record it as such.
6684	if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6685	!TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6686	continue;
6687
6688	// Check that we can correctly put the reductions into the loop, by
6689	// finding the chain of operations that leads from the phi to the loop
6690	// exit value.
6691	SmallVector<Instruction *, `4`> ReductionOperations =
6692	RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6693	bool InLoop = !ReductionOperations.empty();
6694
6695	if (InLoop) {
6696	InLoopReductions.insert(Ptr: Phi);
6697	// Add the elements to InLoopReductionImmediateChains for cost modelling.
6698	Instruction *LastChain = Phi;
6699	for (auto *I : ReductionOperations) {
6700	InLoopReductionImmediateChains [I] = LastChain;
6701	LastChain = I;
6702	}
6703	}
6704	LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6705	<< " reduction for phi: " << *Phi << "\n");
6706	}
6707	}
6708
6709	// This function will select a scalable VF if the target supports scalable
6710	// vectors and a fixed one otherwise.
6711	// TODO: we could return a pair of values that specify the max VF and
6712	// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6713	// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6714	// doesn't have a cost model that can choose which plan to execute if
6715	// more than one is generated.
6716	static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6717	LoopVectorizationCostModel &CM) {
6718	unsigned WidestType;
6719	std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6720
6721	TargetTransformInfo::RegisterKind RegKind =
6722	TTI.enableScalableVectorization()
6723	? TargetTransformInfo::RGK_ScalableVector
6724	: TargetTransformInfo::RGK_FixedWidthVector;
6725
6726	TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6727	unsigned N = RegSize.getKnownMinValue() / WidestType;
6728	return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6729	}
6730
6731	VectorizationFactor
6732	LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6733	ElementCount VF = UserVF;
6734	// Outer loop handling: They may require CFG and instruction level
6735	// transformations before even evaluating whether vectorization is profitable.
6736	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
6737	// the vectorization pipeline.
6738	if (!OrigLoop->isInnermost()) {
6739	// If the user doesn't provide a vectorization factor, determine a
6740	// reasonable one.
6741	if (UserVF.isZero()) {
6742	VF = determineVPlanVF(TTI, CM);
6743	LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6744
6745	// Make sure we have a VF > 1 for stress testing.
6746	if (VPlanBuildStressTest && (VF.isScalar() \|\| VF.isZero())) {
6747	LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6748	<< "overriding computed VF.\n");
6749	VF = ElementCount::getFixed(MinVal: `4`);
6750	}
6751	} else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6752	!ForceTargetSupportsScalableVectors) {
6753	LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6754	<< "not supported by the target.\n");
6755	reportVectorizationFailure(
6756	DebugMsg: "Scalable vectorization requested but not supported by the target",
6757	OREMsg: "the scalable user-specified vectorization width for outer-loop "
6758	"vectorization cannot be used because the target does not support "
6759	"scalable vectors.",
6760	ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6761	return VectorizationFactor::Disabled();
6762	}
6763	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6764	assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6765	"VF needs to be a power of two");
6766	LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6767	<< "VF " << VF << " to build VPlans.\n");
6768	buildVPlans(MinVF: VF, MaxVF: VF);
6769
6770	if (VPlans.empty())
6771	return VectorizationFactor::Disabled();
6772
6773	// For VPlan build stress testing, we bail out after VPlan construction.
6774	if (VPlanBuildStressTest)
6775	return VectorizationFactor::Disabled();
6776
6777	return {VF, `0` /Cost/, `0` / ScalarCost /};
6778	}
6779
6780	LLVM_DEBUG(
6781	dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6782	"VPlan-native path.\n");
6783	return VectorizationFactor::Disabled();
6784	}
6785
6786	void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6787	assert(OrigLoop->isInnermost() && "Inner loop expected.");
6788	CM.collectValuesToIgnore();
6789	CM.collectElementTypesForWidening();
6790
6791	FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6792	if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6793	return;
6794
6795	// Invalidate interleave groups if all blocks of loop will be predicated.
6796	if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6797	!useMaskedInterleavedAccesses(TTI)) {
6798	LLVM_DEBUG(
6799	dbgs()
6800	<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
6801	"which requires masked-interleaved support.\n");
6802	if (CM.InterleaveInfo.invalidateGroups())
6803	// Invalidating interleave groups also requires invalidating all decisions
6804	// based on them, which includes widening decisions and uniform and scalar
6805	// values.
6806	CM.invalidateCostModelingDecisions();
6807	}
6808
6809	if (CM.foldTailByMasking())
6810	Legal->prepareToFoldTailByMasking();
6811
6812	ElementCount MaxUserVF =
6813	UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6814	if (UserVF) {
6815	if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6816	reportVectorizationInfo(
6817	Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6818	ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6819	} else {
6820	assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6821	"VF needs to be a power of two");
6822	// Collect the instructions (and their associated costs) that will be more
6823	// profitable to scalarize.
6824	CM.collectInLoopReductions();
6825	if (CM.selectUserVectorizationFactor(UserVF)) {
6826	LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6827	buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6828	LLVM_DEBUG(printPlans(dbgs()));
6829	return;
6830	}
6831	reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6832	ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6833	}
6834	}
6835
6836	// Collect the Vectorization Factor Candidates.
6837	SmallVector<ElementCount> VFCandidates;
6838	for (auto VF = ElementCount::getFixed(MinVal: `1`);
6839	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= `2`)
6840	VFCandidates.push_back(Elt: VF);
6841	for (auto VF = ElementCount::getScalable(MinVal: `1`);
6842	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= `2`)
6843	VFCandidates.push_back(Elt: VF);
6844
6845	CM.collectInLoopReductions();
6846	for (const auto &VF : VFCandidates) {
6847	// Collect Uniform and Scalar instructions after vectorization with VF.
6848	CM.collectNonVectorizedAndSetWideningDecisions(VF);
6849	}
6850
6851	buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: `1`), MaxVF: MaxFactors.FixedVF);
6852	buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: `1`), MaxVF: MaxFactors.ScalableVF);
6853
6854	LLVM_DEBUG(printPlans(dbgs()));
6855	}
6856
6857	InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6858	ElementCount VF) const {
6859	InstructionCost Cost = CM.getInstructionCost(I: UI, VF);
6860	if (Cost.isValid() && ForceTargetInstructionCost.getNumOccurrences())
6861	return InstructionCost (ForceTargetInstructionCost);
6862	return Cost;
6863	}
6864
6865	bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6866	ElementCount VF) const {
6867	return CM.isUniformAfterVectorization(I, VF);
6868	}
6869
6870	bool VPCostContext::skipCostComputation(Instruction UI, bool* IsVector) const {
6871	return CM.ValuesToIgnore.contains(Ptr: UI) \|\|
6872	(IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) \|\|
6873	SkipCostComputation.contains(Ptr: UI);
6874	}
6875
6876	unsigned VPCostContext::getPredBlockCostDivisor(BasicBlock BB) const* {
6877	return CM.getPredBlockCostDivisor(CostKind, BB);
6878	}
6879
6880	InstructionCost
6881	LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6882	VPCostContext &CostCtx) const {
6883	InstructionCost Cost;
6884	// Cost modeling for inductions is inaccurate in the legacy cost model
6885	// compared to the recipes that are generated. To match here initially during
6886	// VPlan cost model bring up directly use the induction costs from the legacy
6887	// cost model. Note that we do this as pre-processing; the VPlan may not have
6888	// any recipes associated with the original induction increment instruction
6889	// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6890	// the cost of induction phis and increments (both that are represented by
6891	// recipes and those that are not), to avoid distinguishing between them here,
6892	// and skip all recipes that represent induction phis and increments (the
6893	// former case) later on, if they exist, to avoid counting them twice.
6894	// Similarly we pre-compute the cost of any optimized truncates.
6895	// TODO: Switch to more accurate costing based on VPlan.
6896	for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6897	Instruction *IVInc = cast<Instruction>(
6898	Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6899	SmallVector<Instruction *> IVInsts = {IVInc};
6900	for (unsigned I = `0`; I != IVInsts.size(); I++) {
6901	for (Value *Op : IVInsts [I]->operands()) {
6902	auto *OpI = dyn_cast<Instruction>(Val: Op);
6903	if (Op == IV \|\| !OpI \|\| !OrigLoop->contains(Inst: OpI) \|\| !Op->hasOneUse())
6904	continue;
6905	IVInsts.push_back(Elt: OpI);
6906	}
6907	}
6908	IVInsts.push_back(Elt: IV);
6909	for (User *U : IV->users()) {
6910	auto *CI = cast<Instruction>(Val: U);
6911	if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6912	continue;
6913	IVInsts.push_back(Elt: CI);
6914	}
6915
6916	// If the vector loop gets executed exactly once with the given VF, ignore
6917	// the costs of comparison and induction instructions, as they'll get
6918	// simplified away.
6919	// TODO: Remove this code after stepping away from the legacy cost model and
6920	// adding code to simplify VPlans before calculating their costs.
6921	auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6922	if (TC == VF && !CM.foldTailByMasking())
6923	addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6924	InstsToIgnore&: CostCtx.SkipCostComputation);
6925
6926	for (Instruction *IVInst : IVInsts) {
6927	if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6928	continue;
6929	InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6930	LLVM_DEBUG({
6931	dbgs() << "Cost of " << InductionCost << " for VF " << VF
6932	<< ": induction instruction " << *IVInst << "\n";
6933	});
6934	Cost += InductionCost;
6935	CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6936	}
6937	}
6938
6939	/// Compute the cost of all exiting conditions of the loop using the legacy
6940	/// cost model. This is to match the legacy behavior, which adds the cost of
6941	/// all exit conditions. Note that this over-estimates the cost, as there will
6942	/// be a single condition to control the vector loop.
6943	SmallVector<BasicBlock *> Exiting;
6944	CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6945	SetVector<Instruction *> ExitInstrs;
6946	// Collect all exit conditions.
6947	for (BasicBlock *EB : Exiting) {
6948	auto *Term = dyn_cast<CondBrInst>(Val: EB->getTerminator());
6949	if (!Term \|\| CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6950	continue;
6951	if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: `0`))) {
6952	ExitInstrs.insert(X: CondI);
6953	}
6954	}
6955	// Compute the cost of all instructions only feeding the exit conditions.
6956	for (unsigned I = `0`; I != ExitInstrs.size(); ++I) {
6957	Instruction *CondI = ExitInstrs [I];
6958	if (!OrigLoop->contains(Inst: CondI) \|\|
6959	!CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6960	continue;
6961	InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6962	LLVM_DEBUG({
6963	dbgs() << "Cost of " << CondICost << " for VF " << VF
6964	<< ": exit condition instruction " << *CondI << "\n";
6965	});
6966	Cost += CondICost;
6967	for (Value *Op : CondI->operands()) {
6968	auto *OpI = dyn_cast<Instruction>(Val: Op);
6969	if (!OpI \|\| CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) \|\|
6970	any_of(Range: OpI->users(), P: [&ExitInstrs](User *U) {
6971	return !ExitInstrs.contains(key: cast<Instruction>(Val: U));
6972	}))
6973	continue;
6974	ExitInstrs.insert(X: OpI);
6975	}
6976	}
6977
6978	// Pre-compute the costs for branches except for the backedge, as the number
6979	// of replicate regions in a VPlan may not directly match the number of
6980	// branches, which would lead to different decisions.
6981	// TODO: Compute cost of branches for each replicate region in the VPlan,
6982	// which is more accurate than the legacy cost model.
6983	for (BasicBlock *BB : OrigLoop->blocks()) {
6984	if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6985	continue;
6986	CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6987	if (BB == OrigLoop->getLoopLatch())
6988	continue;
6989	auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6990	Cost += BranchCost;
6991	}
6992
6993	// Don't apply special costs when instruction cost is forced to make sure the
6994	// forced cost is used for each recipe.
6995	if (ForceTargetInstructionCost.getNumOccurrences())
6996	return Cost;
6997
6998	// Pre-compute costs for instructions that are forced-scalar or profitable to
6999	// scalarize. Their costs will be computed separately in the legacy cost
7000	// model.
7001	for (Instruction *ForcedScalar : CM.ForcedScalars [VF]) {
7002	if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
7003	continue;
7004	CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
7005	InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
7006	LLVM_DEBUG({
7007	dbgs() << "Cost of " << ForcedCost << " for VF " << VF
7008	<< ": forced scalar " << *ForcedScalar << "\n";
7009	});
7010	Cost += ForcedCost;
7011	}
7012	for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize [VF]) {
7013	if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
7014	continue;
7015	CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
7016	LLVM_DEBUG({
7017	dbgs() << "Cost of " << ScalarCost << " for VF " << VF
7018	<< ": profitable to scalarize " << *Scalarized << "\n";
7019	});
7020	Cost += ScalarCost;
7021	}
7022
7023	return Cost;
7024	}
7025
7026	InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7027	ElementCount VF) const {
7028	VPCostContext CostCtx(CM.TTI, *CM.TLI, Plan, CM, CM.CostKind, PSE, OrigLoop);
7029	InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
7030
7031	// Now compute and add the VPlan-based cost.
7032	Cost += Plan.cost(VF, Ctx&: CostCtx);
7033	#ifndef NDEBUG
7034	unsigned EstimatedWidth = estimateElementCount(VF, CM.getVScaleForTuning());
7035	LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
7036	<< " (Estimated cost per lane: ");
7037	if (Cost.isValid()) {
7038	double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
7039	LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
7040	} else / No point dividing an invalid cost - it will still be invalid /
7041	LLVM_DEBUG(dbgs() << "Invalid");
7042	LLVM_DEBUG(dbgs() << ")\n");
7043	#endif
7044	return Cost;
7045	}
7046
7047	#ifndef NDEBUG
7048	/// Return true if the original loop \ TheLoop contains any instructions that do
7049	/// not have corresponding recipes in \p Plan and are not marked to be ignored
7050	/// in \p CostCtx. This means the VPlan contains simplification that the legacy
7051	/// cost-model did not account for.
7052	static bool planContainsAdditionalSimplifications(VPlan &Plan,
7053	VPCostContext &CostCtx,
7054	Loop *TheLoop,
7055	ElementCount VF) {
7056	using namespace VPlanPatternMatch;
7057	// First collect all instructions for the recipes in Plan.
7058	auto GetInstructionForCost = [](const VPRecipeBase R) -> Instruction {
7059	if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
7060	return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7061	if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7062	return &WidenMem->getIngredient();
7063	return nullptr;
7064	};
7065
7066	// Check if a select for a safe divisor was hoisted to the pre-header. If so,
7067	// the select doesn't need to be considered for the vector loop cost; go with
7068	// the more accurate VPlan-based cost model.
7069	for (VPRecipeBase &R : *Plan.getVectorPreheader()) {
7070	auto *VPI = dyn_cast<VPInstruction>(&R);
7071	if (!VPI \|\| VPI->getOpcode() != Instruction::Select)
7072	continue;
7073
7074	if (auto *WR = dyn_cast_or_null<VPWidenRecipe>(VPI->getSingleUser())) {
7075	switch (WR->getOpcode()) {
7076	case Instruction::UDiv:
7077	case Instruction::SDiv:
7078	case Instruction::URem:
7079	case Instruction::SRem:
7080	return true;
7081	default:
7082	break;
7083	}
7084	}
7085	}
7086
7087	DenseSet<Instruction *> SeenInstrs;
7088	auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
7089	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7090	for (VPRecipeBase &R : *VPBB) {
7091	if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7092	auto *IG = IR->getInterleaveGroup();
7093	unsigned NumMembers = IG->getNumMembers();
7094	for (unsigned I = `0`; I != NumMembers; ++I) {
7095	if (Instruction *M = IG->getMember(I))
7096	SeenInstrs.insert(M);
7097	}
7098	continue;
7099	}
7100	// Unused FOR splices are removed by VPlan transforms, so the VPlan-based
7101	// cost model won't cost it whilst the legacy will.
7102	if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
7103	if (none_of(FOR->users(),
7104	match_fn(m_VPInstruction<
7105	VPInstruction::FirstOrderRecurrenceSplice>())))
7106	return true;
7107	}
7108	// The VPlan-based cost model is more accurate for partial reductions and
7109	// comparing against the legacy cost isn't desirable.
7110	if (auto *VPR = dyn_cast<VPReductionRecipe>(&R))
7111	if (VPR->isPartialReduction())
7112	return true;
7113
7114	// The VPlan-based cost model can analyze if recipes are scalar
7115	// recursively, but the legacy cost model cannot.
7116	if (auto *WidenMemR = dyn_cast<VPWidenMemoryRecipe>(&R)) {
7117	auto *AddrI = dyn_cast<Instruction>(
7118	getLoadStorePointerOperand(&WidenMemR->getIngredient()));
7119	if (AddrI && vputils::isSingleScalar(WidenMemR->getAddr()) !=
7120	CostCtx.isLegacyUniformAfterVectorization(AddrI, VF))
7121	return true;
7122
7123	if (WidenMemR->isReverse()) {
7124	// If the stored value of a reverse store is invariant, LICM will
7125	// hoist the reverse operation to the preheader. In this case, the
7126	// result of the VPlan-based cost model will diverge from that of
7127	// the legacy model.
7128	if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(WidenMemR))
7129	if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7130	return true;
7131
7132	if (auto *StoreR = dyn_cast<VPWidenStoreEVLRecipe>(WidenMemR))
7133	if (StoreR->getStoredValue()->isDefinedOutsideLoopRegions())
7134	return true;
7135	}
7136	}
7137
7138	// The legacy cost model costs non-header phis with a scalar VF as a phi,
7139	// but scalar unrolled VPlans will have VPBlendRecipes which emit selects.
7140	if (isa<VPBlendRecipe>(&R) &&
7141	vputils::onlyFirstLaneUsed(R.getVPSingleValue()))
7142	return true;
7143
7144	/// If a VPlan transform folded a recipe to one producing a single-scalar,
7145	/// but the original instruction wasn't uniform-after-vectorization in the
7146	/// legacy cost model, the legacy cost overestimates the actual cost.
7147	if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7148	if (RepR->isSingleScalar() &&
7149	!CostCtx.isLegacyUniformAfterVectorization(
7150	RepR->getUnderlyingInstr(), VF))
7151	return true;
7152	}
7153	if (Instruction *UI = GetInstructionForCost(&R)) {
7154	// If we adjusted the predicate of the recipe, the cost in the legacy
7155	// cost model may be different.
7156	CmpPredicate Pred;
7157	if (match(&R, m_Cmp(Pred, m_VPValue(), m_VPValue())) &&
7158	cast<VPRecipeWithIRFlags>(R).getPredicate() !=
7159	cast<CmpInst>(UI)->getPredicate())
7160	return true;
7161
7162	// Recipes with underlying instructions being moved out of the loop
7163	// region by LICM may cause discrepancies between the legacy cost model
7164	// and the VPlan-based cost model.
7165	if (!VPBB->getEnclosingLoopRegion())
7166	return true;
7167
7168	SeenInstrs.insert(UI);
7169	}
7170	}
7171	}
7172
7173	// If a reverse recipe has been sunk to the middle block (e.g., for a load
7174	// whose result is only used as a live-out), VPlan avoids the per-iteration
7175	// reverse shuffle cost that the legacy model accounts for.
7176	if (any_of(Plan.getMiddleBlock(), [](const* VPRecipeBase &R) {
7177	return match(&R, m_VPInstruction<VPInstruction::Reverse>());
7178	}))
7179	return true;
7180
7181	// Return true if the loop contains any instructions that are not also part of
7182	// the VPlan or are skipped for VPlan-based cost computations. This indicates
7183	// that the VPlan contains extra simplifications.
7184	return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
7185	TheLoop](BasicBlock *BB) {
7186	return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7187	// Skip induction phis when checking for simplifications, as they may not
7188	// be lowered directly be lowered to a corresponding PHI recipe.
7189	if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
7190	CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
7191	return false;
7192	return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7193	});
7194	});
7195	}
7196	#endif
7197
7198	VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7199	if (VPlans.empty())
7200	return VectorizationFactor::Disabled();
7201	// If there is a single VPlan with a single VF, return it directly.
7202	VPlan &FirstPlan = *VPlans [`0`];
7203	if (VPlans.size() == `1` && size(Range: FirstPlan.vectorFactors()) == `1`)
7204	return {*FirstPlan.vectorFactors().begin(), `0`, `0`};
7205
7206	LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
7207	<< (CM.CostKind == TTI::TCK_RecipThroughput
7208	? "Reciprocal Throughput\n"
7209	: CM.CostKind == TTI::TCK_Latency
7210	? "Instruction Latency\n"
7211	: CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
7212	: CM.CostKind == TTI::TCK_SizeAndLatency
7213	? "Code Size and Latency\n"
7214	: "Unknown\n"));
7215
7216	ElementCount ScalarVF = ElementCount::getFixed(MinVal: `1`);
7217	assert(hasPlanWithVF(ScalarVF) &&
7218	"More than a single plan/VF w/o any plan having scalar VF");
7219
7220	// TODO: Compute scalar cost using VPlan-based cost model.
7221	InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7222	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7223	VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7224	VectorizationFactor BestFactor = ScalarFactor;
7225
7226	bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7227	if (ForceVectorization) {
7228	// Ignore scalar width, because the user explicitly wants vectorization.
7229	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
7230	// evaluation.
7231	BestFactor.Cost = InstructionCost::getMax();
7232	}
7233
7234	for (auto &P : VPlans) {
7235	ArrayRef<ElementCount> VFs(P ->vectorFactors().begin(),
7236	P ->vectorFactors().end());
7237
7238	SmallVector<VPRegisterUsage, `8`> RUs;
7239	if (any_of(Range&: VFs, P: [this](ElementCount VF) {
7240	return CM.shouldConsiderRegPressureForVF(VF);
7241	}))
7242	RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7243
7244	for (unsigned I = `0`; I < VFs.size(); I++) {
7245	ElementCount VF = VFs [I];
7246	if (VF.isScalar())
7247	continue;
7248	if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7249	LLVM_DEBUG(
7250	dbgs()
7251	<< "LV: Not considering vector loop of width " << VF
7252	<< " because it will not generate any vector instructions.\n");
7253	continue;
7254	}
7255	if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7256	LLVM_DEBUG(
7257	dbgs()
7258	<< "LV: Not considering vector loop of width " << VF
7259	<< " because it would cause replicated blocks to be generated,"
7260	<< " which isn't allowed when optimizing for size.\n");
7261	continue;
7262	}
7263
7264	InstructionCost Cost = cost(Plan&: *P, VF);
7265	VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7266
7267	if (CM.shouldConsiderRegPressureForVF(VF) &&
7268	RUs [I].exceedsMaxNumRegs(TTI, OverrideMaxNumRegs: ForceTargetNumVectorRegs)) {
7269	LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7270	<< VF << " because it uses too many registers\n");
7271	continue;
7272	}
7273
7274	if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P ->hasScalarTail()))
7275	BestFactor = CurrentFactor;
7276
7277	// If profitable add it to ProfitableVF list.
7278	if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P ->hasScalarTail()))
7279	ProfitableVFs.push_back(Elt: CurrentFactor);
7280	}
7281	}
7282
7283	#ifndef NDEBUG
7284	// Select the optimal vectorization factor according to the legacy cost-model.
7285	// This is now only used to verify the decisions by the new VPlan-based
7286	// cost-model and will be retired once the VPlan-based cost-model is
7287	// stabilized.
7288	VectorizationFactor LegacyVF = selectVectorizationFactor();
7289	VPlan &BestPlan = getPlanFor(BestFactor.Width);
7290
7291	// Pre-compute the cost and use it to check if BestPlan contains any
7292	// simplifications not accounted for in the legacy cost model. If that's the
7293	// case, don't trigger the assertion, as the extra simplifications may cause a
7294	// different VF to be picked by the VPlan-based cost model.
7295	VPCostContext CostCtx(CM.TTI, *CM.TLI, BestPlan, CM, CM.CostKind, CM.PSE,
7296	OrigLoop);
7297	precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7298	// Verify that the VPlan-based and legacy cost models agree, except for
7299	// VPlans with early exits,*
7300	// VPlans with additional VPlan simplifications,*
7301	// EVL-based VPlans with gather/scatters (the VPlan-based cost model uses*
7302	// vp_scatter/vp_gather).
7303	// The legacy cost model doesn't properly model costs for such loops.
7304	bool UsesEVLGatherScatter =
7305	any_of(VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(
7306	BestPlan.getVectorLoopRegion()->getEntry())),
7307	[](VPBasicBlock *VPBB) {
7308	return any_of(*VPBB, [](VPRecipeBase &R) {
7309	return isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R) &&
7310	!cast<VPWidenMemoryRecipe>(&R)->isConsecutive();
7311	});
7312	});
7313	assert(
7314	(BestFactor.Width == LegacyVF.Width \|\| BestPlan.hasEarlyExit() \|\|
7315	!Legal->getLAI()->getSymbolicStrides().empty() \|\| UsesEVLGatherScatter \|\|
7316	planContainsAdditionalSimplifications(
7317	getPlanFor(BestFactor.Width), CostCtx, OrigLoop, BestFactor.Width) \|\|
7318	planContainsAdditionalSimplifications(
7319	getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7320	" VPlan cost model and legacy cost model disagreed");
7321	assert((BestFactor.Width.isScalar() \|\| BestFactor.ScalarCost > `0`) &&
7322	"when vectorizing, the scalar cost must be computed.");
7323	#endif
7324
7325	LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7326	return BestFactor;
7327	}
7328
7329	DenseMap<const SCEV , Value > LoopVectorizationPlanner::executePlan(
7330	ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7331	InnerLoopVectorizer &ILV, DominatorTree DT, bool* VectorizingEpilogue) {
7332	assert(BestVPlan.hasVF(BestVF) &&
7333	"Trying to execute plan with unsupported VF");
7334	assert(BestVPlan.hasUF(BestUF) &&
7335	"Trying to execute plan with unsupported UF");
7336	if (BestVPlan.hasEarlyExit())
7337	++LoopsEarlyExitVectorized;
7338	// TODO: Move to VPlan transform stage once the transition to the VPlan-based
7339	// cost model is complete for better cost estimates.
7340	RUN_VPLAN_PASS(VPlanTransforms::unrollByUF, BestVPlan, BestUF);
7341	RUN_VPLAN_PASS(VPlanTransforms::materializePacksAndUnpacks, BestVPlan);
7342	RUN_VPLAN_PASS(VPlanTransforms::materializeBroadcasts, BestVPlan);
7343	RUN_VPLAN_PASS(VPlanTransforms::replicateByVF, BestVPlan, BestVF);
7344	bool HasBranchWeights =
7345	hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
7346	if (HasBranchWeights) {
7347	std::optional<unsigned> VScale = CM.getVScaleForTuning();
7348	RUN_VPLAN_PASS(VPlanTransforms::addBranchWeightToMiddleTerminator,
7349	BestVPlan, BestVF, VScale);
7350	}
7351
7352	// Checks are the same for all VPlans, added to BestVPlan only for
7353	// compactness.
7354	attachRuntimeChecks(Plan&: BestVPlan, RTChecks&: ILV.RTChecks, HasBranchWeights);
7355
7356	// Retrieving VectorPH now when it's easier while VPlan still has Regions.
7357	VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
7358
7359	VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7360	VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7361	VPlanTransforms::removeBranchOnConst(Plan&: BestVPlan);
7362	if (BestVPlan.getEntry()->getSingleSuccessor() ==
7363	BestVPlan.getScalarPreheader()) {
7364	// TODO: The vector loop would be dead, should not even try to vectorize.
7365	ORE->emit(RemarkBuilder: [&]() {
7366	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationDead",
7367	OrigLoop->getStartLoc(),
7368	OrigLoop->getHeader())
7369	<< "Created vector loop never executes due to insufficient trip "
7370	"count.";
7371	});
7372	return DenseMap<const SCEV , Value >();
7373	}
7374
7375	VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7376
7377	VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan);
7378	// Convert the exit condition to AVLNext == 0 for EVL tail folded loops.
7379	VPlanTransforms::convertEVLExitCond(Plan&: BestVPlan);
7380	// Regions are dissolved after optimizing for VF and UF, which completely
7381	// removes unneeded loop regions first.
7382	VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7383	// Expand BranchOnTwoConds after dissolution, when latch has direct access to
7384	// its successors.
7385	VPlanTransforms::expandBranchOnTwoConds(Plan&: BestVPlan);
7386	// Convert loops with variable-length stepping after regions are dissolved.
7387	VPlanTransforms::convertToVariableLengthStep(Plan&: BestVPlan);
7388	VPlanTransforms::materializeBackedgeTakenCount(Plan&: BestVPlan, VectorPH);
7389	VPlanTransforms::materializeVectorTripCount(
7390	Plan&: BestVPlan, VectorPHVPBB: VectorPH, TailByMasking: CM.foldTailByMasking(),
7391	RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: BestVF.isVector()), Step: &BestVPlan.getVFxUF());
7392	VPlanTransforms::materializeFactors(Plan&: BestVPlan, VectorPH, VF: BestVF);
7393	VPlanTransforms::cse(Plan&: BestVPlan);
7394	VPlanTransforms::simplifyRecipes(Plan&: BestVPlan);
7395
7396	// 0. Generate SCEV-dependent code in the entry, including TripCount, before
7397	// making any changes to the CFG.
7398	DenseMap<const SCEV , Value > ExpandedSCEVs =
7399	VPlanTransforms::expandSCEVs(Plan&: BestVPlan, SE&: *PSE.getSE());
7400	if (!ILV.getTripCount()) {
7401	ILV.setTripCount(BestVPlan.getTripCount()->getLiveInIRValue());
7402	} else {
7403	assert(VectorizingEpilogue && "should only re-use the existing trip "
7404	"count during epilogue vectorization");
7405	}
7406
7407	// Perform the actual loop transformation.
7408	VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7409	OrigLoop->getParentLoop(),
7410	Legal->getWidestInductionType());
7411
7412	#ifdef EXPENSIVE_CHECKS
7413	assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7414	#endif
7415
7416	// 1. Set up the skeleton for vectorization, including vector pre-header and
7417	// middle block. The vector loop is created during VPlan execution.
7418	State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7419	replaceVPBBWithIRVPBB(VPBB: BestVPlan.getScalarPreheader(),
7420	IRBB: State.CFG.PrevBB->getSingleSuccessor(), Plan: &BestVPlan);
7421	VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7422
7423	assert(verifyVPlanIsValid(BestVPlan) && "final VPlan is invalid");
7424
7425	// After vectorization, the exit blocks of the original loop will have
7426	// additional predecessors. Invalidate SCEVs for the exit phis in case SE
7427	// looked through single-entry phis.
7428	ScalarEvolution &SE = *PSE.getSE();
7429	for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) {
7430	if (!Exit->hasPredecessors())
7431	continue;
7432	for (VPRecipeBase &PhiR : Exit->phis())
7433	SE.forgetLcssaPhiWithNewPredecessor(L: OrigLoop,
7434	V: &cast<VPIRPhi>(Val&: PhiR).getIRPhi());
7435	}
7436	// Forget the original loop and block dispositions.
7437	SE.forgetLoop(L: OrigLoop);
7438	SE.forgetBlockAndLoopDispositions();
7439
7440	ILV.printDebugTracesAtStart();
7441
7442	//===------------------------------------------------===//
7443	//
7444	// Notice: any optimization or new instruction that go
7445	// into the code below should also be implemented in
7446	// the cost-model.
7447	//
7448	//===------------------------------------------------===//
7449
7450	// Retrieve loop information before executing the plan, which may remove the
7451	// original loop, if it becomes unreachable.
7452	MDNode *LID = OrigLoop->getLoopID();
7453	unsigned OrigLoopInvocationWeight = `0`;
7454	std::optional<unsigned> OrigAverageTripCount =
7455	getLoopEstimatedTripCount(L: OrigLoop, EstimatedLoopInvocationWeight: &OrigLoopInvocationWeight);
7456
7457	BestVPlan.execute(State: &State);
7458
7459	// 2.6. Maintain Loop Hints
7460	// Keep all loop hints from the original loop on the vector loop (we'll
7461	// replace the vectorizer-specific hints below).
7462	VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7463	// Add metadata to disable runtime unrolling a scalar loop when there
7464	// are no runtime checks about strides and memory. A scalar loop that is
7465	// rarely used is not worth unrolling.
7466	bool DisableRuntimeUnroll = !ILV.RTChecks.hasChecks() && !BestVF.isScalar();
7467	updateLoopMetadataAndProfileInfo(
7468	VectorLoop: HeaderVPBB ? LI->getLoopFor(BB: State.CFG.VPBB2IRBB.lookup(Val: HeaderVPBB))
7469	: nullptr,
7470	HeaderVPBB, Plan: BestVPlan, VectorizingEpilogue, OrigLoopID: LID, OrigAverageTripCount,
7471	OrigLoopInvocationWeight,
7472	EstimatedVFxUF: estimateElementCount(VF: BestVF * BestUF, VScale: CM.getVScaleForTuning()),
7473	DisableRuntimeUnroll);
7474
7475	// 3. Fix the vectorized code: take care of header phi's, live-outs,
7476	// predication, updating analyses.
7477	ILV.fixVectorizedLoop(State);
7478
7479	ILV.printDebugTracesAtEnd();
7480
7481	return ExpandedSCEVs;
7482	}
7483
7484	//===--------------------------------------------------------------------===//
7485	// EpilogueVectorizerMainLoop
7486	//===--------------------------------------------------------------------===//
7487
7488	/// This function is partially responsible for generating the control flow
7489	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7490	BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
7491	BasicBlock *ScalarPH = createScalarPreheader(Prefix: "");
7492	BasicBlock *VectorPH = ScalarPH->getSinglePredecessor();
7493
7494	// Generate the code to check the minimum iteration count of the vector
7495	// epilogue (see below).
7496	EPI.EpilogueIterationCountCheck =
7497	emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: true);
7498	EPI.EpilogueIterationCountCheck->setName("iter.check");
7499
7500	VectorPH = cast<CondBrInst>(Val: EPI.EpilogueIterationCountCheck->getTerminator())
7501	->getSuccessor(i: `1`);
7502	// Generate the iteration count check for the main loop, after* the check*
7503	// for the epilogue loop, so that the path-length is shorter for the case
7504	// that goes directly through the vector epilogue. The longer-path length for
7505	// the main loop is compensated for, by the gain from vectorizing the larger
7506	// trip count. Note: the branch will get updated later on when we vectorize
7507	// the epilogue.
7508	EPI.MainLoopIterationCountCheck =
7509	emitIterationCountCheck(VectorPH, Bypass: ScalarPH, ForEpilogue: false);
7510
7511	return cast<CondBrInst>(Val: EPI.MainLoopIterationCountCheck->getTerminator())
7512	->getSuccessor(i: `1`);
7513	}
7514
7515	void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7516	LLVM_DEBUG({
7517	dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7518	<< "Main Loop VF:" << EPI.MainLoopVF
7519	<< ", Main Loop UF:" << EPI.MainLoopUF
7520	<< ", Epilogue Loop VF:" << EPI.EpilogueVF
7521	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7522	});
7523	}
7524
7525	void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7526	DEBUG_WITH_TYPE(VerboseDebug, {
7527	dbgs() << "intermediate fn:\n"
7528	<< *OrigLoop->getHeader()->getParent() << "\n";
7529	});
7530	}
7531
7532	BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
7533	BasicBlock VectorPH, BasicBlock Bypass, bool ForEpilogue) {
7534	assert(Bypass && "Expected valid bypass basic block.");
7535	Value *Count = getTripCount();
7536	MinProfitableTripCount = ElementCount::getFixed(MinVal: `0`);
7537	Value *CheckMinIters = createIterationCountCheck(
7538	VectorPH, VF: ForEpilogue ? EPI.EpilogueVF : EPI.MainLoopVF,
7539	UF: ForEpilogue ? EPI.EpilogueUF : EPI.MainLoopUF);
7540
7541	BasicBlock *const TCCheckBlock = VectorPH;
7542	if (!ForEpilogue)
7543	TCCheckBlock->setName("vector.main.loop.iter.check");
7544
7545	// Create new preheader for vector loop.
7546	VectorPH = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7547	DT: static_cast<DominatorTree >(nullptr), LI, MSSAU: nullptr*,
7548	BBName: "vector.ph");
7549	if (ForEpilogue) {
7550	// Save the trip count so we don't have to regenerate it in the
7551	// vec.epilog.iter.check. This is safe to do because the trip count
7552	// generated here dominates the vector epilog iter check.
7553	EPI.TripCount = Count;
7554	} else {
7555	VectorPHVPBB = replaceVPBBWithIRVPBB(VPBB: VectorPHVPBB, IRBB: VectorPH);
7556	}
7557
7558	CondBrInst &BI = *CondBrInst::Create(Cond: CheckMinIters, IfTrue: Bypass, IfFalse: VectorPH);
7559	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7560	setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /IsExpected=/false);
7561	ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7562
7563	// When vectorizing the main loop, its trip-count check is placed in a new
7564	// block, whereas the overall trip-count check is placed in the VPlan entry
7565	// block. When vectorizing the epilogue loop, its trip-count check is placed
7566	// in the VPlan entry block.
7567	if (!ForEpilogue)
7568	introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7569	return TCCheckBlock;
7570	}
7571
7572	//===--------------------------------------------------------------------===//
7573	// EpilogueVectorizerEpilogueLoop
7574	//===--------------------------------------------------------------------===//
7575
7576	/// This function creates a new scalar preheader, using the previous one as
7577	/// entry block to the epilogue VPlan. The minimum iteration check is being
7578	/// represented in VPlan.
7579	BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
7580	BasicBlock *NewScalarPH = createScalarPreheader(Prefix: "vec.epilog.");
7581	BasicBlock *OriginalScalarPH = NewScalarPH->getSinglePredecessor();
7582	OriginalScalarPH->setName("vec.epilog.iter.check");
7583	VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: OriginalScalarPH);
7584	VPBasicBlock *OldEntry = Plan.getEntry();
7585	for (auto &R : make_early_inc_range(Range&: *OldEntry)) {
7586	// Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
7587	// defining.
7588	if (isa<VPIRInstruction>(Val: &R))
7589	continue;
7590	R.moveBefore(BB&: *NewEntry, I: NewEntry->end());
7591	}
7592
7593	VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7594	Plan.setEntry(NewEntry);
7595	// OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7596
7597	return OriginalScalarPH;
7598	}
7599
7600	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7601	LLVM_DEBUG({
7602	dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7603	<< "Epilogue Loop VF:" << EPI.EpilogueVF
7604	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7605	});
7606	}
7607
7608	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7609	DEBUG_WITH_TYPE(VerboseDebug, {
7610	dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7611	});
7612	}
7613
7614	VPRecipeBase VPRecipeBuilder::tryToWidenMemory(VPInstruction VPI,
7615	VFRange &Range) {
7616	assert((VPI->getOpcode() == Instruction::Load \|\|
7617	VPI->getOpcode() == Instruction::Store) &&
7618	"Must be called with either a load or store");
7619	Instruction *I = VPI->getUnderlyingInstr();
7620
7621	auto WillWiden = [&](ElementCount VF) -> bool {
7622	LoopVectorizationCostModel::InstWidening Decision =
7623	CM.getWideningDecision(I, VF);
7624	assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7625	"CM decision should be taken at this point.");
7626	if (Decision == LoopVectorizationCostModel::CM_Interleave)
7627	return true;
7628	if (CM.isScalarAfterVectorization(I, VF) \|\|
7629	CM.isProfitableToScalarize(I, VF))
7630	return false;
7631	return Decision != LoopVectorizationCostModel::CM_Scalarize;
7632	};
7633
7634	if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7635	return nullptr;
7636
7637	// If a mask is not required, drop it - use unmasked version for safe loads.
7638	// TODO: Determine if mask is needed in VPlan.
7639	VPValue Mask = Legal->isMaskRequired(I) ? VPI->getMask() : nullptr*;
7640
7641	// Determine if the pointer operand of the access is either consecutive or
7642	// reverse consecutive.
7643	LoopVectorizationCostModel::InstWidening Decision =
7644	CM.getWideningDecision(I, VF: Range.Start);
7645	bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7646	bool Consecutive =
7647	Reverse \|\| Decision == LoopVectorizationCostModel::CM_Widen;
7648
7649	VPValue *Ptr = VPI->getOpcode() == Instruction::Load ? VPI->getOperand(N: `0`)
7650	: VPI->getOperand(N: `1`);
7651	if (Consecutive) {
7652	auto *GEP = dyn_cast<GetElementPtrInst>(
7653	Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7654	VPSingleDefRecipe *VectorPtr;
7655	if (Reverse) {
7656	// When folding the tail, we may compute an address that we don't in the
7657	// original scalar loop: drop the GEP no-wrap flags in this case.
7658	// Otherwise preserve existing flags without no-unsigned-wrap, as we will
7659	// emit negative indices.
7660	GEPNoWrapFlags Flags =
7661	CM.foldTailByMasking() \|\| !GEP
7662	? GEPNoWrapFlags::none()
7663	: GEP->getNoWrapFlags().withoutNoUnsignedWrap();
7664	VectorPtr = new VPVectorEndPointerRecipe (
7665	Ptr, &Plan.getVF(), getLoadStoreType(I),
7666	/Stride/ -`1`, Flags, VPI->getDebugLoc());
7667	} else {
7668	VectorPtr = new VPVectorPointerRecipe (Ptr, getLoadStoreType(I),
7669	GEP ? GEP->getNoWrapFlags()
7670	: GEPNoWrapFlags::none(),
7671	VPI->getDebugLoc());
7672	}
7673	Builder.insert(R: VectorPtr);
7674	Ptr = VectorPtr;
7675	}
7676
7677	if (VPI->getOpcode() == Instruction::Load) {
7678	auto *Load = cast<LoadInst>(Val: I);
7679	auto LoadR = new* VPWidenLoadRecipe (*Load, Ptr, Mask, Consecutive, Reverse,
7680	*VPI, Load->getDebugLoc());
7681	if (Reverse) {
7682	Builder.insert(R: LoadR);
7683	return new VPInstruction (VPInstruction::Reverse, LoadR, {}, {},
7684	LoadR->getDebugLoc());
7685	}
7686	return LoadR;
7687	}
7688
7689	StoreInst *Store = cast<StoreInst>(Val: I);
7690	VPValue *StoredVal = VPI->getOperand(N: `0`);
7691	if (Reverse)
7692	StoredVal = Builder.createNaryOp(Opcode: VPInstruction::Reverse, Operands: StoredVal,
7693	DL: Store->getDebugLoc());
7694	return new VPWidenStoreRecipe (*Store, Ptr, StoredVal, Mask, Consecutive,
7695	Reverse, *VPI, Store->getDebugLoc());
7696	}
7697
7698	VPWidenIntOrFpInductionRecipe *
7699	VPRecipeBuilder::tryToOptimizeInductionTruncate(VPInstruction *VPI,
7700	VFRange &Range) {
7701	auto *I = cast<TruncInst>(Val: VPI->getUnderlyingInstr());
7702	// Optimize the special case where the source is a constant integer
7703	// induction variable. Notice that we can only optimize the 'trunc' case
7704	// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7705	// (c) other casts depend on pointer size.
7706
7707	// Determine whether \p K is a truncation based on an induction variable that
7708	// can be optimized.
7709	auto IsOptimizableIVTruncate =
7710	[&](Instruction K) -> std::function<bool*(ElementCount)> {
7711	return [=](ElementCount VF) -> bool {
7712	return CM.isOptimizableIVTruncate(I: K, VF);
7713	};
7714	};
7715
7716	if (!LoopVectorizationPlanner::getDecisionAndClampRange(
7717	Predicate: IsOptimizableIVTruncate (I), Range))
7718	return nullptr;
7719
7720	auto *WidenIV = cast<VPWidenIntOrFpInductionRecipe>(
7721	Val: VPI->getOperand(N: `0`)->getDefiningRecipe());
7722	PHINode *Phi = WidenIV->getPHINode();
7723	VPIRValue *Start = WidenIV->getStartValue();
7724	const InductionDescriptor &IndDesc = WidenIV->getInductionDescriptor();
7725
7726	// It is always safe to copy over the NoWrap and FastMath flags. In
7727	// particular, when folding tail by masking, the masked-off lanes are never
7728	// used, so it is safe.
7729	VPIRFlags Flags = vputils::getFlagsFromIndDesc(ID: IndDesc);
7730	VPValue *Step =
7731	vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep());
7732	return new VPWidenIntOrFpInductionRecipe (
7733	Phi, Start, Step, &Plan.getVF(), IndDesc, I, Flags, VPI->getDebugLoc());
7734	}
7735
7736	VPSingleDefRecipe VPRecipeBuilder::tryToWidenCall(VPInstruction VPI,
7737	VFRange &Range) {
7738	CallInst *CI = cast<CallInst>(Val: VPI->getUnderlyingInstr());
7739	bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7740	Predicate: [this, CI](ElementCount VF) {
7741	return CM.isScalarWithPredication(I: CI, VF);
7742	},
7743	Range);
7744
7745	if (IsPredicated)
7746	return nullptr;
7747
7748	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7749	if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
7750	ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|
7751	ID == Intrinsic::pseudoprobe \|\|
7752	ID == Intrinsic::experimental_noalias_scope_decl))
7753	return nullptr;
7754
7755	SmallVector<VPValue *, `4`> Ops(VPI->op_begin(),
7756	VPI->op_begin() + CI->arg_size());
7757
7758	// Is it beneficial to perform intrinsic call compared to lib call?
7759	bool ShouldUseVectorIntrinsic =
7760	ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7761	Predicate: [&](ElementCount VF) -> bool {
7762	return CM.getCallWideningDecision(CI, VF).Kind ==
7763	LoopVectorizationCostModel::CM_IntrinsicCall;
7764	},
7765	Range);
7766	if (ShouldUseVectorIntrinsic)
7767	return new VPWidenIntrinsicRecipe (CI, ID, Ops, CI->getType(), VPI, *VPI,
7768	VPI->getDebugLoc());
7769
7770	Function Variant = nullptr*;
7771	std::optional<unsigned> MaskPos;
7772	// Is better to call a vectorized version of the function than to to scalarize
7773	// the call?
7774	auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7775	Predicate: [&](ElementCount VF) -> bool {
7776	// The following case may be scalarized depending on the VF.
7777	// The flag shows whether we can use a usual Call for vectorized
7778	// version of the instruction.
7779
7780	// If we've found a variant at a previous VF, then stop looking. A
7781	// vectorized variant of a function expects input in a certain shape
7782	// -- basically the number of input registers, the number of lanes
7783	// per register, and whether there's a mask required.
7784	// We store a pointer to the variant in the VPWidenCallRecipe, so
7785	// once we have an appropriate variant it's only valid for that VF.
7786	// This will force a different vplan to be generated for each VF that
7787	// finds a valid variant.
7788	if (Variant)
7789	return false;
7790	LoopVectorizationCostModel::CallWideningDecision Decision =
7791	CM.getCallWideningDecision(CI, VF);
7792	if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7793	Variant = Decision.Variant;
7794	MaskPos = Decision.MaskPos;
7795	return true;
7796	}
7797
7798	return false;
7799	},
7800	Range);
7801	if (ShouldUseVectorCall) {
7802	if (MaskPos.has_value()) {
7803	// We have 2 cases that would require a mask:
7804	// 1) The call needs to be predicated, either due to a conditional
7805	// in the scalar loop or use of an active lane mask with
7806	// tail-folding, and we use the appropriate mask for the block.
7807	// 2) No mask is required for the call instruction, but the only
7808	// available vector variant at this VF requires a mask, so we
7809	// synthesize an all-true mask.
7810	VPValue *Mask = VPI->isMasked() ? VPI->getMask() : Plan.getTrue();
7811
7812	Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7813	}
7814
7815	Ops.push_back(Elt: VPI->getOperand(N: VPI->getNumOperandsWithoutMask() - `1`));
7816	return new VPWidenCallRecipe (CI, Variant, Ops, VPI, VPI,
7817	VPI->getDebugLoc());
7818	}
7819
7820	return nullptr;
7821	}
7822
7823	bool VPRecipeBuilder::shouldWiden(Instruction I, VFRange &Range) const* {
7824	assert((!isa<UncondBrInst, CondBrInst, PHINode, LoadInst, StoreInst>(I)) &&
7825	"Instruction should have been handled earlier");
7826	// Instruction should be widened, unless it is scalar after vectorization,
7827	// scalarization is profitable or it is predicated.
7828	auto WillScalarize = [this, I](ElementCount VF) -> bool {
7829	return CM.isScalarAfterVectorization(I, VF) \|\|
7830	CM.isProfitableToScalarize(I, VF) \|\|
7831	CM.isScalarWithPredication(I, VF);
7832	};
7833	return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7834	Range);
7835	}
7836
7837	VPWidenRecipe VPRecipeBuilder::tryToWiden(VPInstruction VPI) {
7838	auto *I = VPI->getUnderlyingInstr();
7839	switch (VPI->getOpcode()) {
7840	default:
7841	return nullptr;
7842	case Instruction::SDiv:
7843	case Instruction::UDiv:
7844	case Instruction::SRem:
7845	case Instruction::URem: {
7846	// If not provably safe, use a select to form a safe divisor before widening the
7847	// div/rem operation itself. Otherwise fall through to general handling below.
7848	if (CM.isPredicatedInst(I)) {
7849	SmallVector<VPValue *> Ops(VPI->operandsWithoutMask());
7850	VPValue *Mask = VPI->getMask();
7851	VPValue *One = Plan.getConstantInt(Ty: I->getType(), Val: `1u`);
7852	auto *SafeRHS =
7853	Builder.createSelect(Cond: Mask, TrueVal: Ops [`1`], FalseVal: One, DL: VPI->getDebugLoc());
7854	Ops [`1`] = SafeRHS;
7855	return new VPWidenRecipe (I, Ops, VPI, *VPI, VPI->getDebugLoc());
7856	}
7857	[[fallthrough]];
7858	}
7859	case Instruction::Add:
7860	case Instruction::And:
7861	case Instruction::AShr:
7862	case Instruction::FAdd:
7863	case Instruction::FCmp:
7864	case Instruction::FDiv:
7865	case Instruction::FMul:
7866	case Instruction::FNeg:
7867	case Instruction::FRem:
7868	case Instruction::FSub:
7869	case Instruction::ICmp:
7870	case Instruction::LShr:
7871	case Instruction::Mul:
7872	case Instruction::Or:
7873	case Instruction::Select:
7874	case Instruction::Shl:
7875	case Instruction::Sub:
7876	case Instruction::Xor:
7877	case Instruction::Freeze:
7878	return new VPWidenRecipe (I, VPI->operandsWithoutMask(), VPI, *VPI,
7879	VPI->getDebugLoc());
7880	case Instruction::ExtractValue: {
7881	SmallVector<VPValue *> NewOps(VPI->operandsWithoutMask());
7882	auto *EVI = cast<ExtractValueInst>(Val: I);
7883	assert(EVI->getNumIndices() == `1` && "Expected one extractvalue index");
7884	unsigned Idx = EVI->getIndices()[`0`];
7885	NewOps.push_back(Elt: Plan.getConstantInt(BitWidth: `32`, Val: Idx));
7886	return new VPWidenRecipe (I, NewOps, VPI, *VPI, VPI->getDebugLoc());
7887	}
7888	};
7889	}
7890
7891	VPHistogramRecipe VPRecipeBuilder::tryToWidenHistogram(const* HistogramInfo *HI,
7892	VPInstruction *VPI) {
7893	// FIXME: Support other operations.
7894	unsigned Opcode = HI->Update->getOpcode();
7895	assert((Opcode == Instruction::Add \|\| Opcode == Instruction::Sub) &&
7896	"Histogram update operation must be an Add or Sub");
7897
7898	SmallVector<VPValue *, `3`> HGramOps;
7899	// Bucket address.
7900	HGramOps.push_back(Elt: VPI->getOperand(N: `1`));
7901	// Increment value.
7902	HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: `1`)));
7903
7904	// In case of predicated execution (due to tail-folding, or conditional
7905	// execution, or both), pass the relevant mask.
7906	if (Legal->isMaskRequired(I: HI->Store))
7907	HGramOps.push_back(Elt: VPI->getMask());
7908
7909	return new VPHistogramRecipe (Opcode, HGramOps, VPI->getDebugLoc());
7910	}
7911
7912	VPReplicateRecipe VPRecipeBuilder::handleReplication(VPInstruction VPI,
7913	VFRange &Range) {
7914	auto *I = VPI->getUnderlyingInstr();
7915	bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7916	Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7917	Range);
7918
7919	bool IsPredicated = CM.isPredicatedInst(I);
7920
7921	// Even if the instruction is not marked as uniform, there are certain
7922	// intrinsic calls that can be effectively treated as such, so we check for
7923	// them here. Conservatively, we only do this for scalable vectors, since
7924	// for fixed-width VFs we can always fall back on full scalarization.
7925	if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
7926	switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
7927	case Intrinsic::assume:
7928	case Intrinsic::lifetime_start:
7929	case Intrinsic::lifetime_end:
7930	// For scalable vectors if one of the operands is variant then we still
7931	// want to mark as uniform, which will generate one instruction for just
7932	// the first lane of the vector. We can't scalarize the call in the same
7933	// way as for fixed-width vectors because we don't know how many lanes
7934	// there are.
7935	//
7936	// The reasons for doing it this way for scalable vectors are:
7937	// 1. For the assume intrinsic generating the instruction for the first
7938	// lane is still be better than not generating any at all. For
7939	// example, the input may be a splat across all lanes.
7940	// 2. For the lifetime start/end intrinsics the pointer operand only
7941	// does anything useful when the input comes from a stack object,
7942	// which suggests it should always be uniform. For non-stack objects
7943	// the effect is to poison the object, which still allows us to
7944	// remove the call.
7945	IsUniform = true;
7946	break;
7947	default:
7948	break;
7949	}
7950	}
7951	VPValue BlockInMask = nullptr*;
7952	if (!IsPredicated) {
7953	// Finalize the recipe for Instr, first if it is not predicated.
7954	LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7955	} else {
7956	LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7957	// Instructions marked for predication are replicated and a mask operand is
7958	// added initially. Masked replicate recipes will later be placed under an
7959	// if-then construct to prevent side-effects. Generate recipes to compute
7960	// the block mask for this region.
7961	BlockInMask = VPI->getMask();
7962	}
7963
7964	// Note that there is some custom logic to mark some intrinsics as uniform
7965	// manually above for scalable vectors, which this assert needs to account for
7966	// as well.
7967	assert((Range.Start.isScalar() \|\| !IsUniform \|\| !IsPredicated \|\|
7968	(Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
7969	"Should not predicate a uniform recipe");
7970	auto *Recipe =
7971	new VPReplicateRecipe (I, VPI->operandsWithoutMask(), IsUniform,
7972	BlockInMask, VPI, VPI, VPI->getDebugLoc());
7973	return Recipe;
7974	}
7975
7976	VPRecipeBase *
7977	VPRecipeBuilder::tryToCreateWidenNonPhiRecipe(VPSingleDefRecipe *R,
7978	VFRange &Range) {
7979	assert(!R->isPhi() && "phis must be handled earlier");
7980	// First, check for specific widening recipes that deal with optimizing
7981	// truncates, calls and memory operations.
7982
7983	VPRecipeBase *Recipe;
7984	auto *VPI = cast<VPInstruction>(Val: R);
7985	if (VPI->getOpcode() == Instruction::Trunc &&
7986	(Recipe = tryToOptimizeInductionTruncate(VPI, Range)))
7987	return Recipe;
7988
7989	// All widen recipes below deal only with VF > 1.
7990	if (LoopVectorizationPlanner::getDecisionAndClampRange(
7991	Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
7992	return nullptr;
7993
7994	if (VPI->getOpcode() == Instruction::Call)
7995	return tryToWidenCall(VPI, Range);
7996
7997	Instruction *Instr = R->getUnderlyingInstr();
7998	if (VPI->getOpcode() == Instruction::Store)
7999	if (auto HistInfo = Legal->getHistogramInfo(I: cast<StoreInst>(Val: Instr)))
8000	return tryToWidenHistogram(HI: *HistInfo, VPI);
8001
8002	if (VPI->getOpcode() == Instruction::Load \|\|
8003	VPI->getOpcode() == Instruction::Store)
8004	return tryToWidenMemory(VPI, Range);
8005
8006	if (!shouldWiden(I: Instr, Range))
8007	return nullptr;
8008
8009	if (VPI->getOpcode() == Instruction::GetElementPtr)
8010	return new VPWidenGEPRecipe (cast<GetElementPtrInst>(Val: Instr),
8011	VPI->operandsWithoutMask(), *VPI,
8012	VPI->getDebugLoc());
8013
8014	if (Instruction::isCast(Opcode: VPI->getOpcode())) {
8015	auto *CI = cast<CastInst>(Val: Instr);
8016	auto *CastR = cast<VPInstructionWithType>(Val: VPI);
8017	return new VPWidenCastRecipe (CI->getOpcode(), VPI->getOperand(N: `0`),
8018	CastR->getResultType(), CI, VPI, VPI,
8019	VPI->getDebugLoc());
8020	}
8021
8022	return tryToWiden(VPI);
8023	}
8024
8025	void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8026	ElementCount MaxVF) {
8027	if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8028	return;
8029
8030	assert(OrigLoop->isInnermost() && "Inner loop expected.");
8031
8032	const LoopAccessInfo *LAI = Legal->getLAI();
8033	LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8034	OrigLoop, LI, DT, PSE.getSE());
8035	if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8036	!LAI->getRuntimePointerChecking()->getDiffChecks()) {
8037	// Only use noalias metadata when using memory checks guaranteeing no
8038	// overlap across all iterations.
8039	LVer.prepareNoAliasMetadata();
8040	}
8041
8042	// Create initial base VPlan0, to serve as common starting point for all
8043	// candidates built later for specific VF ranges.
8044	auto VPlan0 = VPlanTransforms::buildVPlan0(
8045	TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8046	IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE, LVer: &LVer);
8047
8048	VPlanTransforms::simplifyRecipes(Plan&: *VPlan0);
8049	VPlanTransforms::handleEarlyExits(Plan&: *VPlan0, HasUncountableExit: Legal->hasUncountableEarlyExit());
8050	VPlanTransforms::addMiddleCheck(Plan&: *VPlan0, TailFolded: CM.foldTailByMasking());
8051	RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::createLoopRegions, *VPlan0);
8052
8053	// Create recipes for header phis.
8054	VPlanTransforms::createHeaderPhiRecipes(
8055	Plan&: VPlan0, PSE, OrigLoop&: OrigLoop, Inductions: Legal->getInductionVars(),
8056	Reductions: Legal->getReductionVars(), FixedOrderRecurrences: Legal->getFixedOrderRecurrences(),
8057	InLoopReductions: CM.getInLoopReductions(), AllowReordering: Hints.allowReordering());
8058
8059	if (CM.foldTailByMasking())
8060	RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *VPlan0);
8061	RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize,
8062	*VPlan0);
8063
8064	auto MaxVFTimes2 = MaxVF * `2`;
8065	for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8066	VFRange SubRange = {VF, MaxVFTimes2};
8067	if (auto Plan = tryToBuildVPlanWithVPRecipes(
8068	InitialPlan: std::unique_ptr<VPlan>(VPlan0 ->duplicate()), Range&: SubRange, LVer: &LVer)) {
8069	// Now optimize the initial VPlan.
8070	VPlanTransforms::hoistPredicatedLoads(Plan&: *Plan, PSE, L: OrigLoop);
8071	VPlanTransforms::sinkPredicatedStores(Plan&: *Plan, PSE, L: OrigLoop);
8072	RUN_VPLAN_PASS(VPlanTransforms::truncateToMinimalBitwidths, *Plan,
8073	CM.getMinimalBitwidths());
8074	RUN_VPLAN_PASS(VPlanTransforms::optimize, *Plan);
8075	// TODO: try to put addExplicitVectorLength close to addActiveLaneMask
8076	if (CM.foldTailWithEVL()) {
8077	RUN_VPLAN_PASS(VPlanTransforms::addExplicitVectorLength, *Plan,
8078	CM.getMaxSafeElements());
8079	RUN_VPLAN_PASS(VPlanTransforms::optimizeEVLMasks, *Plan);
8080	}
8081
8082	if (auto P = VPlanTransforms::narrowInterleaveGroups(Plan&: *Plan, TTI))
8083	VPlans.push_back(Elt: std::move(P));
8084
8085	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8086	VPlans.push_back(Elt: std::move(Plan));
8087	}
8088	VF = SubRange.End;
8089	}
8090	}
8091
8092	VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8093	VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8094
8095	using namespace llvm::VPlanPatternMatch;
8096	SmallPtrSet<const InterleaveGroup<Instruction> *, `1`> InterleaveGroups;
8097
8098	// ---------------------------------------------------------------------------
8099	// Build initial VPlan: Scan the body of the loop in a topological order to
8100	// visit each basic block after having visited its predecessor basic blocks.
8101	// ---------------------------------------------------------------------------
8102
8103	bool RequiresScalarEpilogueCheck =
8104	LoopVectorizationPlanner::getDecisionAndClampRange(
8105	Predicate: [this](ElementCount VF) {
8106	return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8107	},
8108	Range);
8109	// Update the branch in the middle block if a scalar epilogue is required.
8110	VPBasicBlock *MiddleVPBB = Plan ->getMiddleBlock();
8111	if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == `2`) {
8112	auto *BranchOnCond = cast<VPInstruction>(Val: MiddleVPBB->getTerminator());
8113	assert(MiddleVPBB->getSuccessors()[`1`] == Plan->getScalarPreheader() &&
8114	"second successor must be scalar preheader");
8115	BranchOnCond->setOperand(I: `0`, New: Plan ->getFalse());
8116	}
8117
8118	// Don't use getDecisionAndClampRange here, because we don't know the UF
8119	// so this function is better to be conservative, rather than to split
8120	// it up into different VPlans.
8121	// TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8122	bool IVUpdateMayOverflow = false;
8123	for (ElementCount VF : Range)
8124	IVUpdateMayOverflow \|= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8125
8126	TailFoldingStyle Style = CM.getTailFoldingStyle();
8127	// Use NUW for the induction increment if we proved that it won't overflow in
8128	// the vector loop or when not folding the tail. In the later case, we know
8129	// that the canonical induction increment will not overflow as the vector trip
8130	// count is >= increment and a multiple of the increment.
8131	VPRegionBlock *LoopRegion = Plan ->getVectorLoopRegion();
8132	bool HasNUW = !IVUpdateMayOverflow \|\| Style == TailFoldingStyle::None;
8133	if (!HasNUW) {
8134	auto *IVInc =
8135	LoopRegion->getExitingBasicBlock()->getTerminator()->getOperand(N: `0`);
8136	assert(match(IVInc,
8137	m_VPInstruction<Instruction::Add>(
8138	m_Specific(LoopRegion->getCanonicalIV()), m_VPValue())) &&
8139	"Did not find the canonical IV increment");
8140	cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8141	}
8142
8143	// ---------------------------------------------------------------------------
8144	// Pre-construction: record ingredients whose recipes we'll need to further
8145	// process after constructing the initial VPlan.
8146	// ---------------------------------------------------------------------------
8147
8148	// For each interleave group which is relevant for this (possibly trimmed)
8149	// Range, add it to the set of groups to be later applied to the VPlan and add
8150	// placeholders for its members' Recipes which we'll be replacing with a
8151	// single VPInterleaveRecipe.
8152	for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8153	auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8154	bool Result = (VF.isVector() && // Query is illegal for VF == 1
8155	CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8156	LoopVectorizationCostModel::CM_Interleave);
8157	// For scalable vectors, the interleave factors must be <= 8 since we
8158	// require the (de)interleaveN intrinsics instead of shufflevectors.
8159	assert((!Result \|\| !VF.isScalable() \|\| IG->getFactor() <= `8`) &&
8160	"Unsupported interleave factor for scalable vectors");
8161	return Result;
8162	};
8163	if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8164	continue;
8165	InterleaveGroups.insert(Ptr: IG);
8166	}
8167
8168	// ---------------------------------------------------------------------------
8169	// Construct wide recipes and apply predication for original scalar
8170	// VPInstructions in the loop.
8171	// ---------------------------------------------------------------------------
8172	VPRecipeBuilder RecipeBuilder(*Plan, TLI, Legal, CM, Builder);
8173
8174	// Scan the body of the loop in a topological order to visit each basic block
8175	// after having visited its predecessor basic blocks.
8176	VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8177	ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8178	HeaderVPBB);
8179
8180	VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8181
8182	// Collect blocks that need predication for in-loop reduction recipes.
8183	DenseSet<BasicBlock *> BlocksNeedingPredication;
8184	for (BasicBlock *BB : OrigLoop->blocks())
8185	if (CM.blockNeedsPredicationForAnyReason(BB))
8186	BlocksNeedingPredication.insert(V: BB);
8187
8188	VPlanTransforms::createInLoopReductionRecipes(Plan&: *Plan, BlocksNeedingPredication,
8189	MinVF: Range.Start);
8190
8191	// Now process all other blocks and instructions.
8192	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8193	// Convert input VPInstructions to widened recipes.
8194	for (VPRecipeBase &R : make_early_inc_range(
8195	Range: make_range(x: VPBB->getFirstNonPhi(), y: VPBB->end()))) {
8196	// Skip recipes that do not need transforming.
8197	if (isa<VPWidenCanonicalIVRecipe, VPBlendRecipe, VPReductionRecipe>(Val: &R))
8198	continue;
8199	auto *VPI = cast<VPInstruction>(Val: &R);
8200	if (!VPI->getUnderlyingValue())
8201	continue;
8202
8203	// TODO: Gradually replace uses of underlying instruction by analyses on
8204	// VPlan. Migrate code relying on the underlying instruction from VPlan0
8205	// to construct recipes below to not use the underlying instruction.
8206	Instruction *Instr = cast<Instruction>(Val: VPI->getUnderlyingValue());
8207	Builder.setInsertPoint(VPI);
8208
8209	// The stores with invariant address inside the loop will be deleted, and
8210	// in the exit block, a uniform store recipe will be created for the final
8211	// invariant store of the reduction.
8212	StoreInst *SI;
8213	if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8214	Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8215	// Only create recipe for the final invariant store of the reduction.
8216	if (Legal->isInvariantStoreOfReduction(SI)) {
8217	auto Recipe = new* VPReplicateRecipe (
8218	SI, VPI->operandsWithoutMask(), true / IsUniform /,
8219	nullptr /Mask/, VPI, VPI, VPI->getDebugLoc());
8220	Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8221	}
8222	R.eraseFromParent();
8223	continue;
8224	}
8225
8226	VPRecipeBase *Recipe =
8227	RecipeBuilder.tryToCreateWidenNonPhiRecipe(R: VPI, Range);
8228	if (!Recipe)
8229	Recipe =
8230	RecipeBuilder.handleReplication(VPI: cast<VPInstruction>(Val: VPI), Range);
8231
8232	RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8233	if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8234	// Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8235	// moved to the phi section in the header.
8236	Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8237	} else {
8238	Builder.insert(R: Recipe);
8239	}
8240	if (Recipe->getNumDefinedValues() == `1`) {
8241	VPI->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8242	} else {
8243	assert(Recipe->getNumDefinedValues() == `0` &&
8244	"Unexpected multidef recipe");
8245	}
8246	R.eraseFromParent();
8247	}
8248	}
8249
8250	assert(isa<VPRegionBlock>(LoopRegion) &&
8251	!LoopRegion->getEntryBasicBlock()->empty() &&
8252	"entry block must be set to a VPRegionBlock having a non-empty entry "
8253	"VPBasicBlock");
8254
8255	// TODO: We can't call runPass on these transforms yet, due to verifier
8256	// failures.
8257	VPlanTransforms::addExitUsersForFirstOrderRecurrences(Plan&: *Plan, Range);
8258
8259	// ---------------------------------------------------------------------------
8260	// Transform initial VPlan: Apply previously taken decisions, in order, to
8261	// bring the VPlan to its final state.
8262	// ---------------------------------------------------------------------------
8263
8264	addReductionResultComputation(Plan, RecipeBuilder, MinVF: Range.Start);
8265
8266	// Optimize FindIV reductions to use sentinel-based approach when possible.
8267	RUN_VPLAN_PASS(VPlanTransforms::optimizeFindIVReductions, *Plan, PSE,
8268	*OrigLoop);
8269	VPlanTransforms::optimizeInductionLiveOutUsers(Plan&: *Plan, PSE,
8270	FoldTail: CM.foldTailByMasking());
8271
8272	// Apply mandatory transformation to handle reductions with multiple in-loop
8273	// uses if possible, bail out otherwise.
8274	if (!RUN_VPLAN_PASS(VPlanTransforms::handleMultiUseReductions, *Plan, ORE,
8275	OrigLoop))
8276	return nullptr;
8277	// Apply mandatory transformation to handle FP maxnum/minnum reduction with
8278	// NaNs if possible, bail out otherwise.
8279	if (!RUN_VPLAN_PASS(VPlanTransforms::handleMaxMinNumReductions, *Plan))
8280	return nullptr;
8281
8282	// Create whole-vector selects for find-last recurrences.
8283	if (!RUN_VPLAN_PASS(VPlanTransforms::handleFindLastReductions, *Plan))
8284	return nullptr;
8285
8286	// Create partial reduction recipes for scaled reductions and transform
8287	// recipes to abstract recipes if it is legal and beneficial and clamp the
8288	// range for better cost estimation.
8289	// TODO: Enable following transform when the EVL-version of extended-reduction
8290	// and mulacc-reduction are implemented.
8291	if (!CM.foldTailWithEVL()) {
8292	VPCostContext CostCtx(CM.TTI, CM.TLI, Plan, CM, CM.CostKind, CM.PSE,
8293	OrigLoop);
8294	RUN_VPLAN_PASS(VPlanTransforms::createPartialReductions, *Plan, CostCtx,
8295	Range);
8296	RUN_VPLAN_PASS(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx,
8297	Range);
8298	}
8299
8300	for (ElementCount VF : Range)
8301	Plan ->addVF(VF);
8302	Plan ->setName("Initial VPlan");
8303
8304	// Interleave memory: for each Interleave Group we marked earlier as relevant
8305	// for this VPlan, replace the Recipes widening its memory instructions with a
8306	// single VPInterleaveRecipe at its insertion point.
8307	RUN_VPLAN_PASS(VPlanTransforms::createInterleaveGroups, *Plan,
8308	InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());
8309
8310	// Replace VPValues for known constant strides.
8311	RUN_VPLAN_PASS(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
8312	Legal->getLAI()->getSymbolicStrides());
8313
8314	auto BlockNeedsPredication = [this](BasicBlock *BB) {
8315	return Legal->blockNeedsPredication(BB);
8316	};
8317	RUN_VPLAN_PASS(VPlanTransforms::dropPoisonGeneratingRecipes, *Plan,
8318	BlockNeedsPredication);
8319
8320	// Sink users of fixed-order recurrence past the recipe defining the previous
8321	// value and introduce FirstOrderRecurrenceSplice VPInstructions.
8322	if (!RUN_VPLAN_PASS(VPlanTransforms::adjustFixedOrderRecurrences, *Plan,
8323	Builder))
8324	return nullptr;
8325
8326	if (useActiveLaneMask(Style)) {
8327	// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8328	// TailFoldingStyle is visible there.
8329	bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8330	VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow);
8331	}
8332
8333	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8334	return Plan;
8335	}
8336
8337	VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8338	// Outer loop handling: They may require CFG and instruction level
8339	// transformations before even evaluating whether vectorization is profitable.
8340	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
8341	// the vectorization pipeline.
8342	assert(!OrigLoop->isInnermost());
8343	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8344
8345	auto Plan = VPlanTransforms::buildVPlan0(
8346	TheLoop: OrigLoop, LI&: *LI, InductionTy: Legal->getWidestInductionType(),
8347	IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), PSE);
8348
8349	VPlanTransforms::handleEarlyExits(Plan&: *Plan,
8350	/HasUncountableExit/ false);
8351	VPlanTransforms::addMiddleCheck(Plan&: Plan, /TailFolded/* false);
8352
8353	VPlanTransforms::createLoopRegions(Plan&: *Plan);
8354
8355	VPlanTransforms::createHeaderPhiRecipes(
8356	Plan&: Plan, PSE, OrigLoop&: OrigLoop, Inductions: Legal->getInductionVars(),
8357	Reductions: MapVector<PHINode *, RecurrenceDescriptor>(),
8358	FixedOrderRecurrences: SmallPtrSet<const PHINode , `1`>(), InLoopReductions: SmallPtrSet<PHINode , `1`>(),
8359	/AllowReordering=/false);
8360
8361	for (ElementCount VF : Range)
8362	Plan ->addVF(VF);
8363
8364	if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(Plan&: Plan, TLI: TLI))
8365	return nullptr;
8366
8367	// Optimize induction live-out users to use precomputed end values.
8368	VPlanTransforms::optimizeInductionLiveOutUsers(Plan&: *Plan, PSE,
8369	/FoldTail=/false);
8370
8371	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8372	return Plan;
8373	}
8374
8375	void LoopVectorizationPlanner::addReductionResultComputation(
8376	VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8377	using namespace VPlanPatternMatch;
8378	VPTypeAnalysis TypeInfo(*Plan);
8379	VPRegionBlock *VectorLoopRegion = Plan ->getVectorLoopRegion();
8380	VPBasicBlock *MiddleVPBB = Plan ->getMiddleBlock();
8381	SmallVector<VPRecipeBase *> ToDelete;
8382	VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
8383	Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
8384	VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
8385	for (VPRecipeBase &R :
8386	Plan ->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8387	VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8388	// TODO: Remove check for constant incoming value once removeDeadRecipes is
8389	// used on VPlan0.
8390	if (!PhiR \|\| isa<VPIRValue>(Val: PhiR->getOperand(N: `1`)))
8391	continue;
8392
8393	RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
8394	const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
8395	PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
8396	Type *PhiTy = TypeInfo.inferScalarType(V: PhiR);
8397	// If tail is folded by masking, introduce selects between the phi
8398	// and the users outside the vector region of each reduction, at the
8399	// beginning of the dedicated latch block.
8400	auto *OrigExitingVPV = PhiR->getBackedgeValue();
8401	auto *NewExitingVPV = PhiR->getBackedgeValue();
8402	// Don't output selects for partial reductions because they have an output
8403	// with fewer lanes than the VF. So the operands of the select would have
8404	// different numbers of lanes. Partial reductions mask the input instead.
8405	auto *RR = dyn_cast<VPReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe());
8406	if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
8407	(!RR \|\| !RR->isPartialReduction())) {
8408	VPValue Cond = vputils::findHeaderMask(Plan&: Plan);
8409	NewExitingVPV =
8410	Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", Flags: *PhiR);
8411	OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
8412	using namespace VPlanPatternMatch;
8413	return match(
8414	U: &U, P: m_CombineOr(
8415	L: m_VPInstruction<VPInstruction::ComputeAnyOfResult>(),
8416	R: m_VPInstruction<VPInstruction::ComputeReductionResult>()));
8417	});
8418
8419	if (CM.usePredicatedReductionSelect(RecurrenceKind))
8420	PhiR->setOperand(I: `1`, New: NewExitingVPV);
8421	}
8422
8423	// We want code in the middle block to appear to execute on the location of
8424	// the scalar loop's latch terminator because: (a) it is all compiler
8425	// generated, (b) these instructions are always executed after evaluating
8426	// the latch conditional branch, and (c) other passes may add new
8427	// predecessors which terminate on this line. This is the easiest way to
8428	// ensure we don't accidentally cause an extra step back into the loop while
8429	// debugging.
8430	DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
8431
8432	// TODO: At the moment ComputeReductionResult also drives creation of the
8433	// bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
8434	// even for in-loop reductions, until the reduction resume value handling is
8435	// also modeled in VPlan.
8436	VPInstruction *FinalReductionResult;
8437	VPBuilder::InsertPointGuard Guard(Builder);
8438	Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
8439	// For AnyOf reductions, find the select among PhiR's users. This is used
8440	// both to find NewVal for ComputeAnyOfResult and to adjust the reduction.
8441	VPRecipeBase AnyOfSelect = nullptr*;
8442	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8443	AnyOfSelect = cast<VPRecipeBase>(Val: find_if(Range: PhiR->users(), P: [](VPUser U) {
8444	return match(U, P: m_Select(Op0: m_VPValue(), Op1: m_VPValue(), Op2: m_VPValue()));
8445	}));
8446	}
8447	if (AnyOfSelect) {
8448	VPValue *Start = PhiR->getStartValue();
8449	// NewVal is the non-phi operand of the select.
8450	VPValue *NewVal = AnyOfSelect->getOperand(N: `1`) == PhiR
8451	? AnyOfSelect->getOperand(N: `2`)
8452	: AnyOfSelect->getOperand(N: `1`);
8453	FinalReductionResult =
8454	Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
8455	Operands: {Start, NewVal, NewExitingVPV}, DL: ExitDL);
8456	} else {
8457	VPIRFlags Flags(RecurrenceKind, PhiR->isOrdered(), PhiR->isInLoop(),
8458	PhiR->getFastMathFlags());
8459	FinalReductionResult =
8460	Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
8461	Operands: {NewExitingVPV}, Flags, DL: ExitDL);
8462	}
8463	// If the vector reduction can be performed in a smaller type, we truncate
8464	// then extend the loop exit value to enable InstCombine to evaluate the
8465	// entire expression in the smaller type.
8466	if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
8467	!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
8468	assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
8469	assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
8470	"Unexpected truncated min-max recurrence!");
8471	Type *RdxTy = RdxDesc.getRecurrenceType();
8472	VPWidenCastRecipe *Trunc;
8473	Instruction::CastOps ExtendOpc =
8474	RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
8475	VPWidenCastRecipe *Extnd;
8476	{
8477	VPBuilder::InsertPointGuard Guard(Builder);
8478	Builder.setInsertPoint(
8479	TheBB: NewExitingVPV->getDefiningRecipe()->getParent(),
8480	IP: std::next(x: NewExitingVPV->getDefiningRecipe()->getIterator()));
8481	Trunc =
8482	Builder.createWidenCast(Opcode: Instruction::Trunc, Op: NewExitingVPV, ResultTy: RdxTy);
8483	Extnd = Builder.createWidenCast(Opcode: ExtendOpc, Op: Trunc, ResultTy: PhiTy);
8484	}
8485	if (PhiR->getOperand(N: `1`) == NewExitingVPV)
8486	PhiR->setOperand(I: `1`, New: Extnd->getVPSingleValue());
8487
8488	// Update ComputeReductionResult with the truncated exiting value and
8489	// extend its result. Operand 0 provides the values to be reduced.
8490	FinalReductionResult->setOperand(I: `0`, New: Trunc);
8491	FinalReductionResult =
8492	Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
8493	}
8494
8495	// Update all users outside the vector region. Also replace redundant
8496	// extracts.
8497	for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
8498	auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
8499	if (FinalReductionResult == U \|\| Parent->getParent())
8500	continue;
8501	// Skip FindIV reduction chain recipes (ComputeReductionResult, icmp).
8502	if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind) &&
8503	match(U, P: m_CombineOr(
8504	L: m_VPInstruction<VPInstruction::ComputeReductionResult>(),
8505	R: m_VPInstruction<Instruction::ICmp>())))
8506	continue;
8507	U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
8508
8509	// Look through ExtractLastPart.
8510	if (match(U, P: m_ExtractLastPart(Op0: m_VPValue())))
8511	U = cast<VPInstruction>(Val: U)->getSingleUser();
8512
8513	if (match(U, P: m_CombineOr(L: m_ExtractLane(Op0: m_VPValue(), Op1: m_VPValue()),
8514	R: m_ExtractLastLane(Op0: m_VPValue()))))
8515	cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
8516	}
8517
8518	// Adjust AnyOf reductions; replace the reduction phi for the selected value
8519	// with a boolean reduction phi node to check if the condition is true in
8520	// any iteration. The final value is selected by the final
8521	// ComputeReductionResult.
8522	if (AnyOfSelect) {
8523	VPValue *Cmp = AnyOfSelect->getOperand(N: `0`);
8524	// If the compare is checking the reduction PHI node, adjust it to check
8525	// the start value.
8526	if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
8527	CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
8528	Builder.setInsertPoint(AnyOfSelect);
8529
8530	// If the true value of the select is the reduction phi, the new value is
8531	// selected if the negated condition is true in any iteration.
8532	if (AnyOfSelect->getOperand(N: `1`) == PhiR)
8533	Cmp = Builder.createNot(Operand: Cmp);
8534	VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
8535	AnyOfSelect->getVPSingleValue()->replaceAllUsesWith(New: Or);
8536	// Delete AnyOfSelect now that it has invalid types.
8537	ToDelete.push_back(Elt: AnyOfSelect);
8538
8539	// Convert the reduction phi to operate on bools.
8540	PhiR->setOperand(I: `0`, New: Plan ->getFalse());
8541	continue;
8542	}
8543
8544	RecurKind RK = PhiR->getRecurrenceKind();
8545	if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
8546	!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
8547	!RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK) &&
8548	!RecurrenceDescriptor::isFindLastRecurrenceKind(Kind: RK))) {
8549	VPBuilder PHBuilder(Plan ->getVectorPreheader());
8550	VPValue *Iden = Plan ->getOrAddLiveIn(
8551	V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: PhiR->getFastMathFlags()));
8552	auto *ScaleFactorVPV = Plan ->getConstantInt(BitWidth: `32`, Val: `1`);
8553	VPValue *StartV = PHBuilder.createNaryOp(
8554	Opcode: VPInstruction::ReductionStartVector,
8555	Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV}, Flags: *PhiR);
8556	PhiR->setOperand(I: `0`, New: StartV);
8557	}
8558	}
8559	for (VPRecipeBase *R : ToDelete)
8560	R->eraseFromParent();
8561
8562	RUN_VPLAN_PASS(VPlanTransforms::clearReductionWrapFlags, *Plan);
8563	}
8564
8565	void LoopVectorizationPlanner::attachRuntimeChecks(
8566	VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
8567	const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
8568	if (SCEVCheckBlock && SCEVCheckBlock->hasNPredecessors(N: `0`)) {
8569	assert((!CM.OptForSize \|\|
8570	CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
8571	"Cannot SCEV check stride or overflow when optimizing for size");
8572	VPlanTransforms::attachCheckBlock(Plan, Cond: SCEVCheckCond, CheckBlock: SCEVCheckBlock,
8573	AddBranchWeights: HasBranchWeights);
8574	}
8575	const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
8576	if (MemCheckBlock && MemCheckBlock->hasNPredecessors(N: `0`)) {
8577	// VPlan-native path does not do any analysis for runtime checks
8578	// currently.
8579	assert((!EnableVPlanNativePath \|\| OrigLoop->isInnermost()) &&
8580	"Runtime checks are not supported for outer loops yet");
8581
8582	if (CM.OptForSize) {
8583	assert(
8584	CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
8585	"Cannot emit memory checks when optimizing for size, unless forced "
8586	"to vectorize.");
8587	ORE->emit(RemarkBuilder: [&]() {
8588	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationCodeSize",
8589	OrigLoop->getStartLoc(),
8590	OrigLoop->getHeader())
8591	<< "Code-size may be reduced by not forcing "
8592	"vectorization, or by source-code modifications "
8593	"eliminating the need for runtime checks "
8594	"(e.g., adding 'restrict').";
8595	});
8596	}
8597	VPlanTransforms::attachCheckBlock(Plan, Cond: MemCheckCond, CheckBlock: MemCheckBlock,
8598	AddBranchWeights: HasBranchWeights);
8599	}
8600	}
8601
8602	void LoopVectorizationPlanner::addMinimumIterationCheck(
8603	VPlan &Plan, ElementCount VF, unsigned UF,
8604	ElementCount MinProfitableTripCount) const {
8605	const uint32_t *BranchWeights =
8606	hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())
8607	? &MinItersBypassWeights[`0`]
8608	: nullptr;
8609	VPlanTransforms::addMinimumIterationCheck(
8610	Plan, VF, UF, MinProfitableTripCount,
8611	RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: VF.isVector()), TailFolded: CM.foldTailByMasking(),
8612	OrigLoop, MinItersBypassWeights: BranchWeights,
8613	DL: OrigLoop->getLoopPredecessor()->getTerminator()->getDebugLoc(), PSE);
8614	}
8615
8616	// Determine how to lower the scalar epilogue, which depends on 1) optimising
8617	// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
8618	// predication, and 4) a TTI hook that analyses whether the loop is suitable
8619	// for predication.
8620	static ScalarEpilogueLowering getScalarEpilogueLowering(
8621	Function F, Loop L, LoopVectorizeHints &Hints, bool OptForSize,
8622	TargetTransformInfo TTI, TargetLibraryInfo TLI,
8623	LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
8624	// 1) OptSize takes precedence over all other options, i.e. if this is set,
8625	// don't look at hints or options, and don't request a scalar epilogue.
8626	if (F->hasOptSize() \|\|
8627	(OptForSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled))
8628	return CM_ScalarEpilogueNotAllowedOptSize;
8629
8630	// 2) If set, obey the directives
8631	if (PreferPredicateOverEpilogue.getNumOccurrences()) {
8632	switch (PreferPredicateOverEpilogue) {
8633	case PreferPredicateTy::ScalarEpilogue:
8634	return CM_ScalarEpilogueAllowed;
8635	case PreferPredicateTy::PredicateElseScalarEpilogue:
8636	return CM_ScalarEpilogueNotNeededUsePredicate;
8637	case PreferPredicateTy::PredicateOrDontVectorize:
8638	return CM_ScalarEpilogueNotAllowedUsePredicate;
8639	};
8640	}
8641
8642	// 3) If set, obey the hints
8643	switch (Hints.getPredicate()) {
8644	case LoopVectorizeHints::FK_Enabled:
8645	return CM_ScalarEpilogueNotNeededUsePredicate;
8646	case LoopVectorizeHints::FK_Disabled:
8647	return CM_ScalarEpilogueAllowed;
8648	};
8649
8650	// 4) if the TTI hook indicates this is profitable, request predication.
8651	TailFoldingInfo TFI(TLI, &LVL, IAI);
8652	if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
8653	return CM_ScalarEpilogueNotNeededUsePredicate;
8654
8655	return CM_ScalarEpilogueAllowed;
8656	}
8657
8658	// Process the loop in the VPlan-native vectorization path. This path builds
8659	// VPlan upfront in the vectorization pipeline, which allows to apply
8660	// VPlan-to-VPlan transformations from the very beginning without modifying the
8661	// input LLVM IR.
8662	static bool processLoopInVPlanNativePath(
8663	Loop L, PredicatedScalarEvolution &PSE, LoopInfo LI, DominatorTree *DT,
8664	LoopVectorizationLegality LVL, TargetTransformInfo TTI,
8665	TargetLibraryInfo TLI, DemandedBits DB, AssumptionCache *AC,
8666	OptimizationRemarkEmitter *ORE,
8667	std::function<BlockFrequencyInfo &()> GetBFI, bool OptForSize,
8668	LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) {
8669
8670	if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
8671	LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
8672	return false;
8673	}
8674	assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
8675	Function *F = L->getHeader()->getParent();
8676	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
8677
8678	ScalarEpilogueLowering SEL =
8679	getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL&: *LVL, IAI: &IAI);
8680
8681	LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,
8682	GetBFI, F, &Hints, IAI, OptForSize);
8683	// Use the planner for outer loop vectorization.
8684	// TODO: CM is not used at this point inside the planner. Turn CM into an
8685	// optional argument if we don't need it in the future.
8686	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
8687	ORE);
8688
8689	// Get user vectorization factor.
8690	ElementCount UserVF = Hints.getWidth();
8691
8692	CM.collectElementTypesForWidening();
8693
8694	// Plan how to best vectorize, return the best VF and its cost.
8695	const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
8696
8697	// If we are stress testing VPlan builds, do not attempt to generate vector
8698	// code. Masked vector code generation support will follow soon.
8699	// Also, do not attempt to vectorize if no vector code will be produced.
8700	if (VPlanBuildStressTest \|\| VectorizationFactor::Disabled() == VF)
8701	return false;
8702
8703	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
8704
8705	{
8706	GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
8707	InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, /UF=/`1`, &CM,
8708	Checks, BestPlan);
8709	LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
8710	<< L->getHeader()->getParent()->getName() << "\"\n");
8711	LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, /UF=/`1`,
8712	MinProfitableTripCount: VF.MinProfitableTripCount);
8713
8714	LVP.executePlan(BestVF: VF.Width, /UF=/BestUF: `1`, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
8715	}
8716
8717	reportVectorization(ORE, TheLoop: L, VF, IC: `1`);
8718
8719	assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
8720	return true;
8721	}
8722
8723	// Emit a remark if there are stores to floats that required a floating point
8724	// extension. If the vectorized loop was generated with floating point there
8725	// will be a performance penalty from the conversion overhead and the change in
8726	// the vector width.
8727	static void checkMixedPrecision(Loop L, OptimizationRemarkEmitter ORE) {
8728	SmallVector<Instruction *, `4`> Worklist;
8729	for (BasicBlock *BB : L->getBlocks()) {
8730	for (Instruction &Inst : *BB) {
8731	if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
8732	if (S->getValueOperand()->getType()->isFloatTy())
8733	Worklist.push_back(Elt: S);
8734	}
8735	}
8736	}
8737
8738	// Traverse the floating point stores upwards searching, for floating point
8739	// conversions.
8740	SmallPtrSet<const Instruction *, `4`> Visited;
8741	SmallPtrSet<const Instruction *, `4`> EmittedRemark;
8742	while (!Worklist.empty()) {
8743	auto *I = Worklist.pop_back_val();
8744	if (!L->contains(Inst: I))
8745	continue;
8746	if (!Visited.insert(Ptr: I).second)
8747	continue;
8748
8749	// Emit a remark if the floating point store required a floating
8750	// point conversion.
8751	// TODO: More work could be done to identify the root cause such as a
8752	// constant or a function return type and point the user to it.
8753	if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
8754	ORE->emit(RemarkBuilder: [&]() {
8755	return OptimizationRemarkAnalysis (LV_NAME, "VectorMixedPrecision",
8756	I->getDebugLoc(), L->getHeader())
8757	<< "floating point conversion changes vector width. "
8758	<< "Mixed floating point precision requires an up/down "
8759	<< "cast that will negatively impact performance.";
8760	});
8761
8762	for (Use &Op : I->operands())
8763	if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
8764	Worklist.push_back(Elt: OpI);
8765	}
8766	}
8767
8768	/// For loops with uncountable early exits, find the cost of doing work when
8769	/// exiting the loop early, such as calculating the final exit values of
8770	/// variables used outside the loop.
8771	/// TODO: This is currently overly pessimistic because the loop may not take
8772	/// the early exit, but better to keep this conservative for now. In future,
8773	/// it might be possible to relax this by using branch probabilities.
8774	static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
8775	VPlan &Plan, ElementCount VF) {
8776	InstructionCost Cost = `0`;
8777	for (auto *ExitVPBB : Plan.getExitBlocks()) {
8778	for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
8779	// If the predecessor is not the middle.block, then it must be the
8780	// vector.early.exit block, which may contain work to calculate the exit
8781	// values of variables used outside the loop.
8782	if (PredVPBB != Plan.getMiddleBlock()) {
8783	LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
8784	<< PredVPBB->getName() << ":\n");
8785	Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
8786	}
8787	}
8788	}
8789	return Cost;
8790	}
8791
8792	/// This function determines whether or not it's still profitable to vectorize
8793	/// the loop given the extra work we have to do outside of the loop:
8794	/// 1. Perform the runtime checks before entering the loop to ensure it's safe
8795	/// to vectorize.
8796	/// 2. In the case of loops with uncountable early exits, we may have to do
8797	/// extra work when exiting the loop early, such as calculating the final
8798	/// exit values of variables used outside the loop.
8799	/// 3. The middle block.
8800	static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
8801	VectorizationFactor &VF, Loop *L,
8802	PredicatedScalarEvolution &PSE,
8803	VPCostContext &CostCtx, VPlan &Plan,
8804	ScalarEpilogueLowering SEL,
8805	std::optional<unsigned> VScale) {
8806	InstructionCost RtC = Checks.getCost();
8807	if (!RtC.isValid())
8808	return false;
8809
8810	// When interleaving only scalar and vector cost will be equal, which in turn
8811	// would lead to a divide by 0. Fall back to hard threshold.
8812	if (VF.Width.isScalar()) {
8813	// TODO: Should we rename VectorizeMemoryCheckThreshold?
8814	if (RtC > VectorizeMemoryCheckThreshold) {
8815	LLVM_DEBUG(
8816	dbgs()
8817	<< "LV: Interleaving only is not profitable due to runtime checks\n");
8818	return false;
8819	}
8820	return true;
8821	}
8822
8823	// The scalar cost should only be 0 when vectorizing with a user specified
8824	// VF/IC. In those cases, runtime checks should always be generated.
8825	uint64_t ScalarC = VF.ScalarCost.getValue();
8826	if (ScalarC == `0`)
8827	return true;
8828
8829	InstructionCost TotalCost = RtC;
8830	// Add on the cost of any work required in the vector early exit block, if
8831	// one exists.
8832	TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
8833	TotalCost += Plan.getMiddleBlock()->cost(VF: VF.Width, Ctx&: CostCtx);
8834
8835	// First, compute the minimum iteration count required so that the vector
8836	// loop outperforms the scalar loop.
8837	// The total cost of the scalar loop is
8838	// ScalarC TC*
8839	// where
8840	// TC is the actual trip count of the loop.*
8841	// ScalarC is the cost of a single scalar iteration.*
8842	//
8843	// The total cost of the vector loop is
8844	// TotalCost + VecC (TC / VF) + EpiC*
8845	// where
8846	// TotalCost is the sum of the costs cost of*
8847	// - the generated runtime checks, i.e. RtC
8848	// - performing any additional work in the vector.early.exit block for
8849	// loops with uncountable early exits.
8850	// - the middle block, if ExpectedTC <= VF.Width.
8851	// VecC is the cost of a single vector iteration.*
8852	// TC is the actual trip count of the loop*
8853	// VF is the vectorization factor*
8854	// EpiCost is the cost of the generated epilogue, including the cost*
8855	// of the remaining scalar operations.
8856	//
8857	// Vectorization is profitable once the total vector cost is less than the
8858	// total scalar cost:
8859	// TotalCost + VecC (TC / VF) + EpiC < ScalarC * TC*
8860	//
8861	// Now we can compute the minimum required trip count TC as
8862	// VF (TotalCost + EpiC) / (ScalarC * VF - VecC) < TC*
8863	//
8864	// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
8865	// the computations are performed on doubles, not integers and the result
8866	// is rounded up, hence we get an upper estimate of the TC.
8867	unsigned IntVF = estimateElementCount(VF: VF.Width, VScale);
8868	uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
8869	uint64_t MinTC1 =
8870	Div == `0` ? `0` : divideCeil(Numerator: TotalCost.getValue() * IntVF, Denominator: Div);
8871
8872	// Second, compute a minimum iteration count so that the cost of the
8873	// runtime checks is only a fraction of the total scalar loop cost. This
8874	// adds a loop-dependent bound on the overhead incurred if the runtime
8875	// checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
8876	// TC. To bound the runtime check to be a fraction 1/X of the scalar*
8877	// cost, compute
8878	// RtC < ScalarC TC * (1 / X) ==> RtC * X / ScalarC < TC*
8879	uint64_t MinTC2 = divideCeil(Numerator: RtC.getValue() * `10`, Denominator: ScalarC);
8880
8881	// Now pick the larger minimum. If it is not a multiple of VF and a scalar
8882	// epilogue is allowed, choose the next closest multiple of VF. This should
8883	// partly compensate for ignoring the epilogue cost.
8884	uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
8885	if (SEL == CM_ScalarEpilogueAllowed)
8886	MinTC = alignTo(Value: MinTC, Align: IntVF);
8887	VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
8888
8889	LLVM_DEBUG(
8890	dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
8891	<< VF.MinProfitableTripCount << "\n");
8892
8893	// Skip vectorization if the expected trip count is less than the minimum
8894	// required trip count.
8895	if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
8896	if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
8897	LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
8898	"trip count < minimum profitable VF ("
8899	<< *ExpectedTC << " < " << VF.MinProfitableTripCount
8900	<< ")\n");
8901
8902	return false;
8903	}
8904	}
8905	return true;
8906	}
8907
8908	LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
8909	: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced \|\|
8910	!EnableLoopInterleaving),
8911	VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced \|\|
8912	!EnableLoopVectorization) {}
8913
8914	/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
8915	/// vectorization.
8916	static SmallVector<VPInstruction *>
8917	preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
8918	using namespace VPlanPatternMatch;
8919	// When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
8920	// introduce multiple uses of undef/poison. If the reduction start value may
8921	// be undef or poison it needs to be frozen and the frozen start has to be
8922	// used when computing the reduction result. We also need to use the frozen
8923	// value in the resume phi generated by the main vector loop, as this is also
8924	// used to compute the reduction result after the epilogue vector loop.
8925	auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
8926	bool UpdateResumePhis) {
8927	VPBuilder Builder(Plan.getEntry());
8928	for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
8929	auto *VPI = dyn_cast<VPInstruction>(Val: &R);
8930	if (!VPI)
8931	continue;
8932	VPValue *OrigStart;
8933	if (!matchFindIVResult(VPI, ReducedIV: m_VPValue(), Start: m_VPValue(V&: OrigStart)))
8934	continue;
8935	if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
8936	continue;
8937	VPInstruction *Freeze =
8938	Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, DL: {}, Name: "fr");
8939	VPI->setOperand(I: `2`, New: Freeze);
8940	if (UpdateResumePhis)
8941	OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
8942	return Freeze != &U && isa<VPPhi>(Val: &U);
8943	});
8944	}
8945	};
8946	AddFreezeForFindLastIVReductions (MainPlan, true);
8947	AddFreezeForFindLastIVReductions (EpiPlan, false);
8948
8949	VPValue VectorTC = nullptr*;
8950	auto *Term =
8951	MainPlan.getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
8952	[[maybe_unused]] bool MatchedTC =
8953	match(V: Term, P: m_BranchOnCount(Op0: m_VPValue(), Op1: m_VPValue(V&: VectorTC)));
8954	assert(MatchedTC && "must match vector trip count");
8955
8956	// If there is a suitable resume value for the canonical induction in the
8957	// scalar (which will become vector) epilogue loop, use it and move it to the
8958	// beginning of the scalar preheader. Otherwise create it below.
8959	VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
8960	auto ResumePhiIter =
8961	find_if(Range: MainScalarPH->phis(), P: [VectorTC](VPRecipeBase &R) {
8962	return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Ops: m_Specific(VPV: VectorTC),
8963	Ops: m_ZeroInt()));
8964	});
8965	VPPhi ResumePhi = nullptr*;
8966	if (ResumePhiIter == MainScalarPH->phis().end()) {
8967	using namespace llvm::VPlanPatternMatch;
8968	assert(
8969	match(MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue(),
8970	m_ZeroInt()) &&
8971	"canonical IV must start at 0");
8972	Type *Ty = VPTypeAnalysis (MainPlan).inferScalarType(V: VectorTC);
8973	VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
8974	ResumePhi = ScalarPHBuilder.createScalarPhi(
8975	IncomingValues: {VectorTC, MainPlan.getZero(Ty)}, DL: {}, Name: "vec.epilog.resume.val");
8976	} else {
8977	ResumePhi = cast<VPPhi>(Val: &*ResumePhiIter);
8978	ResumePhi->setName("vec.epilog.resume.val");
8979	if (&MainScalarPH->front() != ResumePhi)
8980	ResumePhi->moveBefore(BB&: *MainScalarPH, I: MainScalarPH->begin());
8981	}
8982
8983	// Create a ResumeForEpilogue for the canonical IV resume as the
8984	// first non-phi, to keep it alive for the epilogue.
8985	VPBuilder ResumeBuilder(MainScalarPH);
8986	ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue, Operands: ResumePhi);
8987
8988	// Create ResumeForEpilogue instructions for the resume phis of the
8989	// VPIRPhis in the scalar header of the main plan and return them so they can
8990	// be used as resume values when vectorizing the epilogue.
8991	return to_vector(
8992	Range: map_range(C: MainPlan.getScalarHeader()->phis(), F: [&](VPRecipeBase &R) {
8993	assert(isa<VPIRPhi>(R) &&
8994	"only VPIRPhis expected in the scalar header");
8995	return ResumeBuilder.createNaryOp(Opcode: VPInstruction::ResumeForEpilogue,
8996	Operands: R.getOperand(N: `0`));
8997	}));
8998	}
8999
9000	/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9001	/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. Some
9002	/// reductions require creating new instructions to compute the resume values.
9003	/// They are collected in a vector and returned. They must be moved to the
9004	/// preheader of the vector epilogue loop, after created by the execution of \p
9005	/// Plan.
9006	static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
9007	VPlan &Plan, Loop L, const* SCEV2ValueTy &ExpandedSCEVs,
9008	EpilogueLoopVectorizationInfo &EPI, LoopVectorizationCostModel &CM,
9009	ScalarEvolution &SE) {
9010	VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9011	VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9012	Header->setName("vec.epilog.vector.body");
9013
9014	VPCanonicalIVPHIRecipe *IV = VectorLoop->getCanonicalIV();
9015	// When vectorizing the epilogue loop, the canonical induction needs to be
9016	// adjusted by the value after the main vector loop. Find the resume value
9017	// created during execution of the main VPlan. It must be the first phi in the
9018	// loop preheader. Use the value to increment the canonical IV, and update all
9019	// users in the loop region to use the adjusted value.
9020	// FIXME: Improve modeling for canonical IV start values in the epilogue
9021	// loop.
9022	using namespace llvm::PatternMatch;
9023	PHINode EPResumeVal = &L->getLoopPreheader()->phis().begin();
9024	for (Value *Inc : EPResumeVal->incoming_values()) {
9025	if (match(V: Inc, P: m_SpecificInt(V: `0`)))
9026	continue;
9027	assert(!EPI.VectorTripCount &&
9028	"Must only have a single non-zero incoming value");
9029	EPI.VectorTripCount = Inc;
9030	}
9031	// If we didn't find a non-zero vector trip count, all incoming values
9032	// must be zero, which also means the vector trip count is zero. Pick the
9033	// first zero as vector trip count.
9034	// TODO: We should not choose VF UF so the main vector loop is known to*
9035	// be dead.
9036	if (!EPI.VectorTripCount) {
9037	assert(EPResumeVal->getNumIncomingValues() > `0` &&
9038	all_of(EPResumeVal->incoming_values(),
9039	[](Value Inc) { return* match(Inc, m_SpecificInt(`0`)); }) &&
9040	"all incoming values must be 0");
9041	EPI.VectorTripCount = EPResumeVal->getOperand(i_nocapture: `0`);
9042	}
9043	VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9044	assert(all_of(IV->users(),
9045	[](const VPUser *U) {
9046	return isa<VPScalarIVStepsRecipe>(U) \|\|
9047	isa<VPDerivedIVRecipe>(U) \|\|
9048	cast<VPRecipeBase>(U)->isScalarCast() \|\|
9049	cast<VPInstruction>(U)->getOpcode() ==
9050	Instruction::Add;
9051	}) &&
9052	"the canonical IV should only be used by its increment or "
9053	"ScalarIVSteps when resetting the start value");
9054	VPBuilder Builder(Header, Header->getFirstNonPhi());
9055	VPInstruction *Add = Builder.createAdd(LHS: IV, RHS: VPV);
9056	IV->replaceAllUsesWith(New: Add);
9057	Add->setOperand(I: `0`, New: IV);
9058
9059	DenseMap<Value , Value > ToFrozen;
9060	SmallVector<Instruction *> InstsToMove;
9061	// Ensure that the start values for all header phi recipes are updated before
9062	// vectorizing the epilogue loop. Skip the canonical IV, which has been
9063	// handled above.
9064	for (VPRecipeBase &R : drop_begin(RangeOrContainer: Header->phis())) {
9065	Value ResumeV = nullptr*;
9066	// TODO: Move setting of resume values to prepareToExecute.
9067	if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9068	// Find the reduction result by searching users of the phi or its backedge
9069	// value.
9070	auto IsReductionResult = [](VPRecipeBase *R) {
9071	auto *VPI = dyn_cast<VPInstruction>(Val: R);
9072	if (!VPI)
9073	return false;
9074	return VPI->getOpcode() == VPInstruction::ComputeAnyOfResult \|\|
9075	VPI->getOpcode() == VPInstruction::ComputeReductionResult;
9076	};
9077	auto *RdxResult = cast<VPInstruction>(
9078	Val: vputils::findRecipe(Start: ReductionPhi->getBackedgeValue(), Pred: IsReductionResult));
9079	assert(RdxResult && "expected to find reduction result");
9080
9081	ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9082	->getIncomingValueForBlock(BB: L->getLoopPreheader());
9083
9084	// Check for FindIV pattern by looking for icmp user of RdxResult.
9085	// The pattern is: select(icmp ne RdxResult, Sentinel), RdxResult, Start
9086	using namespace VPlanPatternMatch;
9087	VPValue SentinelVPV = nullptr*;
9088	bool IsFindIV = any_of(Range: RdxResult->users(), P: [&](VPUser *U) {
9089	return match(U, P: VPlanPatternMatch::m_SpecificICmp(
9090	MatchPred: ICmpInst::ICMP_NE, Op0: m_Specific(VPV: RdxResult),
9091	Op1: m_VPValue(V&: SentinelVPV)));
9092	});
9093
9094	if (RdxResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
9095	Value *StartV = RdxResult->getOperand(N: `0`)->getLiveInIRValue();
9096	// VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9097	// start value; compare the final value from the main vector loop
9098	// to the start value.
9099	BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9100	IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9101	ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9102	if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9103	InstsToMove.push_back(Elt: I);
9104	} else if (IsFindIV) {
9105	assert(SentinelVPV && "expected to find icmp using RdxResult");
9106
9107	// Get the frozen start value from the main loop.
9108	Value *FrozenStartV = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9109	BB: EPI.MainLoopIterationCountCheck);
9110	if (auto *FreezeI = dyn_cast<FreezeInst>(Val: FrozenStartV))
9111	ToFrozen [FreezeI->getOperand(i_nocapture: `0`)] = FrozenStartV;
9112
9113	// Adjust resume: select(icmp eq ResumeV, FrozenStartV), Sentinel,
9114	// ResumeV
9115	BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9116	IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9117	Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: FrozenStartV);
9118	if (auto *I = dyn_cast<Instruction>(Val: Cmp))
9119	InstsToMove.push_back(Elt: I);
9120	ResumeV =
9121	Builder.CreateSelect(C: Cmp, True: SentinelVPV->getLiveInIRValue(), False: ResumeV);
9122	if (auto *I = dyn_cast<Instruction>(Val: ResumeV))
9123	InstsToMove.push_back(Elt: I);
9124	} else {
9125	VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9126	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9127	if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9128	assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9129	"unexpected start value");
9130	// Partial sub-reductions always start at 0 and account for the
9131	// reduction start value in a final subtraction. Update it to use the
9132	// resume value from the main vector loop.
9133	if (PhiR->getVFScaleFactor() > `1` &&
9134	PhiR->getRecurrenceKind() == RecurKind::Sub) {
9135	auto *Sub = cast<VPInstruction>(Val: RdxResult->getSingleUser());
9136	assert(Sub->getOpcode() == Instruction::Sub && "Unexpected opcode");
9137	assert(isa<VPIRValue>(Sub->getOperand(`0`)) &&
9138	"Expected operand to match the original start value of the "
9139	"reduction");
9140	assert(VPlanPatternMatch::match(VPI->getOperand(`0`),
9141	VPlanPatternMatch::m_ZeroInt()) &&
9142	"Expected start value for partial sub-reduction to start at "
9143	"zero");
9144	Sub->setOperand(I: `0`, New: StartVal);
9145	} else
9146	VPI->setOperand(I: `0`, New: StartVal);
9147	continue;
9148	}
9149	}
9150	} else {
9151	// Retrieve the induction resume values for wide inductions from
9152	// their original phi nodes in the scalar loop.
9153	PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9154	// Hook up to the PHINode generated by a ResumePhi recipe of main
9155	// loop VPlan, which feeds the scalar loop.
9156	ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9157	}
9158	assert(ResumeV && "Must have a resume value");
9159	VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9160	cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9161	}
9162
9163	// For some VPValues in the epilogue plan we must re-use the generated IR
9164	// values from the main plan. Replace them with live-in VPValues.
9165	// TODO: This is a workaround needed for epilogue vectorization and it
9166	// should be removed once induction resume value creation is done
9167	// directly in VPlan.
9168	for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9169	// Re-use frozen values from the main plan for Freeze VPInstructions in the
9170	// epilogue plan. This ensures all users use the same frozen value.
9171	auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9172	if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9173	VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9174	V: ToFrozen.lookup(Val: VPI->getOperand(N: `0`)->getLiveInIRValue())));
9175	continue;
9176	}
9177
9178	// Re-use the trip count and steps expanded for the main loop, as
9179	// skeleton creation needs it as a value that dominates both the scalar
9180	// and vector epilogue loops
9181	auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9182	if (!ExpandR)
9183	continue;
9184	VPValue *ExpandedVal =
9185	Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9186	ExpandR->replaceAllUsesWith(New: ExpandedVal);
9187	if (Plan.getTripCount() == ExpandR)
9188	Plan.resetTripCount(NewTripCount: ExpandedVal);
9189	ExpandR->eraseFromParent();
9190	}
9191
9192	auto VScale = CM.getVScaleForTuning();
9193	unsigned MainLoopStep =
9194	estimateElementCount(VF: EPI.MainLoopVF * EPI.MainLoopUF, VScale);
9195	unsigned EpilogueLoopStep =
9196	estimateElementCount(VF: EPI.EpilogueVF * EPI.EpilogueUF, VScale);
9197	VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
9198	Plan, TripCount: EPI.TripCount, VectorTripCount: EPI.VectorTripCount,
9199	RequiresScalarEpilogue: CM.requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector()), EpilogueVF: EPI.EpilogueVF,
9200	EpilogueUF: EPI.EpilogueUF, MainLoopStep, EpilogueLoopStep, SE);
9201
9202	return InstsToMove;
9203	}
9204
9205	static void
9206	fixScalarResumeValuesFromBypass(BasicBlock BypassBlock, Loop L,
9207	VPlan &BestEpiPlan,
9208	ArrayRef<VPInstruction *> ResumeValues) {
9209	// Fix resume values from the additional bypass block.
9210	BasicBlock *PH = L->getLoopPreheader();
9211	for (auto *Pred : predecessors(BB: PH)) {
9212	for (PHINode &Phi : PH->phis()) {
9213	if (Phi.getBasicBlockIndex(BB: Pred) != -`1`)
9214	continue;
9215	Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
9216	}
9217	}
9218	auto *ScalarPH = cast<VPIRBasicBlock>(Val: BestEpiPlan.getScalarPreheader());
9219	if (ScalarPH->hasPredecessors()) {
9220	// Fix resume values for inductions and reductions from the additional
9221	// bypass block using the incoming values from the main loop's resume phis.
9222	// ResumeValues correspond 1:1 with the scalar loop header phis.
9223	for (auto [ResumeV, HeaderPhi] :
9224	zip(t&: ResumeValues, u: BestEpiPlan.getScalarHeader()->phis())) {
9225	auto *HeaderPhiR = cast<VPIRPhi>(Val: &HeaderPhi);
9226	if (isa<VPIRValue>(Val: HeaderPhiR->getIncomingValueForBlock(VPBB: ScalarPH)))
9227	continue;
9228	auto *EpiResumePhi =
9229	cast<PHINode>(Val: HeaderPhiR->getIRPhi().getIncomingValueForBlock(BB: PH));
9230	if (EpiResumePhi->getBasicBlockIndex(BB: BypassBlock) == -`1`)
9231	continue;
9232	auto *MainResumePhi = cast<PHINode>(Val: ResumeV->getUnderlyingValue());
9233	EpiResumePhi->setIncomingValueForBlock(
9234	BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
9235	}
9236	}
9237	}
9238
9239	/// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
9240	/// loop, after both plans have executed, updating branches from the iteration
9241	/// and runtime checks of the main loop, as well as updating various phis. \p
9242	/// InstsToMove contains instructions that need to be moved to the preheader of
9243	/// the epilogue vector loop.
9244	static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
9245	EpilogueLoopVectorizationInfo &EPI,
9246	DominatorTree *DT,
9247	GeneratedRTChecks &Checks,
9248	ArrayRef<Instruction *> InstsToMove,
9249	ArrayRef<VPInstruction *> ResumeValues) {
9250	BasicBlock *VecEpilogueIterationCountCheck =
9251	cast<VPIRBasicBlock>(Val: EpiPlan.getEntry())->getIRBasicBlock();
9252
9253	BasicBlock *VecEpiloguePreHeader =
9254	cast<CondBrInst>(Val: VecEpilogueIterationCountCheck->getTerminator())
9255	->getSuccessor(i: `1`);
9256	// Adjust the control flow taking the state info from the main loop
9257	// vectorization into account.
9258	assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
9259	"expected this to be saved from the previous pass.");
9260	DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
9261	EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
9262	From: VecEpilogueIterationCountCheck, To: VecEpiloguePreHeader);
9263
9264	DTU.applyUpdates(Updates: {{DominatorTree::Delete, EPI.MainLoopIterationCountCheck,
9265	VecEpilogueIterationCountCheck},
9266	{DominatorTree::Insert, EPI.MainLoopIterationCountCheck,
9267	VecEpiloguePreHeader}});
9268
9269	BasicBlock *ScalarPH =
9270	cast<VPIRBasicBlock>(Val: EpiPlan.getScalarPreheader())->getIRBasicBlock();
9271	EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
9272	From: VecEpilogueIterationCountCheck, To: ScalarPH);
9273	DTU.applyUpdates(
9274	Updates: {{DominatorTree::Delete, EPI.EpilogueIterationCountCheck,
9275	VecEpilogueIterationCountCheck},
9276	{DominatorTree::Insert, EPI.EpilogueIterationCountCheck, ScalarPH}});
9277
9278	// Adjust the terminators of runtime check blocks and phis using them.
9279	BasicBlock *SCEVCheckBlock = Checks.getSCEVChecks().second;
9280	BasicBlock *MemCheckBlock = Checks.getMemRuntimeChecks().second;
9281	if (SCEVCheckBlock) {
9282	SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
9283	From: VecEpilogueIterationCountCheck, To: ScalarPH);
9284	DTU.applyUpdates(Updates: {{DominatorTree::Delete, SCEVCheckBlock,
9285	VecEpilogueIterationCountCheck},
9286	{DominatorTree::Insert, SCEVCheckBlock, ScalarPH}});
9287	}
9288	if (MemCheckBlock) {
9289	MemCheckBlock->getTerminator()->replaceUsesOfWith(
9290	From: VecEpilogueIterationCountCheck, To: ScalarPH);
9291	DTU.applyUpdates(
9292	Updates: {{DominatorTree::Delete, MemCheckBlock, VecEpilogueIterationCountCheck},
9293	{DominatorTree::Insert, MemCheckBlock, ScalarPH}});
9294	}
9295
9296	// The vec.epilog.iter.check block may contain Phi nodes from inductions
9297	// or reductions which merge control-flow from the latch block and the
9298	// middle block. Update the incoming values here and move the Phi into the
9299	// preheader.
9300	SmallVector<PHINode *, `4`> PhisInBlock(
9301	llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
9302
9303	for (PHINode *Phi : PhisInBlock) {
9304	Phi->moveBefore(InsertPos: VecEpiloguePreHeader->getFirstNonPHIIt());
9305	Phi->replaceIncomingBlockWith(
9306	Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
9307	New: VecEpilogueIterationCountCheck);
9308
9309	// If the phi doesn't have an incoming value from the
9310	// EpilogueIterationCountCheck, we are done. Otherwise remove the
9311	// incoming value and also those from other check blocks. This is needed
9312	// for reduction phis only.
9313	if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
9314	return EPI.EpilogueIterationCountCheck == IncB;
9315	}))
9316	continue;
9317	Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
9318	if (SCEVCheckBlock)
9319	Phi->removeIncomingValue(BB: SCEVCheckBlock);
9320	if (MemCheckBlock)
9321	Phi->removeIncomingValue(BB: MemCheckBlock);
9322	}
9323
9324	auto IP = VecEpiloguePreHeader->getFirstNonPHIIt();
9325	for (auto *I : InstsToMove)
9326	I->moveBefore(InsertPos: IP);
9327
9328	// VecEpilogueIterationCountCheck conditionally skips over the epilogue loop
9329	// after executing the main loop. We need to update the resume values of
9330	// inductions and reductions during epilogue vectorization.
9331	fixScalarResumeValuesFromBypass(BypassBlock: VecEpilogueIterationCountCheck, L, BestEpiPlan&: EpiPlan,
9332	ResumeValues);
9333
9334	// Remove dead phis that were moved to the epilogue preheader but are unused
9335	// (e.g., resume phis for inductions not widened in the epilogue vector loop).
9336	for (PHINode &Phi : make_early_inc_range(Range: VecEpiloguePreHeader->phis()))
9337	if (Phi.use_empty())
9338	Phi.eraseFromParent();
9339	}
9340
9341	bool LoopVectorizePass::processLoop(Loop *L) {
9342	assert((EnableVPlanNativePath \|\| L->isInnermost()) &&
9343	"VPlan-native path is not enabled. Only process inner loops.");
9344
9345	LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9346	<< L->getHeader()->getParent()->getName() << "' from "
9347	<< L->getLocStr() << "\n");
9348
9349	LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9350
9351	LLVM_DEBUG(
9352	dbgs() << "LV: Loop hints:"
9353	<< " force="
9354	<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9355	? "disabled"
9356	: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9357	? "enabled"
9358	: "?"))
9359	<< " width=" << Hints.getWidth()
9360	<< " interleave=" << Hints.getInterleave() << "\n");
9361
9362	// Function containing loop
9363	Function *F = L->getHeader()->getParent();
9364
9365	// Looking at the diagnostic output is the only way to determine if a loop
9366	// was vectorized (other than looking at the IR or machine code), so it
9367	// is important to generate an optimization remark for each loop. Most of
9368	// these messages are generated as OptimizationRemarkAnalysis. Remarks
9369	// generated as OptimizationRemark and OptimizationRemarkMissed are
9370	// less verbose reporting vectorized loops and unvectorized loops that may
9371	// benefit from vectorization, respectively.
9372
9373	if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9374	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9375	return false;
9376	}
9377
9378	PredicatedScalarEvolution PSE(SE, L);
9379
9380	// Query this against the original loop and save it here because the profile
9381	// of the original loop header may change as the transformation happens.
9382	bool OptForSize = llvm::shouldOptimizeForSize(
9383	BB: L->getHeader(), PSI,
9384	BFI: PSI && PSI->hasProfileSummary() ? &GetBFI () : nullptr,
9385	QueryType: PGSOQueryType::IRPass);
9386
9387	// Check if it is legal to vectorize the loop.
9388	LoopVectorizationRequirements Requirements;
9389	LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9390	&Requirements, &Hints, DB, AC,
9391	/AllowRuntimeSCEVChecks=/!OptForSize, AA);
9392	if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9393	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9394	Hints.emitRemarkWithHints();
9395	return false;
9396	}
9397
9398	if (LVL.hasUncountableEarlyExit()) {
9399	if (!EnableEarlyExitVectorization) {
9400	reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
9401	"early exit is not enabled",
9402	ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
9403	return false;
9404	}
9405	}
9406
9407	if (!LVL.getPotentiallyFaultingLoads().empty()) {
9408	reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with potentially "
9409	"faulting load is not supported",
9410	ORETag: "PotentiallyFaultingLoadsNotSupported", ORE, TheLoop: L);
9411	return false;
9412	}
9413
9414	// Entrance to the VPlan-native vectorization path. Outer loops are processed
9415	// here. They may require CFG and instruction level transformations before
9416	// even evaluating whether vectorization is profitable. Since we cannot modify
9417	// the incoming IR, we need to build VPlan upfront in the vectorization
9418	// pipeline.
9419	if (!L->isInnermost())
9420	return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9421	ORE, GetBFI, OptForSize, Hints,
9422	Requirements);
9423
9424	assert(L->isInnermost() && "Inner loop expected.");
9425
9426	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9427	bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9428
9429	// If an override option has been passed in for interleaved accesses, use it.
9430	if (EnableInterleavedMemAccesses.getNumOccurrences() > `0`)
9431	UseInterleaved = EnableInterleavedMemAccesses;
9432
9433	// Analyze interleaved memory accesses.
9434	if (UseInterleaved)
9435	IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9436
9437	if (LVL.hasUncountableEarlyExit()) {
9438	BasicBlock *LoopLatch = L->getLoopLatch();
9439	if (IAI.requiresScalarEpilogue() \|\|
9440	any_of(Range: LVL.getCountableExitingBlocks(),
9441	P: [LoopLatch](BasicBlock BB) { return* BB != LoopLatch; })) {
9442	reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
9443	"requiring a scalar epilogue is unsupported",
9444	ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
9445	return false;
9446	}
9447	}
9448
9449	// Check the function attributes and profiles to find out if this function
9450	// should be optimized for size.
9451	ScalarEpilogueLowering SEL =
9452	getScalarEpilogueLowering(F, L, Hints, OptForSize, TTI, TLI, LVL, IAI: &IAI);
9453
9454	// Check the loop for a trip count threshold: vectorize loops with a tiny trip
9455	// count by optimizing for size, to minimize overheads.
9456	auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9457	if (ExpectedTC && ExpectedTC ->isFixed() &&
9458	ExpectedTC ->getFixedValue() < TinyTripCountVectorThreshold) {
9459	LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9460	<< "This loop is worth vectorizing only if no scalar "
9461	<< "iteration overheads are incurred.");
9462	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9463	LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9464	else {
9465	LLVM_DEBUG(dbgs() << "\n");
9466	// Predicate tail-folded loops are efficient even when the loop
9467	// iteration count is low. However, setting the epilogue policy to
9468	// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
9469	// with runtime checks. It's more effective to let
9470	// `isOutsideLoopWorkProfitable` determine if vectorization is
9471	// beneficial for the loop.
9472	if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
9473	SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9474	}
9475	}
9476
9477	// Check the function attributes to see if implicit floats or vectors are
9478	// allowed.
9479	if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
9480	reportVectorizationFailure(
9481	DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
9482	OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
9483	ORETag: "NoImplicitFloat", ORE, TheLoop: L);
9484	Hints.emitRemarkWithHints();
9485	return false;
9486	}
9487
9488	// Check if the target supports potentially unsafe FP vectorization.
9489	// FIXME: Add a check for the type of safety issue (denormal, signaling)
9490	// for the target we're vectorizing for, to make sure none of the
9491	// additional fp-math flags can help.
9492	if (Hints.isPotentiallyUnsafe() &&
9493	TTI->isFPVectorizationPotentiallyUnsafe()) {
9494	reportVectorizationFailure(
9495	DebugMsg: "Potentially unsafe FP op prevents vectorization",
9496	OREMsg: "loop not vectorized due to unsafe FP support.",
9497	ORETag: "UnsafeFP", ORE, TheLoop: L);
9498	Hints.emitRemarkWithHints();
9499	return false;
9500	}
9501
9502	bool AllowOrderedReductions;
9503	// If the flag is set, use that instead and override the TTI behaviour.
9504	if (ForceOrderedReductions.getNumOccurrences() > `0`)
9505	AllowOrderedReductions = ForceOrderedReductions;
9506	else
9507	AllowOrderedReductions = TTI->enableOrderedReductions();
9508	if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
9509	ORE->emit(RemarkBuilder: [&]() {
9510	auto *ExactFPMathInst = Requirements.getExactFPInst();
9511	return OptimizationRemarkAnalysisFPCommute (DEBUG_TYPE, "CantReorderFPOps",
9512	ExactFPMathInst->getDebugLoc(),
9513	ExactFPMathInst->getParent())
9514	<< "loop not vectorized: cannot prove it is safe to reorder "
9515	"floating-point operations";
9516	});
9517	LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9518	"reorder floating-point operations\n");
9519	Hints.emitRemarkWithHints();
9520	return false;
9521	}
9522
9523	// Use the cost model.
9524	LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9525	GetBFI, F, &Hints, IAI, OptForSize);
9526	// Use the planner for vectorization.
9527	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
9528	ORE);
9529
9530	// Get user vectorization factor and interleave count.
9531	ElementCount UserVF = Hints.getWidth();
9532	unsigned UserIC = Hints.getInterleave();
9533	if (UserIC > `1` && !LVL.isSafeForAnyVectorWidth())
9534	UserIC = `1`;
9535
9536	// Plan how to best vectorize.
9537	LVP.plan(UserVF, UserIC);
9538	VectorizationFactor VF = LVP.computeBestVF();
9539	unsigned IC = `1`;
9540
9541	if (ORE->allowExtraAnalysis(LV_NAME))
9542	LVP.emitInvalidCostRemarks(ORE);
9543
9544	GeneratedRTChecks Checks(PSE, DT, LI, TTI, CM.CostKind);
9545	if (LVP.hasPlanWithVF(VF: VF.Width)) {
9546	// Select the interleave count.
9547	IC = LVP.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
9548
9549	unsigned SelectedIC = std::max(a: IC, b: UserIC);
9550	// Optimistically generate runtime checks if they are needed. Drop them if
9551	// they turn out to not be profitable.
9552	if (VF.Width.isVector() \|\| SelectedIC > `1`) {
9553	Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC,
9554	ORE&: *ORE);
9555
9556	// Bail out early if either the SCEV or memory runtime checks are known to
9557	// fail. In that case, the vector loop would never execute.
9558	using namespace llvm::PatternMatch;
9559	if (Checks.getSCEVChecks().first &&
9560	match(V: Checks.getSCEVChecks().first, P: m_One()))
9561	return false;
9562	if (Checks.getMemRuntimeChecks().first &&
9563	match(V: Checks.getMemRuntimeChecks().first, P: m_One()))
9564	return false;
9565	}
9566
9567	// Check if it is profitable to vectorize with runtime checks.
9568	bool ForceVectorization =
9569	Hints.getForce() == LoopVectorizeHints::FK_Enabled;
9570	VPCostContext CostCtx(CM.TTI, *CM.TLI, LVP.getPlanFor(VF: VF.Width), CM,
9571	CM.CostKind, CM.PSE, L);
9572	if (!ForceVectorization &&
9573	!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
9574	Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
9575	VScale: CM.getVScaleForTuning())) {
9576	ORE->emit(RemarkBuilder: [&]() {
9577	return OptimizationRemarkAnalysisAliasing (
9578	DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
9579	L->getHeader())
9580	<< "loop not vectorized: cannot prove it is safe to reorder "
9581	"memory operations";
9582	});
9583	LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
9584	Hints.emitRemarkWithHints();
9585	return false;
9586	}
9587	}
9588
9589	// Identify the diagnostic messages that should be produced.
9590	std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
9591	bool VectorizeLoop = true, InterleaveLoop = true;
9592	if (VF.Width.isScalar()) {
9593	LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
9594	VecDiagMsg = {
9595	"VectorizationNotBeneficial",
9596	"the cost-model indicates that vectorization is not beneficial"};
9597	VectorizeLoop = false;
9598	}
9599
9600	if (UserIC == `1` && Hints.getInterleave() > `1`) {
9601	assert(!LVL.isSafeForAnyVectorWidth() &&
9602	"UserIC should only be ignored due to unsafe dependencies");
9603	LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
9604	IntDiagMsg = {"InterleavingUnsafe",
9605	"Ignoring user-specified interleave count due to possibly "
9606	"unsafe dependencies in the loop."};
9607	InterleaveLoop = false;
9608	} else if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > `1`) {
9609	// Tell the user interleaving was avoided up-front, despite being explicitly
9610	// requested.
9611	LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
9612	"interleaving should be avoided up front\n");
9613	IntDiagMsg = {"InterleavingAvoided",
9614	"Ignoring UserIC, because interleaving was avoided up front"};
9615	InterleaveLoop = false;
9616	} else if (IC == `1` && UserIC <= `1`) {
9617	// Tell the user interleaving is not beneficial.
9618	LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
9619	IntDiagMsg = {
9620	"InterleavingNotBeneficial",
9621	"the cost-model indicates that interleaving is not beneficial"};
9622	InterleaveLoop = false;
9623	if (UserIC == `1`) {
9624	IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
9625	IntDiagMsg.second +=
9626	" and is explicitly disabled or interleave count is set to 1";
9627	}
9628	} else if (IC > `1` && UserIC == `1`) {
9629	// Tell the user interleaving is beneficial, but it explicitly disabled.
9630	LLVM_DEBUG(dbgs() << "LV: Interleaving is beneficial but is explicitly "
9631	"disabled.\n");
9632	IntDiagMsg = {"InterleavingBeneficialButDisabled",
9633	"the cost-model indicates that interleaving is beneficial "
9634	"but is explicitly disabled or interleave count is set to 1"};
9635	InterleaveLoop = false;
9636	}
9637
9638	// If there is a histogram in the loop, do not just interleave without
9639	// vectorizing. The order of operations will be incorrect without the
9640	// histogram intrinsics, which are only used for recipes with VF > 1.
9641	if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
9642	LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
9643	<< "to histogram operations.\n");
9644	IntDiagMsg = {
9645	"HistogramPreventsScalarInterleaving",
9646	"Unable to interleave without vectorization due to constraints on "
9647	"the order of histogram operations"};
9648	InterleaveLoop = false;
9649	}
9650
9651	// Override IC if user provided an interleave count.
9652	IC = UserIC > `0` ? UserIC : IC;
9653
9654	// Emit diagnostic messages, if any.
9655	const char *VAPassName = Hints.vectorizeAnalysisPassName();
9656	if (!VectorizeLoop && !InterleaveLoop) {
9657	// Do not vectorize or interleaving the loop.
9658	ORE->emit(RemarkBuilder: [&]() {
9659	return OptimizationRemarkMissed (VAPassName, VecDiagMsg.first,
9660	L->getStartLoc(), L->getHeader())
9661	<< VecDiagMsg.second;
9662	});
9663	ORE->emit(RemarkBuilder: [&]() {
9664	return OptimizationRemarkMissed (LV_NAME, IntDiagMsg.first,
9665	L->getStartLoc(), L->getHeader())
9666	<< IntDiagMsg.second;
9667	});
9668	return false;
9669	}
9670
9671	if (!VectorizeLoop && InterleaveLoop) {
9672	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
9673	ORE->emit(RemarkBuilder: [&]() {
9674	return OptimizationRemarkAnalysis (VAPassName, VecDiagMsg.first,
9675	L->getStartLoc(), L->getHeader())
9676	<< VecDiagMsg.second;
9677	});
9678	} else if (VectorizeLoop && !InterleaveLoop) {
9679	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9680	<< ") in " << L->getLocStr() << `'\n'`);
9681	ORE->emit(RemarkBuilder: [&]() {
9682	return OptimizationRemarkAnalysis (LV_NAME, IntDiagMsg.first,
9683	L->getStartLoc(), L->getHeader())
9684	<< IntDiagMsg.second;
9685	});
9686	} else if (VectorizeLoop && InterleaveLoop) {
9687	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
9688	<< ") in " << L->getLocStr() << `'\n'`);
9689	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
9690	}
9691
9692	// Report the vectorization decision.
9693	if (VF.Width.isScalar()) {
9694	using namespace ore;
9695	assert(IC > `1`);
9696	ORE->emit(RemarkBuilder: [&]() {
9697	return OptimizationRemark (LV_NAME, "Interleaved", L->getStartLoc(),
9698	L->getHeader())
9699	<< "interleaved loop (interleaved count: "
9700	<< NV ("InterleaveCount", IC) << ")";
9701	});
9702	} else {
9703	// Report the vectorization decision.
9704	reportVectorization(ORE, TheLoop: L, VF, IC);
9705	}
9706	if (ORE->allowExtraAnalysis(LV_NAME))
9707	checkMixedPrecision(L, ORE);
9708
9709	// If we decided that it is legal* to interleave or vectorize the loop, then*
9710	// do it.
9711
9712	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9713	// Consider vectorizing the epilogue too if it's profitable.
9714	VectorizationFactor EpilogueVF =
9715	LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
9716	if (EpilogueVF.Width.isVector()) {
9717	std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
9718
9719	// The first pass vectorizes the main loop and creates a scalar epilogue
9720	// to be vectorized by executing the plan (potentially with a different
9721	// factor) again shortly afterwards.
9722	VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
9723	BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
9724	BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
9725	SmallVector<VPInstruction *> ResumeValues =
9726	preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
9727	EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, `1`,
9728	BestEpiPlan);
9729	EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9730	Checks, *BestMainPlan);
9731	auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
9732	BestVPlan&: BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false*);
9733	++LoopsVectorized;
9734
9735	// Second pass vectorizes the epilogue and adjusts the control flow
9736	// edges from the first pass.
9737	EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
9738	Checks, BestEpiPlan);
9739	SmallVector<Instruction *> InstsToMove = preparePlanForEpilogueVectorLoop(
9740	Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI, CM, SE&: *PSE.getSE());
9741	LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV, DT,
9742	VectorizingEpilogue: true);
9743	connectEpilogueVectorLoop(EpiPlan&: BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
9744	ResumeValues);
9745	++LoopsEpilogueVectorized;
9746	} else {
9747	InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
9748	BestPlan);
9749	// TODO: Move to general VPlan pipeline once epilogue loops are also
9750	// supported.
9751	RUN_VPLAN_PASS(VPlanTransforms::materializeConstantVectorTripCount,
9752	BestPlan, VF.Width, IC, PSE);
9753	LVP.addMinimumIterationCheck(Plan&: BestPlan, VF: VF.Width, UF: IC,
9754	MinProfitableTripCount: VF.MinProfitableTripCount);
9755
9756	LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9757	++LoopsVectorized;
9758	}
9759
9760	assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
9761	"DT not preserved correctly");
9762	assert(!verifyFunction(*F, &dbgs()));
9763
9764	return true;
9765	}
9766
9767	LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
9768
9769	// Don't attempt if
9770	// 1. the target claims to have no vector registers, and
9771	// 2. interleaving won't help ILP.
9772	//
9773	// The second condition is necessary because, even if the target has no
9774	// vector registers, loop vectorization may still enable scalar
9775	// interleaving.
9776	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
9777	TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: `1`)) < `2`)
9778	return LoopVectorizeResult (false, false);
9779
9780	bool Changed = false, CFGChanged = false;
9781
9782	// The vectorizer requires loops to be in simplified form.
9783	// Since simplification may add new inner loops, it has to run before the
9784	// legality and profitability checks. This means running the loop vectorizer
9785	// will simplify all loops, regardless of whether anything end up being
9786	// vectorized.
9787	for (const auto &L : *LI)
9788	Changed \|= CFGChanged \|=
9789	simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false / PreserveLCSSA /);
9790
9791	// Build up a worklist of inner-loops to vectorize. This is necessary as
9792	// the act of vectorizing or partially unrolling a loop creates new loops
9793	// and can invalidate iterators across the loops.
9794	SmallVector<Loop *, `8`> Worklist;
9795
9796	for (Loop L : LI)
9797	collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
9798
9799	LoopsAnalyzed += Worklist.size();
9800
9801	// Now walk the identified inner loops.
9802	while (!Worklist.empty()) {
9803	Loop *L = Worklist.pop_back_val();
9804
9805	// For the inner loops we actually process, form LCSSA to simplify the
9806	// transform.
9807	Changed \|= formLCSSARecursively(L&: L, DT: DT, LI, SE);
9808
9809	Changed \|= CFGChanged \|= processLoop(L);
9810
9811	if (Changed) {
9812	LAIs->clear();
9813
9814	#ifndef NDEBUG
9815	if (VerifySCEV)
9816	SE->verify();
9817	#endif
9818	}
9819	}
9820
9821	// Process each loop nest in the function.
9822	return LoopVectorizeResult (Changed, CFGChanged);
9823	}
9824
9825	PreservedAnalyses LoopVectorizePass::run(Function &F,
9826	FunctionAnalysisManager &AM) {
9827	LI = &AM.getResult<LoopAnalysis>(IR&: F);
9828	// There are no loops in the function. Return before computing other
9829	// expensive analyses.
9830	if (LI->empty())
9831	return PreservedAnalyses::all();
9832	SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
9833	TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
9834	DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
9835	TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
9836	AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
9837	DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
9838	ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
9839	LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
9840	AA = &AM.getResult<AAManager>(IR&: F);
9841
9842	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
9843	PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
9844	GetBFI = [&AM, &F]() -> BlockFrequencyInfo & {
9845	return AM.getResult<BlockFrequencyAnalysis>(IR&: F);
9846	};
9847	LoopVectorizeResult Result = runImpl(F);
9848	if (!Result.MadeAnyChange)
9849	return PreservedAnalyses::all();
9850	PreservedAnalyses PA;
9851
9852	if (isAssignmentTrackingEnabled(M: *F.getParent())) {
9853	for (auto &BB : F)
9854	RemoveRedundantDbgInstrs(BB: &BB);
9855	}
9856
9857	PA.preserve<LoopAnalysis>();
9858	PA.preserve<DominatorTreeAnalysis>();
9859	PA.preserve<ScalarEvolutionAnalysis>();
9860	PA.preserve<LoopAccessAnalysis>();
9861
9862	if (Result.MadeCFGChange) {
9863	// Making CFG changes likely means a loop got vectorized. Indicate that
9864	// extra simplification passes should be run.
9865	// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
9866	// be run if runtime checks have been added.
9867	AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
9868	PA.preserve<ShouldRunExtraVectorPasses>();
9869	} else {
9870	PA.preserveSet<CFGAnalyses>();
9871	}
9872	return PA;
9873	}
9874
9875	void LoopVectorizePass::printPipeline(
9876	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
9877	static_cast<PassInfoMixin<LoopVectorizePass> >(this*)->printPipeline(
9878	OS, MapClassName2PassName);
9879
9880	OS << `'<'`;
9881	OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
9882	OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
9883	OS << `'>'`;
9884	}
9885

Browse the source code of llvm_projects/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp